#include <string> #if !defined (STLPORT) || !defined (_STLP_USE_NO_IOSTREAMS) # include <fstream> # include <locale> # include <stdexcept> # include <cstdio> // for WEOF # include "cppunit/cppunit_proxy.h" # if !defined (STLPORT) || defined(_STLP_USE_NAMESPACES) using namespace std; # endif // // TestCase class // class CodecvtTest : public CPPUNIT_NS::TestCase { CPPUNIT_TEST_SUITE(CodecvtTest); #if defined (STLPORT) && defined (_STLP_NO_MEMBER_TEMPLATES) CPPUNIT_IGNORE; #endif CPPUNIT_TEST(variable_encoding); CPPUNIT_STOP_IGNORE; #if defined (STLPORT) && (defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS)) CPPUNIT_IGNORE; #endif CPPUNIT_TEST(in_out_test); CPPUNIT_TEST(length_test); CPPUNIT_TEST(imbue_while_reading); CPPUNIT_TEST(special_encodings); CPPUNIT_TEST_SUITE_END(); protected: void variable_encoding(); void in_out_test(); void length_test(); void imbue_while_reading(); void special_encodings(); }; CPPUNIT_TEST_SUITE_REGISTRATION(CodecvtTest); #if defined (STLPORT) # define __NO_THROW _STLP_NOTHROW #else # define __NO_THROW throw() #endif /* Codecvt facet eating some characters from the external buffer. * Transform '01' in 'a' */ struct eater_codecvt : public codecvt<char, char, mbstate_t> { typedef codecvt<char,char,mbstate_t> base; explicit eater_codecvt(size_t refs = 0) : base(refs) {} // primitive conversion virtual base::result do_in(mbstate_t& mb, const char* ebegin, const char* eend, const char*& ecur, char* ibegin, char* iend, char*& icur) const __NO_THROW { char *state = (char*)&mb; ecur = ebegin; icur = ibegin; while (ecur != eend) { if (icur == iend) return partial; if (*ecur == '0' || *state == 1) { if (*state != 1) { ++ecur; } if (ecur == eend) { *state = 1; return ok; } if (*ecur == '1') { *icur = 'a'; } else { *(icur++) = '0'; if (icur == iend) { if (*state != 1) { --ecur; } return partial; } *icur = *ecur; } } else { *icur = *ecur; } *state = 0; ++icur; ++ecur; } return ok; } // claim it's not a null-conversion virtual bool do_always_noconv() const __NO_THROW { return false; } // claim it doesn't have a fixed-length encoding virtual int do_encoding() const __NO_THROW { return 0; } // implemented for consistency with do_in overload virtual int do_length(mbstate_t &state, const char *efrom, const char *eend, size_t m) const { char *ibegin = new char[m]; const char *ecur = efrom; char *icur = ibegin; mbstate_t tmp = state; do_in(tmp, efrom, eend, ecur, ibegin, ibegin + m, icur); delete[] ibegin; return ecur - efrom; } virtual int do_max_length() const __NO_THROW { return 2; } #ifdef __DMC__ static locale::id id; #endif }; #ifdef __DMC__ locale::id eater_codecvt::id; locale::id& _GetFacetId(const eater_codecvt*) { return eater_codecvt::id; } #endif /* Codecvt facet generating more characters than the ones read from the * external buffer, transform '01' in 'abc' * This kind of facet do not allow systematical positionning in the external * buffer (tellg -> -1), when you just read a 'a' you are at an undefined * external buffer position. */ struct generator_codecvt : public codecvt<char, char, mbstate_t> { typedef codecvt<char,char,mbstate_t> base; explicit generator_codecvt(size_t refs = 0) : base(refs) {} // primitive conversion virtual base::result do_in(mbstate_t& mb, const char* ebegin, const char* eend, const char*& ecur, char* ibegin, char* iend, char*& icur) const __NO_THROW { //Access the mbstate information in a portable way: char *state = (char*)&mb; ecur = ebegin; icur = ibegin; if (icur == iend) return ok; if (*state == 2) { *(icur++) = 'b'; if (icur == iend) { *state = 3; return ok; } *(icur++) = 'c'; *state = 0; } else if (*state == 3) { *(icur++) = 'c'; *state = 0; } while (ecur != eend) { if (icur == iend) return ok; if (*ecur == '0' || *state == 1) { if (*state != 1) { ++ecur; } if (ecur == eend) { *state = 1; return partial; } if (*ecur == '1') { *(icur++) = 'a'; if (icur == iend) { *state = 2; return ok; } *(icur++) = 'b'; if (icur == iend) { *state = 3; return ok; } *icur = 'c'; } else { *(icur++) = '0'; if (icur == iend) { if (*state != 1) { --ecur; } return ok; } *icur = *ecur; } } else { *icur = *ecur; } *state = 0; ++icur; ++ecur; } return ok; } // claim it's not a null-conversion virtual bool do_always_noconv() const __NO_THROW { return false; } // claim it doesn't have a fixed-length encoding virtual int do_encoding() const __NO_THROW { return 0; } // implemented for consistency with do_in overload virtual int do_length(mbstate_t &mb, const char *efrom, const char *eend, size_t m) const { const char *state = (const char*)&mb; int offset = 0; if (*state == 2) offset = 2; else if (*state == 3) offset = 1; char *ibegin = new char[m + offset]; const char *ecur = efrom; char *icur = ibegin; mbstate_t tmpState = mb; do_in(tmpState, efrom, eend, ecur, ibegin, ibegin + m + offset, icur); /* char *state = (char*)&tmpState; if (*state != 0) { if (*state == 1) --ecur; else if (*state == 2 || *state == 3) { //Undefined position, we return -1: ecur = efrom - 1; } } else { if (*((char*)&mb) != 0) { //We take into account the character that hasn't been counted yet in //the previous decoding step: ecur++; } } */ delete[] ibegin; return (int)min((size_t)(ecur - efrom), m); } virtual int do_max_length() const __NO_THROW { return 0; } #ifdef __DMC__ static locale::id id; #endif }; #ifdef __DMC__ locale::id generator_codecvt::id; locale::id& _GetFacetId(const generator_codecvt*) { return generator_codecvt::id; } #endif // // tests implementation // #include <iostream> void CodecvtTest::variable_encoding() { #if !defined (STLPORT) || !defined (_STLP_NO_MEMBER_TEMPLATES) //We first generate the file used for test: const char* fileName = "test_file.txt"; { ofstream ostr(fileName); //Maybe we simply do not have write access to repository CPPUNIT_ASSERT( ostr.good() ); for (int i = 0; i < 2048; ++i) { ostr << "0123456789"; } CPPUNIT_ASSERT( ostr.good() ); } { ifstream istr(fileName); CPPUNIT_ASSERT( istr.good() ); CPPUNIT_ASSERT( !istr.eof() ); eater_codecvt codec(1); locale loc(locale::classic(), &codec); istr.imbue(loc); CPPUNIT_ASSERT( istr.good() ); CPPUNIT_ASSERT( (int)istr.tellg() == 0 ); int theoricalPos = 0; do { int c = istr.get(); if (char_traits<char>::eq_int_type(c, char_traits<char>::eof())) { break; } ++theoricalPos; if (c == 'a') { ++theoricalPos; } CPPUNIT_ASSERT( (int)istr.tellg() == theoricalPos ); } while (!istr.eof()); cerr << "out!\n"; CPPUNIT_ASSERT( istr.eof() ); cerr << "fin!\n"; } # if 0 /* This test is broken, not sure if it is really possible to get a position in * a locale having a codecvt such as generator_codecvt. Maybe generator_codecvt * is not a valid theorical example of codecvt implementation. */ { ifstream istr(fileName); CPPUNIT_ASSERT( istr.good() ); CPPUNIT_ASSERT( !istr.eof() ); generator_codecvt codec(1); locale loc(locale::classic(), &codec); istr.imbue(loc); CPPUNIT_ASSERT( istr.good() ); CPPUNIT_ASSERT( (int)istr.tellg() == 0 ); int theoricalPos = 0; int theoricalTellg; do { char c = istr.get(); if (c == char_traits<char>::eof()) { break; } switch (c) { case 'a': case 'b': theoricalTellg = -1; break; case 'c': ++theoricalPos; default: ++theoricalPos; theoricalTellg = theoricalPos; break; } if ((int)istr.tellg() != theoricalTellg) { CPPUNIT_ASSERT( (int)istr.tellg() == theoricalTellg ); } } while (!istr.eof()); CPPUNIT_ASSERT( istr.eof() ); } # endif #endif } void CodecvtTest::in_out_test() { #if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS)) try { locale loc(""); typedef codecvt<wchar_t, char, mbstate_t> cdecvt_type; if (has_facet<cdecvt_type>(loc)) { cdecvt_type const& cdect = use_facet<cdecvt_type>(loc); { cdecvt_type::state_type state; memset(&state, 0, sizeof(cdecvt_type::state_type)); string from("abcdef"); const char* next_from; wchar_t to[1]; wchar_t *next_to; cdecvt_type::result res = cdect.in(state, from.data(), from.data() + from.size(), next_from, to, to + sizeof(to) / sizeof(wchar_t), next_to); CPPUNIT_ASSERT( res == cdecvt_type::ok ); CPPUNIT_ASSERT( next_from == from.data() + 1 ); CPPUNIT_ASSERT( next_to == &to[0] + 1 ); CPPUNIT_ASSERT( to[0] == L'a'); } { cdecvt_type::state_type state; memset(&state, 0, sizeof(cdecvt_type::state_type)); wstring from(L"abcdef"); const wchar_t* next_from; char to[1]; char *next_to; cdecvt_type::result res = cdect.out(state, from.data(), from.data() + from.size(), next_from, to, to + sizeof(to) / sizeof(char), next_to); CPPUNIT_ASSERT( res == cdecvt_type::ok ); CPPUNIT_ASSERT( next_from == from.data() + 1 ); CPPUNIT_ASSERT( next_to == &to[0] + 1 ); CPPUNIT_ASSERT( to[0] == 'a'); } } } catch (runtime_error const&) { } catch (...) { CPPUNIT_FAIL; } #endif } void CodecvtTest::length_test() { #if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS)) try { locale loc(""); typedef codecvt<wchar_t, char, mbstate_t> cdecvt_type; if (has_facet<cdecvt_type>(loc)) { cdecvt_type const& cdect = use_facet<cdecvt_type>(loc); { cdecvt_type::state_type state; memset(&state, 0, sizeof(cdecvt_type::state_type)); string from("abcdef"); int res = cdect.length(state, from.data(), from.data() + from.size(), from.size()); CPPUNIT_ASSERT( (size_t)res == from.size() ); } } } catch (runtime_error const&) { } catch (...) { CPPUNIT_FAIL; } #endif } #if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS)) typedef std::codecvt<wchar_t, char, mbstate_t> my_codecvt_base; class my_codecvt : public my_codecvt_base { public: explicit my_codecvt(size_t r = 0) : my_codecvt_base(r) {} protected: virtual result do_in(state_type& /*state*/, const extern_type* first1, const extern_type* last1, const extern_type*& next1, intern_type* first2, intern_type* last2, intern_type*& next2) const { for ( next1 = first1, next2 = first2; next1 < last1; next1 += 2 ) { if ( (last1 - next1) < 2 || (last2 - next2) < 1 ) return partial; *next2++ = (intern_type)((*(next1 + 1) << 8) | (*next1 & 255)); } return ok; } virtual bool do_always_noconv() const __NO_THROW { return false; } virtual int do_max_length() const __NO_THROW { return 2; } virtual int do_encoding() const __NO_THROW { return 2; } }; #endif void CodecvtTest::imbue_while_reading() { #if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS)) { wofstream ofs( "test.txt" ); const wchar_t buf[] = L" "; for ( int i = 0; i < 4098; ++i ) { ofs << buf[0]; } } wifstream ifs("test.txt"); // a file containing 4098 wchars ifs.imbue( locale(locale(), new my_codecvt) ); ifs.get(); ifs.seekg(0); ifs.imbue( locale() ); ifs.ignore(4096); int ch = ifs.get(); CPPUNIT_CHECK( ch != (int)WEOF ); #endif } void CodecvtTest::special_encodings() { #if !defined (STLPORT) || (!defined (_STLP_NO_WCHAR_T) && defined (_STLP_USE_EXCEPTIONS)) { locale loc(locale::classic(), new codecvt_byname<wchar_t, char, mbstate_t>("C")); codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc); mbstate_t state; memset(&state, 0, sizeof(mbstate_t)); char c = '0'; const char *from_next; wchar_t wc; wchar_t *to_next; CPPUNIT_ASSERT( cvt.in(state, &c, &c + 1, from_next, &wc, &wc, to_next) == codecvt_base::ok ); CPPUNIT_ASSERT( to_next == &wc ); CPPUNIT_ASSERT( cvt.in(state, &c, &c + 1, from_next, &wc, &wc + 1, to_next) == codecvt_base::ok ); CPPUNIT_ASSERT( wc == L'0' ); CPPUNIT_ASSERT( to_next == &wc + 1 ); } try { wstring cp936_wstr; const string cp936_str = "\xd6\xd0\xb9\xfa\xc9\xe7\xbb\xe1\xbf\xc6\xd1\xa7\xd4\xba\xb7\xa2\xb2\xbc\x32\x30\x30\x38\xc4\xea\xa1\xb6\xbe\xad\xbc\xc3\xc0\xb6\xc6\xa4\xca\xe9\xa1\xb7\xd6\xb8\xb3\xf6\xa3\xac\x32\x30\x30\x37\xc4\xea\xd6\xd0\xb9\xfa\xbe\xad\xbc\xc3\xd4\xf6\xb3\xa4\xd3\xc9\xc6\xab\xbf\xec\xd7\xaa\xcf\xf2\xb9\xfd\xc8\xc8\xb5\xc4\xc7\xf7\xca\xc6\xc3\xf7\xcf\xd4\xd4\xa4\xbc\xc6\xc8\xab\xc4\xea\x47\x44\x50\xd4\xf6\xcb\xd9\xbd\xab\xb4\xef\x31\x31\x2e\x36\x25\xa1\xa3"; locale loc(locale::classic(), ".936", locale::ctype); codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc); mbstate_t state; memset(&state, 0, sizeof(mbstate_t)); codecvt_base::result res; { wchar_t wbuf[4096]; // Check we will have enough room for the generated wide string generated from the whole char buffer: int len = cvt.length(state, cp936_str.data(), cp936_str.data() + cp936_str.size(), sizeof(wbuf) / sizeof(wchar_t)); CPPUNIT_ASSERT( cp936_str.size() == (size_t)len ); const char *from_next; wchar_t *to_next; res = cvt.in(state, cp936_str.data(), cp936_str.data() + cp936_str.size(), from_next, wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), to_next); CPPUNIT_ASSERT( res == codecvt_base::ok ); CPPUNIT_ASSERT( from_next == cp936_str.data() + cp936_str.size() ); cp936_wstr.assign(wbuf, to_next); } { const wchar_t *from_next; char buf[4096]; char *to_next; res = cvt.out(state, cp936_wstr.data(), cp936_wstr.data() + cp936_wstr.size(), from_next, buf, buf + sizeof(buf), to_next); CPPUNIT_ASSERT( res == codecvt_base::ok ); CPPUNIT_CHECK( string(buf, to_next) == cp936_str ); } } catch (const runtime_error&) { CPPUNIT_MESSAGE("Not enough platform localization support to check 936 code page encoding."); } try { const string utf8_str = "\xe4\xb8\xad\xe5\x9b\xbd\xe7\xa4\xbe\xe4\xbc\x9a\xe7\xa7\x91\xe5\xad\xa6\xe9\x99\xa2\xe5\x8f\x91\xe5\xb8\x83\x32\x30\x30\x38\xe5\xb9\xb4\xe3\x80\x8a\xe7\xbb\x8f\xe6\xb5\x8e\xe8\x93\x9d\xe7\x9a\xae\xe4\xb9\xa6\xe3\x80\x8b\xe6\x8c\x87\xe5\x87\xba\xef\xbc\x8c\x32\x30\x30\x37\xe5\xb9\xb4\xe4\xb8\xad\xe5\x9b\xbd\xe7\xbb\x8f\xe6\xb5\x8e\xe5\xa2\x9e\xe9\x95\xbf\xe7\x94\xb1\xe5\x81\x8f\xe5\xbf\xab\xe8\xbd\xac\xe5\x90\x91\xe8\xbf\x87\xe7\x83\xad\xe7\x9a\x84\xe8\xb6\x8b\xe5\x8a\xbf\xe6\x98\x8e\xe6\x98\xbe\xe9\xa2\x84\xe8\xae\xa1\xe5\x85\xa8\xe5\xb9\xb4\x47\x44\x50\xe5\xa2\x9e\xe9\x80\x9f\xe5\xb0\x86\xe8\xbe\xbe\x31\x31\x2e\x36\x25\xe3\x80\x82"; wstring utf8_wstr; locale loc(locale::classic(), new codecvt_byname<wchar_t, char, mbstate_t>(".utf8")); codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc); mbstate_t state; memset(&state, 0, sizeof(mbstate_t)); codecvt_base::result res; { wchar_t wbuf[4096]; // Check we will have enough room for the wide string generated from the whole char buffer: int len = cvt.length(state, utf8_str.data(), utf8_str.data() + utf8_str.size(), sizeof(wbuf) / sizeof(wchar_t)); CPPUNIT_ASSERT( utf8_str.size() == (size_t)len ); const char *from_next; wchar_t *to_next; res = cvt.in(state, utf8_str.data(), utf8_str.data() + utf8_str.size(), from_next, wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), to_next); CPPUNIT_ASSERT( res == codecvt_base::ok ); CPPUNIT_ASSERT( from_next == utf8_str.data() + utf8_str.size() ); utf8_wstr.assign(wbuf, to_next); // Try to read one char after the other: wchar_t wc; const char* from = utf8_str.data(); const char* from_end = from + utf8_str.size(); from_next = utf8_str.data(); size_t length = 1; size_t windex = 0; while (from + length <= from_end) { res = cvt.in(state, from, from + length, from_next, &wc, &wc + 1, to_next); switch (res) { case codecvt_base::ok: // reset length: from = from_next; length = 1; CPPUNIT_ASSERT( wc == utf8_wstr[windex++] ); wc = 0; break; case codecvt_base::partial: if (from_next == from) // from_next hasn't move so we have to pass more chars ++length; else // char between from and from_next has been eaten, we simply restart // conversion from from_next: from = from_next; continue; case codecvt_base::error: case codecvt_base::noconv: CPPUNIT_FAIL; //break; } } CPPUNIT_ASSERT( windex == utf8_wstr.size() ); } { const wchar_t *from_next; char buf[4096]; char *to_next; res = cvt.out(state, utf8_wstr.data(), utf8_wstr.data() + utf8_wstr.size(), from_next, buf, buf + sizeof(buf), to_next); CPPUNIT_ASSERT( res == codecvt_base::ok ); CPPUNIT_CHECK( string(buf, to_next) == utf8_str ); } { // Check that an obviously wrong UTF8 encoded string is correctly detected: const string bad_utf8_str("\xdf\xdf\xdf\xdf\xdf"); wchar_t wc; const char *from_next; wchar_t *to_next; res = cvt.in(state, bad_utf8_str.data(), bad_utf8_str.data() + bad_utf8_str.size(), from_next, &wc, &wc + 1, to_next); CPPUNIT_ASSERT( res == codecvt_base::error ); } } catch (const runtime_error&) { CPPUNIT_MESSAGE("Not enough platform localization support to check UTF8 encoding."); } #endif } #endif