/* * Copyright (C) 2008 Esmertec AG. * Copyright (C) 2008 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <stdio.h> #include <stdlib.h> #include <setjmp.h> #include <assert.h> #include "wbxml_parser.h" #include "csp13_data.h" #ifdef SUPPORT_SYNCML #include "syncml_data.h" #endif #ifdef PLATFORM_ANDROID extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb, size_t size, int (*compar)(const void *, const void *)); #endif #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) //#define WBXML_DEBUG 1 /* Major TODO items: - Attribute value tokens (not used by IMPS CSP) - EXT_* except EXT_T_0 (not used by IMPS CSP) - PI (not used by IMPS CSP) - cleanups Other TODO: - Support more public ID? Only IMPS is supported now. - Support other charsets than UTF-8 */ static int compareTokenData(const void * t1, const void * t2) { return ((TokenData *)t1)->token - ((TokenData *)t2)->token; } static int compareAttrData(const void * t1, const void * t2) { return ((AttrData *)t1)->token - ((AttrData *)t2)->token; } static bool isTagStart(int token) { if (token == TOKEN_SWITCH_PAGE) return true; token &= 0x3f; return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0); } static bool isAttrStart(int token) { return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) || (token > TOKEN_LITERAL_C && token < 0x80); } WbxmlParser::WbxmlParser(uint32_t transportEncoding) : mTransportEncoding(transportEncoding) { reset(); } WbxmlParser::~WbxmlParser() { } void WbxmlParser::reset(void) { mContentHandler = NULL; mExternalChunk = NULL; mExternalChunkLen = 0; mLastChunk.clear(); mDataOffset = 0; mIsDataEnd = false; mStartElemStack.clear(); mStringTable.clear(); mCurrTagPage = mCurrAttrPage = 0; mPublicId = 0; mState = EXPECT_HEADER; mLastError = ERROR_NO_ERROR; } void WbxmlParser::setContentHandler(WbxmlContentHandler * handler) { mContentHandler = handler; } int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end) { if (data == NULL) { mLastError = ERROR_INVALID_DATA; return WBXML_STATUS_ERROR; } // All temporary C++ varaibles must be declared before setjmp to make // sure they get properly destructed after longjmp. vector<Attribute> attribs; Attribute attrib; string tagName; string characters; string opaque; #ifdef WBXML_DEBUG printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n", dataLen, end, getReadPos(), availDataSize()); #endif appendData(data, dataLen, end); volatile int readPos = getReadPos(); int setjmpRet; switch (setjmpRet = setjmp(mJmpbuf)) { case 0: break; case ERROR_NEED_MORE_DATA: if (!mIsDataEnd) { #ifdef WBXML_DEBUG printf("\nneed more data: readPos %d\n", readPos); #endif setReadPos(readPos); saveRemainingData(); return WBXML_STATUS_OK; } else { #ifdef WBXML_DEBUG printf("wbxml parser error: unexpected data end\n"); #endif mLastError = ERROR_NEED_MORE_DATA; return WBXML_STATUS_ERROR; } break; case ERROR_UNSUPPORTED_PUBID: case ERROR_UNSUPPORTED_CHARSET: case ERROR_INVALID_STRING_TABLE: case ERROR_INVALID_STRING_TABLE_REFERENCE: case ERROR_INVALID_EXT_TOKEN: case ERROR_INVALID_MBUINT: case ERROR_INVALID_ENTITY: case ERROR_UNRECOGNIZED_TAG: case ERROR_UNRECOGNIZED_ATTR: case ERROR_MISSING_ATTR: case ERROR_MISSING_TOKEN_END: #ifdef WBXML_DEBUG printf("wbxml parser error %d\n", setjmpRet); #endif mLastError = ParserError(setjmpRet); return WBXML_STATUS_ERROR; break; case ERROR_NOT_SUPPORTED_YET: printf("wbxml parser error: Not implemented feature.\n"); mLastError = ParserError(setjmpRet); return WBXML_STATUS_ERROR; break; default: printf("wbxml parser error: Impossible execution path.\n"); mLastError = ParserError(setjmpRet); return WBXML_STATUS_ERROR; break; } for (;;) { // save readPos for error recovery readPos = getReadPos(); switch (mState) { case EXPECT_HEADER: mDocVersion = readByte(); mPublicId = readMbuint32(); if (mPublicId != 0) { if (!selectTokenMapping(mPublicId)) { #ifdef WBXML_DEBUG printf("wbxml parser error: unsupported public id \n"); #endif longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID); } } else { mPublicId = -readMbuint32(); } mCharset = readMbuint32(); if (!mCharset) { mCharset = mTransportEncoding; if (!mCharset) { mCharset = CHARSET_UTF8; } } // TODO: support more charsets other than UTF-8 if (mCharset != CHARSET_UTF8) { #ifdef WBXML_DEBUG printf("wbxml parser error: unsupported charset\n"); #endif longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET); } // now advance to next state if (mContentHandler) { mContentHandler->handlePublicId(mPublicId); } mState = EXPECT_STRING_TABLE; break; case EXPECT_STRING_TABLE: { uint32_t len = readMbuint32(); if (availDataSize() < len) { longjmp(mJmpbuf, ERROR_NEED_MORE_DATA); } mStringTable.clear(); // TODO: optimize this while (len--) { mStringTable += readByte(); } if (mStringTable.size()) { if (mStringTable[mStringTable.size() - 1] != 0) { // must have an ending \0 //TODO:the byte array returned by SCTS does not contain '\0' at the //end,should this be fixed accordingly? #ifdef WBXML_DEBUG printf("wbxml parser error: invalid string table\n"); #endif longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE); } } mState = EXPECT_BODY_START; if (mPublicId <= 0) { const char * s = mStringTable.c_str() + (-mPublicId); #ifdef SUPPORT_SYNCML if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) { mPublicId = PUBLICID_SYNCML_1_2; } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) { mPublicId = PUBLICID_SYNCML_1_1; } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) { mPublicId = PUBLICID_SYNCML_1_0; } #endif if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) { longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID); } } break; } case EXPECT_BODY_START: //TODO: handle possible PIs mState = EXPECT_ELEMENT_START; break; case EXPECT_ELEMENT_START: { int stag = readByte(); const char * name; if ((stag & 0x3f) == TOKEN_LITERAL) { name = resolveStrTableRef(); } else { if (stag == TOKEN_SWITCH_PAGE) { mCurrTagPage = readByte(); stag = readByte(); } name = lookupTagName(stag); } if (name == NULL) { #ifdef WBXML_DEBUG printf("wbxml parser error: unrecognized tag\n"); #endif longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG); } attribs.clear(); if (stag & 0x80) { // followed by 1 or more attributes while (peekByte() != TOKEN_END) { readAttribute(&attrib); attribs.push_back(attrib); } if (!attribs.size()) { #ifdef WBXML_DEBUG printf("wbxml parser error: missing attributes\n"); #endif longjmp(mJmpbuf, ERROR_MISSING_ATTR); } // TOKEN_END readByte(); } if (mContentHandler) { mContentHandler->startElement(name, attribs); } if (stag & 0x40) { mState = EXPECT_CONTENT; } else { mState = ELEMENT_END; } tagName = name; mStartElemStack.push_back(name); break; } case EXPECT_CONTENT: { int byte = peekByte(); if (byte == TOKEN_SWITCH_PAGE) { readByte(); mCurrTagPage = readByte(); byte = peekByte(); } if (isTagStart(byte) || byte == TOKEN_END) { if (characters.size() && mContentHandler) { mContentHandler->characters(characters.c_str(), characters.size()); characters.clear(); } if (byte == TOKEN_END) { mState = EXPECT_ELEMENT_END; } else { mState = EXPECT_ELEMENT_START; } } else { // TODO: handle extension and pi switch (byte) { case TOKEN_ENTITY: case TOKEN_STR_I: case TOKEN_STR_T: readString(characters); break; case TOKEN_EXT_T_0: { readByte(); uint32_t valueToken = readMbuint32(); if (mPublicId == PUBLICID_IMPS_1_1 || mPublicId == PUBLICID_IMPS_1_2 || mPublicId == PUBLICID_IMPS_1_3) { TokenData t = {valueToken, NULL}; const TokenData * res = (TokenData *)bsearch(&t, csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens), sizeof(csp13ExtValueTokens[0]), compareTokenData); if (res) { characters.append(res->tagName); } else { longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN); } } else { printf ("Token 0x%x\n", byte); longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); } break; } case TOKEN_OPAQUE: { readByte(); uint32_t opaqueDataLen = readMbuint32(); opaque.clear(); while (opaqueDataLen--) { opaque += (char)readByte(); } if (mContentHandler) { mContentHandler->opaque(opaque.c_str(), opaque.size()); } break; } default: printf ("Token 0x%x\n", byte); longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); break; } } break; } case EXPECT_ELEMENT_END: if (readByte() != TOKEN_END) { #ifdef WBXML_DEBUG printf("wbxml parser error: TOKEN_END expected\n"); #endif longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END); } mState = ELEMENT_END; break; case ELEMENT_END: assert(!mStartElemStack.empty()); tagName = mStartElemStack.back(); mStartElemStack.pop_back(); if (mContentHandler) { mContentHandler->endElement(tagName.c_str()); } if (mStartElemStack.empty()) { mState = EXPECT_BODY_END; } else { mState = EXPECT_CONTENT; } break; case EXPECT_BODY_END: // TODO: handle possible PIs // we're done return WBXML_STATUS_OK; break; } } } /* * We don't make a copy of the data chunk for the current parse() until * it returns. * The remaining data will be saved in saveRemainingData() before parse() * returns. */ void WbxmlParser::appendData(const char * data, uint32_t len, bool end) { mExternalChunk = data; mExternalChunkLen = len; mIsDataEnd = end; } void WbxmlParser::saveRemainingData() { if (mDataOffset > mLastChunk.size()) { uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size(); assert(offsetToExtChunk <= mExternalChunkLen); mLastChunk.assign(mExternalChunk + offsetToExtChunk, mExternalChunkLen - offsetToExtChunk); mDataOffset = 0; } else { mLastChunk.append(mExternalChunk, mExternalChunkLen); } mExternalChunk = NULL; mExternalChunkLen = 0; } int WbxmlParser::readByte() { if (mDataOffset < mLastChunk.size()) { #ifdef WBXML_DEBUG printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]); #endif return (unsigned char)mLastChunk[mDataOffset++]; } else { uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size(); if (offsetToExtChunk < mExternalChunkLen) { mDataOffset++; #ifdef WBXML_DEBUG printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]); #endif return (unsigned char)mExternalChunk[offsetToExtChunk]; } longjmp(mJmpbuf, ERROR_NEED_MORE_DATA); } } int WbxmlParser::peekByte() { if (mDataOffset < mLastChunk.size()) { return (unsigned char)mLastChunk[mDataOffset]; } else { uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size(); if (offsetToExtChunk < mExternalChunkLen) { return (unsigned char)mExternalChunk[offsetToExtChunk]; } longjmp(mJmpbuf, ERROR_NEED_MORE_DATA); } } uint32_t WbxmlParser::readMbuint32() { uint32_t value = 0; uint32_t byte; do { if ((value >> 25) != 0) { // would go overflow. not a valid uint32. longjmp(mJmpbuf, ERROR_INVALID_MBUINT); } byte = readByte(); value = (value << 7) | (byte & 0x7f); } while (byte & 0x80); return value; } /** * Read STR_I | STR_T | ENTITY and *append* to str. * Yes this looks ugly... */ void WbxmlParser::readString(string & str) { int byte = readByte(); switch (byte) { case TOKEN_STR_I: //TODO: assuming UTF-8 while ((byte = readByte()) != 0) { str += (char)byte; } break; case TOKEN_ENTITY: { uint32_t ch = readMbuint32(); //TODO: assuming UTF-8 for now. if (ch <= 0x7f) { str += (char)ch; } else if (ch <= 0x7ff) { str += (char)((ch >> 6) | 0xc0); str += (char)((ch & 0x3f) | 0x80); } else if (ch <= 0xffff) { str += (char)((ch >> 12) | 0xe0); str += (char)(((ch >> 6) & 0x3f) | 0x80); str += (char)((ch & 0x3f) | 0x80); } else if (ch <= 0x10ffff) { // 010000 - 10FFFF str += (char)((ch >> 18) | 0xf0); str += (char)(((ch >> 12) & 0x3f) | 0x80); str += (char)(((ch >> 6) & 0x3f) | 0x80); str += (char)((ch & 0x3f) | 0x80); } else { // not a valid UCS-4 character longjmp(mJmpbuf, ERROR_INVALID_ENTITY); } break; } case TOKEN_STR_T: { const char * s = resolveStrTableRef(); str.append(s, strlen(s)); break; } default: // impossible printf ("Unknown token 0x%02x\n", byte); longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); break; } } const char * WbxmlParser::resolveStrTableRef(void) { uint32_t offset = readMbuint32(); if (offset >= mStringTable.size()) { longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE); } return mStringTable.c_str() + offset; } bool WbxmlParser::selectTokenMapping(int publicId) { switch (publicId) { case PUBLICID_IMPS_1_3: case PUBLICID_IMPS_1_2: case PUBLICID_IMPS_1_1: mTagPages = csp13TagPages; mNumTagPages = ARRAY_SIZE(csp13TagPages); mAttrPages = csp13AttrPages; mNumAttrPages = ARRAY_SIZE(csp13AttrPages); break; #ifdef SUPPORT_SYNCML case PUBLICID_SYNCML_1_0: case PUBLICID_SYNCML_1_1: case PUBLICID_SYNCML_1_2: case PUBLICID_SYNCML_METINF_1_2: mTagPages = syncmlTagPages; mNumTagPages = ARRAY_SIZE(syncmlTagPages); mAttrPages = NULL; mNumAttrPages = 0; break; case PUBLICID_SYNCML_DEVINF_1_2: mTagPages = syncmlDevInfTagPages; mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages); mAttrPages = NULL; mNumAttrPages = 0; break; #endif default: return false; } return true; } const char * WbxmlParser::lookupTagName(int tag) const { tag = tag & 0x3f; // TODO: optimize this if (mCurrTagPage >= mNumTagPages) { return NULL; } const TagCodePage * page = &mTagPages[mCurrTagPage]; if (page == NULL) { return NULL; } TokenData t = {tag, NULL}; const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens, sizeof(TokenData), compareTokenData); if (res) { return res->tagName; } return NULL; } const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const { // TODO: optimize this if (mCurrAttrPage >= mNumAttrPages) { return NULL; } const AttrCodePage * page = &mAttrPages[mCurrAttrPage]; if (page == NULL) { return NULL; } AttrData t = {token, NULL, NULL}; const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens, sizeof(AttrData), compareAttrData); if (res) { if (prefix) { *prefix = res->attrValuePrefix; } return res->attrName; } return NULL; } void WbxmlParser::readAttribute(Attribute * attrib) { // attribute start: attrib start token, LITERAL or END int attrStart = readByte(); const char * name; const char * valuePrefix = NULL; if (attrStart == TOKEN_LITERAL) { name = resolveStrTableRef(); } else { if (attrStart == TOKEN_SWITCH_PAGE) { mCurrAttrPage = readByte(); attrStart = readByte(); } name = lookupAttrName(attrStart, &valuePrefix); } if (name == NULL) { longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR); } attrib->name = name; attrib->value = ""; if (valuePrefix != NULL) { attrib->value = valuePrefix; } // now attribute value: zero or more value, string, entity or extension tokens for (;;) { int valueToken = peekByte(); if (isAttrStart(valueToken) || valueToken == TOKEN_END) { // An attribute start token, a LITERAL token or the END token // indicates the end of an attribute value. return; } switch (valueToken) { case TOKEN_ENTITY: case TOKEN_STR_I: case TOKEN_STR_T: readString(attrib->value); break; case TOKEN_EXT_I_0: case TOKEN_EXT_I_1: case TOKEN_EXT_I_2: case TOKEN_EXT_0: case TOKEN_EXT_1: case TOKEN_EXT_2: //TODO: document type specific printf ("Unsupported Token 0x%x\n", valueToken); longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); break; default: //TODO printf ("Unknown Token 0x%x\n", valueToken); longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); break; } } }