libwbxml/src/wbxml_parser.cpp

/*
 * Copyright (C) 2008 Esmertec AG.
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>
#include <assert.h>
#include "wbxml_parser.h"
#include "csp13_data.h"
#ifdef SUPPORT_SYNCML
#include "syncml_data.h"
#endif

#ifdef PLATFORM_ANDROID
extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb,
        size_t size, int (*compar)(const void *, const void *));
#endif

#define ARRAY_SIZE(a)   (sizeof(a) / sizeof(a[0]))

//#define WBXML_DEBUG 1

/* Major TODO items:
   - Attribute value tokens (not used by IMPS CSP)
   - EXT_* except EXT_T_0 (not used by IMPS CSP)
   - PI (not used by IMPS CSP)
   - cleanups

   Other TODO:
   - Support more public ID? Only IMPS is supported now.
   - Support other charsets than UTF-8
 */

static int compareTokenData(const void * t1, const void * t2)
{
    return ((TokenData *)t1)->token - ((TokenData *)t2)->token;
}

static int compareAttrData(const void * t1, const void * t2)
{
    return ((AttrData *)t1)->token - ((AttrData *)t2)->token;
}

static bool isTagStart(int token)
{
    if (token == TOKEN_SWITCH_PAGE)
        return true;

    token &= 0x3f;
    return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0);
}

static bool isAttrStart(int token)
{
    return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) ||
        (token > TOKEN_LITERAL_C && token < 0x80);
}

WbxmlParser::WbxmlParser(uint32_t transportEncoding) :
    mTransportEncoding(transportEncoding)
{
    reset();
}

WbxmlParser::~WbxmlParser()
{
}

void WbxmlParser::reset(void)
{
    mContentHandler = NULL;

    mExternalChunk = NULL;
    mExternalChunkLen = 0;
    mLastChunk.clear();
    mDataOffset = 0;
    mIsDataEnd = false;

    mStartElemStack.clear();
    mStringTable.clear();

    mCurrTagPage = mCurrAttrPage = 0;
    mPublicId = 0;

    mState = EXPECT_HEADER;
    mLastError = ERROR_NO_ERROR;
}

void WbxmlParser::setContentHandler(WbxmlContentHandler * handler)
{
    mContentHandler = handler;
}

int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end)
{
    if (data == NULL) {
        mLastError = ERROR_INVALID_DATA;
        return WBXML_STATUS_ERROR;
    }

    // All temporary C++ varaibles must be declared before setjmp to make
    // sure they get properly destructed after longjmp.
    vector<Attribute> attribs;
    Attribute attrib;
    string tagName;
    string characters;
    string opaque;

#ifdef WBXML_DEBUG
    printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n",
        dataLen, end, getReadPos(), availDataSize());
#endif
    appendData(data, dataLen, end);
    volatile int readPos = getReadPos();
    int setjmpRet;
    switch (setjmpRet = setjmp(mJmpbuf)) {
        case 0:
            break;

        case ERROR_NEED_MORE_DATA:
            if (!mIsDataEnd) {
#ifdef WBXML_DEBUG
                printf("\nneed more data: readPos %d\n", readPos);
#endif
                setReadPos(readPos);
                saveRemainingData();
                return WBXML_STATUS_OK;
            } else {
#ifdef WBXML_DEBUG
                printf("wbxml parser error: unexpected data end\n");
#endif
                mLastError = ERROR_NEED_MORE_DATA;
                return WBXML_STATUS_ERROR;
            }
            break;

        case ERROR_UNSUPPORTED_PUBID:
        case ERROR_UNSUPPORTED_CHARSET:
        case ERROR_INVALID_STRING_TABLE:
        case ERROR_INVALID_STRING_TABLE_REFERENCE:
        case ERROR_INVALID_EXT_TOKEN:
        case ERROR_INVALID_MBUINT:
        case ERROR_INVALID_ENTITY:
        case ERROR_UNRECOGNIZED_TAG:
        case ERROR_UNRECOGNIZED_ATTR:
        case ERROR_MISSING_ATTR:
        case ERROR_MISSING_TOKEN_END:
#ifdef WBXML_DEBUG
            printf("wbxml parser error %d\n", setjmpRet);
#endif
            mLastError = ParserError(setjmpRet);
            return WBXML_STATUS_ERROR;
            break;

        case ERROR_NOT_SUPPORTED_YET:
            printf("wbxml parser error: Not implemented feature.\n");
            mLastError = ParserError(setjmpRet);
            return WBXML_STATUS_ERROR;
            break;

        default:
            printf("wbxml parser error: Impossible execution path.\n");
            mLastError = ParserError(setjmpRet);
            return WBXML_STATUS_ERROR;
            break;
    }

    for (;;) {
        // save readPos for error recovery
        readPos = getReadPos();

        switch (mState) {
            case EXPECT_HEADER:
                mDocVersion = readByte();

                mPublicId = readMbuint32();
                if (mPublicId != 0) {
                    if (!selectTokenMapping(mPublicId)) {
#ifdef WBXML_DEBUG
                        printf("wbxml parser error: unsupported public id \n");
#endif
                        longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
                    }
                } else {
                    mPublicId = -readMbuint32();
                }
                mCharset = readMbuint32();
                if (!mCharset) {
                    mCharset = mTransportEncoding;
                    if (!mCharset) {
                        mCharset = CHARSET_UTF8;
                    }
                }
                // TODO: support more charsets other than UTF-8
                if (mCharset != CHARSET_UTF8) {
#ifdef WBXML_DEBUG
                    printf("wbxml parser error: unsupported charset\n");
#endif
                    longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET);
                }

                // now advance to next state
                if (mContentHandler) {
                    mContentHandler->handlePublicId(mPublicId);
                }
                mState = EXPECT_STRING_TABLE;
                break;

            case EXPECT_STRING_TABLE:
            {
                uint32_t len = readMbuint32();
                if (availDataSize() < len) {
                    longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
                }
                mStringTable.clear();
                // TODO: optimize this
                while (len--) {
                    mStringTable += readByte();
                }
                if (mStringTable.size()) {
                    if (mStringTable[mStringTable.size() - 1] != 0) {
                        // must have an ending \0
                        //TODO:the byte array returned by SCTS does not contain '\0' at the
                        //end,should this be fixed accordingly?
#ifdef WBXML_DEBUG
                        printf("wbxml parser error: invalid string table\n");
#endif
                        longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE);
                    }
                }
                mState = EXPECT_BODY_START;
                if (mPublicId <= 0) {
                    const char * s = mStringTable.c_str() + (-mPublicId);
#ifdef SUPPORT_SYNCML
                    if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) {
                        mPublicId = PUBLICID_SYNCML_1_2;
                    } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) {
                        mPublicId = PUBLICID_SYNCML_1_1;
                    } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) {
                        mPublicId = PUBLICID_SYNCML_1_0;
                    }
#endif
                    if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) {
                        longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
                    }
                }
                break;
            }

            case EXPECT_BODY_START:
                //TODO: handle possible PIs
                mState = EXPECT_ELEMENT_START;
                break;

            case EXPECT_ELEMENT_START:
            {
                int stag = readByte();
                const char * name;
                if ((stag & 0x3f) == TOKEN_LITERAL) {
                    name = resolveStrTableRef();
                } else {
                    if (stag == TOKEN_SWITCH_PAGE) {
                        mCurrTagPage = readByte();
                        stag = readByte();
                    }
                    name = lookupTagName(stag);
                }
                if (name == NULL) {
#ifdef WBXML_DEBUG
                    printf("wbxml parser error: unrecognized tag\n");
#endif
                    longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG);
                }
                attribs.clear();
                if (stag & 0x80) {
                    // followed by 1 or more attributes
                    while (peekByte() != TOKEN_END) {
                        readAttribute(&attrib);
                        attribs.push_back(attrib);
                    }
                    if (!attribs.size()) {
#ifdef WBXML_DEBUG
                        printf("wbxml parser error: missing attributes\n");
#endif
                        longjmp(mJmpbuf, ERROR_MISSING_ATTR);
                    }
                    // TOKEN_END
                    readByte();
                }
                if (mContentHandler) {
                    mContentHandler->startElement(name, attribs);
                }
                if (stag & 0x40) {
                    mState = EXPECT_CONTENT;
                } else {
                    mState = ELEMENT_END;
                }
                tagName = name;
                mStartElemStack.push_back(name);
                break;
            }

            case EXPECT_CONTENT:
            {
                int byte = peekByte();
                if (byte == TOKEN_SWITCH_PAGE) {
                    readByte();
                    mCurrTagPage = readByte();
                    byte = peekByte();
                }
                if (isTagStart(byte) || byte == TOKEN_END) {
                    if (characters.size() && mContentHandler) {
                        mContentHandler->characters(characters.c_str(), characters.size());
                        characters.clear();
                    }
                    if (byte == TOKEN_END) {
                        mState = EXPECT_ELEMENT_END;
                    } else {
                        mState = EXPECT_ELEMENT_START;
                    }
                } else {
                    // TODO: handle extension and pi
                    switch (byte) {
                        case TOKEN_ENTITY:
                        case TOKEN_STR_I:
                        case TOKEN_STR_T:
                            readString(characters);
                            break;

                        case TOKEN_EXT_T_0:
                        {
                            readByte();
                            uint32_t valueToken = readMbuint32();
                            if (mPublicId == PUBLICID_IMPS_1_1
                                    || mPublicId == PUBLICID_IMPS_1_2
                                    || mPublicId == PUBLICID_IMPS_1_3) {
                                TokenData t = {valueToken, NULL};
                                const TokenData * res = (TokenData *)bsearch(&t,
                                        csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens),
                                        sizeof(csp13ExtValueTokens[0]), compareTokenData);
                                if (res) {
                                    characters.append(res->tagName);
                                } else {
                                    longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN);
                                }
                            } else {
                                printf ("Token 0x%x\n", byte);
                                longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                            }
                            break;
                        }

                        case TOKEN_OPAQUE:
                        {
                            readByte();
                            uint32_t opaqueDataLen = readMbuint32();
                            opaque.clear();
                            while (opaqueDataLen--) {
                                opaque += (char)readByte();
                            }
                            if (mContentHandler) {
                                mContentHandler->opaque(opaque.c_str(), opaque.size());
                            }
                            break;
                        }

                        default:
                            printf ("Token 0x%x\n", byte);
                            longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                            break;
                    }
                }
                break;
            }

            case EXPECT_ELEMENT_END:
                if (readByte() != TOKEN_END) {
#ifdef WBXML_DEBUG
                    printf("wbxml parser error: TOKEN_END expected\n");
#endif
                    longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END);
                }
                mState = ELEMENT_END;
                break;

            case ELEMENT_END:
                assert(!mStartElemStack.empty());

                tagName = mStartElemStack.back();
                mStartElemStack.pop_back();
                if (mContentHandler) {
                    mContentHandler->endElement(tagName.c_str());
                }
                if (mStartElemStack.empty()) {
                    mState = EXPECT_BODY_END;
                } else {
                    mState = EXPECT_CONTENT;
                }
                break;

            case EXPECT_BODY_END:
                // TODO: handle possible PIs

                // we're done
                return WBXML_STATUS_OK;
                break;
        }
    }
}

/*
 * We don't make a copy of the data chunk for the current parse() until
 * it returns.
 * The remaining data will be saved in saveRemainingData() before parse()
 * returns.
 */
void WbxmlParser::appendData(const char * data, uint32_t len, bool end)
{
    mExternalChunk = data;
    mExternalChunkLen = len;
    mIsDataEnd = end;
}

void WbxmlParser::saveRemainingData()
{
    if (mDataOffset > mLastChunk.size()) {
        uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
        assert(offsetToExtChunk <= mExternalChunkLen);
        mLastChunk.assign(mExternalChunk + offsetToExtChunk,
                mExternalChunkLen - offsetToExtChunk);
        mDataOffset = 0;
    } else {
        mLastChunk.append(mExternalChunk, mExternalChunkLen);
    }
    mExternalChunk = NULL;
    mExternalChunkLen = 0;
}

int WbxmlParser::readByte()
{
    if (mDataOffset < mLastChunk.size()) {
#ifdef WBXML_DEBUG
        printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]);
#endif
        return (unsigned char)mLastChunk[mDataOffset++];
    } else {
        uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
        if (offsetToExtChunk < mExternalChunkLen) {
            mDataOffset++;
#ifdef WBXML_DEBUG
            printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]);
#endif
            return (unsigned char)mExternalChunk[offsetToExtChunk];
        }
        longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
    }
}

int WbxmlParser::peekByte()
{
    if (mDataOffset < mLastChunk.size()) {
        return (unsigned char)mLastChunk[mDataOffset];
    } else {
        uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
        if (offsetToExtChunk < mExternalChunkLen) {
            return (unsigned char)mExternalChunk[offsetToExtChunk];
        }
        longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
    }
}

uint32_t WbxmlParser::readMbuint32()
{
    uint32_t value = 0;
    uint32_t byte;
    do {
        if ((value >> 25) != 0) {
            // would go overflow. not a valid uint32.
            longjmp(mJmpbuf, ERROR_INVALID_MBUINT);
        }
        byte = readByte();
        value = (value << 7) | (byte & 0x7f);
    } while (byte & 0x80);
    return value;
}

/**
 * Read STR_I | STR_T | ENTITY and *append* to str.
 * Yes this looks ugly...
 */
void WbxmlParser::readString(string & str)
{
    int byte = readByte();
    switch (byte) {
        case TOKEN_STR_I:
            //TODO: assuming UTF-8
            while ((byte = readByte()) != 0) {
                str += (char)byte;
            }
            break;

        case TOKEN_ENTITY:
        {
            uint32_t ch = readMbuint32();
            //TODO: assuming UTF-8 for now.
            if (ch <= 0x7f) {
                str += (char)ch;
            } else if (ch <= 0x7ff) {
                str += (char)((ch >> 6) | 0xc0);
                str += (char)((ch & 0x3f) | 0x80);
            } else if (ch <= 0xffff) {
                str += (char)((ch >> 12) | 0xe0);
                str += (char)(((ch >> 6) & 0x3f) | 0x80);
                str += (char)((ch & 0x3f) | 0x80);
            } else if (ch <= 0x10ffff) {
                // 010000 - 10FFFF
                str += (char)((ch >> 18) | 0xf0);
                str += (char)(((ch >> 12) & 0x3f) | 0x80);
                str += (char)(((ch >> 6) & 0x3f) | 0x80);
                str += (char)((ch & 0x3f) | 0x80);
            } else {
                // not a valid UCS-4 character
                longjmp(mJmpbuf, ERROR_INVALID_ENTITY);
            }
            break;
        }

        case TOKEN_STR_T:
        {
            const char * s = resolveStrTableRef();
            str.append(s, strlen(s));
            break;
        }

        default:
            // impossible
            printf ("Unknown token 0x%02x\n", byte);
            longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
            break;
    }
}

const char * WbxmlParser::resolveStrTableRef(void)
{
    uint32_t offset = readMbuint32();
    if (offset >= mStringTable.size()) {
        longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE);
    }
    return mStringTable.c_str() + offset;
}

bool WbxmlParser::selectTokenMapping(int publicId)
{
    switch (publicId) {
        case PUBLICID_IMPS_1_3:
        case PUBLICID_IMPS_1_2:
        case PUBLICID_IMPS_1_1:
            mTagPages = csp13TagPages;
            mNumTagPages = ARRAY_SIZE(csp13TagPages);
            mAttrPages = csp13AttrPages;
            mNumAttrPages = ARRAY_SIZE(csp13AttrPages);
            break;

#ifdef SUPPORT_SYNCML
        case PUBLICID_SYNCML_1_0:
        case PUBLICID_SYNCML_1_1:
        case PUBLICID_SYNCML_1_2:
        case PUBLICID_SYNCML_METINF_1_2:
            mTagPages = syncmlTagPages;
            mNumTagPages = ARRAY_SIZE(syncmlTagPages);
            mAttrPages = NULL;
            mNumAttrPages = 0;
            break;

        case PUBLICID_SYNCML_DEVINF_1_2:
            mTagPages = syncmlDevInfTagPages;
            mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages);
            mAttrPages = NULL;
            mNumAttrPages = 0;
            break;
#endif
        default:
            return false;
    }
    return true;
}

const char * WbxmlParser::lookupTagName(int tag) const
{
    tag = tag & 0x3f;

    // TODO: optimize this
    if (mCurrTagPage >= mNumTagPages) {
        return NULL;
    }
    const TagCodePage * page = &mTagPages[mCurrTagPage];
    if (page == NULL) {
        return NULL;
    }

    TokenData t = {tag, NULL};
    const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens,
            sizeof(TokenData), compareTokenData);
    if (res) {
        return res->tagName;
    }

    return NULL;
}

const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const
{
    // TODO: optimize this
    if (mCurrAttrPage >= mNumAttrPages) {
        return NULL;
    }
    const AttrCodePage * page = &mAttrPages[mCurrAttrPage];
    if (page == NULL) {
        return NULL;
    }

    AttrData t = {token, NULL, NULL};
    const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens,
            sizeof(AttrData), compareAttrData);
    if (res) {
        if (prefix) {
            *prefix = res->attrValuePrefix;
        }
        return res->attrName;
    }

    return NULL;
}

void WbxmlParser::readAttribute(Attribute * attrib)
{
    // attribute start: attrib start token, LITERAL or END
    int attrStart = readByte();
    const char * name;
    const char * valuePrefix = NULL;

    if (attrStart == TOKEN_LITERAL) {
        name = resolveStrTableRef();
    } else {
        if (attrStart == TOKEN_SWITCH_PAGE) {
            mCurrAttrPage = readByte();
            attrStart = readByte();
        }
        name = lookupAttrName(attrStart, &valuePrefix);
    }
    if (name == NULL) {
        longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR);
    }
    attrib->name = name;
    attrib->value = "";
    if (valuePrefix != NULL) {
        attrib->value = valuePrefix;
    }

    // now attribute value: zero or more value, string, entity or extension tokens
    for (;;) {
        int valueToken = peekByte();
        if (isAttrStart(valueToken) || valueToken == TOKEN_END) {
            // An attribute start token, a LITERAL token or the END token
            // indicates the end of an attribute value.
            return;
        }
        switch (valueToken) {
            case TOKEN_ENTITY:
            case TOKEN_STR_I:
            case TOKEN_STR_T:
                readString(attrib->value);
                break;

            case TOKEN_EXT_I_0:
            case TOKEN_EXT_I_1:
            case TOKEN_EXT_I_2:
            case TOKEN_EXT_0:
            case TOKEN_EXT_1:
            case TOKEN_EXT_2:
                //TODO: document type specific
                printf ("Unsupported Token 0x%x\n", valueToken);
                longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                break;

            default:
                //TODO
                printf ("Unknown Token 0x%x\n", valueToken);
                longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                break;
        }
    }
}