1 /* 2 * Copyright (C) 2008 Esmertec AG. 3 * Copyright (C) 2008 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 #include <stdio.h> 19 #include <stdlib.h> 20 #include <setjmp.h> 21 #include <assert.h> 22 #include "wbxml_parser.h" 23 #include "csp13_data.h" 24 #ifdef SUPPORT_SYNCML 25 #include "syncml_data.h" 26 #endif 27 28 #ifdef PLATFORM_ANDROID 29 extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb, 30 size_t size, int (*compar)(const void *, const void *)); 31 #endif 32 33 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) 34 35 //#define WBXML_DEBUG 1 36 37 /* Major TODO items: 38 - Attribute value tokens (not used by IMPS CSP) 39 - EXT_* except EXT_T_0 (not used by IMPS CSP) 40 - PI (not used by IMPS CSP) 41 - cleanups 42 43 Other TODO: 44 - Support more public ID? Only IMPS is supported now. 45 - Support other charsets than UTF-8 46 */ 47 48 static int compareTokenData(const void * t1, const void * t2) 49 { 50 return ((TokenData *)t1)->token - ((TokenData *)t2)->token; 51 } 52 53 static int compareAttrData(const void * t1, const void * t2) 54 { 55 return ((AttrData *)t1)->token - ((AttrData *)t2)->token; 56 } 57 58 static bool isTagStart(int token) 59 { 60 if (token == TOKEN_SWITCH_PAGE) 61 return true; 62 63 token &= 0x3f; 64 return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0); 65 } 66 67 static bool isAttrStart(int token) 68 { 69 return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) || 70 (token > TOKEN_LITERAL_C && token < 0x80); 71 } 72 73 WbxmlParser::WbxmlParser(uint32_t transportEncoding) : 74 mTransportEncoding(transportEncoding) 75 { 76 reset(); 77 } 78 79 WbxmlParser::~WbxmlParser() 80 { 81 } 82 83 void WbxmlParser::reset(void) 84 { 85 mContentHandler = NULL; 86 87 mExternalChunk = NULL; 88 mExternalChunkLen = 0; 89 mLastChunk.clear(); 90 mDataOffset = 0; 91 mIsDataEnd = false; 92 93 mStartElemStack.clear(); 94 mStringTable.clear(); 95 96 mCurrTagPage = mCurrAttrPage = 0; 97 mPublicId = 0; 98 99 mState = EXPECT_HEADER; 100 mLastError = ERROR_NO_ERROR; 101 } 102 103 void WbxmlParser::setContentHandler(WbxmlContentHandler * handler) 104 { 105 mContentHandler = handler; 106 } 107 108 int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end) 109 { 110 if (data == NULL) { 111 mLastError = ERROR_INVALID_DATA; 112 return WBXML_STATUS_ERROR; 113 } 114 115 // All temporary C++ varaibles must be declared before setjmp to make 116 // sure they get properly destructed after longjmp. 117 vector<Attribute> attribs; 118 Attribute attrib; 119 string tagName; 120 string characters; 121 string opaque; 122 123 #ifdef WBXML_DEBUG 124 printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n", 125 dataLen, end, getReadPos(), availDataSize()); 126 #endif 127 appendData(data, dataLen, end); 128 volatile int readPos = getReadPos(); 129 int setjmpRet; 130 switch (setjmpRet = setjmp(mJmpbuf)) { 131 case 0: 132 break; 133 134 case ERROR_NEED_MORE_DATA: 135 if (!mIsDataEnd) { 136 #ifdef WBXML_DEBUG 137 printf("\nneed more data: readPos %d\n", readPos); 138 #endif 139 setReadPos(readPos); 140 saveRemainingData(); 141 return WBXML_STATUS_OK; 142 } else { 143 #ifdef WBXML_DEBUG 144 printf("wbxml parser error: unexpected data end\n"); 145 #endif 146 mLastError = ERROR_NEED_MORE_DATA; 147 return WBXML_STATUS_ERROR; 148 } 149 break; 150 151 case ERROR_UNSUPPORTED_PUBID: 152 case ERROR_UNSUPPORTED_CHARSET: 153 case ERROR_INVALID_STRING_TABLE: 154 case ERROR_INVALID_STRING_TABLE_REFERENCE: 155 case ERROR_INVALID_EXT_TOKEN: 156 case ERROR_INVALID_MBUINT: 157 case ERROR_INVALID_ENTITY: 158 case ERROR_UNRECOGNIZED_TAG: 159 case ERROR_UNRECOGNIZED_ATTR: 160 case ERROR_MISSING_ATTR: 161 case ERROR_MISSING_TOKEN_END: 162 #ifdef WBXML_DEBUG 163 printf("wbxml parser error %d\n", setjmpRet); 164 #endif 165 mLastError = ParserError(setjmpRet); 166 return WBXML_STATUS_ERROR; 167 break; 168 169 case ERROR_NOT_SUPPORTED_YET: 170 printf("wbxml parser error: Not implemented feature.\n"); 171 mLastError = ParserError(setjmpRet); 172 return WBXML_STATUS_ERROR; 173 break; 174 175 default: 176 printf("wbxml parser error: Impossible execution path.\n"); 177 mLastError = ParserError(setjmpRet); 178 return WBXML_STATUS_ERROR; 179 break; 180 } 181 182 for (;;) { 183 // save readPos for error recovery 184 readPos = getReadPos(); 185 186 switch (mState) { 187 case EXPECT_HEADER: 188 mDocVersion = readByte(); 189 190 mPublicId = readMbuint32(); 191 if (mPublicId != 0) { 192 if (!selectTokenMapping(mPublicId)) { 193 #ifdef WBXML_DEBUG 194 printf("wbxml parser error: unsupported public id \n"); 195 #endif 196 longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID); 197 } 198 } else { 199 mPublicId = -readMbuint32(); 200 } 201 mCharset = readMbuint32(); 202 if (!mCharset) { 203 mCharset = mTransportEncoding; 204 if (!mCharset) { 205 mCharset = CHARSET_UTF8; 206 } 207 } 208 // TODO: support more charsets other than UTF-8 209 if (mCharset != CHARSET_UTF8) { 210 #ifdef WBXML_DEBUG 211 printf("wbxml parser error: unsupported charset\n"); 212 #endif 213 longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET); 214 } 215 216 // now advance to next state 217 if (mContentHandler) { 218 mContentHandler->handlePublicId(mPublicId); 219 } 220 mState = EXPECT_STRING_TABLE; 221 break; 222 223 case EXPECT_STRING_TABLE: 224 { 225 uint32_t len = readMbuint32(); 226 if (availDataSize() < len) { 227 longjmp(mJmpbuf, ERROR_NEED_MORE_DATA); 228 } 229 mStringTable.clear(); 230 // TODO: optimize this 231 while (len--) { 232 mStringTable += readByte(); 233 } 234 if (mStringTable.size()) { 235 if (mStringTable[mStringTable.size() - 1] != 0) { 236 // must have an ending \0 237 //TODO:the byte array returned by SCTS does not contain '\0' at the 238 //end,should this be fixed accordingly? 239 #ifdef WBXML_DEBUG 240 printf("wbxml parser error: invalid string table\n"); 241 #endif 242 longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE); 243 } 244 } 245 mState = EXPECT_BODY_START; 246 if (mPublicId <= 0) { 247 const char * s = mStringTable.c_str() + (-mPublicId); 248 #ifdef SUPPORT_SYNCML 249 if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) { 250 mPublicId = PUBLICID_SYNCML_1_2; 251 } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) { 252 mPublicId = PUBLICID_SYNCML_1_1; 253 } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) { 254 mPublicId = PUBLICID_SYNCML_1_0; 255 } 256 #endif 257 if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) { 258 longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID); 259 } 260 } 261 break; 262 } 263 264 case EXPECT_BODY_START: 265 //TODO: handle possible PIs 266 mState = EXPECT_ELEMENT_START; 267 break; 268 269 case EXPECT_ELEMENT_START: 270 { 271 int stag = readByte(); 272 const char * name; 273 if ((stag & 0x3f) == TOKEN_LITERAL) { 274 name = resolveStrTableRef(); 275 } else { 276 if (stag == TOKEN_SWITCH_PAGE) { 277 mCurrTagPage = readByte(); 278 stag = readByte(); 279 } 280 name = lookupTagName(stag); 281 } 282 if (name == NULL) { 283 #ifdef WBXML_DEBUG 284 printf("wbxml parser error: unrecognized tag\n"); 285 #endif 286 longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG); 287 } 288 attribs.clear(); 289 if (stag & 0x80) { 290 // followed by 1 or more attributes 291 while (peekByte() != TOKEN_END) { 292 readAttribute(&attrib); 293 attribs.push_back(attrib); 294 } 295 if (!attribs.size()) { 296 #ifdef WBXML_DEBUG 297 printf("wbxml parser error: missing attributes\n"); 298 #endif 299 longjmp(mJmpbuf, ERROR_MISSING_ATTR); 300 } 301 // TOKEN_END 302 readByte(); 303 } 304 if (mContentHandler) { 305 mContentHandler->startElement(name, attribs); 306 } 307 if (stag & 0x40) { 308 mState = EXPECT_CONTENT; 309 } else { 310 mState = ELEMENT_END; 311 } 312 tagName = name; 313 mStartElemStack.push_back(name); 314 break; 315 } 316 317 case EXPECT_CONTENT: 318 { 319 int byte = peekByte(); 320 if (byte == TOKEN_SWITCH_PAGE) { 321 readByte(); 322 mCurrTagPage = readByte(); 323 byte = peekByte(); 324 } 325 if (isTagStart(byte) || byte == TOKEN_END) { 326 if (characters.size() && mContentHandler) { 327 mContentHandler->characters(characters.c_str(), characters.size()); 328 characters.clear(); 329 } 330 if (byte == TOKEN_END) { 331 mState = EXPECT_ELEMENT_END; 332 } else { 333 mState = EXPECT_ELEMENT_START; 334 } 335 } else { 336 // TODO: handle extension and pi 337 switch (byte) { 338 case TOKEN_ENTITY: 339 case TOKEN_STR_I: 340 case TOKEN_STR_T: 341 readString(characters); 342 break; 343 344 case TOKEN_EXT_T_0: 345 { 346 readByte(); 347 uint32_t valueToken = readMbuint32(); 348 if (mPublicId == PUBLICID_IMPS_1_1 349 || mPublicId == PUBLICID_IMPS_1_2 350 || mPublicId == PUBLICID_IMPS_1_3) { 351 TokenData t = {valueToken, NULL}; 352 const TokenData * res = (TokenData *)bsearch(&t, 353 csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens), 354 sizeof(csp13ExtValueTokens[0]), compareTokenData); 355 if (res) { 356 characters.append(res->tagName); 357 } else { 358 longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN); 359 } 360 } else { 361 printf ("Token 0x%x\n", byte); 362 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); 363 } 364 break; 365 } 366 367 case TOKEN_OPAQUE: 368 { 369 readByte(); 370 uint32_t opaqueDataLen = readMbuint32(); 371 opaque.clear(); 372 while (opaqueDataLen--) { 373 opaque += (char)readByte(); 374 } 375 if (mContentHandler) { 376 mContentHandler->opaque(opaque.c_str(), opaque.size()); 377 } 378 break; 379 } 380 381 default: 382 printf ("Token 0x%x\n", byte); 383 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); 384 break; 385 } 386 } 387 break; 388 } 389 390 case EXPECT_ELEMENT_END: 391 if (readByte() != TOKEN_END) { 392 #ifdef WBXML_DEBUG 393 printf("wbxml parser error: TOKEN_END expected\n"); 394 #endif 395 longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END); 396 } 397 mState = ELEMENT_END; 398 break; 399 400 case ELEMENT_END: 401 assert(!mStartElemStack.empty()); 402 403 tagName = mStartElemStack.back(); 404 mStartElemStack.pop_back(); 405 if (mContentHandler) { 406 mContentHandler->endElement(tagName.c_str()); 407 } 408 if (mStartElemStack.empty()) { 409 mState = EXPECT_BODY_END; 410 } else { 411 mState = EXPECT_CONTENT; 412 } 413 break; 414 415 case EXPECT_BODY_END: 416 // TODO: handle possible PIs 417 418 // we're done 419 return WBXML_STATUS_OK; 420 break; 421 } 422 } 423 } 424 425 /* 426 * We don't make a copy of the data chunk for the current parse() until 427 * it returns. 428 * The remaining data will be saved in saveRemainingData() before parse() 429 * returns. 430 */ 431 void WbxmlParser::appendData(const char * data, uint32_t len, bool end) 432 { 433 mExternalChunk = data; 434 mExternalChunkLen = len; 435 mIsDataEnd = end; 436 } 437 438 void WbxmlParser::saveRemainingData() 439 { 440 if (mDataOffset > mLastChunk.size()) { 441 uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size(); 442 assert(offsetToExtChunk <= mExternalChunkLen); 443 mLastChunk.assign(mExternalChunk + offsetToExtChunk, 444 mExternalChunkLen - offsetToExtChunk); 445 mDataOffset = 0; 446 } else { 447 mLastChunk.append(mExternalChunk, mExternalChunkLen); 448 } 449 mExternalChunk = NULL; 450 mExternalChunkLen = 0; 451 } 452 453 int WbxmlParser::readByte() 454 { 455 if (mDataOffset < mLastChunk.size()) { 456 #ifdef WBXML_DEBUG 457 printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]); 458 #endif 459 return (unsigned char)mLastChunk[mDataOffset++]; 460 } else { 461 uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size(); 462 if (offsetToExtChunk < mExternalChunkLen) { 463 mDataOffset++; 464 #ifdef WBXML_DEBUG 465 printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]); 466 #endif 467 return (unsigned char)mExternalChunk[offsetToExtChunk]; 468 } 469 longjmp(mJmpbuf, ERROR_NEED_MORE_DATA); 470 } 471 } 472 473 int WbxmlParser::peekByte() 474 { 475 if (mDataOffset < mLastChunk.size()) { 476 return (unsigned char)mLastChunk[mDataOffset]; 477 } else { 478 uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size(); 479 if (offsetToExtChunk < mExternalChunkLen) { 480 return (unsigned char)mExternalChunk[offsetToExtChunk]; 481 } 482 longjmp(mJmpbuf, ERROR_NEED_MORE_DATA); 483 } 484 } 485 486 uint32_t WbxmlParser::readMbuint32() 487 { 488 uint32_t value = 0; 489 uint32_t byte; 490 do { 491 if ((value >> 25) != 0) { 492 // would go overflow. not a valid uint32. 493 longjmp(mJmpbuf, ERROR_INVALID_MBUINT); 494 } 495 byte = readByte(); 496 value = (value << 7) | (byte & 0x7f); 497 } while (byte & 0x80); 498 return value; 499 } 500 501 /** 502 * Read STR_I | STR_T | ENTITY and *append* to str. 503 * Yes this looks ugly... 504 */ 505 void WbxmlParser::readString(string & str) 506 { 507 int byte = readByte(); 508 switch (byte) { 509 case TOKEN_STR_I: 510 //TODO: assuming UTF-8 511 while ((byte = readByte()) != 0) { 512 str += (char)byte; 513 } 514 break; 515 516 case TOKEN_ENTITY: 517 { 518 uint32_t ch = readMbuint32(); 519 //TODO: assuming UTF-8 for now. 520 if (ch <= 0x7f) { 521 str += (char)ch; 522 } else if (ch <= 0x7ff) { 523 str += (char)((ch >> 6) | 0xc0); 524 str += (char)((ch & 0x3f) | 0x80); 525 } else if (ch <= 0xffff) { 526 str += (char)((ch >> 12) | 0xe0); 527 str += (char)(((ch >> 6) & 0x3f) | 0x80); 528 str += (char)((ch & 0x3f) | 0x80); 529 } else if (ch <= 0x10ffff) { 530 // 010000 - 10FFFF 531 str += (char)((ch >> 18) | 0xf0); 532 str += (char)(((ch >> 12) & 0x3f) | 0x80); 533 str += (char)(((ch >> 6) & 0x3f) | 0x80); 534 str += (char)((ch & 0x3f) | 0x80); 535 } else { 536 // not a valid UCS-4 character 537 longjmp(mJmpbuf, ERROR_INVALID_ENTITY); 538 } 539 break; 540 } 541 542 case TOKEN_STR_T: 543 { 544 const char * s = resolveStrTableRef(); 545 str.append(s, strlen(s)); 546 break; 547 } 548 549 default: 550 // impossible 551 printf ("Unknown token 0x%02x\n", byte); 552 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); 553 break; 554 } 555 } 556 557 const char * WbxmlParser::resolveStrTableRef(void) 558 { 559 uint32_t offset = readMbuint32(); 560 if (offset >= mStringTable.size()) { 561 longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE); 562 } 563 return mStringTable.c_str() + offset; 564 } 565 566 bool WbxmlParser::selectTokenMapping(int publicId) 567 { 568 switch (publicId) { 569 case PUBLICID_IMPS_1_3: 570 case PUBLICID_IMPS_1_2: 571 case PUBLICID_IMPS_1_1: 572 mTagPages = csp13TagPages; 573 mNumTagPages = ARRAY_SIZE(csp13TagPages); 574 mAttrPages = csp13AttrPages; 575 mNumAttrPages = ARRAY_SIZE(csp13AttrPages); 576 break; 577 578 #ifdef SUPPORT_SYNCML 579 case PUBLICID_SYNCML_1_0: 580 case PUBLICID_SYNCML_1_1: 581 case PUBLICID_SYNCML_1_2: 582 case PUBLICID_SYNCML_METINF_1_2: 583 mTagPages = syncmlTagPages; 584 mNumTagPages = ARRAY_SIZE(syncmlTagPages); 585 mAttrPages = NULL; 586 mNumAttrPages = 0; 587 break; 588 589 case PUBLICID_SYNCML_DEVINF_1_2: 590 mTagPages = syncmlDevInfTagPages; 591 mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages); 592 mAttrPages = NULL; 593 mNumAttrPages = 0; 594 break; 595 #endif 596 default: 597 return false; 598 } 599 return true; 600 } 601 602 const char * WbxmlParser::lookupTagName(int tag) const 603 { 604 tag = tag & 0x3f; 605 606 // TODO: optimize this 607 if (mCurrTagPage >= mNumTagPages) { 608 return NULL; 609 } 610 const TagCodePage * page = &mTagPages[mCurrTagPage]; 611 if (page == NULL) { 612 return NULL; 613 } 614 615 TokenData t = {tag, NULL}; 616 const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens, 617 sizeof(TokenData), compareTokenData); 618 if (res) { 619 return res->tagName; 620 } 621 622 return NULL; 623 } 624 625 const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const 626 { 627 // TODO: optimize this 628 if (mCurrAttrPage >= mNumAttrPages) { 629 return NULL; 630 } 631 const AttrCodePage * page = &mAttrPages[mCurrAttrPage]; 632 if (page == NULL) { 633 return NULL; 634 } 635 636 AttrData t = {token, NULL, NULL}; 637 const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens, 638 sizeof(AttrData), compareAttrData); 639 if (res) { 640 if (prefix) { 641 *prefix = res->attrValuePrefix; 642 } 643 return res->attrName; 644 } 645 646 return NULL; 647 } 648 649 void WbxmlParser::readAttribute(Attribute * attrib) 650 { 651 // attribute start: attrib start token, LITERAL or END 652 int attrStart = readByte(); 653 const char * name; 654 const char * valuePrefix = NULL; 655 656 if (attrStart == TOKEN_LITERAL) { 657 name = resolveStrTableRef(); 658 } else { 659 if (attrStart == TOKEN_SWITCH_PAGE) { 660 mCurrAttrPage = readByte(); 661 attrStart = readByte(); 662 } 663 name = lookupAttrName(attrStart, &valuePrefix); 664 } 665 if (name == NULL) { 666 longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR); 667 } 668 attrib->name = name; 669 attrib->value = ""; 670 if (valuePrefix != NULL) { 671 attrib->value = valuePrefix; 672 } 673 674 // now attribute value: zero or more value, string, entity or extension tokens 675 for (;;) { 676 int valueToken = peekByte(); 677 if (isAttrStart(valueToken) || valueToken == TOKEN_END) { 678 // An attribute start token, a LITERAL token or the END token 679 // indicates the end of an attribute value. 680 return; 681 } 682 switch (valueToken) { 683 case TOKEN_ENTITY: 684 case TOKEN_STR_I: 685 case TOKEN_STR_T: 686 readString(attrib->value); 687 break; 688 689 case TOKEN_EXT_I_0: 690 case TOKEN_EXT_I_1: 691 case TOKEN_EXT_I_2: 692 case TOKEN_EXT_0: 693 case TOKEN_EXT_1: 694 case TOKEN_EXT_2: 695 //TODO: document type specific 696 printf ("Unsupported Token 0x%x\n", valueToken); 697 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); 698 break; 699 700 default: 701 //TODO 702 printf ("Unknown Token 0x%x\n", valueToken); 703 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET); 704 break; 705 } 706 } 707 } 708 709