1 /*------------------------------------------------------------------------- 2 * drawElements Quality Program Test Executor 3 * ------------------------------------------ 4 * 5 * Copyright 2014 The Android Open Source Project 6 * 7 * Licensed under the Apache License, Version 2.0 (the "License"); 8 * you may not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 * 19 *//*! 20 * \file 21 * \brief XML Parser. 22 *//*--------------------------------------------------------------------*/ 23 24 #include "xeXMLParser.hpp" 25 #include "deInt32.h" 26 27 namespace xe 28 { 29 namespace xml 30 { 31 32 enum 33 { 34 TOKENIZER_INITIAL_BUFFER_SIZE = 1024 35 }; 36 37 static inline bool isIdentifierStartChar (int ch) 38 { 39 return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z'); 40 } 41 42 static inline bool isIdentifierChar (int ch) 43 { 44 return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_'); 45 } 46 47 static inline bool isWhitespaceChar (int ch) 48 { 49 return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; 50 } 51 52 static int getNextBufferSize (int curSize, int minNewSize) 53 { 54 return de::max(curSize*2, 1<<deLog2Ceil32(minNewSize)); 55 } 56 57 Tokenizer::Tokenizer (void) 58 : m_curToken (TOKEN_INCOMPLETE) 59 , m_curTokenLen (0) 60 , m_state (STATE_DATA) 61 , m_buf (TOKENIZER_INITIAL_BUFFER_SIZE) 62 { 63 } 64 65 Tokenizer::~Tokenizer (void) 66 { 67 } 68 69 void Tokenizer::clear (void) 70 { 71 m_curToken = TOKEN_INCOMPLETE; 72 m_curTokenLen = 0; 73 m_state = STATE_DATA; 74 m_buf.clear(); 75 } 76 77 void Tokenizer::error (const std::string& what) 78 { 79 throw ParseError(what); 80 } 81 82 void Tokenizer::feed (const deUint8* bytes, int numBytes) 83 { 84 // Grow buffer if necessary. 85 if (m_buf.getNumFree() < numBytes) 86 { 87 m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements()+numBytes)); 88 } 89 90 // Append to front. 91 m_buf.pushFront(bytes, numBytes); 92 93 // If we haven't parsed complete token, re-try after data feed. 94 if (m_curToken == TOKEN_INCOMPLETE) 95 advance(); 96 } 97 98 int Tokenizer::getChar (int offset) const 99 { 100 DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements())); 101 102 if (offset < m_buf.getNumElements()) 103 return m_buf.peekBack(offset); 104 else 105 return END_OF_BUFFER; 106 } 107 108 void Tokenizer::advance (void) 109 { 110 if (m_curToken != TOKEN_INCOMPLETE) 111 { 112 // Parser should not try to advance beyond end of string. 113 DE_ASSERT(m_curToken != TOKEN_END_OF_STRING); 114 115 // If current token is tag end, change state to data. 116 if (m_curToken == TOKEN_TAG_END || 117 m_curToken == TOKEN_EMPTY_ELEMENT_END || 118 m_curToken == TOKEN_PROCESSING_INSTRUCTION_END || 119 m_curToken == TOKEN_COMMENT || 120 m_curToken == TOKEN_ENTITY) 121 m_state = STATE_DATA; 122 123 // Advance buffer by length of last token. 124 m_buf.popBack(m_curTokenLen); 125 126 // Reset state. 127 m_curToken = TOKEN_INCOMPLETE; 128 m_curTokenLen = 0; 129 130 // If we hit end of string here, report it as end of string. 131 if (getChar(0) == END_OF_STRING) 132 { 133 m_curToken = TOKEN_END_OF_STRING; 134 m_curTokenLen = 1; 135 return; 136 } 137 } 138 139 int curChar = getChar(m_curTokenLen); 140 141 for (;;) 142 { 143 if (m_state == STATE_DATA) 144 { 145 // Advance until we hit end of buffer or tag start and treat that as data token. 146 if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&') 147 { 148 if (curChar == '<') 149 m_state = STATE_TAG; 150 else if (curChar == '&') 151 m_state = STATE_ENTITY; 152 153 if (m_curTokenLen > 0) 154 { 155 // Report data token. 156 m_curToken = TOKEN_DATA; 157 return; 158 } 159 else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER) 160 { 161 // Just return incomplete token, no data parsed. 162 return; 163 } 164 else 165 { 166 DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY); 167 continue; 168 } 169 } 170 } 171 else 172 { 173 // Eat all whitespace if present. 174 if (m_curTokenLen == 0) 175 { 176 while (isWhitespaceChar(curChar)) 177 { 178 m_buf.popBack(); 179 curChar = getChar(0); 180 } 181 } 182 183 // Handle end of string / buffer. 184 if (curChar == END_OF_STRING) 185 error("Unexpected end of string"); 186 else if (curChar == (int)END_OF_BUFFER) 187 { 188 DE_ASSERT(m_curToken == TOKEN_INCOMPLETE); 189 return; 190 } 191 192 if (m_curTokenLen == 0) 193 { 194 // Expect start of identifier, value or special tag token. 195 if (curChar == '\'' || curChar == '"') 196 m_state = STATE_VALUE; 197 else if (isIdentifierStartChar(curChar)) 198 m_state = STATE_IDENTIFIER; 199 else if (curChar == '<' || curChar == '?' || curChar == '/') 200 m_state = STATE_TAG; 201 else if (curChar == '&') 202 DE_ASSERT(m_state == STATE_ENTITY); 203 else if (curChar == '=') 204 { 205 m_curToken = TOKEN_EQUAL; 206 m_curTokenLen = 1; 207 return; 208 } 209 else if (curChar == '>') 210 { 211 m_curToken = TOKEN_TAG_END; 212 m_curTokenLen = 1; 213 return; 214 } 215 else 216 error("Unexpected character"); 217 } 218 else if (m_state == STATE_IDENTIFIER) 219 { 220 if (!isIdentifierChar(curChar)) 221 { 222 m_curToken = TOKEN_IDENTIFIER; 223 return; 224 } 225 } 226 else if (m_state == STATE_VALUE) 227 { 228 // \todo [2012-06-07 pyry] Escapes. 229 if (curChar == '\'' || curChar == '"') 230 { 231 // \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)? 232 if (curChar != getChar(0)) 233 error("Mismatched quote"); 234 m_curToken = TOKEN_STRING; 235 m_curTokenLen += 1; 236 return; 237 } 238 } 239 else if (m_state == STATE_COMMENT) 240 { 241 DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state. 242 243 if (m_curTokenLen <= 3) 244 { 245 if (curChar != '-') 246 error("Invalid comment start"); 247 } 248 else 249 { 250 int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen-2) : 0; 251 int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen-1) : 0; 252 253 if (prev2 == '-' && prev1 == '-') 254 { 255 if (curChar != '>') 256 error("Invalid comment end"); 257 m_curToken = TOKEN_COMMENT; 258 m_curTokenLen += 1; 259 return; 260 } 261 } 262 } 263 else if (m_state == STATE_ENTITY) 264 { 265 if (m_curTokenLen >= 1) 266 { 267 if (curChar == ';') 268 { 269 m_curToken = TOKEN_ENTITY; 270 m_curTokenLen += 1; 271 return; 272 } 273 else if (!de::inRange<int>(curChar, '0', '9') && 274 !de::inRange<int>(curChar, 'a', 'z') && 275 !de::inRange<int>(curChar, 'A', 'Z')) 276 error("Invalid entity"); 277 } 278 } 279 else 280 { 281 // Special tokens are at most 2 characters. 282 DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1); 283 284 int prevChar = getChar(m_curTokenLen-1); 285 286 if (prevChar == '<') 287 { 288 // Tag start. 289 if (curChar == '/') 290 { 291 m_curToken = TOKEN_END_TAG_START; 292 m_curTokenLen = 2; 293 return; 294 } 295 else if (curChar == '?') 296 { 297 m_curToken = TOKEN_PROCESSING_INSTRUCTION_START; 298 m_curTokenLen = 2; 299 return; 300 } 301 else if (curChar == '!') 302 { 303 m_state = STATE_COMMENT; 304 } 305 else 306 { 307 m_curToken = TOKEN_TAG_START; 308 m_curTokenLen = 1; 309 return; 310 } 311 } 312 else if (prevChar == '?') 313 { 314 if (curChar != '>') 315 error("Invalid processing instruction end"); 316 m_curToken = TOKEN_PROCESSING_INSTRUCTION_END; 317 m_curTokenLen = 2; 318 return; 319 } 320 else if (prevChar == '/') 321 { 322 if (curChar != '>') 323 error("Invalid empty element end"); 324 m_curToken = TOKEN_EMPTY_ELEMENT_END; 325 m_curTokenLen = 2; 326 return; 327 } 328 else 329 error("Could not parse special token"); 330 } 331 } 332 333 m_curTokenLen += 1; 334 curChar = getChar(m_curTokenLen); 335 } 336 } 337 338 void Tokenizer::getString (std::string& dst) const 339 { 340 DE_ASSERT(m_curToken == TOKEN_STRING); 341 dst.resize(m_curTokenLen-2); 342 for (int ndx = 0; ndx < m_curTokenLen-2; ndx++) 343 dst[ndx] = m_buf.peekBack(ndx+1); 344 } 345 346 Parser::Parser (void) 347 : m_element (ELEMENT_INCOMPLETE) 348 , m_state (STATE_DATA) 349 { 350 } 351 352 Parser::~Parser (void) 353 { 354 } 355 356 void Parser::clear (void) 357 { 358 m_tokenizer.clear(); 359 m_elementName.clear(); 360 m_attributes.clear(); 361 m_attribName.clear(); 362 m_entityValue.clear(); 363 364 m_element = ELEMENT_INCOMPLETE; 365 m_state = STATE_DATA; 366 } 367 368 void Parser::error (const std::string& what) 369 { 370 throw ParseError(what); 371 } 372 373 void Parser::feed (const deUint8* bytes, int numBytes) 374 { 375 m_tokenizer.feed(bytes, numBytes); 376 377 if (m_element == ELEMENT_INCOMPLETE) 378 advance(); 379 } 380 381 void Parser::advance (void) 382 { 383 if (m_element == ELEMENT_START) 384 m_attributes.clear(); 385 386 // \note No token is advanced when element end is reported. 387 if (m_state == STATE_YIELD_EMPTY_ELEMENT_END) 388 { 389 DE_ASSERT(m_element == ELEMENT_START); 390 m_element = ELEMENT_END; 391 m_state = STATE_DATA; 392 return; 393 } 394 395 if (m_element != ELEMENT_INCOMPLETE) 396 { 397 m_tokenizer.advance(); 398 m_element = ELEMENT_INCOMPLETE; 399 } 400 401 for (;;) 402 { 403 Token curToken = m_tokenizer.getToken(); 404 405 // Skip comments. 406 while (curToken == TOKEN_COMMENT) 407 { 408 m_tokenizer.advance(); 409 curToken = m_tokenizer.getToken(); 410 } 411 412 if (curToken == TOKEN_INCOMPLETE) 413 { 414 DE_ASSERT(m_element == ELEMENT_INCOMPLETE); 415 return; 416 } 417 418 switch (m_state) 419 { 420 case STATE_ENTITY: 421 m_state = STATE_DATA; 422 // Fall-through 423 424 case STATE_DATA: 425 switch (curToken) 426 { 427 case TOKEN_DATA: 428 m_element = ELEMENT_DATA; 429 return; 430 431 case TOKEN_END_OF_STRING: 432 m_element = ELEMENT_END_OF_STRING; 433 return; 434 435 case TOKEN_TAG_START: 436 m_state = STATE_START_TAG_OPEN; 437 break; 438 439 case TOKEN_END_TAG_START: 440 m_state = STATE_END_TAG_OPEN; 441 break; 442 443 case TOKEN_PROCESSING_INSTRUCTION_START: 444 m_state = STATE_IN_PROCESSING_INSTRUCTION; 445 break; 446 447 case TOKEN_ENTITY: 448 m_state = STATE_ENTITY; 449 m_element = ELEMENT_DATA; 450 parseEntityValue(); 451 return; 452 453 default: 454 error("Unexpected token"); 455 } 456 break; 457 458 case STATE_IN_PROCESSING_INSTRUCTION: 459 if (curToken == TOKEN_PROCESSING_INSTRUCTION_END) 460 m_state = STATE_DATA; 461 else 462 if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING) 463 error("Unexpected token in processing instruction"); 464 break; 465 466 case STATE_START_TAG_OPEN: 467 if (curToken != TOKEN_IDENTIFIER) 468 error("Expected identifier"); 469 m_tokenizer.getTokenStr(m_elementName); 470 m_state = STATE_ATTRIBUTE_LIST; 471 break; 472 473 case STATE_END_TAG_OPEN: 474 if (curToken != TOKEN_IDENTIFIER) 475 error("Expected identifier"); 476 m_tokenizer.getTokenStr(m_elementName); 477 m_state = STATE_EXPECTING_END_TAG_CLOSE; 478 break; 479 480 case STATE_EXPECTING_END_TAG_CLOSE: 481 if (curToken != TOKEN_TAG_END) 482 error("Expected tag end"); 483 m_state = STATE_DATA; 484 m_element = ELEMENT_END; 485 return; 486 487 case STATE_ATTRIBUTE_LIST: 488 if (curToken == TOKEN_IDENTIFIER) 489 { 490 m_tokenizer.getTokenStr(m_attribName); 491 m_state = STATE_EXPECTING_ATTRIBUTE_EQ; 492 } 493 else if (curToken == TOKEN_EMPTY_ELEMENT_END) 494 { 495 m_state = STATE_YIELD_EMPTY_ELEMENT_END; 496 m_element = ELEMENT_START; 497 return; 498 } 499 else if (curToken == TOKEN_TAG_END) 500 { 501 m_state = STATE_DATA; 502 m_element = ELEMENT_START; 503 return; 504 } 505 else 506 error("Unexpected token"); 507 break; 508 509 case STATE_EXPECTING_ATTRIBUTE_EQ: 510 if (curToken != TOKEN_EQUAL) 511 error("Expected '='"); 512 m_state = STATE_EXPECTING_ATTRIBUTE_VALUE; 513 break; 514 515 case STATE_EXPECTING_ATTRIBUTE_VALUE: 516 if (curToken != TOKEN_STRING) 517 error("Expected value"); 518 if (hasAttribute(m_attribName.c_str())) 519 error("Duplicate attribute"); 520 521 m_tokenizer.getString(m_attributes[m_attribName]); 522 m_state = STATE_ATTRIBUTE_LIST; 523 break; 524 525 default: 526 DE_ASSERT(false); 527 } 528 529 m_tokenizer.advance(); 530 } 531 } 532 533 static char getEntityValue (const std::string& entity) 534 { 535 static const struct 536 { 537 const char* name; 538 char value; 539 } s_entities[] = 540 { 541 { "<", '<' }, 542 { ">", '>' }, 543 { "&", '&' }, 544 { "'", '\''}, 545 { """, '"' }, 546 }; 547 548 for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++) 549 { 550 if (entity == s_entities[ndx].name) 551 return s_entities[ndx].value; 552 } 553 554 return 0; 555 } 556 557 void Parser::parseEntityValue (void) 558 { 559 DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY); 560 561 std::string entity; 562 m_tokenizer.getTokenStr(entity); 563 564 const char value = getEntityValue(entity); 565 if (value == 0) 566 error("Invalid entity '" + entity + "'"); 567 568 m_entityValue.resize(1); 569 m_entityValue[0] = value; 570 } 571 572 } // xml 573 } // xe 574