1 /* 2 * Copyright 2005 Maksim Orlovich <maksim (at) kde.org> 3 * Copyright (C) 2006 Apple Computer, Inc. 4 * Copyright (C) 2007 Alexey Proskuryakov <ap (at) webkit.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "core/xml/XPathParser.h" 30 31 #include "bindings/v8/ExceptionState.h" 32 #include "core/dom/ExceptionCode.h" 33 #include "core/xml/XPathEvaluator.h" 34 #include "core/xml/XPathNSResolver.h" 35 #include "core/xml/XPathPath.h" 36 #include "wtf/StdLibExtras.h" 37 #include "wtf/text/StringHash.h" 38 39 using namespace WebCore; 40 using namespace WTF; 41 using namespace Unicode; 42 using namespace XPath; 43 44 extern int xpathyyparse(WebCore::XPath::Parser*); 45 #include "XPathGrammar.h" 46 47 Parser* Parser::currentParser = 0; 48 49 enum XMLCat { NameStart, NameCont, NotPartOfName }; 50 51 typedef HashMap<String, Step::Axis> AxisNamesMap; 52 53 static XMLCat charCat(UChar aChar) 54 { 55 //### might need to add some special cases from the XML spec. 56 57 if (aChar == '_') 58 return NameStart; 59 60 if (aChar == '.' || aChar == '-') 61 return NameCont; 62 CharCategory category = Unicode::category(aChar); 63 if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter)) 64 return NameStart; 65 if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit)) 66 return NameCont; 67 return NotPartOfName; 68 } 69 70 static void setUpAxisNamesMap(AxisNamesMap& axisNames) 71 { 72 struct AxisName { 73 const char* name; 74 Step::Axis axis; 75 }; 76 const AxisName axisNameList[] = { 77 { "ancestor", Step::AncestorAxis }, 78 { "ancestor-or-self", Step::AncestorOrSelfAxis }, 79 { "attribute", Step::AttributeAxis }, 80 { "child", Step::ChildAxis }, 81 { "descendant", Step::DescendantAxis }, 82 { "descendant-or-self", Step::DescendantOrSelfAxis }, 83 { "following", Step::FollowingAxis }, 84 { "following-sibling", Step::FollowingSiblingAxis }, 85 { "namespace", Step::NamespaceAxis }, 86 { "parent", Step::ParentAxis }, 87 { "preceding", Step::PrecedingAxis }, 88 { "preceding-sibling", Step::PrecedingSiblingAxis }, 89 { "self", Step::SelfAxis } 90 }; 91 for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i) 92 axisNames.set(axisNameList[i].name, axisNameList[i].axis); 93 } 94 95 static bool isAxisName(const String& name, Step::Axis& type) 96 { 97 DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ()); 98 99 if (axisNames.isEmpty()) 100 setUpAxisNamesMap(axisNames); 101 102 AxisNamesMap::iterator it = axisNames.find(name); 103 if (it == axisNames.end()) 104 return false; 105 type = it->value; 106 return true; 107 } 108 109 static bool isNodeTypeName(const String& name) 110 { 111 DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ()); 112 if (nodeTypeNames.isEmpty()) { 113 nodeTypeNames.add("comment"); 114 nodeTypeNames.add("text"); 115 nodeTypeNames.add("processing-instruction"); 116 nodeTypeNames.add("node"); 117 } 118 return nodeTypeNames.contains(name); 119 } 120 121 // Returns whether the current token can possibly be a binary operator, given 122 // the previous token. Necessary to disambiguate some of the operators 123 // (* (multiply), div, and, or, mod) in the [32] Operator rule 124 // (check http://www.w3.org/TR/xpath#exprlex). 125 bool Parser::isBinaryOperatorContext() const 126 { 127 switch (m_lastTokenType) { 128 case 0: 129 case '@': case AXISNAME: case '(': case '[': case ',': 130 case AND: case OR: case MULOP: 131 case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: 132 case EQOP: case RELOP: 133 return false; 134 default: 135 return true; 136 } 137 } 138 139 void Parser::skipWS() 140 { 141 while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos])) 142 ++m_nextPos; 143 } 144 145 Token Parser::makeTokenAndAdvance(int code, int advance) 146 { 147 m_nextPos += advance; 148 return Token(code); 149 } 150 151 Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance) 152 { 153 m_nextPos += advance; 154 return Token(code, val); 155 } 156 157 Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance) 158 { 159 m_nextPos += advance; 160 return Token(code, val); 161 } 162 163 // Returns next char if it's there and interesting, 0 otherwise 164 char Parser::peekAheadHelper() 165 { 166 if (m_nextPos + 1 >= m_data.length()) 167 return 0; 168 UChar next = m_data[m_nextPos + 1]; 169 if (next >= 0xff) 170 return 0; 171 return next; 172 } 173 174 char Parser::peekCurHelper() 175 { 176 if (m_nextPos >= m_data.length()) 177 return 0; 178 UChar next = m_data[m_nextPos]; 179 if (next >= 0xff) 180 return 0; 181 return next; 182 } 183 184 Token Parser::lexString() 185 { 186 UChar delimiter = m_data[m_nextPos]; 187 int startPos = m_nextPos + 1; 188 189 for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { 190 if (m_data[m_nextPos] == delimiter) { 191 String value = m_data.substring(startPos, m_nextPos - startPos); 192 if (value.isNull()) 193 value = ""; 194 ++m_nextPos; // Consume the char. 195 return Token(LITERAL, value); 196 } 197 } 198 199 // Ouch, went off the end -- report error. 200 return Token(XPATH_ERROR); 201 } 202 203 Token Parser::lexNumber() 204 { 205 int startPos = m_nextPos; 206 bool seenDot = false; 207 208 // Go until end or a non-digits character. 209 for (; m_nextPos < m_data.length(); ++m_nextPos) { 210 UChar aChar = m_data[m_nextPos]; 211 if (aChar >= 0xff) break; 212 213 if (aChar < '0' || aChar > '9') { 214 if (aChar == '.' && !seenDot) 215 seenDot = true; 216 else 217 break; 218 } 219 } 220 221 return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos)); 222 } 223 224 bool Parser::lexNCName(String& name) 225 { 226 int startPos = m_nextPos; 227 if (m_nextPos >= m_data.length()) 228 return false; 229 230 if (charCat(m_data[m_nextPos]) != NameStart) 231 return false; 232 233 // Keep going until we get a character that's not good for names. 234 for (; m_nextPos < m_data.length(); ++m_nextPos) 235 if (charCat(m_data[m_nextPos]) == NotPartOfName) 236 break; 237 238 name = m_data.substring(startPos, m_nextPos - startPos); 239 return true; 240 } 241 242 bool Parser::lexQName(String& name) 243 { 244 String n1; 245 if (!lexNCName(n1)) 246 return false; 247 248 skipWS(); 249 250 // If the next character is :, what we just got it the prefix, if not, 251 // it's the whole thing. 252 if (peekAheadHelper() != ':') { 253 name = n1; 254 return true; 255 } 256 257 String n2; 258 if (!lexNCName(n2)) 259 return false; 260 261 name = n1 + ":" + n2; 262 return true; 263 } 264 265 Token Parser::nextTokenInternal() 266 { 267 skipWS(); 268 269 if (m_nextPos >= m_data.length()) 270 return Token(0); 271 272 char code = peekCurHelper(); 273 switch (code) { 274 case '(': case ')': case '[': case ']': 275 case '@': case ',': case '|': 276 return makeTokenAndAdvance(code); 277 case '\'': 278 case '\"': 279 return lexString(); 280 case '0': case '1': case '2': case '3': case '4': 281 case '5': case '6': case '7': case '8': case '9': 282 return lexNumber(); 283 case '.': { 284 char next = peekAheadHelper(); 285 if (next == '.') 286 return makeTokenAndAdvance(DOTDOT, 2); 287 if (next >= '0' && next <= '9') 288 return lexNumber(); 289 return makeTokenAndAdvance('.'); 290 } 291 case '/': 292 if (peekAheadHelper() == '/') 293 return makeTokenAndAdvance(SLASHSLASH, 2); 294 return makeTokenAndAdvance('/'); 295 case '+': 296 return makeTokenAndAdvance(PLUS); 297 case '-': 298 return makeTokenAndAdvance(MINUS); 299 case '=': 300 return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ); 301 case '!': 302 if (peekAheadHelper() == '=') 303 return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2); 304 return Token(XPATH_ERROR); 305 case '<': 306 if (peekAheadHelper() == '=') 307 return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2); 308 return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT); 309 case '>': 310 if (peekAheadHelper() == '=') 311 return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2); 312 return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT); 313 case '*': 314 if (isBinaryOperatorContext()) 315 return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul); 316 ++m_nextPos; 317 return Token(NAMETEST, "*"); 318 case '$': { // $ QName 319 m_nextPos++; 320 String name; 321 if (!lexQName(name)) 322 return Token(XPATH_ERROR); 323 return Token(VARIABLEREFERENCE, name); 324 } 325 } 326 327 String name; 328 if (!lexNCName(name)) 329 return Token(XPATH_ERROR); 330 331 skipWS(); 332 // If we're in an operator context, check for any operator names 333 if (isBinaryOperatorContext()) { 334 if (name == "and") //### hash? 335 return Token(AND); 336 if (name == "or") 337 return Token(OR); 338 if (name == "mod") 339 return Token(MULOP, NumericOp::OP_Mod); 340 if (name == "div") 341 return Token(MULOP, NumericOp::OP_Div); 342 } 343 344 // See whether we are at a : 345 if (peekCurHelper() == ':') { 346 m_nextPos++; 347 // Any chance it's an axis name? 348 if (peekCurHelper() == ':') { 349 m_nextPos++; 350 351 //It might be an axis name. 352 Step::Axis axis; 353 if (isAxisName(name, axis)) 354 return Token(AXISNAME, axis); 355 // Ugh, :: is only valid in axis names -> error 356 return Token(XPATH_ERROR); 357 } 358 359 // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest 360 skipWS(); 361 if (peekCurHelper() == '*') { 362 m_nextPos++; 363 return Token(NAMETEST, name + ":*"); 364 } 365 366 // Make a full qname. 367 String n2; 368 if (!lexNCName(n2)) 369 return Token(XPATH_ERROR); 370 371 name = name + ":" + n2; 372 } 373 374 skipWS(); 375 if (peekCurHelper() == '(') { 376 //note: we don't swallow the (here! 377 378 //either node type of function name 379 if (isNodeTypeName(name)) { 380 if (name == "processing-instruction") 381 return Token(PI, name); 382 383 return Token(NODETYPE, name); 384 } 385 //must be a function name. 386 return Token(FUNCTIONNAME, name); 387 } 388 389 // At this point, it must be NAMETEST. 390 return Token(NAMETEST, name); 391 } 392 393 Token Parser::nextToken() 394 { 395 Token toRet = nextTokenInternal(); 396 m_lastTokenType = toRet.type; 397 return toRet; 398 } 399 400 Parser::Parser() 401 { 402 reset(String()); 403 } 404 405 Parser::~Parser() 406 { 407 } 408 409 void Parser::reset(const String& data) 410 { 411 m_nextPos = 0; 412 m_data = data; 413 m_lastTokenType = 0; 414 415 m_topExpr = 0; 416 m_gotNamespaceError = false; 417 } 418 419 int Parser::lex(void* data) 420 { 421 YYSTYPE* yylval = static_cast<YYSTYPE*>(data); 422 Token tok = nextToken(); 423 424 switch (tok.type) { 425 case AXISNAME: 426 yylval->axis = tok.axis; 427 break; 428 case MULOP: 429 yylval->numop = tok.numop; 430 break; 431 case RELOP: 432 case EQOP: 433 yylval->eqop = tok.eqop; 434 break; 435 case NODETYPE: 436 case PI: 437 case FUNCTIONNAME: 438 case LITERAL: 439 case VARIABLEREFERENCE: 440 case NUMBER: 441 case NAMETEST: 442 yylval->str = new String(tok.str); 443 registerString(yylval->str); 444 break; 445 } 446 447 return tok.type; 448 } 449 450 bool Parser::expandQName(const String& qName, AtomicString& localName, AtomicString& namespaceURI) 451 { 452 size_t colon = qName.find(':'); 453 if (colon != kNotFound) { 454 if (!m_resolver) 455 return false; 456 namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon)); 457 if (namespaceURI.isNull()) 458 return false; 459 localName = AtomicString(qName.substring(colon + 1)); 460 } else { 461 localName = AtomicString(qName); 462 } 463 464 return true; 465 } 466 467 Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionState& exceptionState) 468 { 469 reset(statement); 470 471 m_resolver = resolver; 472 473 Parser* oldParser = currentParser; 474 currentParser = this; 475 int parseError = xpathyyparse(this); 476 currentParser = oldParser; 477 478 if (parseError) { 479 deleteAllValues(m_parseNodes); 480 m_parseNodes.clear(); 481 482 HashSet<Vector<OwnPtr<Predicate> >*>::iterator pend = m_predicateVectors.end(); 483 for (HashSet<Vector<OwnPtr<Predicate> >*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) 484 delete *it; 485 m_predicateVectors.clear(); 486 487 HashSet<Vector<OwnPtr<Expression> >*>::iterator eend = m_expressionVectors.end(); 488 for (HashSet<Vector<OwnPtr<Expression> >*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) 489 delete *it; 490 m_expressionVectors.clear(); 491 492 deleteAllValues(m_strings); 493 m_strings.clear(); 494 495 deleteAllValues(m_nodeTests); 496 m_nodeTests.clear(); 497 498 m_topExpr = 0; 499 500 if (m_gotNamespaceError) 501 exceptionState.throwDOMException(NamespaceError, "The string '" + statement + "' contains unresolvable namespaces."); 502 else 503 exceptionState.throwDOMException(SyntaxError, "The string '" + statement + "' is not a valid XPath expression."); 504 return 0; 505 } 506 507 ASSERT(m_parseNodes.size() == 1); 508 ASSERT(*m_parseNodes.begin() == m_topExpr); 509 ASSERT(m_expressionVectors.size() == 0); 510 ASSERT(m_predicateVectors.size() == 0); 511 ASSERT(m_strings.size() == 0); 512 ASSERT(m_nodeTests.size() == 0); 513 514 m_parseNodes.clear(); 515 Expression* result = m_topExpr; 516 m_topExpr = 0; 517 518 return result; 519 } 520 521 void Parser::registerParseNode(ParseNode* node) 522 { 523 if (node == 0) 524 return; 525 526 ASSERT(!m_parseNodes.contains(node)); 527 528 m_parseNodes.add(node); 529 } 530 531 void Parser::unregisterParseNode(ParseNode* node) 532 { 533 if (node == 0) 534 return; 535 536 ASSERT(m_parseNodes.contains(node)); 537 538 m_parseNodes.remove(node); 539 } 540 541 void Parser::registerPredicateVector(Vector<OwnPtr<Predicate> >* vector) 542 { 543 if (vector == 0) 544 return; 545 546 ASSERT(!m_predicateVectors.contains(vector)); 547 548 m_predicateVectors.add(vector); 549 } 550 551 void Parser::deletePredicateVector(Vector<OwnPtr<Predicate> >* vector) 552 { 553 if (vector == 0) 554 return; 555 556 ASSERT(m_predicateVectors.contains(vector)); 557 558 m_predicateVectors.remove(vector); 559 delete vector; 560 } 561 562 563 void Parser::registerExpressionVector(Vector<OwnPtr<Expression> >* vector) 564 { 565 if (vector == 0) 566 return; 567 568 ASSERT(!m_expressionVectors.contains(vector)); 569 570 m_expressionVectors.add(vector); 571 } 572 573 void Parser::deleteExpressionVector(Vector<OwnPtr<Expression> >* vector) 574 { 575 if (vector == 0) 576 return; 577 578 ASSERT(m_expressionVectors.contains(vector)); 579 580 m_expressionVectors.remove(vector); 581 delete vector; 582 } 583 584 void Parser::registerString(String* s) 585 { 586 if (s == 0) 587 return; 588 589 ASSERT(!m_strings.contains(s)); 590 591 m_strings.add(s); 592 } 593 594 void Parser::deleteString(String* s) 595 { 596 if (s == 0) 597 return; 598 599 ASSERT(m_strings.contains(s)); 600 601 m_strings.remove(s); 602 delete s; 603 } 604 605 void Parser::registerNodeTest(Step::NodeTest* t) 606 { 607 if (t == 0) 608 return; 609 610 ASSERT(!m_nodeTests.contains(t)); 611 612 m_nodeTests.add(t); 613 } 614 615 void Parser::deleteNodeTest(Step::NodeTest* t) 616 { 617 if (t == 0) 618 return; 619 620 ASSERT(m_nodeTests.contains(t)); 621 622 m_nodeTests.remove(t); 623 delete t; 624 } 625 626