1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include <algorithm> 8 #include <memory> 9 #include <sstream> 10 #include <string> 11 #include <utility> 12 #include <vector> 13 14 #include "core/fxcrt/cfx_utf8decoder.h" 15 #include "core/fxcrt/cfx_widetextbuf.h" 16 #include "core/fxcrt/fx_extension.h" 17 #include "core/fxcrt/xml/cxml_content.h" 18 #include "core/fxcrt/xml/cxml_element.h" 19 #include "core/fxcrt/xml/cxml_parser.h" 20 #include "third_party/base/ptr_util.h" 21 #include "third_party/base/stl_util.h" 22 23 namespace { 24 25 #define FXCRTM_XML_CHARTYPE_Normal 0x00 26 #define FXCRTM_XML_CHARTYPE_SpaceChar 0x01 27 #define FXCRTM_XML_CHARTYPE_Letter 0x02 28 #define FXCRTM_XML_CHARTYPE_Digital 0x04 29 #define FXCRTM_XML_CHARTYPE_NameIntro 0x08 30 #define FXCRTM_XML_CHARTYPE_NameChar 0x10 31 #define FXCRTM_XML_CHARTYPE_HexDigital 0x20 32 #define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40 33 #define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60 34 #define FXCRTM_XML_CHARTYPE_HexChar 0x60 35 36 const uint8_t g_FXCRT_XML_ByteTypes[256] = { 37 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 38 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 39 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 40 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00, 41 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00, 42 0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A, 43 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 44 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18, 45 0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 46 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 47 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A, 48 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 49 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 50 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 51 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 52 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 53 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 54 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 55 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 56 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 57 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 58 0x1A, 0x1A, 0x01, 0x01, 59 }; 60 61 constexpr int kMaxDepth = 1024; 62 63 bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) { 64 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar); 65 } 66 67 bool g_FXCRT_XML_IsDigital(uint8_t ch) { 68 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital); 69 } 70 71 bool g_FXCRT_XML_IsNameIntro(uint8_t ch) { 72 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro); 73 } 74 75 bool g_FXCRT_XML_IsNameChar(uint8_t ch) { 76 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar); 77 } 78 79 } // namespace 80 81 CXML_Parser::CXML_Parser() 82 : m_nOffset(0), 83 m_pBuffer(nullptr), 84 m_dwBufferSize(0), 85 m_nBufferOffset(0), 86 m_dwIndex(0) {} 87 88 CXML_Parser::~CXML_Parser() {} 89 90 bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) { 91 m_pDataAcc = pdfium::MakeUnique<CXML_DataBufAcc>(pBuffer, size); 92 m_nOffset = 0; 93 return ReadNextBlock(); 94 } 95 96 bool CXML_Parser::ReadNextBlock() { 97 if (!m_pDataAcc->ReadNextBlock()) 98 return false; 99 100 m_pBuffer = m_pDataAcc->GetBlockBuffer(); 101 m_dwBufferSize = m_pDataAcc->GetBlockSize(); 102 m_nBufferOffset = 0; 103 m_dwIndex = 0; 104 return m_dwBufferSize > 0; 105 } 106 107 bool CXML_Parser::IsEOF() { 108 return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize; 109 } 110 111 void CXML_Parser::SkipWhiteSpaces() { 112 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 113 if (IsEOF()) 114 return; 115 116 do { 117 while (m_dwIndex < m_dwBufferSize && 118 g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) { 119 m_dwIndex++; 120 } 121 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 122 if (m_dwIndex < m_dwBufferSize || IsEOF()) 123 break; 124 } while (ReadNextBlock()); 125 } 126 127 void CXML_Parser::GetName(ByteString* space, ByteString* name) { 128 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 129 if (IsEOF()) 130 return; 131 132 std::ostringstream buf; 133 do { 134 while (m_dwIndex < m_dwBufferSize) { 135 uint8_t ch = m_pBuffer[m_dwIndex]; 136 if (ch == ':') { 137 *space = ByteString(buf); 138 buf.str(""); 139 } else if (g_FXCRT_XML_IsNameChar(ch)) { 140 buf << static_cast<char>(ch); 141 } else { 142 break; 143 } 144 m_dwIndex++; 145 } 146 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 147 if (m_dwIndex < m_dwBufferSize || IsEOF()) 148 break; 149 } while (ReadNextBlock()); 150 *name = ByteString(buf); 151 } 152 153 void CXML_Parser::SkipLiterals(const ByteStringView& str) { 154 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 155 if (IsEOF()) { 156 return; 157 } 158 int32_t i = 0, iLen = str.GetLength(); 159 do { 160 while (m_dwIndex < m_dwBufferSize) { 161 if (str[i] != m_pBuffer[m_dwIndex++]) { 162 i = 0; 163 continue; 164 } 165 i++; 166 if (i == iLen) 167 break; 168 } 169 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 170 if (i == iLen) 171 return; 172 173 if (m_dwIndex < m_dwBufferSize || IsEOF()) 174 break; 175 } while (ReadNextBlock()); 176 while (!m_pDataAcc->IsEOF()) { 177 ReadNextBlock(); 178 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize); 179 } 180 m_dwIndex = m_dwBufferSize; 181 } 182 183 uint32_t CXML_Parser::GetCharRef() { 184 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 185 if (IsEOF()) 186 return 0; 187 188 uint8_t ch; 189 int32_t iState = 0; 190 std::ostringstream buf; 191 uint32_t code = 0; 192 do { 193 while (m_dwIndex < m_dwBufferSize) { 194 ch = m_pBuffer[m_dwIndex]; 195 switch (iState) { 196 case 0: 197 if (ch == '#') { 198 m_dwIndex++; 199 iState = 2; 200 break; 201 } 202 iState = 1; 203 case 1: 204 m_dwIndex++; 205 if (ch == ';') { 206 std::string ref = buf.str(); 207 if (ref == "gt") 208 code = '>'; 209 else if (ref == "lt") 210 code = '<'; 211 else if (ref == "amp") 212 code = '&'; 213 else if (ref == "apos") 214 code = '\''; 215 else if (ref == "quot") 216 code = '"'; 217 iState = 10; 218 break; 219 } 220 buf << static_cast<char>(ch); 221 break; 222 case 2: 223 if (ch == 'x') { 224 m_dwIndex++; 225 iState = 4; 226 break; 227 } 228 iState = 3; 229 case 3: 230 m_dwIndex++; 231 if (ch == ';') { 232 iState = 10; 233 break; 234 } 235 if (g_FXCRT_XML_IsDigital(ch)) 236 code = code * 10 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch)); 237 break; 238 case 4: 239 m_dwIndex++; 240 if (ch == ';') { 241 iState = 10; 242 break; 243 } 244 uint8_t nHex = 245 g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar; 246 if (nHex) { 247 if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) { 248 code = (code << 4) + 249 FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch)); 250 } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) { 251 code = (code << 4) + ch - 87; 252 } else { 253 code = (code << 4) + ch - 55; 254 } 255 } 256 break; 257 } 258 if (iState == 10) 259 break; 260 } 261 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 262 if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) { 263 break; 264 } 265 } while (ReadNextBlock()); 266 return code; 267 } 268 269 WideString CXML_Parser::GetAttrValue() { 270 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 271 if (IsEOF()) 272 return WideString(); 273 274 CFX_UTF8Decoder decoder; 275 uint8_t mark = 0; 276 uint8_t ch = 0; 277 do { 278 while (m_dwIndex < m_dwBufferSize) { 279 ch = m_pBuffer[m_dwIndex]; 280 if (mark == 0) { 281 if (ch != '\'' && ch != '"') 282 return WideString(); 283 284 mark = ch; 285 m_dwIndex++; 286 ch = 0; 287 continue; 288 } 289 m_dwIndex++; 290 if (ch == mark) 291 break; 292 293 if (ch == '&') { 294 decoder.AppendCodePoint(GetCharRef()); 295 if (IsEOF()) 296 return WideString(decoder.GetResult()); 297 } else { 298 decoder.Input(ch); 299 } 300 } 301 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 302 if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF()) 303 break; 304 } while (ReadNextBlock()); 305 return WideString(decoder.GetResult()); 306 } 307 308 void CXML_Parser::GetTagName(bool bStartTag, 309 bool* bEndTag, 310 ByteString* space, 311 ByteString* name) { 312 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 313 if (IsEOF()) 314 return; 315 316 *bEndTag = false; 317 uint8_t ch; 318 int32_t iState = bStartTag ? 1 : 0; 319 do { 320 while (m_dwIndex < m_dwBufferSize) { 321 ch = m_pBuffer[m_dwIndex]; 322 switch (iState) { 323 case 0: 324 m_dwIndex++; 325 if (ch != '<') 326 break; 327 328 iState = 1; 329 break; 330 case 1: 331 if (ch == '?') { 332 m_dwIndex++; 333 SkipLiterals("?>"); 334 iState = 0; 335 break; 336 } 337 if (ch == '!') { 338 m_dwIndex++; 339 SkipLiterals("-->"); 340 iState = 0; 341 break; 342 } 343 if (ch == '/') { 344 m_dwIndex++; 345 GetName(space, name); 346 *bEndTag = true; 347 } else { 348 GetName(space, name); 349 *bEndTag = false; 350 } 351 return; 352 } 353 } 354 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 355 if (m_dwIndex < m_dwBufferSize || IsEOF()) 356 break; 357 } while (ReadNextBlock()); 358 } 359 360 std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent, 361 bool bStartTag) { 362 return ParseElementInternal(pParent, bStartTag, 0); 363 } 364 365 std::unique_ptr<CXML_Element> CXML_Parser::ParseElementInternal( 366 CXML_Element* pParent, 367 bool bStartTag, 368 int nDepth) { 369 if (nDepth > kMaxDepth) 370 return nullptr; 371 372 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 373 if (IsEOF()) 374 return nullptr; 375 376 ByteString tag_name; 377 ByteString tag_space; 378 bool bEndTag; 379 GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name); 380 if (tag_name.IsEmpty() || bEndTag) 381 return nullptr; 382 383 auto pElement = pdfium::MakeUnique<CXML_Element>( 384 pParent, tag_space.AsStringView(), tag_name.AsStringView()); 385 do { 386 ByteString attr_space; 387 ByteString attr_name; 388 while (m_dwIndex < m_dwBufferSize) { 389 SkipWhiteSpaces(); 390 if (IsEOF()) 391 break; 392 393 if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex])) 394 break; 395 396 GetName(&attr_space, &attr_name); 397 SkipWhiteSpaces(); 398 if (IsEOF()) 399 break; 400 401 if (m_pBuffer[m_dwIndex] != '=') 402 break; 403 404 m_dwIndex++; 405 SkipWhiteSpaces(); 406 if (IsEOF()) 407 break; 408 409 WideString attr_value = GetAttrValue(); 410 pElement->SetAttribute(attr_space, attr_name, attr_value); 411 } 412 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 413 if (m_dwIndex < m_dwBufferSize || IsEOF()) 414 break; 415 } while (ReadNextBlock()); 416 SkipWhiteSpaces(); 417 if (IsEOF()) 418 return pElement; 419 420 uint8_t ch = m_pBuffer[m_dwIndex++]; 421 if (ch == '/') { 422 m_dwIndex++; 423 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 424 return pElement; 425 } 426 if (ch != '>') { 427 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 428 return nullptr; 429 } 430 SkipWhiteSpaces(); 431 if (IsEOF()) 432 return pElement; 433 434 CFX_UTF8Decoder decoder; 435 CFX_WideTextBuf content; 436 bool bCDATA = false; 437 int32_t iState = 0; 438 do { 439 while (m_dwIndex < m_dwBufferSize) { 440 ch = m_pBuffer[m_dwIndex++]; 441 switch (iState) { 442 case 0: 443 if (ch == '<') { 444 iState = 1; 445 } else if (ch == '&') { 446 decoder.ClearStatus(); 447 decoder.AppendCodePoint(GetCharRef()); 448 } else { 449 decoder.Input(ch); 450 } 451 break; 452 case 1: 453 if (ch == '!') { 454 iState = 2; 455 } else if (ch == '?') { 456 SkipLiterals("?>"); 457 SkipWhiteSpaces(); 458 iState = 0; 459 } else if (ch == '/') { 460 ByteString space; 461 ByteString name; 462 GetName(&space, &name); 463 SkipWhiteSpaces(); 464 m_dwIndex++; 465 iState = 10; 466 } else { 467 content << decoder.GetResult(); 468 WideString dataStr = content.MakeString(); 469 if (!bCDATA) 470 dataStr.TrimRight(L" \t\r\n"); 471 472 InsertContentSegment(bCDATA, dataStr.AsStringView(), 473 pElement.get()); 474 content.Clear(); 475 decoder.Clear(); 476 bCDATA = false; 477 iState = 0; 478 m_dwIndex--; 479 std::unique_ptr<CXML_Element> pSubElement = 480 ParseElementInternal(pElement.get(), true, nDepth + 1); 481 if (!pSubElement) 482 break; 483 484 pElement->AppendChild(std::move(pSubElement)); 485 SkipWhiteSpaces(); 486 } 487 break; 488 case 2: 489 if (ch == '[') { 490 SkipLiterals("]]>"); 491 } else if (ch == '-') { 492 m_dwIndex++; 493 SkipLiterals("-->"); 494 } else { 495 SkipLiterals(">"); 496 } 497 decoder.Clear(); 498 SkipWhiteSpaces(); 499 iState = 0; 500 break; 501 } 502 if (iState == 10) { 503 break; 504 } 505 } 506 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex); 507 if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) 508 break; 509 } while (ReadNextBlock()); 510 content << decoder.GetResult(); 511 WideString dataStr = content.MakeString(); 512 dataStr.TrimRight(L" \t\r\n"); 513 514 InsertContentSegment(bCDATA, dataStr.AsStringView(), pElement.get()); 515 content.Clear(); 516 decoder.Clear(); 517 bCDATA = false; 518 return pElement; 519 } 520 521 void CXML_Parser::InsertContentSegment(bool bCDATA, 522 const WideStringView& content, 523 CXML_Element* pElement) { 524 if (content.IsEmpty()) 525 return; 526 527 pElement->AppendChild(pdfium::MakeUnique<CXML_Content>(bCDATA, content)); 528 } 529