Home | History | Annotate | Download | only in xml
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include <algorithm>
      8 #include <memory>
      9 #include <sstream>
     10 #include <string>
     11 #include <utility>
     12 #include <vector>
     13 
     14 #include "core/fxcrt/cfx_utf8decoder.h"
     15 #include "core/fxcrt/cfx_widetextbuf.h"
     16 #include "core/fxcrt/fx_extension.h"
     17 #include "core/fxcrt/xml/cxml_content.h"
     18 #include "core/fxcrt/xml/cxml_element.h"
     19 #include "core/fxcrt/xml/cxml_parser.h"
     20 #include "third_party/base/ptr_util.h"
     21 #include "third_party/base/stl_util.h"
     22 
     23 namespace {
     24 
     25 #define FXCRTM_XML_CHARTYPE_Normal 0x00
     26 #define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
     27 #define FXCRTM_XML_CHARTYPE_Letter 0x02
     28 #define FXCRTM_XML_CHARTYPE_Digital 0x04
     29 #define FXCRTM_XML_CHARTYPE_NameIntro 0x08
     30 #define FXCRTM_XML_CHARTYPE_NameChar 0x10
     31 #define FXCRTM_XML_CHARTYPE_HexDigital 0x20
     32 #define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
     33 #define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
     34 #define FXCRTM_XML_CHARTYPE_HexChar 0x60
     35 
     36 const uint8_t g_FXCRT_XML_ByteTypes[256] = {
     37     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     38     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     39     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
     40     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
     41     0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
     42     0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
     43     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     44     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
     45     0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     46     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     47     0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
     48     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     49     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     50     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     51     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     52     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     53     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     54     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     55     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     56     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     57     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
     58     0x1A, 0x1A, 0x01, 0x01,
     59 };
     60 
     61 constexpr int kMaxDepth = 1024;
     62 
     63 bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
     64   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
     65 }
     66 
     67 bool g_FXCRT_XML_IsDigital(uint8_t ch) {
     68   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
     69 }
     70 
     71 bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
     72   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
     73 }
     74 
     75 bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
     76   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
     77 }
     78 
     79 }  // namespace
     80 
     81 CXML_Parser::CXML_Parser()
     82     : m_nOffset(0),
     83       m_pBuffer(nullptr),
     84       m_dwBufferSize(0),
     85       m_nBufferOffset(0),
     86       m_dwIndex(0) {}
     87 
     88 CXML_Parser::~CXML_Parser() {}
     89 
     90 bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
     91   m_pDataAcc = pdfium::MakeUnique<CXML_DataBufAcc>(pBuffer, size);
     92   m_nOffset = 0;
     93   return ReadNextBlock();
     94 }
     95 
     96 bool CXML_Parser::ReadNextBlock() {
     97   if (!m_pDataAcc->ReadNextBlock())
     98     return false;
     99 
    100   m_pBuffer = m_pDataAcc->GetBlockBuffer();
    101   m_dwBufferSize = m_pDataAcc->GetBlockSize();
    102   m_nBufferOffset = 0;
    103   m_dwIndex = 0;
    104   return m_dwBufferSize > 0;
    105 }
    106 
    107 bool CXML_Parser::IsEOF() {
    108   return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
    109 }
    110 
    111 void CXML_Parser::SkipWhiteSpaces() {
    112   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    113   if (IsEOF())
    114     return;
    115 
    116   do {
    117     while (m_dwIndex < m_dwBufferSize &&
    118            g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
    119       m_dwIndex++;
    120     }
    121     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    122     if (m_dwIndex < m_dwBufferSize || IsEOF())
    123       break;
    124   } while (ReadNextBlock());
    125 }
    126 
    127 void CXML_Parser::GetName(ByteString* space, ByteString* name) {
    128   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    129   if (IsEOF())
    130     return;
    131 
    132   std::ostringstream buf;
    133   do {
    134     while (m_dwIndex < m_dwBufferSize) {
    135       uint8_t ch = m_pBuffer[m_dwIndex];
    136       if (ch == ':') {
    137         *space = ByteString(buf);
    138         buf.str("");
    139       } else if (g_FXCRT_XML_IsNameChar(ch)) {
    140         buf << static_cast<char>(ch);
    141       } else {
    142         break;
    143       }
    144       m_dwIndex++;
    145     }
    146     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    147     if (m_dwIndex < m_dwBufferSize || IsEOF())
    148       break;
    149   } while (ReadNextBlock());
    150   *name = ByteString(buf);
    151 }
    152 
    153 void CXML_Parser::SkipLiterals(const ByteStringView& str) {
    154   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    155   if (IsEOF()) {
    156     return;
    157   }
    158   int32_t i = 0, iLen = str.GetLength();
    159   do {
    160     while (m_dwIndex < m_dwBufferSize) {
    161       if (str[i] != m_pBuffer[m_dwIndex++]) {
    162         i = 0;
    163         continue;
    164       }
    165       i++;
    166       if (i == iLen)
    167         break;
    168     }
    169     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    170     if (i == iLen)
    171       return;
    172 
    173     if (m_dwIndex < m_dwBufferSize || IsEOF())
    174       break;
    175   } while (ReadNextBlock());
    176   while (!m_pDataAcc->IsEOF()) {
    177     ReadNextBlock();
    178     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
    179   }
    180   m_dwIndex = m_dwBufferSize;
    181 }
    182 
    183 uint32_t CXML_Parser::GetCharRef() {
    184   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    185   if (IsEOF())
    186     return 0;
    187 
    188   uint8_t ch;
    189   int32_t iState = 0;
    190   std::ostringstream buf;
    191   uint32_t code = 0;
    192   do {
    193     while (m_dwIndex < m_dwBufferSize) {
    194       ch = m_pBuffer[m_dwIndex];
    195       switch (iState) {
    196         case 0:
    197           if (ch == '#') {
    198             m_dwIndex++;
    199             iState = 2;
    200             break;
    201           }
    202           iState = 1;
    203         case 1:
    204           m_dwIndex++;
    205           if (ch == ';') {
    206             std::string ref = buf.str();
    207             if (ref == "gt")
    208               code = '>';
    209             else if (ref == "lt")
    210               code = '<';
    211             else if (ref == "amp")
    212               code = '&';
    213             else if (ref == "apos")
    214               code = '\'';
    215             else if (ref == "quot")
    216               code = '"';
    217             iState = 10;
    218             break;
    219           }
    220           buf << static_cast<char>(ch);
    221           break;
    222         case 2:
    223           if (ch == 'x') {
    224             m_dwIndex++;
    225             iState = 4;
    226             break;
    227           }
    228           iState = 3;
    229         case 3:
    230           m_dwIndex++;
    231           if (ch == ';') {
    232             iState = 10;
    233             break;
    234           }
    235           if (g_FXCRT_XML_IsDigital(ch))
    236             code = code * 10 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
    237           break;
    238         case 4:
    239           m_dwIndex++;
    240           if (ch == ';') {
    241             iState = 10;
    242             break;
    243           }
    244           uint8_t nHex =
    245               g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
    246           if (nHex) {
    247             if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
    248               code = (code << 4) +
    249                      FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
    250             } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
    251               code = (code << 4) + ch - 87;
    252             } else {
    253               code = (code << 4) + ch - 55;
    254             }
    255           }
    256           break;
    257       }
    258       if (iState == 10)
    259         break;
    260     }
    261     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    262     if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
    263       break;
    264     }
    265   } while (ReadNextBlock());
    266   return code;
    267 }
    268 
    269 WideString CXML_Parser::GetAttrValue() {
    270   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    271   if (IsEOF())
    272     return WideString();
    273 
    274   CFX_UTF8Decoder decoder;
    275   uint8_t mark = 0;
    276   uint8_t ch = 0;
    277   do {
    278     while (m_dwIndex < m_dwBufferSize) {
    279       ch = m_pBuffer[m_dwIndex];
    280       if (mark == 0) {
    281         if (ch != '\'' && ch != '"')
    282           return WideString();
    283 
    284         mark = ch;
    285         m_dwIndex++;
    286         ch = 0;
    287         continue;
    288       }
    289       m_dwIndex++;
    290       if (ch == mark)
    291         break;
    292 
    293       if (ch == '&') {
    294         decoder.AppendCodePoint(GetCharRef());
    295         if (IsEOF())
    296           return WideString(decoder.GetResult());
    297       } else {
    298         decoder.Input(ch);
    299       }
    300     }
    301     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    302     if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
    303       break;
    304   } while (ReadNextBlock());
    305   return WideString(decoder.GetResult());
    306 }
    307 
    308 void CXML_Parser::GetTagName(bool bStartTag,
    309                              bool* bEndTag,
    310                              ByteString* space,
    311                              ByteString* name) {
    312   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    313   if (IsEOF())
    314     return;
    315 
    316   *bEndTag = false;
    317   uint8_t ch;
    318   int32_t iState = bStartTag ? 1 : 0;
    319   do {
    320     while (m_dwIndex < m_dwBufferSize) {
    321       ch = m_pBuffer[m_dwIndex];
    322       switch (iState) {
    323         case 0:
    324           m_dwIndex++;
    325           if (ch != '<')
    326             break;
    327 
    328           iState = 1;
    329           break;
    330         case 1:
    331           if (ch == '?') {
    332             m_dwIndex++;
    333             SkipLiterals("?>");
    334             iState = 0;
    335             break;
    336           }
    337           if (ch == '!') {
    338             m_dwIndex++;
    339             SkipLiterals("-->");
    340             iState = 0;
    341             break;
    342           }
    343           if (ch == '/') {
    344             m_dwIndex++;
    345             GetName(space, name);
    346             *bEndTag = true;
    347           } else {
    348             GetName(space, name);
    349             *bEndTag = false;
    350           }
    351           return;
    352       }
    353     }
    354     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    355     if (m_dwIndex < m_dwBufferSize || IsEOF())
    356       break;
    357   } while (ReadNextBlock());
    358 }
    359 
    360 std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
    361                                                         bool bStartTag) {
    362   return ParseElementInternal(pParent, bStartTag, 0);
    363 }
    364 
    365 std::unique_ptr<CXML_Element> CXML_Parser::ParseElementInternal(
    366     CXML_Element* pParent,
    367     bool bStartTag,
    368     int nDepth) {
    369   if (nDepth > kMaxDepth)
    370     return nullptr;
    371 
    372   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    373   if (IsEOF())
    374     return nullptr;
    375 
    376   ByteString tag_name;
    377   ByteString tag_space;
    378   bool bEndTag;
    379   GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
    380   if (tag_name.IsEmpty() || bEndTag)
    381     return nullptr;
    382 
    383   auto pElement = pdfium::MakeUnique<CXML_Element>(
    384       pParent, tag_space.AsStringView(), tag_name.AsStringView());
    385   do {
    386     ByteString attr_space;
    387     ByteString attr_name;
    388     while (m_dwIndex < m_dwBufferSize) {
    389       SkipWhiteSpaces();
    390       if (IsEOF())
    391         break;
    392 
    393       if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
    394         break;
    395 
    396       GetName(&attr_space, &attr_name);
    397       SkipWhiteSpaces();
    398       if (IsEOF())
    399         break;
    400 
    401       if (m_pBuffer[m_dwIndex] != '=')
    402         break;
    403 
    404       m_dwIndex++;
    405       SkipWhiteSpaces();
    406       if (IsEOF())
    407         break;
    408 
    409       WideString attr_value = GetAttrValue();
    410       pElement->SetAttribute(attr_space, attr_name, attr_value);
    411     }
    412     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    413     if (m_dwIndex < m_dwBufferSize || IsEOF())
    414       break;
    415   } while (ReadNextBlock());
    416   SkipWhiteSpaces();
    417   if (IsEOF())
    418     return pElement;
    419 
    420   uint8_t ch = m_pBuffer[m_dwIndex++];
    421   if (ch == '/') {
    422     m_dwIndex++;
    423     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    424     return pElement;
    425   }
    426   if (ch != '>') {
    427     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    428     return nullptr;
    429   }
    430   SkipWhiteSpaces();
    431   if (IsEOF())
    432     return pElement;
    433 
    434   CFX_UTF8Decoder decoder;
    435   CFX_WideTextBuf content;
    436   bool bCDATA = false;
    437   int32_t iState = 0;
    438   do {
    439     while (m_dwIndex < m_dwBufferSize) {
    440       ch = m_pBuffer[m_dwIndex++];
    441       switch (iState) {
    442         case 0:
    443           if (ch == '<') {
    444             iState = 1;
    445           } else if (ch == '&') {
    446             decoder.ClearStatus();
    447             decoder.AppendCodePoint(GetCharRef());
    448           } else {
    449             decoder.Input(ch);
    450           }
    451           break;
    452         case 1:
    453           if (ch == '!') {
    454             iState = 2;
    455           } else if (ch == '?') {
    456             SkipLiterals("?>");
    457             SkipWhiteSpaces();
    458             iState = 0;
    459           } else if (ch == '/') {
    460             ByteString space;
    461             ByteString name;
    462             GetName(&space, &name);
    463             SkipWhiteSpaces();
    464             m_dwIndex++;
    465             iState = 10;
    466           } else {
    467             content << decoder.GetResult();
    468             WideString dataStr = content.MakeString();
    469             if (!bCDATA)
    470               dataStr.TrimRight(L" \t\r\n");
    471 
    472             InsertContentSegment(bCDATA, dataStr.AsStringView(),
    473                                  pElement.get());
    474             content.Clear();
    475             decoder.Clear();
    476             bCDATA = false;
    477             iState = 0;
    478             m_dwIndex--;
    479             std::unique_ptr<CXML_Element> pSubElement =
    480                 ParseElementInternal(pElement.get(), true, nDepth + 1);
    481             if (!pSubElement)
    482               break;
    483 
    484             pElement->AppendChild(std::move(pSubElement));
    485             SkipWhiteSpaces();
    486           }
    487           break;
    488         case 2:
    489           if (ch == '[') {
    490             SkipLiterals("]]>");
    491           } else if (ch == '-') {
    492             m_dwIndex++;
    493             SkipLiterals("-->");
    494           } else {
    495             SkipLiterals(">");
    496           }
    497           decoder.Clear();
    498           SkipWhiteSpaces();
    499           iState = 0;
    500           break;
    501       }
    502       if (iState == 10) {
    503         break;
    504       }
    505     }
    506     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    507     if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
    508       break;
    509   } while (ReadNextBlock());
    510   content << decoder.GetResult();
    511   WideString dataStr = content.MakeString();
    512   dataStr.TrimRight(L" \t\r\n");
    513 
    514   InsertContentSegment(bCDATA, dataStr.AsStringView(), pElement.get());
    515   content.Clear();
    516   decoder.Clear();
    517   bCDATA = false;
    518   return pElement;
    519 }
    520 
    521 void CXML_Parser::InsertContentSegment(bool bCDATA,
    522                                        const WideStringView& content,
    523                                        CXML_Element* pElement) {
    524   if (content.IsEmpty())
    525     return;
    526 
    527   pElement->AppendChild(pdfium::MakeUnique<CXML_Content>(bCDATA, content));
    528 }
    529