Home | History | Annotate | Download | only in fpdf_parser
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/include/fpdfapi/fpdf_parser.h"
      8 
      9 #include "core/include/fxcrt/fx_ext.h"
     10 
     11 // Indexed by 8-bit character code, contains either:
     12 //   'W' - for whitespace: NUL, TAB, CR, LF, FF, SPACE, 0x80, 0xff
     13 //   'N' - for numeric: 0123456789+-.
     14 //   'D' - for delimiter: %()/<>[]{}
     15 //   'R' - otherwise.
     16 const char PDF_CharType[256] = {
     17     // NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL  BS   HT   LF   VT   FF   CR   SO
     18     // SI
     19     'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W', 'W', 'R', 'W', 'W', 'R',
     20     'R',
     21 
     22     // DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN  EM   SUB  ESC  FS   GS   RS
     23     // US
     24     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     25     'R',
     26 
     27     // SP    !    "    #    $    %    &        (    )    *    +    ,    -    .
     28     // /
     29     'W', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'D', 'D', 'R', 'N', 'R', 'N', 'N',
     30     'D',
     31 
     32     // 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    > ?
     33     'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'R', 'R', 'D', 'R', 'D',
     34     'R',
     35 
     36     // @    A    B    C    D    E    F    G    H    I    J    K    L    M    N O
     37     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     38     'R',
     39 
     40     // P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^ _
     41     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R',
     42     'R',
     43 
     44     // `    a    b    c    d    e    f    g    h    i    j    k    l    m    n o
     45     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     46     'R',
     47 
     48     // p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     49     // DEL
     50     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R',
     51     'R',
     52 
     53     'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     54     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     55     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     56     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     57     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     58     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     59     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     60     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     61     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W'};
     62 
     63 CPDF_SimpleParser::CPDF_SimpleParser(const uint8_t* pData, FX_DWORD dwSize) {
     64   m_pData = pData;
     65   m_dwSize = dwSize;
     66   m_dwCurPos = 0;
     67 }
     68 CPDF_SimpleParser::CPDF_SimpleParser(const CFX_ByteStringC& str) {
     69   m_pData = str.GetPtr();
     70   m_dwSize = str.GetLength();
     71   m_dwCurPos = 0;
     72 }
     73 void CPDF_SimpleParser::ParseWord(const uint8_t*& pStart,
     74                                   FX_DWORD& dwSize,
     75                                   int& type) {
     76   pStart = NULL;
     77   dwSize = 0;
     78   type = PDFWORD_EOF;
     79   uint8_t ch;
     80   while (1) {
     81     if (m_dwSize <= m_dwCurPos)
     82       return;
     83     ch = m_pData[m_dwCurPos++];
     84     while (PDFCharIsWhitespace(ch)) {
     85       if (m_dwSize <= m_dwCurPos)
     86         return;
     87       ch = m_pData[m_dwCurPos++];
     88     }
     89 
     90     if (ch != '%')
     91       break;
     92 
     93     while (1) {
     94       if (m_dwSize <= m_dwCurPos)
     95         return;
     96       ch = m_pData[m_dwCurPos++];
     97       if (ch == '\r' || ch == '\n')
     98         break;
     99     }
    100   }
    101 
    102   FX_DWORD start_pos = m_dwCurPos - 1;
    103   pStart = m_pData + start_pos;
    104   if (PDFCharIsDelimiter(ch)) {
    105     if (ch == '/') {
    106       while (1) {
    107         if (m_dwSize <= m_dwCurPos)
    108           return;
    109         ch = m_pData[m_dwCurPos++];
    110         if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
    111           m_dwCurPos--;
    112           dwSize = m_dwCurPos - start_pos;
    113           type = PDFWORD_NAME;
    114           return;
    115         }
    116       }
    117     } else {
    118       type = PDFWORD_DELIMITER;
    119       dwSize = 1;
    120       if (ch == '<') {
    121         if (m_dwSize <= m_dwCurPos)
    122           return;
    123         ch = m_pData[m_dwCurPos++];
    124         if (ch == '<')
    125           dwSize = 2;
    126         else
    127           m_dwCurPos--;
    128       } else if (ch == '>') {
    129         if (m_dwSize <= m_dwCurPos)
    130           return;
    131         ch = m_pData[m_dwCurPos++];
    132         if (ch == '>')
    133           dwSize = 2;
    134         else
    135           m_dwCurPos--;
    136       }
    137     }
    138     return;
    139   }
    140 
    141   type = PDFWORD_NUMBER;
    142   dwSize = 1;
    143   while (1) {
    144     if (!PDFCharIsNumeric(ch))
    145       type = PDFWORD_TEXT;
    146     if (m_dwSize <= m_dwCurPos)
    147       return;
    148     ch = m_pData[m_dwCurPos++];
    149 
    150     if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
    151       m_dwCurPos--;
    152       break;
    153     }
    154     dwSize++;
    155   }
    156 }
    157 CFX_ByteStringC CPDF_SimpleParser::GetWord() {
    158   const uint8_t* pStart;
    159   FX_DWORD dwSize;
    160   int type;
    161   ParseWord(pStart, dwSize, type);
    162   if (dwSize == 1 && pStart[0] == '<') {
    163     while (m_dwCurPos < m_dwSize && m_pData[m_dwCurPos] != '>') {
    164       m_dwCurPos++;
    165     }
    166     if (m_dwCurPos < m_dwSize) {
    167       m_dwCurPos++;
    168     }
    169     return CFX_ByteStringC(pStart,
    170                            (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
    171   }
    172   if (dwSize == 1 && pStart[0] == '(') {
    173     int level = 1;
    174     while (m_dwCurPos < m_dwSize) {
    175       if (m_pData[m_dwCurPos] == ')') {
    176         level--;
    177         if (level == 0) {
    178           break;
    179         }
    180       }
    181       if (m_pData[m_dwCurPos] == '\\') {
    182         if (m_dwSize <= m_dwCurPos) {
    183           break;
    184         }
    185         m_dwCurPos++;
    186       } else if (m_pData[m_dwCurPos] == '(') {
    187         level++;
    188       }
    189       if (m_dwSize <= m_dwCurPos) {
    190         break;
    191       }
    192       m_dwCurPos++;
    193     }
    194     if (m_dwCurPos < m_dwSize) {
    195       m_dwCurPos++;
    196     }
    197     return CFX_ByteStringC(pStart,
    198                            (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
    199   }
    200   return CFX_ByteStringC(pStart, dwSize);
    201 }
    202 FX_BOOL CPDF_SimpleParser::SearchToken(const CFX_ByteStringC& token) {
    203   int token_len = token.GetLength();
    204   while (m_dwCurPos < m_dwSize - token_len) {
    205     if (FXSYS_memcmp(m_pData + m_dwCurPos, token.GetPtr(), token_len) == 0) {
    206       break;
    207     }
    208     m_dwCurPos++;
    209   }
    210   if (m_dwCurPos == m_dwSize - token_len) {
    211     return FALSE;
    212   }
    213   m_dwCurPos += token_len;
    214   return TRUE;
    215 }
    216 FX_BOOL CPDF_SimpleParser::SkipWord(const CFX_ByteStringC& token) {
    217   while (1) {
    218     CFX_ByteStringC word = GetWord();
    219     if (word.IsEmpty()) {
    220       return FALSE;
    221     }
    222     if (word == token) {
    223       return TRUE;
    224     }
    225   }
    226   return FALSE;
    227 }
    228 FX_BOOL CPDF_SimpleParser::FindTagPair(const CFX_ByteStringC& start_token,
    229                                        const CFX_ByteStringC& end_token,
    230                                        FX_DWORD& start_pos,
    231                                        FX_DWORD& end_pos) {
    232   if (!start_token.IsEmpty()) {
    233     if (!SkipWord(start_token)) {
    234       return FALSE;
    235     }
    236     start_pos = m_dwCurPos;
    237   }
    238   while (1) {
    239     end_pos = m_dwCurPos;
    240     CFX_ByteStringC word = GetWord();
    241     if (word.IsEmpty()) {
    242       return FALSE;
    243     }
    244     if (word == end_token) {
    245       return TRUE;
    246     }
    247   }
    248   return FALSE;
    249 }
    250 FX_BOOL CPDF_SimpleParser::FindTagParam(const CFX_ByteStringC& token,
    251                                         int nParams) {
    252   nParams++;
    253   FX_DWORD* pBuf = FX_Alloc(FX_DWORD, nParams);
    254   int buf_index = 0;
    255   int buf_count = 0;
    256   while (1) {
    257     pBuf[buf_index++] = m_dwCurPos;
    258     if (buf_index == nParams) {
    259       buf_index = 0;
    260     }
    261     buf_count++;
    262     if (buf_count > nParams) {
    263       buf_count = nParams;
    264     }
    265     CFX_ByteStringC word = GetWord();
    266     if (word.IsEmpty()) {
    267       FX_Free(pBuf);
    268       return FALSE;
    269     }
    270     if (word == token) {
    271       if (buf_count < nParams) {
    272         continue;
    273       }
    274       m_dwCurPos = pBuf[buf_index];
    275       FX_Free(pBuf);
    276       return TRUE;
    277     }
    278   }
    279   return FALSE;
    280 }
    281 
    282 CFX_ByteString PDF_NameDecode(const CFX_ByteStringC& bstr) {
    283   int size = bstr.GetLength();
    284   const FX_CHAR* pSrc = bstr.GetCStr();
    285   if (!FXSYS_memchr(pSrc, '#', size)) {
    286     return bstr;
    287   }
    288   CFX_ByteString result;
    289   FX_CHAR* pDestStart = result.GetBuffer(size);
    290   FX_CHAR* pDest = pDestStart;
    291   for (int i = 0; i < size; i++) {
    292     if (pSrc[i] == '#' && i < size - 2) {
    293       *pDest++ =
    294           FXSYS_toHexDigit(pSrc[i + 1]) * 16 + FXSYS_toHexDigit(pSrc[i + 2]);
    295       i += 2;
    296     } else {
    297       *pDest++ = pSrc[i];
    298     }
    299   }
    300   result.ReleaseBuffer((FX_STRSIZE)(pDest - pDestStart));
    301   return result;
    302 }
    303 CFX_ByteString PDF_NameDecode(const CFX_ByteString& orig) {
    304   if (!FXSYS_memchr(orig.c_str(), '#', orig.GetLength())) {
    305     return orig;
    306   }
    307   return PDF_NameDecode(CFX_ByteStringC(orig));
    308 }
    309 CFX_ByteString PDF_NameEncode(const CFX_ByteString& orig) {
    310   uint8_t* src_buf = (uint8_t*)orig.c_str();
    311   int src_len = orig.GetLength();
    312   int dest_len = 0;
    313   int i;
    314   for (i = 0; i < src_len; i++) {
    315     uint8_t ch = src_buf[i];
    316     if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' ||
    317         PDFCharIsDelimiter(ch)) {
    318       dest_len += 3;
    319     } else {
    320       dest_len++;
    321     }
    322   }
    323   if (dest_len == src_len)
    324     return orig;
    325 
    326   CFX_ByteString res;
    327   FX_CHAR* dest_buf = res.GetBuffer(dest_len);
    328   dest_len = 0;
    329   for (i = 0; i < src_len; i++) {
    330     uint8_t ch = src_buf[i];
    331     if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' ||
    332         PDFCharIsDelimiter(ch)) {
    333       dest_buf[dest_len++] = '#';
    334       dest_buf[dest_len++] = "0123456789ABCDEF"[ch / 16];
    335       dest_buf[dest_len++] = "0123456789ABCDEF"[ch % 16];
    336     } else {
    337       dest_buf[dest_len++] = ch;
    338     }
    339   }
    340   dest_buf[dest_len] = 0;
    341   res.ReleaseBuffer();
    342   return res;
    343 }
    344 CFX_ByteTextBuf& operator<<(CFX_ByteTextBuf& buf, const CPDF_Object* pObj) {
    345   if (!pObj) {
    346     buf << " null";
    347     return buf;
    348   }
    349   switch (pObj->GetType()) {
    350     case PDFOBJ_NULL:
    351       buf << " null";
    352       break;
    353     case PDFOBJ_BOOLEAN:
    354     case PDFOBJ_NUMBER:
    355       buf << " " << pObj->GetString();
    356       break;
    357     case PDFOBJ_STRING:
    358       buf << PDF_EncodeString(pObj->GetString(), pObj->AsString()->IsHex());
    359       break;
    360     case PDFOBJ_NAME: {
    361       CFX_ByteString str = pObj->GetString();
    362       buf << "/" << PDF_NameEncode(str);
    363       break;
    364     }
    365     case PDFOBJ_REFERENCE: {
    366       buf << " " << pObj->AsReference()->GetRefObjNum() << " 0 R ";
    367       break;
    368     }
    369     case PDFOBJ_ARRAY: {
    370       const CPDF_Array* p = pObj->AsArray();
    371       buf << "[";
    372       for (FX_DWORD i = 0; i < p->GetCount(); i++) {
    373         CPDF_Object* pElement = p->GetElement(i);
    374         if (pElement->GetObjNum()) {
    375           buf << " " << pElement->GetObjNum() << " 0 R";
    376         } else {
    377           buf << pElement;
    378         }
    379       }
    380       buf << "]";
    381       break;
    382     }
    383     case PDFOBJ_DICTIONARY: {
    384       const CPDF_Dictionary* p = pObj->AsDictionary();
    385       buf << "<<";
    386       for (const auto& it : *p) {
    387         const CFX_ByteString& key = it.first;
    388         CPDF_Object* pValue = it.second;
    389         buf << "/" << PDF_NameEncode(key);
    390         if (pValue && pValue->GetObjNum()) {
    391           buf << " " << pValue->GetObjNum() << " 0 R ";
    392         } else {
    393           buf << pValue;
    394         }
    395       }
    396       buf << ">>";
    397       break;
    398     }
    399     case PDFOBJ_STREAM: {
    400       const CPDF_Stream* p = pObj->AsStream();
    401       buf << p->GetDict() << "stream\r\n";
    402       CPDF_StreamAcc acc;
    403       acc.LoadAllData(p, TRUE);
    404       buf.AppendBlock(acc.GetData(), acc.GetSize());
    405       buf << "\r\nendstream";
    406       break;
    407     }
    408     default:
    409       ASSERT(FALSE);
    410       break;
    411   }
    412   return buf;
    413 }
    414 FX_FLOAT PDF_ClipFloat(FX_FLOAT f) {
    415   if (f < 0) {
    416     return 0;
    417   }
    418   if (f > 1.0f) {
    419     return 1.0f;
    420   }
    421   return f;
    422 }
    423 static CPDF_Object* SearchNumberNode(CPDF_Dictionary* pNode, int num) {
    424   CPDF_Array* pLimits = pNode->GetArray("Limits");
    425   if (pLimits &&
    426       (num < pLimits->GetInteger(0) || num > pLimits->GetInteger(1))) {
    427     return NULL;
    428   }
    429   CPDF_Array* pNumbers = pNode->GetArray("Nums");
    430   if (pNumbers) {
    431     FX_DWORD dwCount = pNumbers->GetCount() / 2;
    432     for (FX_DWORD i = 0; i < dwCount; i++) {
    433       int index = pNumbers->GetInteger(i * 2);
    434       if (num == index) {
    435         return pNumbers->GetElementValue(i * 2 + 1);
    436       }
    437       if (index > num) {
    438         break;
    439       }
    440     }
    441     return NULL;
    442   }
    443   CPDF_Array* pKids = pNode->GetArray("Kids");
    444   if (!pKids) {
    445     return NULL;
    446   }
    447   for (FX_DWORD i = 0; i < pKids->GetCount(); i++) {
    448     CPDF_Dictionary* pKid = pKids->GetDict(i);
    449     if (!pKid) {
    450       continue;
    451     }
    452     CPDF_Object* pFound = SearchNumberNode(pKid, num);
    453     if (pFound) {
    454       return pFound;
    455     }
    456   }
    457   return NULL;
    458 }
    459 CPDF_Object* CPDF_NumberTree::LookupValue(int num) {
    460   return SearchNumberNode(m_pRoot, num);
    461 }
    462