Home | History | Annotate | Download | only in fpdf_parser
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "../../../include/fpdfapi/fpdf_parser.h"
      8 const char PDF_CharType[256] = {
      9   //NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL  BS   HT   LF   VT   FF   CR   SO   SI
     10     'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W', 'W', 'R', 'W', 'W', 'R', 'R',
     11 
     12   //DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN  EM   SUB  ESC  FS   GS   RS   US
     13     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     14 
     15   //SP    !    "    #    $    %    &        (    )    *    +    ,    -    .    /
     16     'W', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'D', 'D', 'R', 'N', 'R', 'N', 'N', 'D',
     17 
     18   // 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     19     'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'R', 'R', 'D', 'R', 'D', 'R',
     20 
     21   // @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     22     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     23 
     24   // P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     25     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R', 'R',
     26 
     27   // `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     28     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     29 
     30   // p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~   DEL
     31     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R', 'R',
     32 
     33     'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     34     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     35     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     36     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     37     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     38     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     39     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
     40     'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W'
     41 };
     42 
     43 #ifndef MAX_PATH
     44 #define MAX_PATH 4096
     45 #endif
     46 CPDF_SimpleParser::CPDF_SimpleParser(FX_LPCBYTE pData, FX_DWORD dwSize)
     47 {
     48     m_pData = pData;
     49     m_dwSize = dwSize;
     50     m_dwCurPos = 0;
     51 }
     52 CPDF_SimpleParser::CPDF_SimpleParser(FX_BSTR str)
     53 {
     54     m_pData = str.GetPtr();
     55     m_dwSize = str.GetLength();
     56     m_dwCurPos = 0;
     57 }
     58 void CPDF_SimpleParser::ParseWord(FX_LPCBYTE& pStart, FX_DWORD& dwSize, int& type)
     59 {
     60     pStart = NULL;
     61     dwSize = 0;
     62     type = PDFWORD_EOF;
     63     FX_BYTE ch;
     64     char chartype;
     65     while (1) {
     66         if (m_dwSize <= m_dwCurPos) {
     67             return;
     68         }
     69         ch = m_pData[m_dwCurPos++];
     70         chartype = PDF_CharType[ch];
     71         while (chartype == 'W') {
     72             if (m_dwSize <= m_dwCurPos) {
     73                 return;
     74             }
     75             ch = m_pData[m_dwCurPos++];
     76             chartype = PDF_CharType[ch];
     77         }
     78         if (ch != '%') {
     79             break;
     80         }
     81         while (1) {
     82             if (m_dwSize <= m_dwCurPos) {
     83                 return;
     84             }
     85             ch = m_pData[m_dwCurPos++];
     86             if (ch == '\r' || ch == '\n') {
     87                 break;
     88             }
     89         }
     90         chartype = PDF_CharType[ch];
     91     }
     92     FX_DWORD start_pos = m_dwCurPos - 1;
     93     pStart = m_pData + start_pos;
     94     if (chartype == 'D') {
     95         if (ch == '/') {
     96             while (1) {
     97                 if (m_dwSize <= m_dwCurPos) {
     98                     return;
     99                 }
    100                 ch = m_pData[m_dwCurPos++];
    101                 chartype = PDF_CharType[ch];
    102                 if (chartype != 'R' && chartype != 'N') {
    103                     m_dwCurPos --;
    104                     dwSize = m_dwCurPos - start_pos;
    105                     type = PDFWORD_NAME;
    106                     return;
    107                 }
    108             }
    109         } else {
    110             type = PDFWORD_DELIMITER;
    111             dwSize = 1;
    112             if (ch == '<') {
    113                 if (m_dwSize <= m_dwCurPos) {
    114                     return;
    115                 }
    116                 ch = m_pData[m_dwCurPos++];
    117                 if (ch == '<') {
    118                     dwSize = 2;
    119                 } else {
    120                     m_dwCurPos --;
    121                 }
    122             } else if (ch == '>') {
    123                 if (m_dwSize <= m_dwCurPos) {
    124                     return;
    125                 }
    126                 ch = m_pData[m_dwCurPos++];
    127                 if (ch == '>') {
    128                     dwSize = 2;
    129                 } else {
    130                     m_dwCurPos --;
    131                 }
    132             }
    133         }
    134         return;
    135     }
    136     type = PDFWORD_NUMBER;
    137     dwSize = 1;
    138     while (1) {
    139         if (chartype != 'N') {
    140             type = PDFWORD_TEXT;
    141         }
    142         if (m_dwSize <= m_dwCurPos) {
    143             return;
    144         }
    145         ch = m_pData[m_dwCurPos++];
    146         chartype = PDF_CharType[ch];
    147         if (chartype == 'D' || chartype == 'W') {
    148             m_dwCurPos --;
    149             break;
    150         }
    151         dwSize ++;
    152     }
    153 }
    154 CFX_ByteStringC CPDF_SimpleParser::GetWord()
    155 {
    156     FX_LPCBYTE pStart;
    157     FX_DWORD dwSize;
    158     int type;
    159     ParseWord(pStart, dwSize, type);
    160     if (dwSize == 1 && pStart[0] == '<') {
    161         while (m_dwCurPos < m_dwSize && m_pData[m_dwCurPos] != '>') {
    162             m_dwCurPos ++;
    163         }
    164         if (m_dwCurPos < m_dwSize) {
    165             m_dwCurPos ++;
    166         }
    167         return CFX_ByteStringC(pStart, (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
    168     } else if (dwSize == 1 && pStart[0] == '(') {
    169         int level = 1;
    170         while (m_dwCurPos < m_dwSize) {
    171             if (m_pData[m_dwCurPos] == ')') {
    172                 level --;
    173                 if (level == 0) {
    174                     break;
    175                 }
    176             }
    177             if (m_pData[m_dwCurPos] == '\\') {
    178                 if (m_dwSize <= m_dwCurPos) {
    179                     break;
    180                 }
    181                 m_dwCurPos ++;
    182             } else if (m_pData[m_dwCurPos] == '(') {
    183                 level ++;
    184             }
    185             if (m_dwSize <= m_dwCurPos) {
    186                 break;
    187             }
    188             m_dwCurPos ++;
    189         }
    190         if (m_dwCurPos < m_dwSize) {
    191             m_dwCurPos ++;
    192         }
    193         return CFX_ByteStringC(pStart, (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
    194     }
    195     return CFX_ByteStringC(pStart, dwSize);
    196 }
    197 FX_BOOL CPDF_SimpleParser::SearchToken(FX_BSTR token)
    198 {
    199     int token_len = token.GetLength();
    200     while (m_dwCurPos < m_dwSize - token_len) {
    201         if (FXSYS_memcmp32(m_pData + m_dwCurPos, token.GetPtr(), token_len) == 0) {
    202             break;
    203         }
    204         m_dwCurPos ++;
    205     }
    206     if (m_dwCurPos == m_dwSize - token_len) {
    207         return FALSE;
    208     }
    209     m_dwCurPos += token_len;
    210     return TRUE;
    211 }
    212 FX_BOOL CPDF_SimpleParser::SkipWord(FX_BSTR token)
    213 {
    214     while (1) {
    215         CFX_ByteStringC word = GetWord();
    216         if (word.IsEmpty()) {
    217             return FALSE;
    218         }
    219         if (word == token) {
    220             return TRUE;
    221         }
    222     }
    223     return FALSE;
    224 }
    225 FX_BOOL CPDF_SimpleParser::FindTagPair(FX_BSTR start_token, FX_BSTR end_token,
    226                                        FX_DWORD& start_pos, FX_DWORD& end_pos)
    227 {
    228     if (!start_token.IsEmpty()) {
    229         if (!SkipWord(start_token)) {
    230             return FALSE;
    231         }
    232         start_pos = m_dwCurPos;
    233     }
    234     while (1) {
    235         end_pos = m_dwCurPos;
    236         CFX_ByteStringC word = GetWord();
    237         if (word.IsEmpty()) {
    238             return FALSE;
    239         }
    240         if (word == end_token) {
    241             return TRUE;
    242         }
    243     }
    244     return FALSE;
    245 }
    246 FX_BOOL CPDF_SimpleParser::FindTagParam(FX_BSTR token, int nParams)
    247 {
    248     nParams ++;
    249     FX_DWORD* pBuf = FX_Alloc(FX_DWORD, nParams);
    250     int buf_index = 0;
    251     int buf_count = 0;
    252     while (1) {
    253         pBuf[buf_index++] = m_dwCurPos;
    254         if (buf_index == nParams) {
    255             buf_index = 0;
    256         }
    257         buf_count ++;
    258         if (buf_count > nParams) {
    259             buf_count = nParams;
    260         }
    261         CFX_ByteStringC word = GetWord();
    262         if (word.IsEmpty()) {
    263             FX_Free(pBuf);
    264             return FALSE;
    265         }
    266         if (word == token) {
    267             if (buf_count < nParams) {
    268                 continue;
    269             }
    270             m_dwCurPos = pBuf[buf_index];
    271             FX_Free(pBuf);
    272             return TRUE;
    273         }
    274     }
    275     return FALSE;
    276 }
    277 static int _hex2dec(char ch)
    278 {
    279     if (ch >= '0' && ch <= '9') {
    280         return ch - '0';
    281     }
    282     if (ch >= 'a' && ch <= 'f') {
    283         return ch - 'a' + 10;
    284     }
    285     if (ch >= 'A' && ch <= 'F') {
    286         return ch - 'A' + 10;
    287     }
    288     return 0;
    289 }
    290 CFX_ByteString PDF_NameDecode(FX_BSTR bstr)
    291 {
    292     int size = bstr.GetLength();
    293     FX_LPCSTR pSrc = bstr.GetCStr();
    294     if (FXSYS_memchr(pSrc, '#', size) == NULL) {
    295         return bstr;
    296     }
    297     CFX_ByteString result;
    298     FX_LPSTR pDestStart = result.GetBuffer(size);
    299     FX_LPSTR pDest = pDestStart;
    300     for (int i = 0; i < size; i ++) {
    301         if (pSrc[i] == '#' && i < size - 2) {
    302             *pDest ++ = _hex2dec(pSrc[i + 1]) * 16 + _hex2dec(pSrc[i + 2]);
    303             i += 2;
    304         } else {
    305             *pDest ++ = pSrc[i];
    306         }
    307     }
    308     result.ReleaseBuffer((FX_STRSIZE)(pDest - pDestStart));
    309     return result;
    310 }
    311 CFX_ByteString PDF_NameDecode(const CFX_ByteString& orig)
    312 {
    313     if (FXSYS_memchr(orig.c_str(), '#', orig.GetLength()) == NULL) {
    314         return orig;
    315     }
    316     return PDF_NameDecode(CFX_ByteStringC(orig));
    317 }
    318 CFX_ByteString PDF_NameEncode(const CFX_ByteString& orig)
    319 {
    320     FX_LPBYTE src_buf = (FX_LPBYTE)orig.c_str();
    321     int src_len = orig.GetLength();
    322     int dest_len = 0;
    323     int i;
    324     for (i = 0; i < src_len; i ++) {
    325         FX_BYTE ch = src_buf[i];
    326         if (ch >= 0x80 || PDF_CharType[ch] == 'W' || ch == '#' ||
    327                 PDF_CharType[ch] == 'D') {
    328             dest_len += 3;
    329         } else {
    330             dest_len ++;
    331         }
    332     }
    333     if (dest_len == src_len) {
    334         return orig;
    335     }
    336     CFX_ByteString res;
    337     FX_LPSTR dest_buf = res.GetBuffer(dest_len);
    338     dest_len = 0;
    339     for (i = 0; i < src_len; i ++) {
    340         FX_BYTE ch = src_buf[i];
    341         if (ch >= 0x80 || PDF_CharType[ch] == 'W' || ch == '#' ||
    342                 PDF_CharType[ch] == 'D') {
    343             dest_buf[dest_len++] = '#';
    344             dest_buf[dest_len++] = "0123456789ABCDEF"[ch / 16];
    345             dest_buf[dest_len++] = "0123456789ABCDEF"[ch % 16];
    346         } else {
    347             dest_buf[dest_len++] = ch;
    348         }
    349     }
    350     dest_buf[dest_len] = 0;
    351     res.ReleaseBuffer();
    352     return res;
    353 }
    354 CFX_ByteTextBuf& operator << (CFX_ByteTextBuf& buf, const CPDF_Object* pObj)
    355 {
    356     if (pObj == NULL) {
    357         buf << FX_BSTRC(" null");
    358         return buf;
    359     }
    360     switch (pObj->GetType()) {
    361         case PDFOBJ_NULL:
    362             buf << FX_BSTRC(" null");
    363             break;
    364         case PDFOBJ_BOOLEAN:
    365         case PDFOBJ_NUMBER:
    366             buf << " " << pObj->GetString();
    367             break;
    368         case PDFOBJ_STRING: {
    369                 CFX_ByteString str = pObj->GetString();
    370                 FX_BOOL bHex = ((CPDF_String*)pObj)->IsHex();
    371                 buf << PDF_EncodeString(str, bHex);
    372                 break;
    373             }
    374         case PDFOBJ_NAME: {
    375                 CFX_ByteString str = pObj->GetString();
    376                 buf << FX_BSTRC("/") << PDF_NameEncode(str);
    377                 break;
    378             }
    379         case PDFOBJ_REFERENCE: {
    380                 CPDF_Reference* p = (CPDF_Reference*)pObj;
    381                 buf << " " << p->GetRefObjNum() << FX_BSTRC(" 0 R ");
    382                 break;
    383             }
    384         case PDFOBJ_ARRAY: {
    385                 CPDF_Array* p = (CPDF_Array*)pObj;
    386                 buf << FX_BSTRC("[");
    387                 for (FX_DWORD i = 0; i < p->GetCount(); i ++) {
    388                     CPDF_Object* pElement = p->GetElement(i);
    389                     if (pElement->GetObjNum()) {
    390                         buf << " " << pElement->GetObjNum() << FX_BSTRC(" 0 R");
    391                     } else {
    392                         buf << pElement;
    393                     }
    394                 }
    395                 buf << FX_BSTRC("]");
    396                 break;
    397             }
    398         case PDFOBJ_DICTIONARY: {
    399                 CPDF_Dictionary* p = (CPDF_Dictionary*)pObj;
    400                 buf << FX_BSTRC("<<");
    401                 FX_POSITION pos = p->GetStartPos();
    402                 while (pos) {
    403                     CFX_ByteString key;
    404                     CPDF_Object* pValue = p->GetNextElement(pos, key);
    405                     buf << FX_BSTRC("/") << PDF_NameEncode(key);
    406                     if (pValue->GetObjNum()) {
    407                         buf << " " << pValue->GetObjNum() << FX_BSTRC(" 0 R ");
    408                     } else {
    409                         buf << pValue;
    410                     }
    411                 }
    412                 buf << FX_BSTRC(">>");
    413                 break;
    414             }
    415         case PDFOBJ_STREAM: {
    416                 CPDF_Stream* p = (CPDF_Stream*)pObj;
    417                 buf << p->GetDict() << FX_BSTRC("stream\r\n");
    418                 CPDF_StreamAcc acc;
    419                 acc.LoadAllData(p, TRUE);
    420                 buf.AppendBlock(acc.GetData(), acc.GetSize());
    421                 buf << FX_BSTRC("\r\nendstream");
    422                 break;
    423             }
    424         default:
    425             ASSERT(FALSE);
    426             break;
    427     }
    428     return buf;
    429 }
    430 FX_FLOAT PDF_ClipFloat(FX_FLOAT f)
    431 {
    432     if (f < 0) {
    433         return 0;
    434     }
    435     if (f > 1.0f) {
    436         return 1.0f;
    437     }
    438     return f;
    439 }
    440 static CPDF_Object* SearchNumberNode(CPDF_Dictionary* pNode, int num)
    441 {
    442     CPDF_Array* pLimits = pNode->GetArray("Limits");
    443     if (pLimits && (num < pLimits->GetInteger(0) || num > pLimits->GetInteger(1))) {
    444         return NULL;
    445     }
    446     CPDF_Array* pNumbers = pNode->GetArray("Nums");
    447     if (pNumbers) {
    448         FX_DWORD dwCount = pNumbers->GetCount() / 2;
    449         for (FX_DWORD i = 0; i < dwCount; i ++) {
    450             int index = pNumbers->GetInteger(i * 2);
    451             if (num == index) {
    452                 return pNumbers->GetElementValue(i * 2 + 1);
    453             }
    454             if (index > num) {
    455                 break;
    456             }
    457         }
    458         return NULL;
    459     }
    460     CPDF_Array* pKids = pNode->GetArray("Kids");
    461     if (pKids == NULL) {
    462         return NULL;
    463     }
    464     for (FX_DWORD i = 0; i < pKids->GetCount(); i ++) {
    465         CPDF_Dictionary* pKid = pKids->GetDict(i);
    466         if (pKid == NULL) {
    467             continue;
    468         }
    469         CPDF_Object* pFound = SearchNumberNode(pKid, num);
    470         if (pFound) {
    471             return pFound;
    472         }
    473     }
    474     return NULL;
    475 }
    476 CPDF_Object* CPDF_NumberTree::LookupValue(int num)
    477 {
    478     return SearchNumberNode(m_pRoot, num);
    479 }
    480