Home | History | Annotate | Download | only in fpdf_parser
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "../../../include/fpdfapi/fpdf_parser.h"
      8 extern const FX_LPCSTR _PDF_CharType =
      9     "WRRRRRRRRWWRWWRRRRRRRRRRRRRRRRRR"
     10     "WRRRRDRRDDRNRNNDNNNNNNNNNNRRDRDR"
     11     "RRRRRRRRRRRRRRRRRRRRRRRRRRRDRDRR"
     12     "RRRRRRRRRRRRRRRRRRRRRRRRRRRDRDRR"
     13     "WRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR"
     14     "RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR"
     15     "RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR"
     16     "RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRW";
     17 #ifndef MAX_PATH
     18 #define MAX_PATH 4096
     19 #endif
     20 CPDF_SimpleParser::CPDF_SimpleParser(FX_LPCBYTE pData, FX_DWORD dwSize)
     21 {
     22     m_pData = pData;
     23     m_dwSize = dwSize;
     24     m_dwCurPos = 0;
     25 }
     26 CPDF_SimpleParser::CPDF_SimpleParser(FX_BSTR str)
     27 {
     28     m_pData = str;
     29     m_dwSize = str.GetLength();
     30     m_dwCurPos = 0;
     31 }
     32 void CPDF_SimpleParser::ParseWord(FX_LPCBYTE& pStart, FX_DWORD& dwSize, int& type)
     33 {
     34     pStart = NULL;
     35     dwSize = 0;
     36     type = PDFWORD_EOF;
     37     FX_BYTE ch;
     38     char chartype;
     39     while (1) {
     40         if (m_dwSize <= m_dwCurPos) {
     41             return;
     42         }
     43         ch = m_pData[m_dwCurPos++];
     44         chartype = _PDF_CharType[ch];
     45         while (chartype == 'W') {
     46             if (m_dwSize <= m_dwCurPos) {
     47                 return;
     48             }
     49             ch = m_pData[m_dwCurPos++];
     50             chartype = _PDF_CharType[ch];
     51         }
     52         if (ch != '%') {
     53             break;
     54         }
     55         while (1) {
     56             if (m_dwSize <= m_dwCurPos) {
     57                 return;
     58             }
     59             ch = m_pData[m_dwCurPos++];
     60             if (ch == '\r' || ch == '\n') {
     61                 break;
     62             }
     63         }
     64         chartype = _PDF_CharType[ch];
     65     }
     66     FX_DWORD start_pos = m_dwCurPos - 1;
     67     pStart = m_pData + start_pos;
     68     if (chartype == 'D') {
     69         if (ch == '/') {
     70             while (1) {
     71                 if (m_dwSize <= m_dwCurPos) {
     72                     return;
     73                 }
     74                 ch = m_pData[m_dwCurPos++];
     75                 chartype = _PDF_CharType[ch];
     76                 if (chartype != 'R' && chartype != 'N') {
     77                     m_dwCurPos --;
     78                     dwSize = m_dwCurPos - start_pos;
     79                     type = PDFWORD_NAME;
     80                     return;
     81                 }
     82             }
     83         } else {
     84             type = PDFWORD_DELIMITER;
     85             dwSize = 1;
     86             if (ch == '<') {
     87                 if (m_dwSize <= m_dwCurPos) {
     88                     return;
     89                 }
     90                 ch = m_pData[m_dwCurPos++];
     91                 if (ch == '<') {
     92                     dwSize = 2;
     93                 } else {
     94                     m_dwCurPos --;
     95                 }
     96             } else if (ch == '>') {
     97                 if (m_dwSize <= m_dwCurPos) {
     98                     return;
     99                 }
    100                 ch = m_pData[m_dwCurPos++];
    101                 if (ch == '>') {
    102                     dwSize = 2;
    103                 } else {
    104                     m_dwCurPos --;
    105                 }
    106             }
    107         }
    108         return;
    109     }
    110     type = PDFWORD_NUMBER;
    111     dwSize = 1;
    112     while (1) {
    113         if (chartype != 'N') {
    114             type = PDFWORD_TEXT;
    115         }
    116         if (m_dwSize <= m_dwCurPos) {
    117             return;
    118         }
    119         ch = m_pData[m_dwCurPos++];
    120         chartype = _PDF_CharType[ch];
    121         if (chartype == 'D' || chartype == 'W') {
    122             m_dwCurPos --;
    123             break;
    124         }
    125         dwSize ++;
    126     }
    127 }
    128 CFX_ByteStringC CPDF_SimpleParser::GetWord()
    129 {
    130     FX_LPCBYTE pStart;
    131     FX_DWORD dwSize;
    132     int type;
    133     ParseWord(pStart, dwSize, type);
    134     if (dwSize == 1 && pStart[0] == '<') {
    135         while (m_dwCurPos < m_dwSize && m_pData[m_dwCurPos] != '>') {
    136             m_dwCurPos ++;
    137         }
    138         if (m_dwCurPos < m_dwSize) {
    139             m_dwCurPos ++;
    140         }
    141         return CFX_ByteStringC(pStart, (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
    142     } else if (dwSize == 1 && pStart[0] == '(') {
    143         int level = 1;
    144         while (m_dwCurPos < m_dwSize) {
    145             if (m_pData[m_dwCurPos] == ')') {
    146                 level --;
    147                 if (level == 0) {
    148                     break;
    149                 }
    150             }
    151             if (m_pData[m_dwCurPos] == '\\') {
    152                 if (m_dwSize <= m_dwCurPos) {
    153                     break;
    154                 }
    155                 m_dwCurPos ++;
    156             } else if (m_pData[m_dwCurPos] == '(') {
    157                 level ++;
    158             }
    159             if (m_dwSize <= m_dwCurPos) {
    160                 break;
    161             }
    162             m_dwCurPos ++;
    163         }
    164         if (m_dwCurPos < m_dwSize) {
    165             m_dwCurPos ++;
    166         }
    167         return CFX_ByteStringC(pStart, (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
    168     }
    169     return CFX_ByteStringC(pStart, dwSize);
    170 }
    171 FX_BOOL CPDF_SimpleParser::SearchToken(FX_BSTR token)
    172 {
    173     int token_len = token.GetLength();
    174     while (m_dwCurPos < m_dwSize - token_len) {
    175         if (FXSYS_memcmp32(m_pData + m_dwCurPos, token, token_len) == 0) {
    176             break;
    177         }
    178         m_dwCurPos ++;
    179     }
    180     if (m_dwCurPos == m_dwSize - token_len) {
    181         return FALSE;
    182     }
    183     m_dwCurPos += token_len;
    184     return TRUE;
    185 }
    186 FX_BOOL CPDF_SimpleParser::SkipWord(FX_BSTR token)
    187 {
    188     while (1) {
    189         CFX_ByteStringC word = GetWord();
    190         if (word.IsEmpty()) {
    191             return FALSE;
    192         }
    193         if (word == token) {
    194             return TRUE;
    195         }
    196     }
    197     return FALSE;
    198 }
    199 FX_BOOL CPDF_SimpleParser::FindTagPair(FX_BSTR start_token, FX_BSTR end_token,
    200                                        FX_DWORD& start_pos, FX_DWORD& end_pos)
    201 {
    202     if (!start_token.IsEmpty()) {
    203         if (!SkipWord(start_token)) {
    204             return FALSE;
    205         }
    206         start_pos = m_dwCurPos;
    207     }
    208     while (1) {
    209         end_pos = m_dwCurPos;
    210         CFX_ByteStringC word = GetWord();
    211         if (word.IsEmpty()) {
    212             return FALSE;
    213         }
    214         if (word == end_token) {
    215             return TRUE;
    216         }
    217     }
    218     return FALSE;
    219 }
    220 FX_BOOL CPDF_SimpleParser::FindTagParam(FX_BSTR token, int nParams)
    221 {
    222     nParams ++;
    223     FX_DWORD* pBuf = FX_Alloc(FX_DWORD, nParams);
    224     int buf_index = 0;
    225     int buf_count = 0;
    226     while (1) {
    227         pBuf[buf_index++] = m_dwCurPos;
    228         if (buf_index == nParams) {
    229             buf_index = 0;
    230         }
    231         buf_count ++;
    232         if (buf_count > nParams) {
    233             buf_count = nParams;
    234         }
    235         CFX_ByteStringC word = GetWord();
    236         if (word.IsEmpty()) {
    237             FX_Free(pBuf);
    238             return FALSE;
    239         }
    240         if (word == token) {
    241             if (buf_count < nParams) {
    242                 continue;
    243             }
    244             m_dwCurPos = pBuf[buf_index];
    245             FX_Free(pBuf);
    246             return TRUE;
    247         }
    248     }
    249     return FALSE;
    250 }
    251 static int _hex2dec(char ch)
    252 {
    253     if (ch >= '0' && ch <= '9') {
    254         return ch - '0';
    255     }
    256     if (ch >= 'a' && ch <= 'f') {
    257         return ch - 'a' + 10;
    258     }
    259     if (ch >= 'A' && ch <= 'F') {
    260         return ch - 'A' + 10;
    261     }
    262     return 0;
    263 }
    264 CFX_ByteString PDF_NameDecode(FX_BSTR bstr)
    265 {
    266     int size = bstr.GetLength();
    267     FX_LPCSTR pSrc = bstr.GetCStr();
    268     if (FXSYS_memchr(pSrc, '#', size) == NULL) {
    269         return bstr;
    270     }
    271     CFX_ByteString result;
    272     FX_LPSTR pDestStart = result.GetBuffer(size);
    273     FX_LPSTR pDest = pDestStart;
    274     for (int i = 0; i < size; i ++) {
    275         if (pSrc[i] == '#' && i < size - 2) {
    276             *pDest ++ = _hex2dec(pSrc[i + 1]) * 16 + _hex2dec(pSrc[i + 2]);
    277             i += 2;
    278         } else {
    279             *pDest ++ = pSrc[i];
    280         }
    281     }
    282     result.ReleaseBuffer((FX_STRSIZE)(pDest - pDestStart));
    283     return result;
    284 }
    285 CFX_ByteString PDF_NameDecode(const CFX_ByteString& orig)
    286 {
    287     if (FXSYS_memchr((FX_LPCSTR)orig, '#', orig.GetLength()) == NULL) {
    288         return orig;
    289     }
    290     return PDF_NameDecode(CFX_ByteStringC(orig));
    291 }
    292 CFX_ByteString PDF_NameEncode(const CFX_ByteString& orig)
    293 {
    294     FX_LPBYTE src_buf = (FX_LPBYTE)(FX_LPCSTR)orig;
    295     int src_len = orig.GetLength();
    296     int dest_len = 0;
    297     int i;
    298     for (i = 0; i < src_len; i ++) {
    299         FX_BYTE ch = src_buf[i];
    300         if (ch >= 0x80 || _PDF_CharType[ch] == 'W' || ch == '#' ||
    301                 _PDF_CharType[ch] == 'D') {
    302             dest_len += 3;
    303         } else {
    304             dest_len ++;
    305         }
    306     }
    307     if (dest_len == src_len) {
    308         return orig;
    309     }
    310     CFX_ByteString res;
    311     FX_LPSTR dest_buf = res.GetBuffer(dest_len);
    312     dest_len = 0;
    313     for (i = 0; i < src_len; i ++) {
    314         FX_BYTE ch = src_buf[i];
    315         if (ch >= 0x80 || _PDF_CharType[ch] == 'W' || ch == '#' ||
    316                 _PDF_CharType[ch] == 'D') {
    317             dest_buf[dest_len++] = '#';
    318             dest_buf[dest_len++] = "0123456789ABCDEF"[ch / 16];
    319             dest_buf[dest_len++] = "0123456789ABCDEF"[ch % 16];
    320         } else {
    321             dest_buf[dest_len++] = ch;
    322         }
    323     }
    324     dest_buf[dest_len] = 0;
    325     res.ReleaseBuffer();
    326     return res;
    327 }
    328 CFX_ByteTextBuf& operator << (CFX_ByteTextBuf& buf, const CPDF_Object* pObj)
    329 {
    330     if (pObj == NULL) {
    331         buf << FX_BSTRC(" null");
    332         return buf;
    333     }
    334     switch (pObj->GetType()) {
    335         case PDFOBJ_NULL:
    336             buf << FX_BSTRC(" null");
    337             break;
    338         case PDFOBJ_BOOLEAN:
    339         case PDFOBJ_NUMBER:
    340             buf << " " << pObj->GetString();
    341             break;
    342         case PDFOBJ_STRING: {
    343                 CFX_ByteString str = pObj->GetString();
    344                 FX_BOOL bHex = ((CPDF_String*)pObj)->IsHex();
    345                 buf << PDF_EncodeString(str, bHex);
    346                 break;
    347             }
    348         case PDFOBJ_NAME: {
    349                 CFX_ByteString str = pObj->GetString();
    350                 buf << FX_BSTRC("/") << PDF_NameEncode(str);
    351                 break;
    352             }
    353         case PDFOBJ_REFERENCE: {
    354                 CPDF_Reference* p = (CPDF_Reference*)pObj;
    355                 buf << " " << p->GetRefObjNum() << FX_BSTRC(" 0 R ");
    356                 break;
    357             }
    358         case PDFOBJ_ARRAY: {
    359                 CPDF_Array* p = (CPDF_Array*)pObj;
    360                 buf << FX_BSTRC("[");
    361                 for (FX_DWORD i = 0; i < p->GetCount(); i ++) {
    362                     CPDF_Object* pElement = p->GetElement(i);
    363                     if (pElement->GetObjNum()) {
    364                         buf << " " << pElement->GetObjNum() << FX_BSTRC(" 0 R");
    365                     } else {
    366                         buf << pElement;
    367                     }
    368                 }
    369                 buf << FX_BSTRC("]");
    370                 break;
    371             }
    372         case PDFOBJ_DICTIONARY: {
    373                 CPDF_Dictionary* p = (CPDF_Dictionary*)pObj;
    374                 buf << FX_BSTRC("<<");
    375                 FX_POSITION pos = p->GetStartPos();
    376                 while (pos) {
    377                     CFX_ByteString key;
    378                     CPDF_Object* pValue = p->GetNextElement(pos, key);
    379                     buf << FX_BSTRC("/") << PDF_NameEncode(key);
    380                     if (pValue->GetObjNum()) {
    381                         buf << " " << pValue->GetObjNum() << FX_BSTRC(" 0 R ");
    382                     } else {
    383                         buf << pValue;
    384                     }
    385                 }
    386                 buf << FX_BSTRC(">>");
    387                 break;
    388             }
    389         case PDFOBJ_STREAM: {
    390                 CPDF_Stream* p = (CPDF_Stream*)pObj;
    391                 buf << p->GetDict() << FX_BSTRC("stream\r\n");
    392                 CPDF_StreamAcc acc;
    393                 acc.LoadAllData(p, TRUE);
    394                 buf.AppendBlock(acc.GetData(), acc.GetSize());
    395                 buf << FX_BSTRC("\r\nendstream");
    396                 break;
    397             }
    398         default:
    399             ASSERT(FALSE);
    400             break;
    401     }
    402     return buf;
    403 }
    404 FX_FLOAT PDF_ClipFloat(FX_FLOAT f)
    405 {
    406     if (f < 0) {
    407         return 0;
    408     }
    409     if (f > 1.0f) {
    410         return 1.0f;
    411     }
    412     return f;
    413 }
    414 static CPDF_Object* SearchNumberNode(CPDF_Dictionary* pNode, int num)
    415 {
    416     CPDF_Array* pLimits = pNode->GetArray("Limits");
    417     if (pLimits && (num < pLimits->GetInteger(0) || num > pLimits->GetInteger(1))) {
    418         return NULL;
    419     }
    420     CPDF_Array* pNumbers = pNode->GetArray("Nums");
    421     if (pNumbers) {
    422         FX_DWORD dwCount = pNumbers->GetCount() / 2;
    423         for (FX_DWORD i = 0; i < dwCount; i ++) {
    424             int index = pNumbers->GetInteger(i * 2);
    425             if (num == index) {
    426                 return pNumbers->GetElementValue(i * 2 + 1);
    427             }
    428             if (index > num) {
    429                 break;
    430             }
    431         }
    432         return NULL;
    433     }
    434     CPDF_Array* pKids = pNode->GetArray("Kids");
    435     if (pKids == NULL) {
    436         return NULL;
    437     }
    438     for (FX_DWORD i = 0; i < pKids->GetCount(); i ++) {
    439         CPDF_Dictionary* pKid = pKids->GetDict(i);
    440         if (pKid == NULL) {
    441             continue;
    442         }
    443         CPDF_Object* pFound = SearchNumberNode(pKid, num);
    444         if (pFound) {
    445             return pFound;
    446         }
    447     }
    448     return NULL;
    449 }
    450 CPDF_Object* CPDF_NumberTree::LookupValue(int num)
    451 {
    452     return SearchNumberNode(m_pRoot, num);
    453 }
    454