1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/include/fpdfapi/fpdf_parser.h" 8 9 #include "core/include/fxcrt/fx_ext.h" 10 11 // Indexed by 8-bit character code, contains either: 12 // 'W' - for whitespace: NUL, TAB, CR, LF, FF, SPACE, 0x80, 0xff 13 // 'N' - for numeric: 0123456789+-. 14 // 'D' - for delimiter: %()/<>[]{} 15 // 'R' - otherwise. 16 const char PDF_CharType[256] = { 17 // NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO 18 // SI 19 'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W', 'W', 'R', 'W', 'W', 'R', 20 'R', 21 22 // DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS 23 // US 24 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 25 'R', 26 27 // SP ! " # $ % & ( ) * + , - . 28 // / 29 'W', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'D', 'D', 'R', 'N', 'R', 'N', 'N', 30 'D', 31 32 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 33 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'R', 'R', 'D', 'R', 'D', 34 'R', 35 36 // @ A B C D E F G H I J K L M N O 37 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 38 'R', 39 40 // P Q R S T U V W X Y Z [ \ ] ^ _ 41 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R', 42 'R', 43 44 // ` a b c d e f g h i j k l m n o 45 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 46 'R', 47 48 // p q r s t u v w x y z { | } ~ 49 // DEL 50 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R', 51 'R', 52 53 'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 54 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 55 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 56 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 57 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 58 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 59 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 60 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 61 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W'}; 62 63 CPDF_SimpleParser::CPDF_SimpleParser(const uint8_t* pData, FX_DWORD dwSize) { 64 m_pData = pData; 65 m_dwSize = dwSize; 66 m_dwCurPos = 0; 67 } 68 CPDF_SimpleParser::CPDF_SimpleParser(const CFX_ByteStringC& str) { 69 m_pData = str.GetPtr(); 70 m_dwSize = str.GetLength(); 71 m_dwCurPos = 0; 72 } 73 void CPDF_SimpleParser::ParseWord(const uint8_t*& pStart, 74 FX_DWORD& dwSize, 75 int& type) { 76 pStart = NULL; 77 dwSize = 0; 78 type = PDFWORD_EOF; 79 uint8_t ch; 80 while (1) { 81 if (m_dwSize <= m_dwCurPos) 82 return; 83 ch = m_pData[m_dwCurPos++]; 84 while (PDFCharIsWhitespace(ch)) { 85 if (m_dwSize <= m_dwCurPos) 86 return; 87 ch = m_pData[m_dwCurPos++]; 88 } 89 90 if (ch != '%') 91 break; 92 93 while (1) { 94 if (m_dwSize <= m_dwCurPos) 95 return; 96 ch = m_pData[m_dwCurPos++]; 97 if (ch == '\r' || ch == '\n') 98 break; 99 } 100 } 101 102 FX_DWORD start_pos = m_dwCurPos - 1; 103 pStart = m_pData + start_pos; 104 if (PDFCharIsDelimiter(ch)) { 105 if (ch == '/') { 106 while (1) { 107 if (m_dwSize <= m_dwCurPos) 108 return; 109 ch = m_pData[m_dwCurPos++]; 110 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) { 111 m_dwCurPos--; 112 dwSize = m_dwCurPos - start_pos; 113 type = PDFWORD_NAME; 114 return; 115 } 116 } 117 } else { 118 type = PDFWORD_DELIMITER; 119 dwSize = 1; 120 if (ch == '<') { 121 if (m_dwSize <= m_dwCurPos) 122 return; 123 ch = m_pData[m_dwCurPos++]; 124 if (ch == '<') 125 dwSize = 2; 126 else 127 m_dwCurPos--; 128 } else if (ch == '>') { 129 if (m_dwSize <= m_dwCurPos) 130 return; 131 ch = m_pData[m_dwCurPos++]; 132 if (ch == '>') 133 dwSize = 2; 134 else 135 m_dwCurPos--; 136 } 137 } 138 return; 139 } 140 141 type = PDFWORD_NUMBER; 142 dwSize = 1; 143 while (1) { 144 if (!PDFCharIsNumeric(ch)) 145 type = PDFWORD_TEXT; 146 if (m_dwSize <= m_dwCurPos) 147 return; 148 ch = m_pData[m_dwCurPos++]; 149 150 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) { 151 m_dwCurPos--; 152 break; 153 } 154 dwSize++; 155 } 156 } 157 CFX_ByteStringC CPDF_SimpleParser::GetWord() { 158 const uint8_t* pStart; 159 FX_DWORD dwSize; 160 int type; 161 ParseWord(pStart, dwSize, type); 162 if (dwSize == 1 && pStart[0] == '<') { 163 while (m_dwCurPos < m_dwSize && m_pData[m_dwCurPos] != '>') { 164 m_dwCurPos++; 165 } 166 if (m_dwCurPos < m_dwSize) { 167 m_dwCurPos++; 168 } 169 return CFX_ByteStringC(pStart, 170 (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData))); 171 } 172 if (dwSize == 1 && pStart[0] == '(') { 173 int level = 1; 174 while (m_dwCurPos < m_dwSize) { 175 if (m_pData[m_dwCurPos] == ')') { 176 level--; 177 if (level == 0) { 178 break; 179 } 180 } 181 if (m_pData[m_dwCurPos] == '\\') { 182 if (m_dwSize <= m_dwCurPos) { 183 break; 184 } 185 m_dwCurPos++; 186 } else if (m_pData[m_dwCurPos] == '(') { 187 level++; 188 } 189 if (m_dwSize <= m_dwCurPos) { 190 break; 191 } 192 m_dwCurPos++; 193 } 194 if (m_dwCurPos < m_dwSize) { 195 m_dwCurPos++; 196 } 197 return CFX_ByteStringC(pStart, 198 (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData))); 199 } 200 return CFX_ByteStringC(pStart, dwSize); 201 } 202 FX_BOOL CPDF_SimpleParser::SearchToken(const CFX_ByteStringC& token) { 203 int token_len = token.GetLength(); 204 while (m_dwCurPos < m_dwSize - token_len) { 205 if (FXSYS_memcmp(m_pData + m_dwCurPos, token.GetPtr(), token_len) == 0) { 206 break; 207 } 208 m_dwCurPos++; 209 } 210 if (m_dwCurPos == m_dwSize - token_len) { 211 return FALSE; 212 } 213 m_dwCurPos += token_len; 214 return TRUE; 215 } 216 FX_BOOL CPDF_SimpleParser::SkipWord(const CFX_ByteStringC& token) { 217 while (1) { 218 CFX_ByteStringC word = GetWord(); 219 if (word.IsEmpty()) { 220 return FALSE; 221 } 222 if (word == token) { 223 return TRUE; 224 } 225 } 226 return FALSE; 227 } 228 FX_BOOL CPDF_SimpleParser::FindTagPair(const CFX_ByteStringC& start_token, 229 const CFX_ByteStringC& end_token, 230 FX_DWORD& start_pos, 231 FX_DWORD& end_pos) { 232 if (!start_token.IsEmpty()) { 233 if (!SkipWord(start_token)) { 234 return FALSE; 235 } 236 start_pos = m_dwCurPos; 237 } 238 while (1) { 239 end_pos = m_dwCurPos; 240 CFX_ByteStringC word = GetWord(); 241 if (word.IsEmpty()) { 242 return FALSE; 243 } 244 if (word == end_token) { 245 return TRUE; 246 } 247 } 248 return FALSE; 249 } 250 FX_BOOL CPDF_SimpleParser::FindTagParam(const CFX_ByteStringC& token, 251 int nParams) { 252 nParams++; 253 FX_DWORD* pBuf = FX_Alloc(FX_DWORD, nParams); 254 int buf_index = 0; 255 int buf_count = 0; 256 while (1) { 257 pBuf[buf_index++] = m_dwCurPos; 258 if (buf_index == nParams) { 259 buf_index = 0; 260 } 261 buf_count++; 262 if (buf_count > nParams) { 263 buf_count = nParams; 264 } 265 CFX_ByteStringC word = GetWord(); 266 if (word.IsEmpty()) { 267 FX_Free(pBuf); 268 return FALSE; 269 } 270 if (word == token) { 271 if (buf_count < nParams) { 272 continue; 273 } 274 m_dwCurPos = pBuf[buf_index]; 275 FX_Free(pBuf); 276 return TRUE; 277 } 278 } 279 return FALSE; 280 } 281 282 CFX_ByteString PDF_NameDecode(const CFX_ByteStringC& bstr) { 283 int size = bstr.GetLength(); 284 const FX_CHAR* pSrc = bstr.GetCStr(); 285 if (!FXSYS_memchr(pSrc, '#', size)) { 286 return bstr; 287 } 288 CFX_ByteString result; 289 FX_CHAR* pDestStart = result.GetBuffer(size); 290 FX_CHAR* pDest = pDestStart; 291 for (int i = 0; i < size; i++) { 292 if (pSrc[i] == '#' && i < size - 2) { 293 *pDest++ = 294 FXSYS_toHexDigit(pSrc[i + 1]) * 16 + FXSYS_toHexDigit(pSrc[i + 2]); 295 i += 2; 296 } else { 297 *pDest++ = pSrc[i]; 298 } 299 } 300 result.ReleaseBuffer((FX_STRSIZE)(pDest - pDestStart)); 301 return result; 302 } 303 CFX_ByteString PDF_NameDecode(const CFX_ByteString& orig) { 304 if (!FXSYS_memchr(orig.c_str(), '#', orig.GetLength())) { 305 return orig; 306 } 307 return PDF_NameDecode(CFX_ByteStringC(orig)); 308 } 309 CFX_ByteString PDF_NameEncode(const CFX_ByteString& orig) { 310 uint8_t* src_buf = (uint8_t*)orig.c_str(); 311 int src_len = orig.GetLength(); 312 int dest_len = 0; 313 int i; 314 for (i = 0; i < src_len; i++) { 315 uint8_t ch = src_buf[i]; 316 if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' || 317 PDFCharIsDelimiter(ch)) { 318 dest_len += 3; 319 } else { 320 dest_len++; 321 } 322 } 323 if (dest_len == src_len) 324 return orig; 325 326 CFX_ByteString res; 327 FX_CHAR* dest_buf = res.GetBuffer(dest_len); 328 dest_len = 0; 329 for (i = 0; i < src_len; i++) { 330 uint8_t ch = src_buf[i]; 331 if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' || 332 PDFCharIsDelimiter(ch)) { 333 dest_buf[dest_len++] = '#'; 334 dest_buf[dest_len++] = "0123456789ABCDEF"[ch / 16]; 335 dest_buf[dest_len++] = "0123456789ABCDEF"[ch % 16]; 336 } else { 337 dest_buf[dest_len++] = ch; 338 } 339 } 340 dest_buf[dest_len] = 0; 341 res.ReleaseBuffer(); 342 return res; 343 } 344 CFX_ByteTextBuf& operator<<(CFX_ByteTextBuf& buf, const CPDF_Object* pObj) { 345 if (!pObj) { 346 buf << " null"; 347 return buf; 348 } 349 switch (pObj->GetType()) { 350 case PDFOBJ_NULL: 351 buf << " null"; 352 break; 353 case PDFOBJ_BOOLEAN: 354 case PDFOBJ_NUMBER: 355 buf << " " << pObj->GetString(); 356 break; 357 case PDFOBJ_STRING: 358 buf << PDF_EncodeString(pObj->GetString(), pObj->AsString()->IsHex()); 359 break; 360 case PDFOBJ_NAME: { 361 CFX_ByteString str = pObj->GetString(); 362 buf << "/" << PDF_NameEncode(str); 363 break; 364 } 365 case PDFOBJ_REFERENCE: { 366 buf << " " << pObj->AsReference()->GetRefObjNum() << " 0 R "; 367 break; 368 } 369 case PDFOBJ_ARRAY: { 370 const CPDF_Array* p = pObj->AsArray(); 371 buf << "["; 372 for (FX_DWORD i = 0; i < p->GetCount(); i++) { 373 CPDF_Object* pElement = p->GetElement(i); 374 if (pElement->GetObjNum()) { 375 buf << " " << pElement->GetObjNum() << " 0 R"; 376 } else { 377 buf << pElement; 378 } 379 } 380 buf << "]"; 381 break; 382 } 383 case PDFOBJ_DICTIONARY: { 384 const CPDF_Dictionary* p = pObj->AsDictionary(); 385 buf << "<<"; 386 for (const auto& it : *p) { 387 const CFX_ByteString& key = it.first; 388 CPDF_Object* pValue = it.second; 389 buf << "/" << PDF_NameEncode(key); 390 if (pValue && pValue->GetObjNum()) { 391 buf << " " << pValue->GetObjNum() << " 0 R "; 392 } else { 393 buf << pValue; 394 } 395 } 396 buf << ">>"; 397 break; 398 } 399 case PDFOBJ_STREAM: { 400 const CPDF_Stream* p = pObj->AsStream(); 401 buf << p->GetDict() << "stream\r\n"; 402 CPDF_StreamAcc acc; 403 acc.LoadAllData(p, TRUE); 404 buf.AppendBlock(acc.GetData(), acc.GetSize()); 405 buf << "\r\nendstream"; 406 break; 407 } 408 default: 409 ASSERT(FALSE); 410 break; 411 } 412 return buf; 413 } 414 FX_FLOAT PDF_ClipFloat(FX_FLOAT f) { 415 if (f < 0) { 416 return 0; 417 } 418 if (f > 1.0f) { 419 return 1.0f; 420 } 421 return f; 422 } 423 static CPDF_Object* SearchNumberNode(CPDF_Dictionary* pNode, int num) { 424 CPDF_Array* pLimits = pNode->GetArray("Limits"); 425 if (pLimits && 426 (num < pLimits->GetInteger(0) || num > pLimits->GetInteger(1))) { 427 return NULL; 428 } 429 CPDF_Array* pNumbers = pNode->GetArray("Nums"); 430 if (pNumbers) { 431 FX_DWORD dwCount = pNumbers->GetCount() / 2; 432 for (FX_DWORD i = 0; i < dwCount; i++) { 433 int index = pNumbers->GetInteger(i * 2); 434 if (num == index) { 435 return pNumbers->GetElementValue(i * 2 + 1); 436 } 437 if (index > num) { 438 break; 439 } 440 } 441 return NULL; 442 } 443 CPDF_Array* pKids = pNode->GetArray("Kids"); 444 if (!pKids) { 445 return NULL; 446 } 447 for (FX_DWORD i = 0; i < pKids->GetCount(); i++) { 448 CPDF_Dictionary* pKid = pKids->GetDict(i); 449 if (!pKid) { 450 continue; 451 } 452 CPDF_Object* pFound = SearchNumberNode(pKid, num); 453 if (pFound) { 454 return pFound; 455 } 456 } 457 return NULL; 458 } 459 CPDF_Object* CPDF_NumberTree::LookupValue(int num) { 460 return SearchNumberNode(m_pRoot, num); 461 } 462