1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "pdf/pdfium/pdfium_page.h" 6 7 #include <math.h> 8 9 #include "base/logging.h" 10 #include "base/strings/string_number_conversions.h" 11 #include "base/strings/string_util.h" 12 #include "base/strings/utf_string_conversions.h" 13 #include "base/values.h" 14 #include "pdf/pdfium/pdfium_engine.h" 15 16 // Used when doing hit detection. 17 #define kTolerance 20.0 18 19 // Dictionary Value key names for returning the accessible page content as JSON. 20 const char kPageWidth[] = "width"; 21 const char kPageHeight[] = "height"; 22 const char kPageTextBox[] = "textBox"; 23 const char kTextBoxLeft[] = "left"; 24 const char kTextBoxTop[] = "top"; 25 const char kTextBoxWidth[] = "width"; 26 const char kTextBoxHeight[] = "height"; 27 const char kTextBoxFontSize[] = "fontSize"; 28 const char kTextBoxNodes[] = "textNodes"; 29 const char kTextNodeType[] = "type"; 30 const char kTextNodeText[] = "text"; 31 const char kTextNodeURL[] = "url"; 32 const char kTextNodeTypeText[] = "text"; 33 const char kTextNodeTypeURL[] = "url"; 34 const char kDocLinkURLPrefix[] = "#page"; 35 36 namespace chrome_pdf { 37 38 PDFiumPage::PDFiumPage(PDFiumEngine* engine, 39 int i, 40 const pp::Rect& r, 41 bool available) 42 : engine_(engine), 43 page_(NULL), 44 text_page_(NULL), 45 index_(i), 46 rect_(r), 47 calculated_links_(false), 48 available_(available) { 49 } 50 51 PDFiumPage::~PDFiumPage() { 52 Unload(); 53 } 54 55 void PDFiumPage::Unload() { 56 if (text_page_) { 57 FPDFText_ClosePage(text_page_); 58 text_page_ = NULL; 59 } 60 61 if (page_) { 62 if (engine_->form()) { 63 FORM_OnBeforeClosePage(page_, engine_->form()); 64 } 65 FPDF_ClosePage(page_); 66 page_ = NULL; 67 } 68 } 69 70 FPDF_PAGE PDFiumPage::GetPage() { 71 ScopedUnsupportedFeature scoped_unsupported_feature(engine_); 72 if (!available_) 73 return NULL; 74 if (!page_) { 75 page_ = FPDF_LoadPage(engine_->doc(), index_); 76 if (page_ && engine_->form()) { 77 FORM_OnAfterLoadPage(page_, engine_->form()); 78 } 79 } 80 return page_; 81 } 82 83 FPDF_PAGE PDFiumPage::GetPrintPage() { 84 ScopedUnsupportedFeature scoped_unsupported_feature(engine_); 85 if (!available_) 86 return NULL; 87 if (!page_) 88 page_ = FPDF_LoadPage(engine_->doc(), index_); 89 return page_; 90 } 91 92 void PDFiumPage::ClosePrintPage() { 93 if (page_) { 94 FPDF_ClosePage(page_); 95 page_ = NULL; 96 } 97 } 98 99 FPDF_TEXTPAGE PDFiumPage::GetTextPage() { 100 if (!available_) 101 return NULL; 102 if (!text_page_) 103 text_page_ = FPDFText_LoadPage(GetPage()); 104 return text_page_; 105 } 106 107 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { 108 base::DictionaryValue* node = new base::DictionaryValue(); 109 110 if (!available_) 111 return node; 112 113 double width = FPDF_GetPageWidth(GetPage()); 114 double height = FPDF_GetPageHeight(GetPage()); 115 116 base::ListValue* text = new base::ListValue(); 117 int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount()); 118 for (int i = 0; i < box_count; i++) { 119 double left, top, right, bottom; 120 FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom); 121 text->Append( 122 GetTextBoxAsValue(height, left, top, right, bottom, rotation)); 123 } 124 125 node->SetDouble(kPageWidth, width); 126 node->SetDouble(kPageHeight, height); 127 node->Set(kPageTextBox, text); // Takes ownership of |text| 128 129 return node; 130 } 131 132 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height, 133 double left, double top, 134 double right, double bottom, 135 int rotation) { 136 base::string16 text_utf16; 137 int char_count = 138 FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0); 139 if (char_count > 0) { 140 unsigned short* data = reinterpret_cast<unsigned short*>( 141 WriteInto(&text_utf16, char_count + 1)); 142 FPDFText_GetBoundedText(GetTextPage(), 143 left, top, right, bottom, 144 data, char_count); 145 } 146 std::string text_utf8 = base::UTF16ToUTF8(text_utf16); 147 148 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top); 149 Area area; 150 std::vector<LinkTarget> targets; 151 if (link) { 152 targets.push_back(LinkTarget()); 153 area = GetLinkTarget(link, &targets[0]); 154 } else { 155 pp::Rect rect( 156 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation)); 157 GetLinks(rect, &targets); 158 area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA; 159 } 160 161 int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top, 162 kTolerance, kTolerance); 163 double font_size = FPDFText_GetFontSize(GetTextPage(), char_index); 164 165 base::DictionaryValue* node = new base::DictionaryValue(); 166 node->SetDouble(kTextBoxLeft, left); 167 node->SetDouble(kTextBoxTop, page_height - top); 168 node->SetDouble(kTextBoxWidth, right - left); 169 node->SetDouble(kTextBoxHeight, top - bottom); 170 node->SetDouble(kTextBoxFontSize, font_size); 171 172 base::ListValue* text_nodes = new base::ListValue(); 173 174 if (area == DOCLINK_AREA) { 175 std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page); 176 text_nodes->Append(CreateURLNode(text_utf8, url)); 177 } else if (area == WEBLINK_AREA && link) { 178 text_nodes->Append(CreateURLNode(text_utf8, targets[0].url)); 179 } else if (area == WEBLINK_AREA && !link) { 180 size_t start = 0; 181 for (size_t i = 0; i < targets.size(); ++i) { 182 // Remove the extra NULL character at end. 183 // Otherwise, find() will not return any matches. 184 if (targets[i].url.size() > 0 && 185 targets[i].url[targets[i].url.size() - 1] == '\0') { 186 targets[i].url.resize(targets[i].url.size() - 1); 187 } 188 // There should only ever be one NULL character 189 DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0'); 190 191 // PDFium may change the case of generated links. 192 std::string lowerCaseURL = StringToLowerASCII(targets[i].url); 193 std::string lowerCaseText = StringToLowerASCII(text_utf8); 194 size_t pos = lowerCaseText.find(lowerCaseURL, start); 195 size_t length = targets[i].url.size(); 196 if (pos == std::string::npos) { 197 // Check if the link is a "mailto:" URL 198 if (lowerCaseURL.compare(0, 7, "mailto:") == 0) { 199 pos = lowerCaseText.find(lowerCaseURL.substr(7), start); 200 length -= 7; 201 } 202 203 if (pos == std::string::npos) { 204 // No match has been found. This should never happen. 205 continue; 206 } 207 } 208 209 std::string before_text = text_utf8.substr(start, pos - start); 210 if (before_text.size() > 0) 211 text_nodes->Append(CreateTextNode(before_text)); 212 std::string link_text = text_utf8.substr(pos, length); 213 text_nodes->Append(CreateURLNode(link_text, targets[i].url)); 214 215 start = pos + length; 216 } 217 std::string before_text = text_utf8.substr(start); 218 if (before_text.size() > 0) 219 text_nodes->Append(CreateTextNode(before_text)); 220 } else { 221 text_nodes->Append(CreateTextNode(text_utf8)); 222 } 223 224 node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|. 225 return node; 226 } 227 228 base::Value* PDFiumPage::CreateTextNode(std::string text) { 229 base::DictionaryValue* node = new base::DictionaryValue(); 230 node->SetString(kTextNodeType, kTextNodeTypeText); 231 node->SetString(kTextNodeText, text); 232 return node; 233 } 234 235 base::Value* PDFiumPage::CreateURLNode(std::string text, std::string url) { 236 base::DictionaryValue* node = new base::DictionaryValue(); 237 node->SetString(kTextNodeType, kTextNodeTypeURL); 238 node->SetString(kTextNodeText, text); 239 node->SetString(kTextNodeURL, url); 240 return node; 241 } 242 243 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, 244 int rotation, 245 int* char_index, 246 LinkTarget* target) { 247 if (!available_) 248 return NONSELECTABLE_AREA; 249 pp::Point point2 = point - rect_.point(); 250 double new_x, new_y; 251 FPDF_DeviceToPage(GetPage(), 0, 0, rect_.width(), rect_.height(), 252 rotation, point2.x(), point2.y(), &new_x, &new_y); 253 254 int rv = FPDFText_GetCharIndexAtPos( 255 GetTextPage(), new_x, new_y, kTolerance, kTolerance); 256 *char_index = rv; 257 258 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), new_x, new_y); 259 if (link) { 260 // We don't handle all possible link types of the PDF. For example, 261 // launch actions, cross-document links, etc. 262 // In that case, GetLinkTarget() will return NONSELECTABLE_AREA 263 // and we should proceed with area detection. 264 PDFiumPage::Area area = GetLinkTarget(link, target); 265 if (area != PDFiumPage::NONSELECTABLE_AREA) 266 return area; 267 } 268 269 if (rv < 0) 270 return NONSELECTABLE_AREA; 271 272 return GetLink(*char_index, target) != -1 ? WEBLINK_AREA : TEXT_AREA; 273 } 274 275 base::char16 PDFiumPage::GetCharAtIndex(int index) { 276 if (!available_) 277 return L'\0'; 278 return static_cast<base::char16>(FPDFText_GetUnicode(GetTextPage(), index)); 279 } 280 281 int PDFiumPage::GetCharCount() { 282 if (!available_) 283 return 0; 284 return FPDFText_CountChars(GetTextPage()); 285 } 286 287 PDFiumPage::Area PDFiumPage::GetLinkTarget( 288 FPDF_LINK link, PDFiumPage::LinkTarget* target) { 289 FPDF_DEST dest = FPDFLink_GetDest(engine_->doc(), link); 290 if (dest != NULL) 291 return GetDestinationTarget(dest, target); 292 293 FPDF_ACTION action = FPDFLink_GetAction(link); 294 if (action) { 295 switch (FPDFAction_GetType(action)) { 296 case PDFACTION_GOTO: { 297 FPDF_DEST dest = FPDFAction_GetDest(engine_->doc(), action); 298 if (dest) 299 return GetDestinationTarget(dest, target); 300 // TODO(gene): We don't fully support all types of the in-document 301 // links. Need to implement that. There is a bug to track that: 302 // http://code.google.com/p/chromium/issues/detail?id=55776 303 } break; 304 case PDFACTION_URI: { 305 if (target) { 306 size_t buffer_size = 307 FPDFAction_GetURIPath(engine_->doc(), action, NULL, 0); 308 if (buffer_size > 1) { 309 void* data = WriteInto(&target->url, buffer_size); 310 FPDFAction_GetURIPath(engine_->doc(), action, data, buffer_size); 311 } 312 } 313 return WEBLINK_AREA; 314 } break; 315 // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH 316 // at the moment. 317 } 318 } 319 320 return NONSELECTABLE_AREA; 321 } 322 323 PDFiumPage::Area PDFiumPage::GetDestinationTarget( 324 FPDF_DEST destination, PDFiumPage::LinkTarget* target) { 325 int page_index = FPDFDest_GetPageIndex(engine_->doc(), destination); 326 if (target) { 327 target->page = page_index; 328 } 329 return DOCLINK_AREA; 330 } 331 332 int PDFiumPage::GetLink(int char_index, PDFiumPage::LinkTarget* target) { 333 if (!available_) 334 return -1; 335 336 CalculateLinks(); 337 338 // Get the bounding box of the rect again, since it might have moved because 339 // of the tolerance above. 340 double left, right, bottom, top; 341 FPDFText_GetCharBox(GetTextPage(), char_index, &left, &right, &bottom, &top); 342 343 pp::Point origin( 344 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0).point()); 345 for (size_t i = 0; i < links_.size(); ++i) { 346 for (size_t j = 0; j < links_[i].rects.size(); ++j) { 347 if (links_[i].rects[j].Contains(origin)) { 348 if (target) 349 target->url = links_[i].url; 350 return i; 351 } 352 } 353 } 354 return -1; 355 } 356 357 std::vector<int> PDFiumPage::GetLinks(pp::Rect text_area, 358 std::vector<LinkTarget>* targets) { 359 if (!available_) 360 return std::vector<int>(); 361 362 CalculateLinks(); 363 364 std::vector<int> links; 365 366 for (size_t i = 0; i < links_.size(); ++i) { 367 for (size_t j = 0; j < links_[i].rects.size(); ++j) { 368 if (links_[i].rects[j].Intersects(text_area)) { 369 if (targets) { 370 LinkTarget target; 371 target.url = links_[i].url; 372 targets->push_back(target); 373 } 374 links.push_back(i); 375 } 376 } 377 } 378 return links; 379 } 380 381 void PDFiumPage::CalculateLinks() { 382 if (calculated_links_) 383 return; 384 385 calculated_links_ = true; 386 FPDF_PAGELINK links = FPDFLink_LoadWebLinks(GetTextPage()); 387 int count = FPDFLink_CountWebLinks(links); 388 for (int i = 0; i < count; ++i) { 389 base::string16 url; 390 int url_length = FPDFLink_GetURL(links, i, NULL, 0); 391 if (url_length > 1) { // WriteInto needs at least 2 characters. 392 unsigned short* data = 393 reinterpret_cast<unsigned short*>(WriteInto(&url, url_length)); 394 FPDFLink_GetURL(links, i, data, url_length); 395 } 396 Link link; 397 link.url = base::UTF16ToUTF8(url); 398 399 // If the link cannot be converted to a pp::Var, then it is not possible to 400 // pass it to JS. In this case, ignore the link like other PDF viewers. 401 // See http://crbug.com/312882 for an example. 402 pp::Var link_var(link.url); 403 if (!link_var.is_string()) 404 continue; 405 406 // Make sure all the characters in the URL are valid per RFC 1738. 407 // http://crbug.com/340326 has a sample bad PDF. 408 // GURL does not work correctly, e.g. it just strips \t \r \n. 409 bool is_invalid_url = false; 410 for (size_t j = 0; j < link.url.length(); ++j) { 411 // Control characters are not allowed. 412 // 0x7F is also a control character. 413 // 0x80 and above are not in US-ASCII. 414 if (link.url[j] < ' ' || link.url[j] >= '\x7F') { 415 is_invalid_url = true; 416 break; 417 } 418 } 419 if (is_invalid_url) 420 continue; 421 422 int rect_count = FPDFLink_CountRects(links, i); 423 for (int j = 0; j < rect_count; ++j) { 424 double left, top, right, bottom; 425 FPDFLink_GetRect(links, i, j, &left, &top, &right, &bottom); 426 link.rects.push_back( 427 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0)); 428 } 429 links_.push_back(link); 430 } 431 FPDFLink_CloseWebLinks(links); 432 } 433 434 pp::Rect PDFiumPage::PageToScreen(const pp::Point& offset, 435 double zoom, 436 double left, 437 double top, 438 double right, 439 double bottom, 440 int rotation) { 441 if (!available_) 442 return pp::Rect(); 443 444 int new_left, new_top, new_right, new_bottom; 445 FPDF_PageToDevice( 446 page_, 447 static_cast<int>((rect_.x() - offset.x()) * zoom), 448 static_cast<int>((rect_.y() - offset.y()) * zoom), 449 static_cast<int>(ceil(rect_.width() * zoom)), 450 static_cast<int>(ceil(rect_.height() * zoom)), 451 rotation, left, top, &new_left, &new_top); 452 FPDF_PageToDevice( 453 page_, 454 static_cast<int>((rect_.x() - offset.x()) * zoom), 455 static_cast<int>((rect_.y() - offset.y()) * zoom), 456 static_cast<int>(ceil(rect_.width() * zoom)), 457 static_cast<int>(ceil(rect_.height() * zoom)), 458 rotation, right, bottom, &new_right, &new_bottom); 459 460 // If the PDF is rotated, the horizontal/vertical coordinates could be 461 // flipped. See 462 // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf 463 if (new_right < new_left) 464 std::swap(new_right, new_left); 465 if (new_bottom < new_top) 466 std::swap(new_bottom, new_top); 467 468 return pp::Rect( 469 new_left, new_top, new_right - new_left + 1, new_bottom - new_top + 1); 470 } 471 472 PDFiumPage::Link::Link() { 473 } 474 475 PDFiumPage::Link::~Link() { 476 } 477 478 } // namespace chrome_pdf 479