Home | History | Annotate | Download | only in pdfium
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "pdf/pdfium/pdfium_page.h"
      6 
      7 #include <math.h>
      8 
      9 #include "base/logging.h"
     10 #include "base/strings/string_number_conversions.h"
     11 #include "base/strings/string_util.h"
     12 #include "base/strings/utf_string_conversions.h"
     13 #include "base/values.h"
     14 #include "pdf/pdfium/pdfium_engine.h"
     15 
     16 // Used when doing hit detection.
     17 #define kTolerance 20.0
     18 
     19 // Dictionary Value key names for returning the accessible page content as JSON.
     20 const char kPageWidth[] = "width";
     21 const char kPageHeight[] = "height";
     22 const char kPageTextBox[] = "textBox";
     23 const char kTextBoxLeft[] = "left";
     24 const char kTextBoxTop[]  = "top";
     25 const char kTextBoxWidth[] = "width";
     26 const char kTextBoxHeight[]  = "height";
     27 const char kTextBoxFontSize[] = "fontSize";
     28 const char kTextBoxNodes[] = "textNodes";
     29 const char kTextNodeType[] = "type";
     30 const char kTextNodeText[] = "text";
     31 const char kTextNodeURL[] = "url";
     32 const char kTextNodeTypeText[] = "text";
     33 const char kTextNodeTypeURL[] = "url";
     34 const char kDocLinkURLPrefix[] = "#page";
     35 
     36 namespace chrome_pdf {
     37 
     38 PDFiumPage::PDFiumPage(PDFiumEngine* engine,
     39                        int i,
     40                        const pp::Rect& r,
     41                        bool available)
     42     : engine_(engine),
     43       page_(NULL),
     44       text_page_(NULL),
     45       index_(i),
     46       rect_(r),
     47       calculated_links_(false),
     48       available_(available) {
     49 }
     50 
     51 PDFiumPage::~PDFiumPage() {
     52   Unload();
     53 }
     54 
     55 void PDFiumPage::Unload() {
     56   if (text_page_) {
     57     FPDFText_ClosePage(text_page_);
     58     text_page_ = NULL;
     59   }
     60 
     61   if (page_) {
     62     if (engine_->form()) {
     63       FORM_OnBeforeClosePage(page_, engine_->form());
     64     }
     65     FPDF_ClosePage(page_);
     66     page_ = NULL;
     67   }
     68 }
     69 
     70 FPDF_PAGE PDFiumPage::GetPage() {
     71   ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
     72   if (!available_)
     73     return NULL;
     74   if (!page_) {
     75     page_ = FPDF_LoadPage(engine_->doc(), index_);
     76     if (page_ && engine_->form()) {
     77       FORM_OnAfterLoadPage(page_, engine_->form());
     78     }
     79   }
     80   return page_;
     81 }
     82 
     83 FPDF_PAGE PDFiumPage::GetPrintPage() {
     84   ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
     85   if (!available_)
     86     return NULL;
     87   if (!page_)
     88     page_ = FPDF_LoadPage(engine_->doc(), index_);
     89   return page_;
     90 }
     91 
     92 void PDFiumPage::ClosePrintPage() {
     93   if (page_) {
     94     FPDF_ClosePage(page_);
     95     page_ = NULL;
     96   }
     97 }
     98 
     99 FPDF_TEXTPAGE PDFiumPage::GetTextPage() {
    100   if (!available_)
    101     return NULL;
    102   if (!text_page_)
    103     text_page_ = FPDFText_LoadPage(GetPage());
    104   return text_page_;
    105 }
    106 
    107 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {
    108   base::DictionaryValue* node = new base::DictionaryValue();
    109 
    110   if (!available_)
    111     return node;
    112 
    113   double width = FPDF_GetPageWidth(GetPage());
    114   double height = FPDF_GetPageHeight(GetPage());
    115 
    116   base::ListValue* text = new base::ListValue();
    117   int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
    118   for (int i = 0; i < box_count; i++) {
    119     double left, top, right, bottom;
    120     FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom);
    121     text->Append(
    122         GetTextBoxAsValue(height, left, top, right, bottom, rotation));
    123   }
    124 
    125   node->SetDouble(kPageWidth, width);
    126   node->SetDouble(kPageHeight, height);
    127   node->Set(kPageTextBox, text);  // Takes ownership of |text|
    128 
    129   return node;
    130 }
    131 
    132 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height,
    133                                            double left, double top,
    134                                            double right, double bottom,
    135                                            int rotation) {
    136   base::string16 text_utf16;
    137   int char_count =
    138     FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0);
    139   if (char_count > 0) {
    140     unsigned short* data = reinterpret_cast<unsigned short*>(
    141         WriteInto(&text_utf16, char_count + 1));
    142     FPDFText_GetBoundedText(GetTextPage(),
    143                             left, top, right, bottom,
    144                             data, char_count);
    145   }
    146   std::string text_utf8 = base::UTF16ToUTF8(text_utf16);
    147 
    148   FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top);
    149   Area area;
    150   std::vector<LinkTarget> targets;
    151   if (link) {
    152     targets.push_back(LinkTarget());
    153     area = GetLinkTarget(link, &targets[0]);
    154   } else {
    155     pp::Rect rect(
    156         PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation));
    157     GetLinks(rect, &targets);
    158     area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA;
    159   }
    160 
    161   int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top,
    162                                               kTolerance, kTolerance);
    163   double font_size = FPDFText_GetFontSize(GetTextPage(), char_index);
    164 
    165   base::DictionaryValue* node = new base::DictionaryValue();
    166   node->SetDouble(kTextBoxLeft, left);
    167   node->SetDouble(kTextBoxTop, page_height - top);
    168   node->SetDouble(kTextBoxWidth, right - left);
    169   node->SetDouble(kTextBoxHeight, top - bottom);
    170   node->SetDouble(kTextBoxFontSize, font_size);
    171 
    172   base::ListValue* text_nodes = new base::ListValue();
    173 
    174   if (area == DOCLINK_AREA) {
    175     std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page);
    176     text_nodes->Append(CreateURLNode(text_utf8, url));
    177   } else if (area == WEBLINK_AREA && link) {
    178     text_nodes->Append(CreateURLNode(text_utf8, targets[0].url));
    179   } else if (area == WEBLINK_AREA && !link) {
    180     size_t start = 0;
    181     for (size_t i = 0; i < targets.size(); ++i) {
    182       // Remove the extra NULL character at end.
    183       // Otherwise, find() will not return any matches.
    184       if (targets[i].url.size() > 0 &&
    185           targets[i].url[targets[i].url.size() - 1] == '\0') {
    186         targets[i].url.resize(targets[i].url.size() - 1);
    187       }
    188       // There should only ever be one NULL character
    189       DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0');
    190 
    191       // PDFium may change the case of generated links.
    192       std::string lowerCaseURL = StringToLowerASCII(targets[i].url);
    193       std::string lowerCaseText = StringToLowerASCII(text_utf8);
    194       size_t pos = lowerCaseText.find(lowerCaseURL, start);
    195       size_t length = targets[i].url.size();
    196       if (pos == std::string::npos) {
    197         // Check if the link is a "mailto:" URL
    198         if (lowerCaseURL.compare(0, 7, "mailto:") == 0) {
    199           pos = lowerCaseText.find(lowerCaseURL.substr(7), start);
    200           length -= 7;
    201         }
    202 
    203         if (pos == std::string::npos) {
    204           // No match has been found.  This should never happen.
    205           continue;
    206         }
    207       }
    208 
    209       std::string before_text = text_utf8.substr(start, pos - start);
    210       if (before_text.size() > 0)
    211         text_nodes->Append(CreateTextNode(before_text));
    212       std::string link_text = text_utf8.substr(pos, length);
    213       text_nodes->Append(CreateURLNode(link_text, targets[i].url));
    214 
    215       start = pos + length;
    216     }
    217     std::string before_text = text_utf8.substr(start);
    218     if (before_text.size() > 0)
    219       text_nodes->Append(CreateTextNode(before_text));
    220   } else {
    221     text_nodes->Append(CreateTextNode(text_utf8));
    222   }
    223 
    224   node->Set(kTextBoxNodes, text_nodes);  // Takes ownership of |text_nodes|.
    225   return node;
    226 }
    227 
    228 base::Value* PDFiumPage::CreateTextNode(std::string text) {
    229   base::DictionaryValue* node = new base::DictionaryValue();
    230   node->SetString(kTextNodeType, kTextNodeTypeText);
    231   node->SetString(kTextNodeText, text);
    232   return node;
    233 }
    234 
    235 base::Value* PDFiumPage::CreateURLNode(std::string text, std::string url) {
    236   base::DictionaryValue* node = new base::DictionaryValue();
    237   node->SetString(kTextNodeType, kTextNodeTypeURL);
    238   node->SetString(kTextNodeText, text);
    239   node->SetString(kTextNodeURL, url);
    240   return node;
    241 }
    242 
    243 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,
    244                                           int rotation,
    245                                           int* char_index,
    246                                           LinkTarget* target) {
    247   if (!available_)
    248     return NONSELECTABLE_AREA;
    249   pp::Point point2 = point - rect_.point();
    250   double new_x, new_y;
    251   FPDF_DeviceToPage(GetPage(), 0, 0, rect_.width(), rect_.height(),
    252         rotation, point2.x(), point2.y(), &new_x, &new_y);
    253 
    254   int rv = FPDFText_GetCharIndexAtPos(
    255       GetTextPage(), new_x, new_y, kTolerance, kTolerance);
    256   *char_index = rv;
    257 
    258   FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), new_x, new_y);
    259   if (link) {
    260     // We don't handle all possible link types of the PDF. For example,
    261     // launch actions, cross-document links, etc.
    262     // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
    263     // and we should proceed with area detection.
    264     PDFiumPage::Area area = GetLinkTarget(link, target);
    265     if (area != PDFiumPage::NONSELECTABLE_AREA)
    266       return area;
    267   }
    268 
    269   if (rv < 0)
    270     return NONSELECTABLE_AREA;
    271 
    272   return GetLink(*char_index, target) != -1 ? WEBLINK_AREA : TEXT_AREA;
    273 }
    274 
    275 base::char16 PDFiumPage::GetCharAtIndex(int index) {
    276   if (!available_)
    277     return L'\0';
    278   return static_cast<base::char16>(FPDFText_GetUnicode(GetTextPage(), index));
    279 }
    280 
    281 int PDFiumPage::GetCharCount() {
    282   if (!available_)
    283     return 0;
    284   return FPDFText_CountChars(GetTextPage());
    285 }
    286 
    287 PDFiumPage::Area PDFiumPage::GetLinkTarget(
    288     FPDF_LINK link, PDFiumPage::LinkTarget* target) {
    289   FPDF_DEST dest = FPDFLink_GetDest(engine_->doc(), link);
    290   if (dest != NULL)
    291     return GetDestinationTarget(dest, target);
    292 
    293   FPDF_ACTION action = FPDFLink_GetAction(link);
    294   if (action) {
    295     switch (FPDFAction_GetType(action)) {
    296       case PDFACTION_GOTO: {
    297           FPDF_DEST dest = FPDFAction_GetDest(engine_->doc(), action);
    298           if (dest)
    299             return GetDestinationTarget(dest, target);
    300           // TODO(gene): We don't fully support all types of the in-document
    301           // links. Need to implement that. There is a bug to track that:
    302           // http://code.google.com/p/chromium/issues/detail?id=55776
    303         } break;
    304       case PDFACTION_URI: {
    305           if (target) {
    306             size_t buffer_size =
    307                 FPDFAction_GetURIPath(engine_->doc(), action, NULL, 0);
    308             if (buffer_size > 1) {
    309               void* data = WriteInto(&target->url, buffer_size);
    310               FPDFAction_GetURIPath(engine_->doc(), action, data, buffer_size);
    311             }
    312           }
    313           return WEBLINK_AREA;
    314         } break;
    315       // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
    316       // at the moment.
    317     }
    318   }
    319 
    320   return NONSELECTABLE_AREA;
    321 }
    322 
    323 PDFiumPage::Area PDFiumPage::GetDestinationTarget(
    324     FPDF_DEST destination, PDFiumPage::LinkTarget* target) {
    325   int page_index = FPDFDest_GetPageIndex(engine_->doc(), destination);
    326   if (target) {
    327     target->page = page_index;
    328   }
    329   return DOCLINK_AREA;
    330 }
    331 
    332 int PDFiumPage::GetLink(int char_index, PDFiumPage::LinkTarget* target) {
    333   if (!available_)
    334     return -1;
    335 
    336   CalculateLinks();
    337 
    338   // Get the bounding box of the rect again, since it might have moved because
    339   // of the tolerance above.
    340   double left, right, bottom, top;
    341   FPDFText_GetCharBox(GetTextPage(), char_index, &left, &right, &bottom, &top);
    342 
    343   pp::Point origin(
    344       PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0).point());
    345   for (size_t i = 0; i < links_.size(); ++i) {
    346     for (size_t j = 0; j < links_[i].rects.size(); ++j) {
    347       if (links_[i].rects[j].Contains(origin)) {
    348         if (target)
    349           target->url = links_[i].url;
    350         return i;
    351       }
    352     }
    353   }
    354   return -1;
    355 }
    356 
    357 std::vector<int> PDFiumPage::GetLinks(pp::Rect text_area,
    358                                       std::vector<LinkTarget>* targets) {
    359   if (!available_)
    360     return std::vector<int>();
    361 
    362   CalculateLinks();
    363 
    364   std::vector<int> links;
    365 
    366   for (size_t i = 0; i < links_.size(); ++i) {
    367     for (size_t j = 0; j < links_[i].rects.size(); ++j) {
    368       if (links_[i].rects[j].Intersects(text_area)) {
    369         if (targets) {
    370           LinkTarget target;
    371           target.url = links_[i].url;
    372           targets->push_back(target);
    373         }
    374         links.push_back(i);
    375       }
    376     }
    377   }
    378   return links;
    379 }
    380 
    381 void PDFiumPage::CalculateLinks() {
    382   if (calculated_links_)
    383     return;
    384 
    385   calculated_links_ = true;
    386   FPDF_PAGELINK links = FPDFLink_LoadWebLinks(GetTextPage());
    387   int count = FPDFLink_CountWebLinks(links);
    388   for (int i = 0; i < count; ++i) {
    389     base::string16 url;
    390     int url_length = FPDFLink_GetURL(links, i, NULL, 0);
    391     if (url_length > 1) {  // WriteInto needs at least 2 characters.
    392       unsigned short* data =
    393           reinterpret_cast<unsigned short*>(WriteInto(&url, url_length));
    394       FPDFLink_GetURL(links, i, data, url_length);
    395     }
    396     Link link;
    397     link.url = base::UTF16ToUTF8(url);
    398 
    399     // If the link cannot be converted to a pp::Var, then it is not possible to
    400     // pass it to JS. In this case, ignore the link like other PDF viewers.
    401     // See http://crbug.com/312882 for an example.
    402     pp::Var link_var(link.url);
    403     if (!link_var.is_string())
    404       continue;
    405 
    406     // Make sure all the characters in the URL are valid per RFC 1738.
    407     // http://crbug.com/340326 has a sample bad PDF.
    408     // GURL does not work correctly, e.g. it just strips \t \r \n.
    409     bool is_invalid_url = false;
    410     for (size_t j = 0; j < link.url.length(); ++j) {
    411       // Control characters are not allowed.
    412       // 0x7F is also a control character.
    413       // 0x80 and above are not in US-ASCII.
    414       if (link.url[j] < ' ' || link.url[j] >= '\x7F') {
    415         is_invalid_url = true;
    416         break;
    417       }
    418     }
    419     if (is_invalid_url)
    420       continue;
    421 
    422     int rect_count = FPDFLink_CountRects(links, i);
    423     for (int j = 0; j < rect_count; ++j) {
    424       double left, top, right, bottom;
    425       FPDFLink_GetRect(links, i, j, &left, &top, &right, &bottom);
    426       link.rects.push_back(
    427           PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0));
    428     }
    429     links_.push_back(link);
    430   }
    431   FPDFLink_CloseWebLinks(links);
    432 }
    433 
    434 pp::Rect PDFiumPage::PageToScreen(const pp::Point& offset,
    435                                   double zoom,
    436                                   double left,
    437                                   double top,
    438                                   double right,
    439                                   double bottom,
    440                                   int rotation) {
    441   if (!available_)
    442     return pp::Rect();
    443 
    444   int new_left, new_top, new_right, new_bottom;
    445   FPDF_PageToDevice(
    446       page_,
    447       static_cast<int>((rect_.x() - offset.x()) * zoom),
    448       static_cast<int>((rect_.y() - offset.y()) * zoom),
    449       static_cast<int>(ceil(rect_.width() * zoom)),
    450       static_cast<int>(ceil(rect_.height() * zoom)),
    451       rotation, left, top, &new_left, &new_top);
    452   FPDF_PageToDevice(
    453       page_,
    454       static_cast<int>((rect_.x() - offset.x()) * zoom),
    455       static_cast<int>((rect_.y() - offset.y()) * zoom),
    456       static_cast<int>(ceil(rect_.width() * zoom)),
    457       static_cast<int>(ceil(rect_.height() * zoom)),
    458       rotation, right, bottom, &new_right, &new_bottom);
    459 
    460   // If the PDF is rotated, the horizontal/vertical coordinates could be
    461   // flipped.  See
    462   // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
    463   if (new_right < new_left)
    464     std::swap(new_right, new_left);
    465   if (new_bottom < new_top)
    466     std::swap(new_bottom, new_top);
    467 
    468   return pp::Rect(
    469       new_left, new_top, new_right - new_left + 1, new_bottom - new_top + 1);
    470 }
    471 
    472 PDFiumPage::Link::Link() {
    473 }
    474 
    475 PDFiumPage::Link::~Link() {
    476 }
    477 
    478 }  // namespace chrome_pdf
    479