Home | History | Annotate | Download | only in pdf
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "pdf/document_loader.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_util.h"
      9 #include "net/http/http_util.h"
     10 #include "ppapi/c/pp_errors.h"
     11 #include "ppapi/cpp/url_loader.h"
     12 #include "ppapi/cpp/url_request_info.h"
     13 #include "ppapi/cpp/url_response_info.h"
     14 
     15 namespace chrome_pdf {
     16 
     17 // Document below size will be downloaded in one chunk.
     18 const uint32 kMinFileSize = 64*1024;
     19 
     20 DocumentLoader::DocumentLoader(Client* client)
     21     : client_(client), partial_document_(false), request_pending_(false),
     22       current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
     23       document_size_(0), header_request_(true), is_multipart_(false) {
     24   loader_factory_.Initialize(this);
     25 }
     26 
     27 DocumentLoader::~DocumentLoader() {
     28 }
     29 
     30 bool DocumentLoader::Init(const pp::URLLoader& loader,
     31                           const std::string& url,
     32                           const std::string& headers) {
     33   DCHECK(url_.empty());
     34   url_ = url;
     35   loader_ = loader;
     36 
     37   std::string response_headers;
     38   if (!headers.empty()) {
     39     response_headers = headers;
     40   } else {
     41     pp::URLResponseInfo response = loader_.GetResponseInfo();
     42     pp::Var headers_var = response.GetHeaders();
     43 
     44     if (headers_var.is_string()) {
     45       response_headers = headers_var.AsString();
     46     }
     47   }
     48 
     49   bool accept_ranges_bytes = false;
     50   bool content_encoded = false;
     51   uint32 content_length = 0;
     52   std::string type;
     53   std::string disposition;
     54   if (!response_headers.empty()) {
     55     net::HttpUtil::HeadersIterator it(response_headers.begin(),
     56                                       response_headers.end(), "\n");
     57     while (it.GetNext()) {
     58       if (LowerCaseEqualsASCII(it.name(), "content-length")) {
     59         content_length = atoi(it.values().c_str());
     60       } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
     61         accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
     62       } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
     63         content_encoded = true;
     64       } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
     65         type = it.values();
     66         size_t semi_colon_pos = type.find(';');
     67         if (semi_colon_pos != std::string::npos) {
     68           type = type.substr(0, semi_colon_pos);
     69         }
     70         TrimWhitespace(type, base::TRIM_ALL, &type);
     71       } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
     72         disposition = it.values();
     73       }
     74     }
     75   }
     76   if (!type.empty() &&
     77       !EndsWith(type, "/pdf", false) &&
     78       !EndsWith(type, ".pdf", false) &&
     79       !EndsWith(type, "/x-pdf", false) &&
     80       !EndsWith(type, "/*", false) &&
     81       !EndsWith(type, "/acrobat", false) &&
     82       !EndsWith(type, "/unknown", false)) {
     83     return false;
     84   }
     85   if (StartsWithASCII(disposition, "attachment", false)) {
     86     return false;
     87   }
     88 
     89   if (content_length > 0)
     90     chunk_stream_.Preallocate(content_length);
     91 
     92   document_size_ = content_length;
     93   requests_count_ = 0;
     94 
     95   // Enable partial loading only if file size is above the threshold.
     96   // It will allow avoiding latency for multiple requests.
     97   if (content_length > kMinFileSize &&
     98       accept_ranges_bytes &&
     99       !content_encoded) {
    100     LoadPartialDocument();
    101   } else {
    102     LoadFullDocument();
    103   }
    104   return true;
    105 }
    106 
    107 void DocumentLoader::LoadPartialDocument() {
    108   partial_document_ = true;
    109   // Force the main request to be cancelled, since if we're a full-frame plugin
    110   // there could be other references to the loader.
    111   loader_.Close();
    112   loader_ = pp::URLLoader();
    113   // Download file header.
    114   header_request_ = true;
    115   RequestData(0, std::min(GetRequestSize(), document_size_));
    116 }
    117 
    118 void DocumentLoader::LoadFullDocument() {
    119   partial_document_ = false;
    120   chunk_buffer_.clear();
    121   ReadMore();
    122 }
    123 
    124 bool DocumentLoader::IsDocumentComplete() const {
    125   if (document_size_ == 0)  // Document size unknown.
    126     return false;
    127   return IsDataAvailable(0, document_size_);
    128 }
    129 
    130 uint32 DocumentLoader::GetAvailableData() const {
    131   if (document_size_ == 0) {  // If document size is unknown.
    132     return current_pos_;
    133   }
    134 
    135   std::vector<std::pair<size_t, size_t> > ranges;
    136   chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
    137   uint32 available = document_size_;
    138   std::vector<std::pair<size_t, size_t> >::iterator it;
    139   for (it = ranges.begin(); it != ranges.end(); ++it) {
    140     available -= it->second;
    141   }
    142   return available;
    143 }
    144 
    145 void DocumentLoader::ClearPendingRequests() {
    146   // The first item in the queue is pending (need to keep it in the queue).
    147   if (pending_requests_.size() > 1) {
    148     // Remove all elements except the first one.
    149     pending_requests_.erase(++pending_requests_.begin(),
    150                             pending_requests_.end());
    151   }
    152 }
    153 
    154 bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
    155   return chunk_stream_.ReadData(position, size, buf);
    156 }
    157 
    158 bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
    159   return chunk_stream_.IsRangeAvailable(position, size);
    160 }
    161 
    162 void DocumentLoader::RequestData(uint32 position, uint32 size) {
    163   DCHECK(partial_document_);
    164 
    165   // We have some artefact request from
    166   // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
    167   // document is complete.
    168   // We need this fix in PDFIum. Adding this as a work around.
    169   // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
    170   // Test url:
    171   // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
    172   if (IsDocumentComplete())
    173     return;
    174 
    175   pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
    176   DownloadPendingRequests();
    177 }
    178 
    179 void DocumentLoader::DownloadPendingRequests() {
    180   if (request_pending_ || pending_requests_.empty())
    181     return;
    182 
    183   // Remove already completed requests.
    184   // By design DownloadPendingRequests() should have at least 1 request in the
    185   // queue. ReadComplete() will remove the last pending comment from the queue.
    186   while (pending_requests_.size() > 1) {
    187     if (IsDataAvailable(pending_requests_.front().first,
    188                         pending_requests_.front().second)) {
    189       pending_requests_.pop_front();
    190     } else {
    191       break;
    192     }
    193   }
    194 
    195   uint32 pos = pending_requests_.front().first;
    196   uint32 size = pending_requests_.front().second;
    197   if (IsDataAvailable(pos, size)) {
    198     ReadComplete();
    199     return;
    200   }
    201 
    202   // If current request has been partially downloaded already, split it into
    203   // a few smaller requests.
    204   std::vector<std::pair<size_t, size_t> > ranges;
    205   chunk_stream_.GetMissedRanges(pos, size, &ranges);
    206   if (ranges.size() > 0) {
    207     pending_requests_.pop_front();
    208     pending_requests_.insert(pending_requests_.begin(),
    209                              ranges.begin(), ranges.end());
    210     pos = pending_requests_.front().first;
    211     size = pending_requests_.front().second;
    212   }
    213 
    214   uint32 cur_request_size = GetRequestSize();
    215   // If size is less than default request, try to expand download range for
    216   // more optimal download.
    217   if (size < cur_request_size && partial_document_) {
    218     // First, try to expand block towards the end of the file.
    219     uint32 new_pos = pos;
    220     uint32 new_size = cur_request_size;
    221     if (pos + new_size > document_size_)
    222       new_size = document_size_ - pos;
    223 
    224     std::vector<std::pair<size_t, size_t> > ranges;
    225     if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
    226       new_pos = ranges[0].first;
    227       new_size = ranges[0].second;
    228     }
    229 
    230     // Second, try to expand block towards the beginning of the file.
    231     if (new_size < cur_request_size) {
    232       uint32 block_end = new_pos + new_size;
    233       if (block_end > cur_request_size) {
    234         new_pos = block_end - cur_request_size;
    235       } else {
    236         new_pos = 0;
    237       }
    238       new_size = block_end - new_pos;
    239 
    240       if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
    241         new_pos = ranges.back().first;
    242         new_size = ranges.back().second;
    243       }
    244     }
    245     pos = new_pos;
    246     size = new_size;
    247   }
    248 
    249   size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
    250   size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
    251   if (pos - last_byte_before < cur_request_size) {
    252     size = pos + size - last_byte_before;
    253     pos = last_byte_before;
    254   }
    255 
    256   if ((pos + size < first_byte_after) &&
    257       (pos + size + cur_request_size >= first_byte_after))
    258     size = first_byte_after - pos;
    259 
    260   request_pending_ = true;
    261 
    262   // Start downloading first pending request.
    263   loader_.Close();
    264   loader_ = client_->CreateURLLoader();
    265   pp::CompletionCallback callback =
    266       loader_factory_.NewCallback(&DocumentLoader::DidOpen);
    267   pp::URLRequestInfo request = GetRequest(pos, size);
    268   requests_count_++;
    269   int rv = loader_.Open(request, callback);
    270   if (rv != PP_OK_COMPLETIONPENDING)
    271     callback.Run(rv);
    272 }
    273 
    274 pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
    275                                               uint32 size) const {
    276   pp::URLRequestInfo request(client_->GetPluginInstance());
    277   request.SetURL(url_.c_str());
    278   request.SetMethod("GET");
    279   request.SetFollowRedirects(true);
    280 
    281   const size_t kBufSize = 100;
    282   char buf[kBufSize];
    283   // According to rfc2616, byte range specifies position of the first and last
    284   // bytes in the requested range inclusively. Therefore we should subtract 1
    285   // from the position + size, to get index of the last byte that needs to be
    286   // downloaded.
    287   base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
    288                  position + size - 1);
    289   pp::Var header(buf);
    290   request.SetHeaders(header);
    291 
    292   return request;
    293 }
    294 
    295 void DocumentLoader::DidOpen(int32_t result) {
    296   if (result != PP_OK) {
    297     NOTREACHED();
    298     return;
    299   }
    300 
    301   is_multipart_ = false;
    302   current_chunk_size_ = 0;
    303   current_chunk_read_ = 0;
    304 
    305   pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
    306   std::string headers;
    307   if (headers_var.is_string())
    308     headers = headers_var.AsString();
    309 
    310   std::string boundary = GetMultiPartBoundary(headers);
    311   if (boundary.size()) {
    312     // Leave position untouched for now, when we read the data we'll get it.
    313     is_multipart_ = true;
    314     multipart_boundary_ = boundary;
    315   } else {
    316     // Need to make sure that the server returned a byte-range, since it's
    317     // possible for a server to just ignore our bye-range request and just
    318     // return the entire document even if it supports byte-range requests.
    319     // i.e. sniff response to
    320     // http://www.act.org/compass/sample/pdf/geometry.pdf
    321     current_pos_ = 0;
    322     uint32 start_pos, end_pos;
    323     if (GetByteRange(headers, &start_pos, &end_pos)) {
    324       current_pos_ = start_pos;
    325       if (end_pos && end_pos > start_pos)
    326         current_chunk_size_ = end_pos - start_pos + 1;
    327     }
    328   }
    329 
    330   ReadMore();
    331 }
    332 
    333 bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
    334                                   uint32* end) {
    335   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
    336   while (it.GetNext()) {
    337     if (LowerCaseEqualsASCII(it.name(), "content-range")) {
    338       std::string range = it.values().c_str();
    339       if (StartsWithASCII(range, "bytes", false)) {
    340         range = range.substr(strlen("bytes"));
    341         std::string::size_type pos = range.find('-');
    342         std::string range_end;
    343         if (pos != std::string::npos)
    344           range_end = range.substr(pos + 1);
    345         TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
    346         TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
    347         *start = atoi(range.c_str());
    348         *end = atoi(range_end.c_str());
    349         return true;
    350       }
    351     }
    352   }
    353   return false;
    354 }
    355 
    356 std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
    357   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
    358   while (it.GetNext()) {
    359     if (LowerCaseEqualsASCII(it.name(), "content-type")) {
    360       std::string type = StringToLowerASCII(it.values());
    361       if (StartsWithASCII(type, "multipart/", true)) {
    362         const char* boundary = strstr(type.c_str(), "boundary=");
    363         if (!boundary) {
    364           NOTREACHED();
    365           break;
    366         }
    367 
    368         return std::string(boundary + 9);
    369       }
    370     }
    371   }
    372   return std::string();
    373 }
    374 
    375 void DocumentLoader::ReadMore() {
    376   pp::CompletionCallback callback =
    377         loader_factory_.NewCallback(&DocumentLoader::DidRead);
    378   int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
    379   if (rv != PP_OK_COMPLETIONPENDING)
    380     callback.Run(rv);
    381 }
    382 
    383 void DocumentLoader::DidRead(int32_t result) {
    384   if (result > 0) {
    385     char* start = buffer_;
    386     size_t length = result;
    387     if (is_multipart_ && result > 2) {
    388       for (int i = 2; i < result; ++i) {
    389         if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
    390             (i >= 4 &&
    391              buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
    392              buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
    393           uint32 start_pos, end_pos;
    394           if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
    395             current_pos_ = start_pos;
    396             start += i;
    397             length -= i;
    398             if (end_pos && end_pos > start_pos)
    399               current_chunk_size_ = end_pos - start_pos + 1;
    400           }
    401           break;
    402         }
    403       }
    404 
    405       // Reset this flag so we don't look inside the buffer in future calls of
    406       // DidRead for this response.  Note that this code DOES NOT handle multi-
    407       // part responses with more than one part (we don't issue them at the
    408       // moment, so they shouldn't arrive).
    409       is_multipart_ = false;
    410     }
    411 
    412     if (current_chunk_size_ &&
    413         current_chunk_read_ + length > current_chunk_size_)
    414       length = current_chunk_size_ - current_chunk_read_;
    415 
    416     if (length) {
    417       if (document_size_ > 0) {
    418         chunk_stream_.WriteData(current_pos_, start, length);
    419       } else {
    420         // If we did not get content-length in the response, we can't
    421         // preallocate buffer for the entire document. Resizing array causing
    422         // memory fragmentation issues on the large files and OOM exceptions.
    423         // To fix this, we collect all chunks of the file to the list and
    424         // concatenate them together after request is complete.
    425         chunk_buffer_.push_back(std::vector<unsigned char>());
    426         chunk_buffer_.back().resize(length);
    427         memcpy(&(chunk_buffer_.back()[0]), start, length);
    428       }
    429       current_pos_ += length;
    430       current_chunk_read_ += length;
    431       client_->OnNewDataAvailable();
    432     }
    433     ReadMore();
    434   } else if (result == PP_OK) {
    435     ReadComplete();
    436   } else {
    437     NOTREACHED();
    438   }
    439 }
    440 
    441 void DocumentLoader::ReadComplete() {
    442   if (!partial_document_) {
    443     if (document_size_ == 0) {
    444       // For the document with no 'content-length" specified we've collected all
    445       // the chunks already. Let's allocate final document buffer and copy them
    446       // over.
    447       chunk_stream_.Preallocate(current_pos_);
    448       uint32 pos = 0;
    449       std::list<std::vector<unsigned char> >::iterator it;
    450       for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
    451         chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
    452         pos += it->size();
    453       }
    454       chunk_buffer_.clear();
    455     }
    456     document_size_ = current_pos_;
    457     client_->OnDocumentComplete();
    458     return;
    459   }
    460 
    461   request_pending_ = false;
    462   pending_requests_.pop_front();
    463 
    464   // If there are more pending request - continue downloading.
    465   if (!pending_requests_.empty()) {
    466     DownloadPendingRequests();
    467     return;
    468   }
    469 
    470   if (IsDocumentComplete()) {
    471     client_->OnDocumentComplete();
    472     return;
    473   }
    474 
    475   if (header_request_)
    476     client_->OnPartialDocumentLoaded();
    477   else
    478     client_->OnPendingRequestComplete();
    479   header_request_ = false;
    480 
    481   // The OnPendingRequestComplete could have added more requests.
    482   if (!pending_requests_.empty()) {
    483     DownloadPendingRequests();
    484   } else {
    485     // Document is not complete and we have no outstanding requests.
    486     // Let's keep downloading PDF file in small chunks.
    487     uint32 pos = chunk_stream_.GetFirstMissingByte();
    488     std::vector<std::pair<size_t, size_t> > ranges;
    489     chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
    490     DCHECK(ranges.size() > 0);
    491     RequestData(ranges[0].first, ranges[0].second);
    492   }
    493 }
    494 
    495 uint32 DocumentLoader::GetRequestSize() const {
    496   // Document loading strategy:
    497   // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
    498   // double the size (64k), and so on, until we cap max request size at 2M for
    499   // 71 or more requests.
    500   uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
    501   return 32*1024 * (1 << ((limited_count - 1) / 10u));
    502 }
    503 
    504 }  // namespace chrome_pdf
    505