Home | History | Annotate | Download | only in core
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/dom_distiller/core/distiller.h"
      6 
      7 #include <map>
      8 #include <vector>
      9 
     10 #include "base/auto_reset.h"
     11 #include "base/bind.h"
     12 #include "base/callback.h"
     13 #include "base/location.h"
     14 #include "base/message_loop/message_loop.h"
     15 #include "base/strings/string_number_conversions.h"
     16 #include "base/strings/utf_string_conversions.h"
     17 #include "base/values.h"
     18 #include "components/dom_distiller/core/distiller_page.h"
     19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
     20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
     21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
     22 #include "net/url_request/url_request_context_getter.h"
     23 
     24 namespace {
     25 // Maximum number of distilled pages in an article.
     26 const size_t kMaxPagesInArticle = 32;
     27 }
     28 
     29 namespace dom_distiller {
     30 
     31 DistillerFactoryImpl::DistillerFactoryImpl(
     32     scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
     33     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
     34     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
     35       dom_distiller_options_(dom_distiller_options) {
     36 }
     37 
     38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
     39 
     40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
     41   scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
     42       *distiller_url_fetcher_factory_, dom_distiller_options_));
     43   return distiller.PassAs<Distiller>();
     44 }
     45 
     46 DistillerImpl::DistilledPageData::DistilledPageData() {}
     47 
     48 DistillerImpl::DistilledPageData::~DistilledPageData() {}
     49 
     50 DistillerImpl::DistillerImpl(
     51     const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
     52     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
     53     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
     54       dom_distiller_options_(dom_distiller_options),
     55       max_pages_in_article_(kMaxPagesInArticle),
     56       destruction_allowed_(true),
     57       weak_factory_(this) {
     58 }
     59 
     60 DistillerImpl::~DistillerImpl() {
     61   DCHECK(destruction_allowed_);
     62 }
     63 
     64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
     65   max_pages_in_article_ = max_num_pages;
     66 }
     67 
     68 bool DistillerImpl::AreAllPagesFinished() const {
     69   return started_pages_index_.empty() && waiting_pages_.empty();
     70 }
     71 
     72 size_t DistillerImpl::TotalPageCount() const {
     73   return waiting_pages_.size() + started_pages_index_.size() +
     74          finished_pages_index_.size();
     75 }
     76 
     77 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
     78   if (!IsPageNumberInUse(page_num) && url.is_valid() &&
     79       TotalPageCount() < max_pages_in_article_ &&
     80       seen_urls_.find(url.spec()) == seen_urls_.end()) {
     81     waiting_pages_[page_num] = url;
     82   }
     83 }
     84 
     85 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
     86   return waiting_pages_.find(page_num) != waiting_pages_.end() ||
     87          started_pages_index_.find(page_num) != started_pages_index_.end() ||
     88          finished_pages_index_.find(page_num) != finished_pages_index_.end();
     89 }
     90 
     91 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
     92     const {
     93   DCHECK_LT(index, pages_.size());
     94   DistilledPageData* page_data = pages_[index];
     95   DCHECK(page_data);
     96   return page_data;
     97 }
     98 
     99 void DistillerImpl::DistillPage(const GURL& url,
    100                                 scoped_ptr<DistillerPage> distiller_page,
    101                                 const DistillationFinishedCallback& finished_cb,
    102                                 const DistillationUpdateCallback& update_cb) {
    103   DCHECK(AreAllPagesFinished());
    104   distiller_page_ = distiller_page.Pass();
    105   finished_cb_ = finished_cb;
    106   update_cb_ = update_cb;
    107 
    108   AddToDistillationQueue(0, url);
    109   DistillNextPage();
    110 }
    111 
    112 void DistillerImpl::DistillNextPage() {
    113   if (!waiting_pages_.empty()) {
    114     std::map<int, GURL>::iterator front = waiting_pages_.begin();
    115     int page_num = front->first;
    116     const GURL url = front->second;
    117 
    118     waiting_pages_.erase(front);
    119     DCHECK(url.is_valid());
    120     DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
    121     DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
    122     seen_urls_.insert(url.spec());
    123     pages_.push_back(new DistilledPageData());
    124     started_pages_index_[page_num] = pages_.size() - 1;
    125     distiller_page_->DistillPage(
    126         url,
    127         dom_distiller_options_,
    128         base::Bind(&DistillerImpl::OnPageDistillationFinished,
    129                    weak_factory_.GetWeakPtr(),
    130                    page_num,
    131                    url));
    132   }
    133 }
    134 
    135 void DistillerImpl::OnPageDistillationFinished(
    136     int page_num,
    137     const GURL& page_url,
    138     scoped_ptr<proto::DomDistillerResult> distiller_result,
    139     bool distillation_successful) {
    140   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    141   if (distillation_successful) {
    142     DCHECK(distiller_result.get());
    143     DistilledPageData* page_data =
    144         GetPageAtIndex(started_pages_index_[page_num]);
    145     page_data->distilled_page_proto =
    146         new base::RefCountedData<DistilledPageProto>();
    147     page_data->page_num = page_num;
    148     if (distiller_result->has_title()) {
    149       page_data->distilled_page_proto->data.set_title(
    150           distiller_result->title());
    151     }
    152     page_data->distilled_page_proto->data.set_url(page_url.spec());
    153     if (distiller_result->has_distilled_content() &&
    154         distiller_result->distilled_content().has_html()) {
    155       page_data->distilled_page_proto->data.set_html(
    156           distiller_result->distilled_content().html());
    157     }
    158     if (distiller_result->has_debug_info() &&
    159         distiller_result->debug_info().has_log()) {
    160       page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
    161           distiller_result->debug_info().log());
    162     }
    163 
    164     if (distiller_result->has_pagination_info()) {
    165       proto::PaginationInfo pagination_info =
    166           distiller_result->pagination_info();
    167       if (pagination_info.has_next_page()) {
    168         GURL next_page_url(pagination_info.next_page());
    169         if (next_page_url.is_valid()) {
    170           // The pages should be in same origin.
    171           DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
    172           AddToDistillationQueue(page_num + 1, next_page_url);
    173         }
    174       }
    175 
    176       if (pagination_info.has_prev_page()) {
    177         GURL prev_page_url(pagination_info.prev_page());
    178         if (prev_page_url.is_valid()) {
    179           DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
    180           AddToDistillationQueue(page_num - 1, prev_page_url);
    181         }
    182       }
    183     }
    184 
    185     for (int img_num = 0; img_num < distiller_result->image_urls_size();
    186          ++img_num) {
    187       std::string image_id =
    188           base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
    189       FetchImage(page_num, image_id, distiller_result->image_urls(img_num));
    190     }
    191 
    192     AddPageIfDone(page_num);
    193     DistillNextPage();
    194   } else {
    195     started_pages_index_.erase(page_num);
    196     RunDistillerCallbackIfDone();
    197   }
    198 }
    199 
    200 void DistillerImpl::FetchImage(int page_num,
    201                                const std::string& image_id,
    202                                const std::string& item) {
    203   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    204   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
    205   DistillerURLFetcher* fetcher =
    206       distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
    207   page_data->image_fetchers_.push_back(fetcher);
    208 
    209   fetcher->FetchURL(item,
    210                     base::Bind(&DistillerImpl::OnFetchImageDone,
    211                                weak_factory_.GetWeakPtr(),
    212                                page_num,
    213                                base::Unretained(fetcher),
    214                                image_id));
    215 }
    216 
    217 void DistillerImpl::OnFetchImageDone(int page_num,
    218                                      DistillerURLFetcher* url_fetcher,
    219                                      const std::string& id,
    220                                      const std::string& response) {
    221   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    222   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
    223   DCHECK(page_data->distilled_page_proto.get());
    224   DCHECK(url_fetcher);
    225   ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
    226       std::find(page_data->image_fetchers_.begin(),
    227                 page_data->image_fetchers_.end(),
    228                 url_fetcher);
    229 
    230   DCHECK(fetcher_it != page_data->image_fetchers_.end());
    231   // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
    232   // callback is invoked by the |url_fetcher|.
    233   page_data->image_fetchers_.weak_erase(fetcher_it);
    234   base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
    235 
    236   DistilledPageProto_Image* image =
    237       page_data->distilled_page_proto->data.add_image();
    238   image->set_name(id);
    239   image->set_data(response);
    240 
    241   AddPageIfDone(page_num);
    242 }
    243 
    244 void DistillerImpl::AddPageIfDone(int page_num) {
    245   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    246   DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
    247   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
    248   if (page_data->image_fetchers_.empty()) {
    249     finished_pages_index_[page_num] = started_pages_index_[page_num];
    250     started_pages_index_.erase(page_num);
    251     const ArticleDistillationUpdate& article_update =
    252         CreateDistillationUpdate();
    253     DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
    254     update_cb_.Run(article_update);
    255     RunDistillerCallbackIfDone();
    256   }
    257 }
    258 
    259 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
    260     const {
    261   bool has_prev_page = false;
    262   bool has_next_page = false;
    263   if (!finished_pages_index_.empty()) {
    264     int prev_page_num = finished_pages_index_.begin()->first - 1;
    265     int next_page_num = finished_pages_index_.rbegin()->first + 1;
    266     has_prev_page = IsPageNumberInUse(prev_page_num);
    267     has_next_page = IsPageNumberInUse(next_page_num);
    268   }
    269 
    270   std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
    271       update_pages;
    272   for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
    273        it != finished_pages_index_.end();
    274        ++it) {
    275     update_pages.push_back(pages_[it->second]->distilled_page_proto);
    276   }
    277   return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
    278 }
    279 
    280 void DistillerImpl::RunDistillerCallbackIfDone() {
    281   DCHECK(!finished_cb_.is_null());
    282   if (AreAllPagesFinished()) {
    283     bool first_page = true;
    284     scoped_ptr<DistilledArticleProto> article_proto(
    285         new DistilledArticleProto());
    286     // Stitch the pages back into the article.
    287     for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
    288          it != finished_pages_index_.end();) {
    289       DistilledPageData* page_data = GetPageAtIndex(it->second);
    290       *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
    291 
    292       if (first_page) {
    293         article_proto->set_title(page_data->distilled_page_proto->data.title());
    294         first_page = false;
    295       }
    296 
    297       finished_pages_index_.erase(it++);
    298     }
    299 
    300     pages_.clear();
    301     DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
    302               max_pages_in_article_);
    303 
    304     DCHECK(pages_.empty());
    305     DCHECK(finished_pages_index_.empty());
    306 
    307     base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
    308                                                        false);
    309     finished_cb_.Run(article_proto.Pass());
    310     finished_cb_.Reset();
    311   }
    312 }
    313 
    314 }  // namespace dom_distiller
    315