Home | History | Annotate | Download | only in core
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/dom_distiller/core/distiller.h"
      6 
      7 #include <map>
      8 #include <vector>
      9 
     10 #include "base/auto_reset.h"
     11 #include "base/bind.h"
     12 #include "base/callback.h"
     13 #include "base/location.h"
     14 #include "base/message_loop/message_loop.h"
     15 #include "base/strings/string_number_conversions.h"
     16 #include "base/strings/utf_string_conversions.h"
     17 #include "base/values.h"
     18 #include "components/dom_distiller/core/distiller_page.h"
     19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
     20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
     21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
     22 #include "net/url_request/url_request_context_getter.h"
     23 
     24 namespace {
     25 // Maximum number of distilled pages in an article.
     26 const size_t kMaxPagesInArticle = 32;
     27 }
     28 
     29 namespace dom_distiller {
     30 
     31 DistillerFactoryImpl::DistillerFactoryImpl(
     32     scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
     33     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
     34     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
     35       dom_distiller_options_(dom_distiller_options) {
     36 }
     37 
     38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
     39 
     40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
     41   scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
     42       *distiller_url_fetcher_factory_, dom_distiller_options_));
     43   return distiller.PassAs<Distiller>();
     44 }
     45 
     46 DistillerImpl::DistilledPageData::DistilledPageData() {}
     47 
     48 DistillerImpl::DistilledPageData::~DistilledPageData() {}
     49 
     50 DistillerImpl::DistillerImpl(
     51     const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
     52     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
     53     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
     54       dom_distiller_options_(dom_distiller_options),
     55       max_pages_in_article_(kMaxPagesInArticle),
     56       destruction_allowed_(true),
     57       weak_factory_(this) {
     58 }
     59 
     60 DistillerImpl::~DistillerImpl() {
     61   DCHECK(destruction_allowed_);
     62 }
     63 
     64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
     65   max_pages_in_article_ = max_num_pages;
     66 }
     67 
     68 bool DistillerImpl::AreAllPagesFinished() const {
     69   return started_pages_index_.empty() && waiting_pages_.empty();
     70 }
     71 
     72 size_t DistillerImpl::TotalPageCount() const {
     73   return waiting_pages_.size() + started_pages_index_.size() +
     74          finished_pages_index_.size();
     75 }
     76 
     77 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
     78   if (!IsPageNumberInUse(page_num) && url.is_valid() &&
     79       TotalPageCount() < max_pages_in_article_ &&
     80       seen_urls_.find(url.spec()) == seen_urls_.end()) {
     81     waiting_pages_[page_num] = url;
     82   }
     83 }
     84 
     85 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
     86   return waiting_pages_.find(page_num) != waiting_pages_.end() ||
     87          started_pages_index_.find(page_num) != started_pages_index_.end() ||
     88          finished_pages_index_.find(page_num) != finished_pages_index_.end();
     89 }
     90 
     91 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
     92     const {
     93   DCHECK_LT(index, pages_.size());
     94   DistilledPageData* page_data = pages_[index];
     95   DCHECK(page_data);
     96   return page_data;
     97 }
     98 
     99 void DistillerImpl::DistillPage(const GURL& url,
    100                                 scoped_ptr<DistillerPage> distiller_page,
    101                                 const DistillationFinishedCallback& finished_cb,
    102                                 const DistillationUpdateCallback& update_cb) {
    103   DCHECK(AreAllPagesFinished());
    104   distiller_page_ = distiller_page.Pass();
    105   finished_cb_ = finished_cb;
    106   update_cb_ = update_cb;
    107 
    108   AddToDistillationQueue(0, url);
    109   DistillNextPage();
    110 }
    111 
    112 void DistillerImpl::DistillNextPage() {
    113   if (!waiting_pages_.empty()) {
    114     std::map<int, GURL>::iterator front = waiting_pages_.begin();
    115     int page_num = front->first;
    116     const GURL url = front->second;
    117 
    118     waiting_pages_.erase(front);
    119     DCHECK(url.is_valid());
    120     DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
    121     DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
    122     seen_urls_.insert(url.spec());
    123     pages_.push_back(new DistilledPageData());
    124     started_pages_index_[page_num] = pages_.size() - 1;
    125     distiller_page_->DistillPage(
    126         url,
    127         dom_distiller_options_,
    128         base::Bind(&DistillerImpl::OnPageDistillationFinished,
    129                    weak_factory_.GetWeakPtr(),
    130                    page_num,
    131                    url));
    132   }
    133 }
    134 
    135 void DistillerImpl::OnPageDistillationFinished(
    136     int page_num,
    137     const GURL& page_url,
    138     scoped_ptr<DistilledPageInfo> distilled_page,
    139     bool distillation_successful) {
    140   DCHECK(distilled_page.get());
    141   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    142   if (distillation_successful) {
    143     DistilledPageData* page_data =
    144         GetPageAtIndex(started_pages_index_[page_num]);
    145     page_data->distilled_page_proto =
    146         new base::RefCountedData<DistilledPageProto>();
    147     page_data->page_num = page_num;
    148     page_data->distilled_page_proto->data.set_title(distilled_page->title);
    149     page_data->distilled_page_proto->data.set_url(page_url.spec());
    150     page_data->distilled_page_proto->data.set_html(distilled_page->html);
    151 
    152     GURL next_page_url(distilled_page->next_page_url);
    153     if (next_page_url.is_valid()) {
    154       // The pages should be in same origin.
    155       DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
    156       AddToDistillationQueue(page_num + 1, next_page_url);
    157     }
    158 
    159     GURL prev_page_url(distilled_page->prev_page_url);
    160     if (prev_page_url.is_valid()) {
    161       DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
    162       AddToDistillationQueue(page_num - 1, prev_page_url);
    163     }
    164 
    165     for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
    166          ++img_num) {
    167       std::string image_id =
    168           base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
    169       FetchImage(page_num, image_id, distilled_page->image_urls[img_num]);
    170     }
    171 
    172     AddPageIfDone(page_num);
    173     DistillNextPage();
    174   } else {
    175     started_pages_index_.erase(page_num);
    176     RunDistillerCallbackIfDone();
    177   }
    178 }
    179 
    180 void DistillerImpl::FetchImage(int page_num,
    181                                const std::string& image_id,
    182                                const std::string& item) {
    183   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    184   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
    185   DistillerURLFetcher* fetcher =
    186       distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
    187   page_data->image_fetchers_.push_back(fetcher);
    188 
    189   fetcher->FetchURL(item,
    190                     base::Bind(&DistillerImpl::OnFetchImageDone,
    191                                weak_factory_.GetWeakPtr(),
    192                                page_num,
    193                                base::Unretained(fetcher),
    194                                image_id));
    195 }
    196 
    197 void DistillerImpl::OnFetchImageDone(int page_num,
    198                                      DistillerURLFetcher* url_fetcher,
    199                                      const std::string& id,
    200                                      const std::string& response) {
    201   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    202   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
    203   DCHECK(page_data->distilled_page_proto);
    204   DCHECK(url_fetcher);
    205   ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
    206       std::find(page_data->image_fetchers_.begin(),
    207                 page_data->image_fetchers_.end(),
    208                 url_fetcher);
    209 
    210   DCHECK(fetcher_it != page_data->image_fetchers_.end());
    211   // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
    212   // callback is invoked by the |url_fetcher|.
    213   page_data->image_fetchers_.weak_erase(fetcher_it);
    214   base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
    215 
    216   DistilledPageProto_Image* image =
    217       page_data->distilled_page_proto->data.add_image();
    218   image->set_name(id);
    219   image->set_data(response);
    220 
    221   AddPageIfDone(page_num);
    222 }
    223 
    224 void DistillerImpl::AddPageIfDone(int page_num) {
    225   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
    226   DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
    227   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
    228   if (page_data->image_fetchers_.empty()) {
    229     finished_pages_index_[page_num] = started_pages_index_[page_num];
    230     started_pages_index_.erase(page_num);
    231     const ArticleDistillationUpdate& article_update =
    232         CreateDistillationUpdate();
    233     DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
    234     update_cb_.Run(article_update);
    235     RunDistillerCallbackIfDone();
    236   }
    237 }
    238 
    239 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
    240     const {
    241   bool has_prev_page = false;
    242   bool has_next_page = false;
    243   if (!finished_pages_index_.empty()) {
    244     int prev_page_num = finished_pages_index_.begin()->first - 1;
    245     int next_page_num = finished_pages_index_.rbegin()->first + 1;
    246     has_prev_page = IsPageNumberInUse(prev_page_num);
    247     has_next_page = IsPageNumberInUse(next_page_num);
    248   }
    249 
    250   std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
    251       update_pages;
    252   for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
    253        it != finished_pages_index_.end();
    254        ++it) {
    255     update_pages.push_back(pages_[it->second]->distilled_page_proto);
    256   }
    257   return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
    258 }
    259 
    260 void DistillerImpl::RunDistillerCallbackIfDone() {
    261   DCHECK(!finished_cb_.is_null());
    262   if (AreAllPagesFinished()) {
    263     bool first_page = true;
    264     scoped_ptr<DistilledArticleProto> article_proto(
    265         new DistilledArticleProto());
    266     // Stitch the pages back into the article.
    267     for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
    268          it != finished_pages_index_.end();) {
    269       DistilledPageData* page_data = GetPageAtIndex(it->second);
    270       *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
    271 
    272       if (first_page) {
    273         article_proto->set_title(page_data->distilled_page_proto->data.title());
    274         first_page = false;
    275       }
    276 
    277       finished_pages_index_.erase(it++);
    278     }
    279 
    280     pages_.clear();
    281     DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
    282               max_pages_in_article_);
    283 
    284     DCHECK(pages_.empty());
    285     DCHECK(finished_pages_index_.empty());
    286 
    287     base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
    288                                                        false);
    289     finished_cb_.Run(article_proto.Pass());
    290     finished_cb_.Reset();
    291   }
    292 }
    293 
    294 }  // namespace dom_distiller
    295