1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/dom_distiller/core/distiller.h" 6 7 #include <map> 8 #include <vector> 9 10 #include "base/auto_reset.h" 11 #include "base/bind.h" 12 #include "base/callback.h" 13 #include "base/location.h" 14 #include "base/message_loop/message_loop.h" 15 #include "base/strings/string_number_conversions.h" 16 #include "base/strings/utf_string_conversions.h" 17 #include "base/values.h" 18 #include "components/dom_distiller/core/distiller_page.h" 19 #include "components/dom_distiller/core/distiller_url_fetcher.h" 20 #include "components/dom_distiller/core/proto/distilled_article.pb.h" 21 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 22 #include "net/url_request/url_request_context_getter.h" 23 24 namespace { 25 // Maximum number of distilled pages in an article. 26 const size_t kMaxPagesInArticle = 32; 27 } 28 29 namespace dom_distiller { 30 31 DistillerFactoryImpl::DistillerFactoryImpl( 32 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, 33 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options) 34 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), 35 dom_distiller_options_(dom_distiller_options) { 36 } 37 38 DistillerFactoryImpl::~DistillerFactoryImpl() {} 39 40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { 41 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 42 *distiller_url_fetcher_factory_, dom_distiller_options_)); 43 return distiller.PassAs<Distiller>(); 44 } 45 46 DistillerImpl::DistilledPageData::DistilledPageData() {} 47 48 DistillerImpl::DistilledPageData::~DistilledPageData() {} 49 50 DistillerImpl::DistillerImpl( 51 const DistillerURLFetcherFactory& distiller_url_fetcher_factory, 52 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options) 53 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), 54 dom_distiller_options_(dom_distiller_options), 55 max_pages_in_article_(kMaxPagesInArticle), 56 destruction_allowed_(true), 57 weak_factory_(this) { 58 } 59 60 DistillerImpl::~DistillerImpl() { 61 DCHECK(destruction_allowed_); 62 } 63 64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) { 65 max_pages_in_article_ = max_num_pages; 66 } 67 68 bool DistillerImpl::AreAllPagesFinished() const { 69 return started_pages_index_.empty() && waiting_pages_.empty(); 70 } 71 72 size_t DistillerImpl::TotalPageCount() const { 73 return waiting_pages_.size() + started_pages_index_.size() + 74 finished_pages_index_.size(); 75 } 76 77 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) { 78 if (!IsPageNumberInUse(page_num) && url.is_valid() && 79 TotalPageCount() < max_pages_in_article_ && 80 seen_urls_.find(url.spec()) == seen_urls_.end()) { 81 waiting_pages_[page_num] = url; 82 } 83 } 84 85 bool DistillerImpl::IsPageNumberInUse(int page_num) const { 86 return waiting_pages_.find(page_num) != waiting_pages_.end() || 87 started_pages_index_.find(page_num) != started_pages_index_.end() || 88 finished_pages_index_.find(page_num) != finished_pages_index_.end(); 89 } 90 91 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index) 92 const { 93 DCHECK_LT(index, pages_.size()); 94 DistilledPageData* page_data = pages_[index]; 95 DCHECK(page_data); 96 return page_data; 97 } 98 99 void DistillerImpl::DistillPage(const GURL& url, 100 scoped_ptr<DistillerPage> distiller_page, 101 const DistillationFinishedCallback& finished_cb, 102 const DistillationUpdateCallback& update_cb) { 103 DCHECK(AreAllPagesFinished()); 104 distiller_page_ = distiller_page.Pass(); 105 finished_cb_ = finished_cb; 106 update_cb_ = update_cb; 107 108 AddToDistillationQueue(0, url); 109 DistillNextPage(); 110 } 111 112 void DistillerImpl::DistillNextPage() { 113 if (!waiting_pages_.empty()) { 114 std::map<int, GURL>::iterator front = waiting_pages_.begin(); 115 int page_num = front->first; 116 const GURL url = front->second; 117 118 waiting_pages_.erase(front); 119 DCHECK(url.is_valid()); 120 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end()); 121 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); 122 seen_urls_.insert(url.spec()); 123 pages_.push_back(new DistilledPageData()); 124 started_pages_index_[page_num] = pages_.size() - 1; 125 distiller_page_->DistillPage( 126 url, 127 dom_distiller_options_, 128 base::Bind(&DistillerImpl::OnPageDistillationFinished, 129 weak_factory_.GetWeakPtr(), 130 page_num, 131 url)); 132 } 133 } 134 135 void DistillerImpl::OnPageDistillationFinished( 136 int page_num, 137 const GURL& page_url, 138 scoped_ptr<DistilledPageInfo> distilled_page, 139 bool distillation_successful) { 140 DCHECK(distilled_page.get()); 141 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 142 if (distillation_successful) { 143 DistilledPageData* page_data = 144 GetPageAtIndex(started_pages_index_[page_num]); 145 page_data->distilled_page_proto = 146 new base::RefCountedData<DistilledPageProto>(); 147 page_data->page_num = page_num; 148 page_data->distilled_page_proto->data.set_title(distilled_page->title); 149 page_data->distilled_page_proto->data.set_url(page_url.spec()); 150 page_data->distilled_page_proto->data.set_html(distilled_page->html); 151 152 GURL next_page_url(distilled_page->next_page_url); 153 if (next_page_url.is_valid()) { 154 // The pages should be in same origin. 155 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); 156 AddToDistillationQueue(page_num + 1, next_page_url); 157 } 158 159 GURL prev_page_url(distilled_page->prev_page_url); 160 if (prev_page_url.is_valid()) { 161 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin()); 162 AddToDistillationQueue(page_num - 1, prev_page_url); 163 } 164 165 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); 166 ++img_num) { 167 std::string image_id = 168 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num); 169 FetchImage(page_num, image_id, distilled_page->image_urls[img_num]); 170 } 171 172 AddPageIfDone(page_num); 173 DistillNextPage(); 174 } else { 175 started_pages_index_.erase(page_num); 176 RunDistillerCallbackIfDone(); 177 } 178 } 179 180 void DistillerImpl::FetchImage(int page_num, 181 const std::string& image_id, 182 const std::string& item) { 183 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 184 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 185 DistillerURLFetcher* fetcher = 186 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 187 page_data->image_fetchers_.push_back(fetcher); 188 189 fetcher->FetchURL(item, 190 base::Bind(&DistillerImpl::OnFetchImageDone, 191 weak_factory_.GetWeakPtr(), 192 page_num, 193 base::Unretained(fetcher), 194 image_id)); 195 } 196 197 void DistillerImpl::OnFetchImageDone(int page_num, 198 DistillerURLFetcher* url_fetcher, 199 const std::string& id, 200 const std::string& response) { 201 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 202 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 203 DCHECK(page_data->distilled_page_proto); 204 DCHECK(url_fetcher); 205 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = 206 std::find(page_data->image_fetchers_.begin(), 207 page_data->image_fetchers_.end(), 208 url_fetcher); 209 210 DCHECK(fetcher_it != page_data->image_fetchers_.end()); 211 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone 212 // callback is invoked by the |url_fetcher|. 213 page_data->image_fetchers_.weak_erase(fetcher_it); 214 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); 215 216 DistilledPageProto_Image* image = 217 page_data->distilled_page_proto->data.add_image(); 218 image->set_name(id); 219 image->set_data(response); 220 221 AddPageIfDone(page_num); 222 } 223 224 void DistillerImpl::AddPageIfDone(int page_num) { 225 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 226 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); 227 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 228 if (page_data->image_fetchers_.empty()) { 229 finished_pages_index_[page_num] = started_pages_index_[page_num]; 230 started_pages_index_.erase(page_num); 231 const ArticleDistillationUpdate& article_update = 232 CreateDistillationUpdate(); 233 DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size()); 234 update_cb_.Run(article_update); 235 RunDistillerCallbackIfDone(); 236 } 237 } 238 239 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate() 240 const { 241 bool has_prev_page = false; 242 bool has_next_page = false; 243 if (!finished_pages_index_.empty()) { 244 int prev_page_num = finished_pages_index_.begin()->first - 1; 245 int next_page_num = finished_pages_index_.rbegin()->first + 1; 246 has_prev_page = IsPageNumberInUse(prev_page_num); 247 has_next_page = IsPageNumberInUse(next_page_num); 248 } 249 250 std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> > 251 update_pages; 252 for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin(); 253 it != finished_pages_index_.end(); 254 ++it) { 255 update_pages.push_back(pages_[it->second]->distilled_page_proto); 256 } 257 return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page); 258 } 259 260 void DistillerImpl::RunDistillerCallbackIfDone() { 261 DCHECK(!finished_cb_.is_null()); 262 if (AreAllPagesFinished()) { 263 bool first_page = true; 264 scoped_ptr<DistilledArticleProto> article_proto( 265 new DistilledArticleProto()); 266 // Stitch the pages back into the article. 267 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin(); 268 it != finished_pages_index_.end();) { 269 DistilledPageData* page_data = GetPageAtIndex(it->second); 270 *(article_proto->add_pages()) = page_data->distilled_page_proto->data; 271 272 if (first_page) { 273 article_proto->set_title(page_data->distilled_page_proto->data.title()); 274 first_page = false; 275 } 276 277 finished_pages_index_.erase(it++); 278 } 279 280 pages_.clear(); 281 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()), 282 max_pages_in_article_); 283 284 DCHECK(pages_.empty()); 285 DCHECK(finished_pages_index_.empty()); 286 287 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_, 288 false); 289 finished_cb_.Run(article_proto.Pass()); 290 finished_cb_.Reset(); 291 } 292 } 293 294 } // namespace dom_distiller 295