1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/dom_distiller/core/distiller.h" 6 7 #include <map> 8 #include <vector> 9 10 #include "base/auto_reset.h" 11 #include "base/bind.h" 12 #include "base/callback.h" 13 #include "base/location.h" 14 #include "base/message_loop/message_loop.h" 15 #include "base/strings/string_number_conversions.h" 16 #include "base/strings/utf_string_conversions.h" 17 #include "base/values.h" 18 #include "components/dom_distiller/core/distiller_page.h" 19 #include "components/dom_distiller/core/distiller_url_fetcher.h" 20 #include "components/dom_distiller/core/proto/distilled_article.pb.h" 21 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 22 #include "net/url_request/url_request_context_getter.h" 23 24 namespace { 25 // Maximum number of distilled pages in an article. 26 const size_t kMaxPagesInArticle = 32; 27 } 28 29 namespace dom_distiller { 30 31 DistillerFactoryImpl::DistillerFactoryImpl( 32 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, 33 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options) 34 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), 35 dom_distiller_options_(dom_distiller_options) { 36 } 37 38 DistillerFactoryImpl::~DistillerFactoryImpl() {} 39 40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { 41 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 42 *distiller_url_fetcher_factory_, dom_distiller_options_)); 43 return distiller.PassAs<Distiller>(); 44 } 45 46 DistillerImpl::DistilledPageData::DistilledPageData() {} 47 48 DistillerImpl::DistilledPageData::~DistilledPageData() {} 49 50 DistillerImpl::DistillerImpl( 51 const DistillerURLFetcherFactory& distiller_url_fetcher_factory, 52 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options) 53 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), 54 dom_distiller_options_(dom_distiller_options), 55 max_pages_in_article_(kMaxPagesInArticle), 56 destruction_allowed_(true), 57 weak_factory_(this) { 58 } 59 60 DistillerImpl::~DistillerImpl() { 61 DCHECK(destruction_allowed_); 62 } 63 64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) { 65 max_pages_in_article_ = max_num_pages; 66 } 67 68 bool DistillerImpl::AreAllPagesFinished() const { 69 return started_pages_index_.empty() && waiting_pages_.empty(); 70 } 71 72 size_t DistillerImpl::TotalPageCount() const { 73 return waiting_pages_.size() + started_pages_index_.size() + 74 finished_pages_index_.size(); 75 } 76 77 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) { 78 if (!IsPageNumberInUse(page_num) && url.is_valid() && 79 TotalPageCount() < max_pages_in_article_ && 80 seen_urls_.find(url.spec()) == seen_urls_.end()) { 81 waiting_pages_[page_num] = url; 82 } 83 } 84 85 bool DistillerImpl::IsPageNumberInUse(int page_num) const { 86 return waiting_pages_.find(page_num) != waiting_pages_.end() || 87 started_pages_index_.find(page_num) != started_pages_index_.end() || 88 finished_pages_index_.find(page_num) != finished_pages_index_.end(); 89 } 90 91 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index) 92 const { 93 DCHECK_LT(index, pages_.size()); 94 DistilledPageData* page_data = pages_[index]; 95 DCHECK(page_data); 96 return page_data; 97 } 98 99 void DistillerImpl::DistillPage(const GURL& url, 100 scoped_ptr<DistillerPage> distiller_page, 101 const DistillationFinishedCallback& finished_cb, 102 const DistillationUpdateCallback& update_cb) { 103 DCHECK(AreAllPagesFinished()); 104 distiller_page_ = distiller_page.Pass(); 105 finished_cb_ = finished_cb; 106 update_cb_ = update_cb; 107 108 AddToDistillationQueue(0, url); 109 DistillNextPage(); 110 } 111 112 void DistillerImpl::DistillNextPage() { 113 if (!waiting_pages_.empty()) { 114 std::map<int, GURL>::iterator front = waiting_pages_.begin(); 115 int page_num = front->first; 116 const GURL url = front->second; 117 118 waiting_pages_.erase(front); 119 DCHECK(url.is_valid()); 120 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end()); 121 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); 122 seen_urls_.insert(url.spec()); 123 pages_.push_back(new DistilledPageData()); 124 started_pages_index_[page_num] = pages_.size() - 1; 125 distiller_page_->DistillPage( 126 url, 127 dom_distiller_options_, 128 base::Bind(&DistillerImpl::OnPageDistillationFinished, 129 weak_factory_.GetWeakPtr(), 130 page_num, 131 url)); 132 } 133 } 134 135 void DistillerImpl::OnPageDistillationFinished( 136 int page_num, 137 const GURL& page_url, 138 scoped_ptr<proto::DomDistillerResult> distiller_result, 139 bool distillation_successful) { 140 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 141 if (distillation_successful) { 142 DCHECK(distiller_result.get()); 143 DistilledPageData* page_data = 144 GetPageAtIndex(started_pages_index_[page_num]); 145 page_data->distilled_page_proto = 146 new base::RefCountedData<DistilledPageProto>(); 147 page_data->page_num = page_num; 148 if (distiller_result->has_title()) { 149 page_data->distilled_page_proto->data.set_title( 150 distiller_result->title()); 151 } 152 page_data->distilled_page_proto->data.set_url(page_url.spec()); 153 if (distiller_result->has_distilled_content() && 154 distiller_result->distilled_content().has_html()) { 155 page_data->distilled_page_proto->data.set_html( 156 distiller_result->distilled_content().html()); 157 } 158 if (distiller_result->has_debug_info() && 159 distiller_result->debug_info().has_log()) { 160 page_data->distilled_page_proto->data.mutable_debug_info()->set_log( 161 distiller_result->debug_info().log()); 162 } 163 164 if (distiller_result->has_pagination_info()) { 165 proto::PaginationInfo pagination_info = 166 distiller_result->pagination_info(); 167 if (pagination_info.has_next_page()) { 168 GURL next_page_url(pagination_info.next_page()); 169 if (next_page_url.is_valid()) { 170 // The pages should be in same origin. 171 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); 172 AddToDistillationQueue(page_num + 1, next_page_url); 173 } 174 } 175 176 if (pagination_info.has_prev_page()) { 177 GURL prev_page_url(pagination_info.prev_page()); 178 if (prev_page_url.is_valid()) { 179 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin()); 180 AddToDistillationQueue(page_num - 1, prev_page_url); 181 } 182 } 183 } 184 185 for (int img_num = 0; img_num < distiller_result->image_urls_size(); 186 ++img_num) { 187 std::string image_id = 188 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num); 189 FetchImage(page_num, image_id, distiller_result->image_urls(img_num)); 190 } 191 192 AddPageIfDone(page_num); 193 DistillNextPage(); 194 } else { 195 started_pages_index_.erase(page_num); 196 RunDistillerCallbackIfDone(); 197 } 198 } 199 200 void DistillerImpl::FetchImage(int page_num, 201 const std::string& image_id, 202 const std::string& item) { 203 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 204 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 205 DistillerURLFetcher* fetcher = 206 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 207 page_data->image_fetchers_.push_back(fetcher); 208 209 fetcher->FetchURL(item, 210 base::Bind(&DistillerImpl::OnFetchImageDone, 211 weak_factory_.GetWeakPtr(), 212 page_num, 213 base::Unretained(fetcher), 214 image_id)); 215 } 216 217 void DistillerImpl::OnFetchImageDone(int page_num, 218 DistillerURLFetcher* url_fetcher, 219 const std::string& id, 220 const std::string& response) { 221 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 222 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 223 DCHECK(page_data->distilled_page_proto.get()); 224 DCHECK(url_fetcher); 225 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = 226 std::find(page_data->image_fetchers_.begin(), 227 page_data->image_fetchers_.end(), 228 url_fetcher); 229 230 DCHECK(fetcher_it != page_data->image_fetchers_.end()); 231 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone 232 // callback is invoked by the |url_fetcher|. 233 page_data->image_fetchers_.weak_erase(fetcher_it); 234 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); 235 236 DistilledPageProto_Image* image = 237 page_data->distilled_page_proto->data.add_image(); 238 image->set_name(id); 239 image->set_data(response); 240 241 AddPageIfDone(page_num); 242 } 243 244 void DistillerImpl::AddPageIfDone(int page_num) { 245 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 246 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); 247 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 248 if (page_data->image_fetchers_.empty()) { 249 finished_pages_index_[page_num] = started_pages_index_[page_num]; 250 started_pages_index_.erase(page_num); 251 const ArticleDistillationUpdate& article_update = 252 CreateDistillationUpdate(); 253 DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size()); 254 update_cb_.Run(article_update); 255 RunDistillerCallbackIfDone(); 256 } 257 } 258 259 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate() 260 const { 261 bool has_prev_page = false; 262 bool has_next_page = false; 263 if (!finished_pages_index_.empty()) { 264 int prev_page_num = finished_pages_index_.begin()->first - 1; 265 int next_page_num = finished_pages_index_.rbegin()->first + 1; 266 has_prev_page = IsPageNumberInUse(prev_page_num); 267 has_next_page = IsPageNumberInUse(next_page_num); 268 } 269 270 std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> > 271 update_pages; 272 for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin(); 273 it != finished_pages_index_.end(); 274 ++it) { 275 update_pages.push_back(pages_[it->second]->distilled_page_proto); 276 } 277 return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page); 278 } 279 280 void DistillerImpl::RunDistillerCallbackIfDone() { 281 DCHECK(!finished_cb_.is_null()); 282 if (AreAllPagesFinished()) { 283 bool first_page = true; 284 scoped_ptr<DistilledArticleProto> article_proto( 285 new DistilledArticleProto()); 286 // Stitch the pages back into the article. 287 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin(); 288 it != finished_pages_index_.end();) { 289 DistilledPageData* page_data = GetPageAtIndex(it->second); 290 *(article_proto->add_pages()) = page_data->distilled_page_proto->data; 291 292 if (first_page) { 293 article_proto->set_title(page_data->distilled_page_proto->data.title()); 294 first_page = false; 295 } 296 297 finished_pages_index_.erase(it++); 298 } 299 300 pages_.clear(); 301 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()), 302 max_pages_in_article_); 303 304 DCHECK(pages_.empty()); 305 DCHECK(finished_pages_index_.empty()); 306 307 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_, 308 false); 309 finished_cb_.Run(article_proto.Pass()); 310 finished_cb_.Reset(); 311 } 312 } 313 314 } // namespace dom_distiller 315