1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_ 6 #define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_ 7 8 #include <map> 9 #include <string> 10 11 #include "base/callback.h" 12 #include "base/containers/hash_tables.h" 13 #include "base/memory/ref_counted.h" 14 #include "base/memory/scoped_ptr.h" 15 #include "base/memory/scoped_vector.h" 16 #include "base/memory/weak_ptr.h" 17 #include "components/dom_distiller/core/article_distillation_update.h" 18 #include "components/dom_distiller/core/distiller_page.h" 19 #include "components/dom_distiller/core/distiller_url_fetcher.h" 20 #include "components/dom_distiller/core/proto/distilled_article.pb.h" 21 #include "net/url_request/url_request_context_getter.h" 22 #include "url/gurl.h" 23 24 namespace dom_distiller { 25 26 class DistillerImpl; 27 28 class Distiller { 29 public: 30 typedef base::Callback<void(scoped_ptr<DistilledArticleProto>)> 31 DistillationFinishedCallback; 32 typedef base::Callback<void(const ArticleDistillationUpdate&)> 33 DistillationUpdateCallback; 34 35 virtual ~Distiller() {} 36 37 // Distills a page, and asynchronously returns the article HTML to the 38 // supplied |finished_cb| callback. |update_cb| is invoked whenever article 39 // under distillation is updated with more data. 40 // E.g. when distilling a 2 page article, |update_cb| may be invoked each time 41 // a distilled page is added and |finished_cb| will be invoked once 42 // distillation is completed. 43 virtual void DistillPage(const GURL& url, 44 scoped_ptr<DistillerPage> distiller_page, 45 const DistillationFinishedCallback& finished_cb, 46 const DistillationUpdateCallback& update_cb) = 0; 47 }; 48 49 class DistillerFactory { 50 public: 51 virtual scoped_ptr<Distiller> CreateDistiller() = 0; 52 virtual ~DistillerFactory() {} 53 }; 54 55 // Factory for creating a Distiller. 56 class DistillerFactoryImpl : public DistillerFactory { 57 public: 58 DistillerFactoryImpl( 59 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, 60 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options); 61 virtual ~DistillerFactoryImpl(); 62 virtual scoped_ptr<Distiller> CreateDistiller() OVERRIDE; 63 64 private: 65 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; 66 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; 67 }; 68 69 // Distills a article from a page and associated pages. 70 class DistillerImpl : public Distiller { 71 public: 72 DistillerImpl( 73 const DistillerURLFetcherFactory& distiller_url_fetcher_factory, 74 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options); 75 virtual ~DistillerImpl(); 76 77 virtual void DistillPage( 78 const GURL& url, 79 scoped_ptr<DistillerPage> distiller_page, 80 const DistillationFinishedCallback& finished_cb, 81 const DistillationUpdateCallback& update_cb) OVERRIDE; 82 83 void SetMaxNumPagesInArticle(size_t max_num_pages); 84 85 private: 86 // In case of multiple pages, the Distiller maintains state of multiple pages 87 // as page numbers relative to the page number where distillation started. 88 // E.g. if distillation starts at page 2 for a 3 page article. The relative 89 // page numbers assigned to pages will be [-1,0,1]. 90 91 // Class representing the state of a page under distillation. 92 struct DistilledPageData { 93 DistilledPageData(); 94 virtual ~DistilledPageData(); 95 // Relative page number of the page. 96 int page_num; 97 ScopedVector<DistillerURLFetcher> image_fetchers_; 98 scoped_refptr<base::RefCountedData<DistilledPageProto> > 99 distilled_page_proto; 100 101 private: 102 DISALLOW_COPY_AND_ASSIGN(DistilledPageData); 103 }; 104 105 void OnFetchImageDone(int page_num, 106 DistillerURLFetcher* url_fetcher, 107 const std::string& id, 108 const std::string& response); 109 110 void OnPageDistillationFinished( 111 int page_num, 112 const GURL& page_url, 113 scoped_ptr<proto::DomDistillerResult> distilled_page, 114 bool distillation_successful); 115 116 virtual void FetchImage(int page_num, 117 const std::string& image_id, 118 const std::string& item); 119 120 // Distills the next page. 121 void DistillNextPage(); 122 123 // Adds the |url| to |pages_to_be_distilled| if |page_num| is a valid relative 124 // page number and |url| is valid. Ignores duplicate pages and urls. 125 void AddToDistillationQueue(int page_num, const GURL& url); 126 127 // Check if |page_num| is a valid relative page number, i.e. page with 128 // |page_num| is either under distillation or has already completed 129 // distillation. 130 bool IsPageNumberInUse(int page_num) const; 131 132 bool AreAllPagesFinished() const; 133 134 // Total number of pages in the article that the distiller knows of, this 135 // includes pages that are pending distillation. 136 size_t TotalPageCount() const; 137 138 // Runs |finished_cb_| if all distillation callbacks and image fetches are 139 // complete. 140 void RunDistillerCallbackIfDone(); 141 142 // Checks if page |distilled_page_data| has finished distillation, including 143 // all image fetches. 144 void AddPageIfDone(int page_num); 145 146 DistilledPageData* GetPageAtIndex(size_t index) const; 147 148 // Create an ArticleDistillationUpdate for the current distillation 149 // state. 150 const ArticleDistillationUpdate CreateDistillationUpdate() const; 151 152 const DistillerURLFetcherFactory& distiller_url_fetcher_factory_; 153 scoped_ptr<DistillerPage> distiller_page_; 154 155 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; 156 DistillationFinishedCallback finished_cb_; 157 DistillationUpdateCallback update_cb_; 158 159 // Set of pages that are under distillation or have finished distillation. 160 // |started_pages_index_| and |finished_pages_index_| maintains the mapping 161 // from page number to the indices in |pages_|. 162 ScopedVector<DistilledPageData> pages_; 163 164 // Maps page numbers of finished pages to the indices in |pages_|. 165 std::map<int, size_t> finished_pages_index_; 166 167 // Maps page numbers of pages under distillation to the indices in |pages_|. 168 // If a page is |started_pages_| that means it is still waiting for an action 169 // (distillation or image fetch) to finish. 170 base::hash_map<int, size_t> started_pages_index_; 171 172 // The list of pages that are still waiting for distillation to start. 173 // This is a map, to make distiller prefer distilling lower page numbers 174 // first. 175 std::map<int, GURL> waiting_pages_; 176 177 // Set to keep track of which urls are already seen by the distiller. Used to 178 // prevent distiller from distilling the same url twice. 179 base::hash_set<std::string> seen_urls_; 180 181 size_t max_pages_in_article_; 182 183 bool destruction_allowed_; 184 185 base::WeakPtrFactory<DistillerImpl> weak_factory_; 186 187 DISALLOW_COPY_AND_ASSIGN(DistillerImpl); 188 }; 189 190 } // namespace dom_distiller 191 192 #endif // COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_ 193