Home | History | Annotate | Download | only in core
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
      6 #define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
      7 
      8 #include <map>
      9 #include <string>
     10 
     11 #include "base/callback.h"
     12 #include "base/containers/hash_tables.h"
     13 #include "base/memory/ref_counted.h"
     14 #include "base/memory/scoped_ptr.h"
     15 #include "base/memory/scoped_vector.h"
     16 #include "base/memory/weak_ptr.h"
     17 #include "components/dom_distiller/core/article_distillation_update.h"
     18 #include "components/dom_distiller/core/distiller_page.h"
     19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
     20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
     21 #include "net/url_request/url_request_context_getter.h"
     22 #include "url/gurl.h"
     23 
     24 namespace dom_distiller {
     25 
     26 class DistillerImpl;
     27 
     28 class Distiller {
     29  public:
     30   typedef base::Callback<void(scoped_ptr<DistilledArticleProto>)>
     31       DistillationFinishedCallback;
     32   typedef base::Callback<void(const ArticleDistillationUpdate&)>
     33       DistillationUpdateCallback;
     34 
     35   virtual ~Distiller() {}
     36 
     37   // Distills a page, and asynchronously returns the article HTML to the
     38   // supplied |finished_cb| callback. |update_cb| is invoked whenever article
     39   // under distillation is updated with more data.
     40   // E.g. when distilling a 2 page article, |update_cb| may be invoked each time
     41   // a distilled page is added and |finished_cb| will be invoked once
     42   // distillation is completed.
     43   virtual void DistillPage(const GURL& url,
     44                            scoped_ptr<DistillerPage> distiller_page,
     45                            const DistillationFinishedCallback& finished_cb,
     46                            const DistillationUpdateCallback& update_cb) = 0;
     47 };
     48 
     49 class DistillerFactory {
     50  public:
     51   virtual scoped_ptr<Distiller> CreateDistiller() = 0;
     52   virtual ~DistillerFactory() {}
     53 };
     54 
     55 // Factory for creating a Distiller.
     56 class DistillerFactoryImpl : public DistillerFactory {
     57  public:
     58   DistillerFactoryImpl(
     59       scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
     60       const dom_distiller::proto::DomDistillerOptions& dom_distiller_options);
     61   virtual ~DistillerFactoryImpl();
     62   virtual scoped_ptr<Distiller> CreateDistiller() OVERRIDE;
     63 
     64  private:
     65   scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
     66   dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
     67 };
     68 
     69 // Distills a article from a page and associated pages.
     70 class DistillerImpl : public Distiller {
     71  public:
     72   DistillerImpl(
     73       const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
     74       const dom_distiller::proto::DomDistillerOptions& dom_distiller_options);
     75   virtual ~DistillerImpl();
     76 
     77   virtual void DistillPage(
     78       const GURL& url,
     79       scoped_ptr<DistillerPage> distiller_page,
     80       const DistillationFinishedCallback& finished_cb,
     81       const DistillationUpdateCallback& update_cb) OVERRIDE;
     82 
     83   void SetMaxNumPagesInArticle(size_t max_num_pages);
     84 
     85  private:
     86   // In case of multiple pages, the Distiller maintains state of multiple pages
     87   // as page numbers relative to the page number where distillation started.
     88   // E.g. if distillation starts at page 2 for a 3 page article. The relative
     89   // page numbers assigned to pages will be [-1,0,1].
     90 
     91   // Class representing the state of a page under distillation.
     92   struct DistilledPageData {
     93     DistilledPageData();
     94     virtual ~DistilledPageData();
     95     // Relative page number of the page.
     96     int page_num;
     97     ScopedVector<DistillerURLFetcher> image_fetchers_;
     98     scoped_refptr<base::RefCountedData<DistilledPageProto> >
     99         distilled_page_proto;
    100 
    101    private:
    102     DISALLOW_COPY_AND_ASSIGN(DistilledPageData);
    103   };
    104 
    105   void OnFetchImageDone(int page_num,
    106                         DistillerURLFetcher* url_fetcher,
    107                         const std::string& id,
    108                         const std::string& response);
    109 
    110   void OnPageDistillationFinished(int page_num,
    111                                   const GURL& page_url,
    112                                   scoped_ptr<DistilledPageInfo> distilled_page,
    113                                   bool distillation_successful);
    114 
    115   virtual void FetchImage(int page_num,
    116                           const std::string& image_id,
    117                           const std::string& item);
    118 
    119   // Distills the next page.
    120   void DistillNextPage();
    121 
    122   // Adds the |url| to |pages_to_be_distilled| if |page_num| is a valid relative
    123   // page number and |url| is valid. Ignores duplicate pages and urls.
    124   void AddToDistillationQueue(int page_num, const GURL& url);
    125 
    126   // Check if |page_num| is a valid relative page number, i.e. page with
    127   // |page_num| is either under distillation or has already completed
    128   // distillation.
    129   bool IsPageNumberInUse(int page_num) const;
    130 
    131   bool AreAllPagesFinished() const;
    132 
    133   // Total number of pages in the article that the distiller knows of, this
    134   // includes pages that are pending distillation.
    135   size_t TotalPageCount() const;
    136 
    137   // Runs |finished_cb_| if all distillation callbacks and image fetches are
    138   // complete.
    139   void RunDistillerCallbackIfDone();
    140 
    141   // Checks if page |distilled_page_data| has finished distillation, including
    142   // all image fetches.
    143   void AddPageIfDone(int page_num);
    144 
    145   DistilledPageData* GetPageAtIndex(size_t index) const;
    146 
    147   // Create an ArticleDistillationUpdate for the current distillation
    148   // state.
    149   const ArticleDistillationUpdate CreateDistillationUpdate() const;
    150 
    151   const DistillerURLFetcherFactory& distiller_url_fetcher_factory_;
    152   scoped_ptr<DistillerPage> distiller_page_;
    153 
    154   dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
    155   DistillationFinishedCallback finished_cb_;
    156   DistillationUpdateCallback update_cb_;
    157 
    158   // Set of pages that are under distillation or have finished distillation.
    159   // |started_pages_index_| and |finished_pages_index_| maintains the mapping
    160   // from page number to the indices in |pages_|.
    161   ScopedVector<DistilledPageData> pages_;
    162 
    163   // Maps page numbers of finished pages to the indices in |pages_|.
    164   std::map<int, size_t> finished_pages_index_;
    165 
    166   // Maps page numbers of pages under distillation to the indices in |pages_|.
    167   // If a page is |started_pages_| that means it is still waiting for an action
    168   // (distillation or image fetch) to finish.
    169   base::hash_map<int, size_t> started_pages_index_;
    170 
    171   // The list of pages that are still waiting for distillation to start.
    172   // This is a map, to make distiller prefer distilling lower page numbers
    173   // first.
    174   std::map<int, GURL> waiting_pages_;
    175 
    176   // Set to keep track of which urls are already seen by the distiller. Used to
    177   // prevent distiller from distilling the same url twice.
    178   base::hash_set<std::string> seen_urls_;
    179 
    180   size_t max_pages_in_article_;
    181 
    182   bool destruction_allowed_;
    183 
    184   base::WeakPtrFactory<DistillerImpl> weak_factory_;
    185 
    186   DISALLOW_COPY_AND_ASSIGN(DistillerImpl);
    187 };
    188 
    189 }  // namespace dom_distiller
    190 
    191 #endif  // COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
    192