Home | History | Annotate | Download | only in standalone
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <sstream>
      6 
      7 #include "base/command_line.h"
      8 #include "base/files/scoped_temp_dir.h"
      9 #include "base/message_loop/message_loop.h"
     10 #include "base/path_service.h"
     11 #include "base/run_loop.h"
     12 #include "base/strings/string_number_conversions.h"
     13 #include "base/strings/string_split.h"
     14 #include "components/dom_distiller/content/distiller_page_web_contents.h"
     15 #include "components/dom_distiller/core/article_entry.h"
     16 #include "components/dom_distiller/core/distilled_page_prefs.h"
     17 #include "components/dom_distiller/core/distiller.h"
     18 #include "components/dom_distiller/core/dom_distiller_service.h"
     19 #include "components/dom_distiller/core/dom_distiller_store.h"
     20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
     21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
     22 #include "components/dom_distiller/core/task_tracker.h"
     23 #include "components/leveldb_proto/proto_database.h"
     24 #include "components/leveldb_proto/proto_database_impl.h"
     25 #include "components/pref_registry/testing_pref_service_syncable.h"
     26 #include "content/public/browser/browser_context.h"
     27 #include "content/public/browser/browser_thread.h"
     28 #include "content/public/test/content_browser_test.h"
     29 #include "content/shell/browser/shell.h"
     30 #include "google/protobuf/io/coded_stream.h"
     31 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
     32 #include "net/dns/mock_host_resolver.h"
     33 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
     34 #include "ui/base/resource/resource_bundle.h"
     35 
     36 using content::ContentBrowserTest;
     37 
     38 namespace dom_distiller {
     39 
     40 namespace {
     41 
     42 // The url to distill.
     43 const char* kUrlSwitch = "url";
     44 
     45 // A space-separated list of urls to distill.
     46 const char* kUrlsSwitch = "urls";
     47 
     48 // Indicates that DNS resolution should be disabled for this test.
     49 const char* kDisableDnsSwitch = "disable-dns";
     50 
     51 // Will write the distilled output to the given file instead of to stdout.
     52 const char* kOutputFile = "output-file";
     53 
     54 // Indicates to output a serialized protocol buffer instead of human-readable
     55 // output.
     56 const char* kShouldOutputBinary = "output-binary";
     57 
     58 // Indicates to output only the text of the article and not the enclosing html.
     59 const char* kExtractTextOnly = "extract-text-only";
     60 
     61 // Indicates to include debug output.
     62 const char* kDebugLevel = "debug-level";
     63 
     64 // Maximum number of concurrent started extractor requests.
     65 const int kMaxExtractorTasks = 8;
     66 
     67 scoped_ptr<DomDistillerService> CreateDomDistillerService(
     68     content::BrowserContext* context,
     69     const base::FilePath& db_path) {
     70   scoped_refptr<base::SequencedTaskRunner> background_task_runner =
     71       content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
     72           content::BrowserThread::GetBlockingPool()->GetSequenceToken());
     73 
     74   // TODO(cjhopman): use an in-memory database instead of an on-disk one with
     75   // temporary directory.
     76   scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
     77       new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
     78           background_task_runner));
     79   scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
     80       db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path));
     81 
     82   scoped_ptr<DistillerPageFactory> distiller_page_factory(
     83       new DistillerPageWebContentsFactory(context));
     84   scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
     85       new DistillerURLFetcherFactory(context->GetRequestContext()));
     86 
     87   dom_distiller::proto::DomDistillerOptions options;
     88   if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
     89     options.set_extract_text_only(true);
     90   }
     91   int debug_level = 0;
     92   if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
     93       base::StringToInt(
     94           base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
     95               kDebugLevel),
     96           &debug_level)) {
     97     options.set_debug_level(debug_level);
     98   }
     99   scoped_ptr<DistillerFactory> distiller_factory(
    100       new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options));
    101 
    102   // Setting up PrefService for DistilledPagePrefs.
    103   user_prefs::TestingPrefServiceSyncable* pref_service =
    104       new user_prefs::TestingPrefServiceSyncable();
    105   DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
    106 
    107   return scoped_ptr<DomDistillerService>(new DomDistillerService(
    108       dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
    109       distiller_factory.Pass(),
    110       distiller_page_factory.Pass(),
    111       scoped_ptr<DistilledPagePrefs>(
    112           new DistilledPagePrefs(pref_service))));
    113 }
    114 
    115 void AddComponentsResources() {
    116   base::FilePath pak_file;
    117   base::FilePath pak_dir;
    118   PathService::Get(base::DIR_MODULE, &pak_dir);
    119   pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak"));
    120   ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
    121       pak_file, ui::SCALE_FACTOR_NONE);
    122 }
    123 
    124 bool WriteProtobufWithSize(
    125     const google::protobuf::MessageLite& message,
    126     google::protobuf::io::ZeroCopyOutputStream* output_stream) {
    127   google::protobuf::io::CodedOutputStream coded_output(output_stream);
    128 
    129   // Write the size.
    130   const int size = message.ByteSize();
    131   coded_output.WriteLittleEndian32(size);
    132   message.SerializeWithCachedSizes(&coded_output);
    133   return !coded_output.HadError();
    134 }
    135 
    136 std::string GetReadableArticleString(
    137     const DistilledArticleProto& article_proto) {
    138   std::stringstream output;
    139   output << "Article Title: " << article_proto.title() << std::endl;
    140   output << "# of pages: " << article_proto.pages_size() << std::endl;
    141   for (int i = 0; i < article_proto.pages_size(); ++i) {
    142     const DistilledPageProto& page = article_proto.pages(i);
    143     output << "Page " << i << std::endl;
    144     output << "URL: " << page.url() << std::endl;
    145     output << "Content: " << page.html() << std::endl;
    146     if (page.has_debug_info() && page.debug_info().has_log())
    147       output << "Log: " << page.debug_info().log() << std::endl;
    148   }
    149   return output.str();
    150 }
    151 
    152 }  // namespace
    153 
    154 class ContentExtractionRequest : public ViewRequestDelegate {
    155  public:
    156   void Start(DomDistillerService* service, const gfx::Size& render_view_size,
    157              base::Closure finished_callback) {
    158     finished_callback_ = finished_callback;
    159     viewer_handle_ =
    160         service->ViewUrl(this,
    161                          service->CreateDefaultDistillerPage(render_view_size),
    162                          url_);
    163   }
    164 
    165   DistilledArticleProto GetArticleCopy() {
    166     return *article_proto_;
    167   }
    168 
    169   static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
    170       const CommandLine& command_line) {
    171     ScopedVector<ContentExtractionRequest> requests;
    172     if (command_line.HasSwitch(kUrlSwitch)) {
    173       GURL url;
    174       std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
    175       url = GURL(url_string);
    176       if (url.is_valid()) {
    177         requests.push_back(new ContentExtractionRequest(url));
    178       }
    179     } else if (command_line.HasSwitch(kUrlsSwitch)) {
    180       std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
    181       std::vector<std::string> urls;
    182       base::SplitString(urls_string, ' ', &urls);
    183       for (size_t i = 0; i < urls.size(); ++i) {
    184         GURL url(urls[i]);
    185         if (url.is_valid()) {
    186           requests.push_back(new ContentExtractionRequest(url));
    187         } else {
    188           ADD_FAILURE() << "Bad url";
    189         }
    190       }
    191     }
    192     if (requests.empty()) {
    193       ADD_FAILURE() << "No valid url provided";
    194     }
    195 
    196     return requests.Pass();
    197   }
    198 
    199  private:
    200   ContentExtractionRequest(const GURL& url) : url_(url) {}
    201 
    202   virtual void OnArticleUpdated(ArticleDistillationUpdate article_update)
    203       OVERRIDE {}
    204 
    205   virtual void OnArticleReady(const DistilledArticleProto* article_proto)
    206       OVERRIDE {
    207     article_proto_ = article_proto;
    208     CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
    209     base::MessageLoop::current()->PostTask(
    210         FROM_HERE,
    211         finished_callback_);
    212   }
    213 
    214   const DistilledArticleProto* article_proto_;
    215   scoped_ptr<ViewerHandle> viewer_handle_;
    216   GURL url_;
    217   base::Closure finished_callback_;
    218 };
    219 
    220 class ContentExtractor : public ContentBrowserTest {
    221  public:
    222   ContentExtractor()
    223       : pending_tasks_(0),
    224         max_tasks_(kMaxExtractorTasks),
    225         next_request_(0),
    226         output_data_(),
    227         protobuf_output_stream_(
    228             new google::protobuf::io::StringOutputStream(&output_data_)) {}
    229 
    230   // Change behavior of the default host resolver to avoid DNS lookup errors, so
    231   // we can make network calls.
    232   virtual void SetUpOnMainThread() OVERRIDE {
    233     if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
    234       EnableDNSLookupForThisTest();
    235     }
    236     CHECK(db_dir_.CreateUniqueTempDir());
    237     AddComponentsResources();
    238   }
    239 
    240   virtual void TearDownOnMainThread() OVERRIDE {
    241     DisableDNSLookupForThisTest();
    242   }
    243 
    244  protected:
    245   // Creates the DomDistillerService and creates and starts the extraction
    246   // request.
    247   void Start() {
    248     content::BrowserContext* context =
    249         shell()->web_contents()->GetBrowserContext();
    250     service_ = CreateDomDistillerService(context,
    251                                          db_dir_.path());
    252     const CommandLine& command_line = *CommandLine::ForCurrentProcess();
    253     requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
    254     PumpQueue();
    255   }
    256 
    257   void PumpQueue() {
    258     while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
    259       requests_[next_request_]->Start(
    260           service_.get(),
    261           shell()->web_contents()->GetContainerBounds().size(),
    262           base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
    263       ++next_request_;
    264       ++pending_tasks_;
    265     }
    266   }
    267 
    268  private:
    269   // Change behavior of the default host resolver to allow DNS lookup
    270   // to proceed instead of being blocked by the test infrastructure.
    271   void EnableDNSLookupForThisTest() {
    272     // mock_host_resolver_override_ takes ownership of the resolver.
    273     scoped_refptr<net::RuleBasedHostResolverProc> resolver =
    274         new net::RuleBasedHostResolverProc(host_resolver());
    275     resolver->AllowDirectLookup("*");
    276     mock_host_resolver_override_.reset(
    277         new net::ScopedDefaultHostResolverProc(resolver.get()));
    278   }
    279 
    280   // We need to reset the DNS lookup when we finish, or the test will fail.
    281   void DisableDNSLookupForThisTest() {
    282     mock_host_resolver_override_.reset();
    283   }
    284 
    285   void FinishRequest() {
    286     --pending_tasks_;
    287     if (next_request_ == requests_.size() && pending_tasks_ == 0) {
    288       Finish();
    289     } else {
    290       PumpQueue();
    291     }
    292   }
    293 
    294   void DoArticleOutput() {
    295     for (size_t i = 0; i < requests_.size(); ++i) {
    296       const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
    297       if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
    298         WriteProtobufWithSize(article, protobuf_output_stream_.get());
    299       } else {
    300         output_data_ += GetReadableArticleString(article) + "\n";
    301       }
    302     }
    303 
    304     if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
    305       base::FilePath filename =
    306           CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
    307       ASSERT_EQ(
    308           (int)output_data_.size(),
    309           base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
    310     } else {
    311       VLOG(0) << output_data_;
    312     }
    313   }
    314 
    315   void Finish() {
    316     DoArticleOutput();
    317     requests_.clear();
    318     service_.reset();
    319     base::MessageLoop::current()->PostTask(
    320         FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
    321   }
    322 
    323   size_t pending_tasks_;
    324   size_t max_tasks_;
    325   size_t next_request_;
    326 
    327   base::ScopedTempDir db_dir_;
    328   scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
    329   scoped_ptr<DomDistillerService> service_;
    330   ScopedVector<ContentExtractionRequest> requests_;
    331 
    332   std::string output_data_;
    333   scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
    334 };
    335 
    336 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
    337   Start();
    338   base::RunLoop().Run();
    339 }
    340 
    341 }  // namespace dom_distiller
    342