1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <sstream> 6 7 #include "base/command_line.h" 8 #include "base/files/scoped_temp_dir.h" 9 #include "base/message_loop/message_loop.h" 10 #include "base/path_service.h" 11 #include "base/run_loop.h" 12 #include "base/strings/string_number_conversions.h" 13 #include "base/strings/string_split.h" 14 #include "components/dom_distiller/content/distiller_page_web_contents.h" 15 #include "components/dom_distiller/core/article_entry.h" 16 #include "components/dom_distiller/core/distilled_page_prefs.h" 17 #include "components/dom_distiller/core/distiller.h" 18 #include "components/dom_distiller/core/dom_distiller_service.h" 19 #include "components/dom_distiller/core/dom_distiller_store.h" 20 #include "components/dom_distiller/core/proto/distilled_article.pb.h" 21 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 22 #include "components/dom_distiller/core/task_tracker.h" 23 #include "components/leveldb_proto/proto_database.h" 24 #include "components/leveldb_proto/proto_database_impl.h" 25 #include "components/pref_registry/testing_pref_service_syncable.h" 26 #include "content/public/browser/browser_context.h" 27 #include "content/public/browser/browser_thread.h" 28 #include "content/public/test/content_browser_test.h" 29 #include "content/shell/browser/shell.h" 30 #include "google/protobuf/io/coded_stream.h" 31 #include "google/protobuf/io/zero_copy_stream_impl_lite.h" 32 #include "net/dns/mock_host_resolver.h" 33 #include "third_party/dom_distiller_js/dom_distiller.pb.h" 34 #include "ui/base/resource/resource_bundle.h" 35 36 using content::ContentBrowserTest; 37 38 namespace dom_distiller { 39 40 namespace { 41 42 // The url to distill. 43 const char* kUrlSwitch = "url"; 44 45 // A space-separated list of urls to distill. 46 const char* kUrlsSwitch = "urls"; 47 48 // Indicates that DNS resolution should be disabled for this test. 49 const char* kDisableDnsSwitch = "disable-dns"; 50 51 // Will write the distilled output to the given file instead of to stdout. 52 const char* kOutputFile = "output-file"; 53 54 // Indicates to output a serialized protocol buffer instead of human-readable 55 // output. 56 const char* kShouldOutputBinary = "output-binary"; 57 58 // Indicates to output only the text of the article and not the enclosing html. 59 const char* kExtractTextOnly = "extract-text-only"; 60 61 // Indicates to include debug output. 62 const char* kDebugLevel = "debug-level"; 63 64 // Maximum number of concurrent started extractor requests. 65 const int kMaxExtractorTasks = 8; 66 67 scoped_ptr<DomDistillerService> CreateDomDistillerService( 68 content::BrowserContext* context, 69 const base::FilePath& db_path) { 70 scoped_refptr<base::SequencedTaskRunner> background_task_runner = 71 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( 72 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); 73 74 // TODO(cjhopman): use an in-memory database instead of an on-disk one with 75 // temporary directory. 76 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( 77 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( 78 background_task_runner)); 79 scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore( 80 db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path)); 81 82 scoped_ptr<DistillerPageFactory> distiller_page_factory( 83 new DistillerPageWebContentsFactory(context)); 84 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory( 85 new DistillerURLFetcherFactory(context->GetRequestContext())); 86 87 dom_distiller::proto::DomDistillerOptions options; 88 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) { 89 options.set_extract_text_only(true); 90 } 91 int debug_level = 0; 92 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && 93 base::StringToInt( 94 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( 95 kDebugLevel), 96 &debug_level)) { 97 options.set_debug_level(debug_level); 98 } 99 scoped_ptr<DistillerFactory> distiller_factory( 100 new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); 101 102 // Setting up PrefService for DistilledPagePrefs. 103 user_prefs::TestingPrefServiceSyncable* pref_service = 104 new user_prefs::TestingPrefServiceSyncable(); 105 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); 106 107 return scoped_ptr<DomDistillerService>(new DomDistillerService( 108 dom_distiller_store.PassAs<DomDistillerStoreInterface>(), 109 distiller_factory.Pass(), 110 distiller_page_factory.Pass(), 111 scoped_ptr<DistilledPagePrefs>( 112 new DistilledPagePrefs(pref_service)))); 113 } 114 115 void AddComponentsResources() { 116 base::FilePath pak_file; 117 base::FilePath pak_dir; 118 PathService::Get(base::DIR_MODULE, &pak_dir); 119 pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak")); 120 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath( 121 pak_file, ui::SCALE_FACTOR_NONE); 122 } 123 124 bool WriteProtobufWithSize( 125 const google::protobuf::MessageLite& message, 126 google::protobuf::io::ZeroCopyOutputStream* output_stream) { 127 google::protobuf::io::CodedOutputStream coded_output(output_stream); 128 129 // Write the size. 130 const int size = message.ByteSize(); 131 coded_output.WriteLittleEndian32(size); 132 message.SerializeWithCachedSizes(&coded_output); 133 return !coded_output.HadError(); 134 } 135 136 std::string GetReadableArticleString( 137 const DistilledArticleProto& article_proto) { 138 std::stringstream output; 139 output << "Article Title: " << article_proto.title() << std::endl; 140 output << "# of pages: " << article_proto.pages_size() << std::endl; 141 for (int i = 0; i < article_proto.pages_size(); ++i) { 142 const DistilledPageProto& page = article_proto.pages(i); 143 output << "Page " << i << std::endl; 144 output << "URL: " << page.url() << std::endl; 145 output << "Content: " << page.html() << std::endl; 146 if (page.has_debug_info() && page.debug_info().has_log()) 147 output << "Log: " << page.debug_info().log() << std::endl; 148 } 149 return output.str(); 150 } 151 152 } // namespace 153 154 class ContentExtractionRequest : public ViewRequestDelegate { 155 public: 156 void Start(DomDistillerService* service, const gfx::Size& render_view_size, 157 base::Closure finished_callback) { 158 finished_callback_ = finished_callback; 159 viewer_handle_ = 160 service->ViewUrl(this, 161 service->CreateDefaultDistillerPage(render_view_size), 162 url_); 163 } 164 165 DistilledArticleProto GetArticleCopy() { 166 return *article_proto_; 167 } 168 169 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( 170 const CommandLine& command_line) { 171 ScopedVector<ContentExtractionRequest> requests; 172 if (command_line.HasSwitch(kUrlSwitch)) { 173 GURL url; 174 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); 175 url = GURL(url_string); 176 if (url.is_valid()) { 177 requests.push_back(new ContentExtractionRequest(url)); 178 } 179 } else if (command_line.HasSwitch(kUrlsSwitch)) { 180 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); 181 std::vector<std::string> urls; 182 base::SplitString(urls_string, ' ', &urls); 183 for (size_t i = 0; i < urls.size(); ++i) { 184 GURL url(urls[i]); 185 if (url.is_valid()) { 186 requests.push_back(new ContentExtractionRequest(url)); 187 } else { 188 ADD_FAILURE() << "Bad url"; 189 } 190 } 191 } 192 if (requests.empty()) { 193 ADD_FAILURE() << "No valid url provided"; 194 } 195 196 return requests.Pass(); 197 } 198 199 private: 200 ContentExtractionRequest(const GURL& url) : url_(url) {} 201 202 virtual void OnArticleUpdated(ArticleDistillationUpdate article_update) 203 OVERRIDE {} 204 205 virtual void OnArticleReady(const DistilledArticleProto* article_proto) 206 OVERRIDE { 207 article_proto_ = article_proto; 208 CHECK(article_proto->pages_size()) << "Failed extracting " << url_; 209 base::MessageLoop::current()->PostTask( 210 FROM_HERE, 211 finished_callback_); 212 } 213 214 const DistilledArticleProto* article_proto_; 215 scoped_ptr<ViewerHandle> viewer_handle_; 216 GURL url_; 217 base::Closure finished_callback_; 218 }; 219 220 class ContentExtractor : public ContentBrowserTest { 221 public: 222 ContentExtractor() 223 : pending_tasks_(0), 224 max_tasks_(kMaxExtractorTasks), 225 next_request_(0), 226 output_data_(), 227 protobuf_output_stream_( 228 new google::protobuf::io::StringOutputStream(&output_data_)) {} 229 230 // Change behavior of the default host resolver to avoid DNS lookup errors, so 231 // we can make network calls. 232 virtual void SetUpOnMainThread() OVERRIDE { 233 if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) { 234 EnableDNSLookupForThisTest(); 235 } 236 CHECK(db_dir_.CreateUniqueTempDir()); 237 AddComponentsResources(); 238 } 239 240 virtual void TearDownOnMainThread() OVERRIDE { 241 DisableDNSLookupForThisTest(); 242 } 243 244 protected: 245 // Creates the DomDistillerService and creates and starts the extraction 246 // request. 247 void Start() { 248 content::BrowserContext* context = 249 shell()->web_contents()->GetBrowserContext(); 250 service_ = CreateDomDistillerService(context, 251 db_dir_.path()); 252 const CommandLine& command_line = *CommandLine::ForCurrentProcess(); 253 requests_ = ContentExtractionRequest::CreateForCommandLine(command_line); 254 PumpQueue(); 255 } 256 257 void PumpQueue() { 258 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { 259 requests_[next_request_]->Start( 260 service_.get(), 261 shell()->web_contents()->GetContainerBounds().size(), 262 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); 263 ++next_request_; 264 ++pending_tasks_; 265 } 266 } 267 268 private: 269 // Change behavior of the default host resolver to allow DNS lookup 270 // to proceed instead of being blocked by the test infrastructure. 271 void EnableDNSLookupForThisTest() { 272 // mock_host_resolver_override_ takes ownership of the resolver. 273 scoped_refptr<net::RuleBasedHostResolverProc> resolver = 274 new net::RuleBasedHostResolverProc(host_resolver()); 275 resolver->AllowDirectLookup("*"); 276 mock_host_resolver_override_.reset( 277 new net::ScopedDefaultHostResolverProc(resolver.get())); 278 } 279 280 // We need to reset the DNS lookup when we finish, or the test will fail. 281 void DisableDNSLookupForThisTest() { 282 mock_host_resolver_override_.reset(); 283 } 284 285 void FinishRequest() { 286 --pending_tasks_; 287 if (next_request_ == requests_.size() && pending_tasks_ == 0) { 288 Finish(); 289 } else { 290 PumpQueue(); 291 } 292 } 293 294 void DoArticleOutput() { 295 for (size_t i = 0; i < requests_.size(); ++i) { 296 const DistilledArticleProto& article = requests_[i]->GetArticleCopy(); 297 if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) { 298 WriteProtobufWithSize(article, protobuf_output_stream_.get()); 299 } else { 300 output_data_ += GetReadableArticleString(article) + "\n"; 301 } 302 } 303 304 if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) { 305 base::FilePath filename = 306 CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile); 307 ASSERT_EQ( 308 (int)output_data_.size(), 309 base::WriteFile(filename, output_data_.c_str(), output_data_.size())); 310 } else { 311 VLOG(0) << output_data_; 312 } 313 } 314 315 void Finish() { 316 DoArticleOutput(); 317 requests_.clear(); 318 service_.reset(); 319 base::MessageLoop::current()->PostTask( 320 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure()); 321 } 322 323 size_t pending_tasks_; 324 size_t max_tasks_; 325 size_t next_request_; 326 327 base::ScopedTempDir db_dir_; 328 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_; 329 scoped_ptr<DomDistillerService> service_; 330 ScopedVector<ContentExtractionRequest> requests_; 331 332 std::string output_data_; 333 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; 334 }; 335 336 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { 337 Start(); 338 base::RunLoop().Run(); 339 } 340 341 } // namespace dom_distiller 342