1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h" 6 7 #include <map> 8 #include <utility> 9 10 #include "base/bind.h" 11 #include "base/bind_helpers.h" 12 #include "base/format_macros.h" 13 #include "base/stl_util.h" 14 #include "base/strings/stringprintf.h" 15 #include "base/time/time.h" 16 #include "chrome/browser/common/cancelable_request.h" 17 #include "chrome/browser/history/history_service.h" 18 #include "chrome/browser/history/history_service_factory.h" 19 #include "chrome/browser/history/history_types.h" 20 #include "chrome/browser/profiles/profile.h" 21 #include "chrome/browser/safe_browsing/browser_features.h" 22 #include "chrome/browser/safe_browsing/client_side_detection_host.h" 23 #include "chrome/browser/safe_browsing/database_manager.h" 24 #include "chrome/common/safe_browsing/csd.pb.h" 25 #include "content/public/browser/browser_thread.h" 26 #include "content/public/browser/navigation_controller.h" 27 #include "content/public/browser/navigation_entry.h" 28 #include "content/public/browser/web_contents.h" 29 #include "content/public/common/page_transition_types.h" 30 #include "url/gurl.h" 31 32 using content::BrowserThread; 33 using content::NavigationController; 34 using content::NavigationEntry; 35 using content::WebContents; 36 37 namespace safe_browsing { 38 39 namespace { 40 41 const int kMaxMalwareIPPerRequest = 5; 42 43 void FilterBenignIpsOnIOThread( 44 scoped_refptr<SafeBrowsingDatabaseManager> database_manager, 45 IPUrlMap* ips) { 46 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 47 for (IPUrlMap::iterator it = ips->begin(); it != ips->end();) { 48 if (!database_manager.get() || 49 !database_manager->MatchMalwareIP(it->first)) { 50 // it++ here returns a copy of the old iterator and passes it to erase. 51 ips->erase(it++); 52 } else { 53 ++it; 54 } 55 } 56 } 57 } // namespace 58 59 IPUrlInfo::IPUrlInfo(const std::string& url, 60 const std::string& method, 61 const std::string& referrer, 62 const ResourceType::Type& resource_type) 63 : url(url), 64 method(method), 65 referrer(referrer), 66 resource_type(resource_type) { 67 } 68 69 IPUrlInfo::~IPUrlInfo() {} 70 71 BrowseInfo::BrowseInfo() : http_status_code(0) {} 72 73 BrowseInfo::~BrowseInfo() {} 74 75 static void AddFeature(const std::string& feature_name, 76 double feature_value, 77 ClientPhishingRequest* request) { 78 DCHECK(request); 79 ClientPhishingRequest::Feature* feature = 80 request->add_non_model_feature_map(); 81 feature->set_name(feature_name); 82 feature->set_value(feature_value); 83 VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value(); 84 } 85 86 static void AddMalwareIpUrlInfo(const std::string& ip, 87 const std::vector<IPUrlInfo>& meta_infos, 88 ClientMalwareRequest* request) { 89 DCHECK(request); 90 for (std::vector<IPUrlInfo>::const_iterator it = meta_infos.begin(); 91 it != meta_infos.end(); ++it) { 92 ClientMalwareRequest::UrlInfo* urlinfo = 93 request->add_bad_ip_url_info(); 94 // We add the information about url on the bad ip. 95 urlinfo->set_ip(ip); 96 urlinfo->set_url(it->url); 97 urlinfo->set_method(it->method); 98 urlinfo->set_referrer(it->referrer); 99 urlinfo->set_resource_type(static_cast<int>(it->resource_type)); 100 } 101 DVLOG(2) << "Added url info for bad ip: " << ip; 102 } 103 104 static void AddNavigationFeatures( 105 const std::string& feature_prefix, 106 const NavigationController& controller, 107 int index, 108 const std::vector<GURL>& redirect_chain, 109 ClientPhishingRequest* request) { 110 NavigationEntry* entry = controller.GetEntryAtIndex(index); 111 bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure(); 112 if (!is_secure_referrer) { 113 AddFeature(base::StringPrintf("%s%s=%s", 114 feature_prefix.c_str(), 115 features::kReferrer, 116 entry->GetReferrer().url.spec().c_str()), 117 1.0, 118 request); 119 } 120 AddFeature(feature_prefix + features::kHasSSLReferrer, 121 is_secure_referrer ? 1.0 : 0.0, 122 request); 123 AddFeature(feature_prefix + features::kPageTransitionType, 124 static_cast<double>( 125 content::PageTransitionStripQualifier( 126 entry->GetTransitionType())), 127 request); 128 AddFeature(feature_prefix + features::kIsFirstNavigation, 129 index == 0 ? 1.0 : 0.0, 130 request); 131 // Redirect chain should always be at least of size one, as the rendered 132 // url is the last element in the chain. 133 if (redirect_chain.empty()) { 134 NOTREACHED(); 135 return; 136 } 137 if (redirect_chain.back() != entry->GetURL()) { 138 // I originally had this as a DCHECK but I saw a failure once that I 139 // can't reproduce. It looks like it might be related to the 140 // navigation controller only keeping a limited number of navigation 141 // events. For now we'll just attach a feature specifying that this is 142 // a mismatch and try and figure out what to do with it on the server. 143 DLOG(WARNING) << "Expected:" << entry->GetURL() 144 << " Actual:" << redirect_chain.back(); 145 AddFeature(feature_prefix + features::kRedirectUrlMismatch, 146 1.0, 147 request); 148 return; 149 } 150 // We skip the last element since it should just be the current url. 151 for (size_t i = 0; i < redirect_chain.size() - 1; i++) { 152 std::string printable_redirect = redirect_chain[i].spec(); 153 if (redirect_chain[i].SchemeIsSecure()) { 154 printable_redirect = features::kSecureRedirectValue; 155 } 156 AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s", 157 feature_prefix.c_str(), 158 features::kRedirect, 159 i, 160 printable_redirect.c_str()), 161 1.0, 162 request); 163 } 164 } 165 166 BrowserFeatureExtractor::BrowserFeatureExtractor( 167 WebContents* tab, 168 ClientSideDetectionHost* host) 169 : tab_(tab), 170 host_(host), 171 weak_factory_(this) { 172 DCHECK(tab); 173 } 174 175 BrowserFeatureExtractor::~BrowserFeatureExtractor() { 176 weak_factory_.InvalidateWeakPtrs(); 177 // Delete all the pending extractions (delete callback and request objects). 178 STLDeleteContainerPairFirstPointers(pending_extractions_.begin(), 179 pending_extractions_.end()); 180 181 // Also cancel all the pending history service queries. 182 HistoryService* history; 183 bool success = GetHistoryService(&history); 184 DCHECK(success || pending_queries_.size() == 0); 185 // Cancel all the pending history lookups and cleanup the memory. 186 for (PendingQueriesMap::iterator it = pending_queries_.begin(); 187 it != pending_queries_.end(); ++it) { 188 if (history) { 189 history->CancelRequest(it->first); 190 } 191 ExtractionData& extraction = it->second; 192 delete extraction.first; // delete request 193 } 194 pending_queries_.clear(); 195 } 196 197 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info, 198 ClientPhishingRequest* request, 199 const DoneCallback& callback) { 200 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 201 DCHECK(request); 202 DCHECK(info); 203 DCHECK_EQ(0U, request->url().find("http:")); 204 DCHECK(!callback.is_null()); 205 // Extract features pertaining to this navigation. 206 const NavigationController& controller = tab_->GetController(); 207 int url_index = -1; 208 int first_host_index = -1; 209 210 GURL request_url(request->url()); 211 int index = controller.GetCurrentEntryIndex(); 212 // The url that we are extracting features for should already be commited. 213 DCHECK_NE(index, -1); 214 for (; index >= 0; index--) { 215 NavigationEntry* entry = controller.GetEntryAtIndex(index); 216 if (url_index == -1 && entry->GetURL() == request_url) { 217 // It's possible that we've been on the on the possibly phishy url before 218 // in this tab, so make sure that we use the latest navigation for 219 // features. 220 // Note that it's possible that the url_index should always be the 221 // latest entry, but I'm worried about possible races during a navigation 222 // and transient entries (i.e. interstiatials) so for now we will just 223 // be cautious. 224 url_index = index; 225 } else if (index < url_index) { 226 if (entry->GetURL().host() == request_url.host()) { 227 first_host_index = index; 228 } else { 229 // We have found the possibly phishing url, but we are no longer on the 230 // host. No reason to look back any further. 231 break; 232 } 233 } 234 } 235 236 // Add features pertaining to how we got to 237 // 1) The candidate url 238 // 2) The first url on the same host as the candidate url (assuming that 239 // it's different from the candidate url). 240 if (url_index != -1) { 241 AddNavigationFeatures( 242 std::string(), controller, url_index, info->url_redirects, request); 243 } 244 if (first_host_index != -1) { 245 AddNavigationFeatures(features::kHostPrefix, 246 controller, 247 first_host_index, 248 info->host_redirects, 249 request); 250 } 251 252 ExtractBrowseInfoFeatures(*info, request); 253 pending_extractions_[request] = callback; 254 base::MessageLoop::current()->PostTask( 255 FROM_HERE, 256 base::Bind(&BrowserFeatureExtractor::StartExtractFeatures, 257 weak_factory_.GetWeakPtr(), request, callback)); 258 } 259 260 void BrowserFeatureExtractor::ExtractMalwareFeatures( 261 BrowseInfo* info, 262 ClientMalwareRequest* request, 263 const MalwareDoneCallback& callback) { 264 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 265 DCHECK_EQ(0U, request->url().find("http:")); 266 DCHECK(!callback.is_null()); 267 268 // Grab the IPs because they might go away before we're done 269 // checking them against the IP blacklist on the IO thread. 270 scoped_ptr<IPUrlMap> ips(new IPUrlMap); 271 ips->swap(info->ips); 272 273 IPUrlMap* ips_ptr = ips.get(); 274 275 // The API doesn't take a scoped_ptr because the API gets mocked and we 276 // cannot mock an API that takes scoped_ptr as arguments. 277 scoped_ptr<ClientMalwareRequest> req(request); 278 279 // IP blacklist lookups have to happen on the IO thread. 280 BrowserThread::PostTaskAndReply( 281 BrowserThread::IO, 282 FROM_HERE, 283 base::Bind(&FilterBenignIpsOnIOThread, 284 host_->database_manager(), 285 ips_ptr), 286 base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures, 287 weak_factory_.GetWeakPtr(), 288 base::Passed(&ips), callback, base::Passed(&req))); 289 } 290 291 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures( 292 const BrowseInfo& info, 293 ClientPhishingRequest* request) { 294 if (info.unsafe_resource.get()) { 295 // A SafeBrowsing interstitial was shown for the current URL. 296 AddFeature(features::kSafeBrowsingMaliciousUrl + 297 info.unsafe_resource->url.spec(), 298 1.0, 299 request); 300 AddFeature(features::kSafeBrowsingOriginalUrl + 301 info.unsafe_resource->original_url.spec(), 302 1.0, 303 request); 304 AddFeature(features::kSafeBrowsingIsSubresource, 305 info.unsafe_resource->is_subresource ? 1.0 : 0.0, 306 request); 307 AddFeature(features::kSafeBrowsingThreatType, 308 static_cast<double>(info.unsafe_resource->threat_type), 309 request); 310 } 311 if (info.http_status_code != 0) { 312 AddFeature(features::kHttpStatusCode, info.http_status_code, request); 313 } 314 } 315 316 void BrowserFeatureExtractor::StartExtractFeatures( 317 ClientPhishingRequest* request, 318 const DoneCallback& callback) { 319 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 320 size_t removed = pending_extractions_.erase(request); 321 DCHECK_EQ(1U, removed); 322 HistoryService* history; 323 if (!request || !request->IsInitialized() || !GetHistoryService(&history)) { 324 callback.Run(false, request); 325 return; 326 } 327 CancelableRequestProvider::Handle handle = history->QueryURL( 328 GURL(request->url()), 329 true /* wants_visits */, 330 &request_consumer_, 331 base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone, 332 base::Unretained(this))); 333 334 StorePendingQuery(handle, request, callback); 335 } 336 337 void BrowserFeatureExtractor::QueryUrlHistoryDone( 338 CancelableRequestProvider::Handle handle, 339 bool success, 340 const history::URLRow* row, 341 history::VisitVector* visits) { 342 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 343 ClientPhishingRequest* request; 344 DoneCallback callback; 345 if (!GetPendingQuery(handle, &request, &callback)) { 346 DLOG(FATAL) << "No pending history query found"; 347 return; 348 } 349 DCHECK(request); 350 DCHECK(!callback.is_null()); 351 if (!success) { 352 // URL is not found in the history. In practice this should not 353 // happen (unless there is a real error) because we just visited 354 // that URL. 355 callback.Run(false, request); 356 return; 357 } 358 AddFeature(features::kUrlHistoryVisitCount, 359 static_cast<double>(row->visit_count()), 360 request); 361 362 base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1); 363 int num_visits_24h_ago = 0; 364 int num_visits_typed = 0; 365 int num_visits_link = 0; 366 for (history::VisitVector::const_iterator it = visits->begin(); 367 it != visits->end(); ++it) { 368 if (!content::PageTransitionIsMainFrame(it->transition)) { 369 continue; 370 } 371 if (it->visit_time < threshold) { 372 ++num_visits_24h_ago; 373 } 374 content::PageTransition transition = content::PageTransitionStripQualifier( 375 it->transition); 376 if (transition == content::PAGE_TRANSITION_TYPED) { 377 ++num_visits_typed; 378 } else if (transition == content::PAGE_TRANSITION_LINK) { 379 ++num_visits_link; 380 } 381 } 382 AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo, 383 static_cast<double>(num_visits_24h_ago), 384 request); 385 AddFeature(features::kUrlHistoryTypedCount, 386 static_cast<double>(num_visits_typed), 387 request); 388 AddFeature(features::kUrlHistoryLinkCount, 389 static_cast<double>(num_visits_link), 390 request); 391 392 // Issue next history lookup for host visits. 393 HistoryService* history; 394 if (!GetHistoryService(&history)) { 395 callback.Run(false, request); 396 return; 397 } 398 CancelableRequestProvider::Handle next_handle = 399 history->GetVisibleVisitCountToHost( 400 GURL(request->url()), 401 &request_consumer_, 402 base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone, 403 base::Unretained(this))); 404 StorePendingQuery(next_handle, request, callback); 405 } 406 407 void BrowserFeatureExtractor::QueryHttpHostVisitsDone( 408 CancelableRequestProvider::Handle handle, 409 bool success, 410 int num_visits, 411 base::Time first_visit) { 412 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 413 ClientPhishingRequest* request; 414 DoneCallback callback; 415 if (!GetPendingQuery(handle, &request, &callback)) { 416 DLOG(FATAL) << "No pending history query found"; 417 return; 418 } 419 DCHECK(request); 420 DCHECK(!callback.is_null()); 421 if (!success) { 422 callback.Run(false, request); 423 return; 424 } 425 SetHostVisitsFeatures(num_visits, first_visit, true, request); 426 427 // Same lookup but for the HTTPS URL. 428 HistoryService* history; 429 if (!GetHistoryService(&history)) { 430 callback.Run(false, request); 431 return; 432 } 433 std::string https_url = request->url(); 434 CancelableRequestProvider::Handle next_handle = 435 history->GetVisibleVisitCountToHost( 436 GURL(https_url.replace(0, 5, "https:")), 437 &request_consumer_, 438 base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone, 439 base::Unretained(this))); 440 StorePendingQuery(next_handle, request, callback); 441 } 442 443 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone( 444 CancelableRequestProvider::Handle handle, 445 bool success, 446 int num_visits, 447 base::Time first_visit) { 448 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 449 ClientPhishingRequest* request; 450 DoneCallback callback; 451 if (!GetPendingQuery(handle, &request, &callback)) { 452 DLOG(FATAL) << "No pending history query found"; 453 return; 454 } 455 DCHECK(request); 456 DCHECK(!callback.is_null()); 457 if (!success) { 458 callback.Run(false, request); 459 return; 460 } 461 SetHostVisitsFeatures(num_visits, first_visit, false, request); 462 callback.Run(true, request); // We're done with all the history lookups. 463 } 464 465 void BrowserFeatureExtractor::SetHostVisitsFeatures( 466 int num_visits, 467 base::Time first_visit, 468 bool is_http_query, 469 ClientPhishingRequest* request) { 470 DCHECK(request); 471 AddFeature(is_http_query ? 472 features::kHttpHostVisitCount : features::kHttpsHostVisitCount, 473 static_cast<double>(num_visits), 474 request); 475 if (num_visits > 0) { 476 AddFeature( 477 is_http_query ? 478 features::kFirstHttpHostVisitMoreThan24hAgo : 479 features::kFirstHttpsHostVisitMoreThan24hAgo, 480 (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ? 481 1.0 : 0.0, 482 request); 483 } 484 } 485 486 void BrowserFeatureExtractor::StorePendingQuery( 487 CancelableRequestProvider::Handle handle, 488 ClientPhishingRequest* request, 489 const DoneCallback& callback) { 490 DCHECK_EQ(0U, pending_queries_.count(handle)); 491 pending_queries_[handle] = std::make_pair(request, callback); 492 } 493 494 bool BrowserFeatureExtractor::GetPendingQuery( 495 CancelableRequestProvider::Handle handle, 496 ClientPhishingRequest** request, 497 DoneCallback* callback) { 498 PendingQueriesMap::iterator it = pending_queries_.find(handle); 499 DCHECK(it != pending_queries_.end()); 500 if (it != pending_queries_.end()) { 501 *request = it->second.first; 502 *callback = it->second.second; 503 pending_queries_.erase(it); 504 return true; 505 } 506 return false; 507 } 508 509 bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) { 510 *history = NULL; 511 if (tab_ && tab_->GetBrowserContext()) { 512 Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext()); 513 *history = HistoryServiceFactory::GetForProfile(profile, 514 Profile::EXPLICIT_ACCESS); 515 if (*history) { 516 return true; 517 } 518 } 519 VLOG(2) << "Unable to query history. No history service available."; 520 return false; 521 } 522 523 void BrowserFeatureExtractor::FinishExtractMalwareFeatures( 524 scoped_ptr<IPUrlMap> bad_ips, 525 MalwareDoneCallback callback, 526 scoped_ptr<ClientMalwareRequest> request) { 527 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 528 int matched_bad_ips = 0; 529 for (IPUrlMap::const_iterator it = bad_ips->begin(); 530 it != bad_ips->end(); ++it) { 531 AddMalwareIpUrlInfo(it->first, it->second, request.get()); 532 ++matched_bad_ips; 533 // Limit the number of matched bad IPs in one request to control 534 // the request's size 535 if (matched_bad_ips >= kMaxMalwareIPPerRequest) { 536 break; 537 } 538 } 539 callback.Run(true, request.Pass()); 540 } 541 542 } // namespace safe_browsing 543