1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h" 6 7 #include <map> 8 #include <utility> 9 10 #include "base/bind.h" 11 #include "base/bind_helpers.h" 12 #include "base/format_macros.h" 13 #include "base/stl_util.h" 14 #include "base/strings/stringprintf.h" 15 #include "base/time/time.h" 16 #include "chrome/browser/common/cancelable_request.h" 17 #include "chrome/browser/history/history_service.h" 18 #include "chrome/browser/history/history_service_factory.h" 19 #include "chrome/browser/history/history_types.h" 20 #include "chrome/browser/profiles/profile.h" 21 #include "chrome/browser/safe_browsing/browser_features.h" 22 #include "chrome/browser/safe_browsing/client_side_detection_service.h" 23 #include "chrome/common/safe_browsing/csd.pb.h" 24 #include "content/public/browser/browser_thread.h" 25 #include "content/public/browser/navigation_controller.h" 26 #include "content/public/browser/navigation_entry.h" 27 #include "content/public/browser/web_contents.h" 28 #include "content/public/common/page_transition_types.h" 29 #include "url/gurl.h" 30 31 using content::BrowserThread; 32 using content::NavigationController; 33 using content::NavigationEntry; 34 using content::WebContents; 35 36 namespace safe_browsing { 37 38 const int BrowserFeatureExtractor::kMaxMalwareIPPerRequest = 5; 39 40 BrowseInfo::BrowseInfo() : http_status_code(0) {} 41 42 BrowseInfo::~BrowseInfo() {} 43 44 static void AddFeature(const std::string& feature_name, 45 double feature_value, 46 ClientPhishingRequest* request) { 47 DCHECK(request); 48 ClientPhishingRequest::Feature* feature = 49 request->add_non_model_feature_map(); 50 feature->set_name(feature_name); 51 feature->set_value(feature_value); 52 VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value(); 53 } 54 55 static void AddMalwareFeature(const std::string& feature_name, 56 const std::set<std::string>& meta_infos, 57 double feature_value, 58 ClientMalwareRequest* request) { 59 DCHECK(request); 60 ClientMalwareRequest::Feature* feature = 61 request->add_feature_map(); 62 feature->set_name(feature_name); 63 feature->set_value(feature_value); 64 for (std::set<std::string>::const_iterator it = meta_infos.begin(); 65 it != meta_infos.end(); ++it) { 66 feature->add_metainfo(*it); 67 } 68 VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value(); 69 } 70 71 static void AddNavigationFeatures( 72 const std::string& feature_prefix, 73 const NavigationController& controller, 74 int index, 75 const std::vector<GURL>& redirect_chain, 76 ClientPhishingRequest* request) { 77 NavigationEntry* entry = controller.GetEntryAtIndex(index); 78 bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure(); 79 if (!is_secure_referrer) { 80 AddFeature(base::StringPrintf("%s%s=%s", 81 feature_prefix.c_str(), 82 features::kReferrer, 83 entry->GetReferrer().url.spec().c_str()), 84 1.0, 85 request); 86 } 87 AddFeature(feature_prefix + features::kHasSSLReferrer, 88 is_secure_referrer ? 1.0 : 0.0, 89 request); 90 AddFeature(feature_prefix + features::kPageTransitionType, 91 static_cast<double>( 92 content::PageTransitionStripQualifier( 93 entry->GetTransitionType())), 94 request); 95 AddFeature(feature_prefix + features::kIsFirstNavigation, 96 index == 0 ? 1.0 : 0.0, 97 request); 98 // Redirect chain should always be at least of size one, as the rendered 99 // url is the last element in the chain. 100 if (redirect_chain.empty()) { 101 NOTREACHED(); 102 return; 103 } 104 if (redirect_chain.back() != entry->GetURL()) { 105 // I originally had this as a DCHECK but I saw a failure once that I 106 // can't reproduce. It looks like it might be related to the 107 // navigation controller only keeping a limited number of navigation 108 // events. For now we'll just attach a feature specifying that this is 109 // a mismatch and try and figure out what to do with it on the server. 110 DLOG(WARNING) << "Expected:" << entry->GetURL() 111 << " Actual:" << redirect_chain.back(); 112 AddFeature(feature_prefix + features::kRedirectUrlMismatch, 113 1.0, 114 request); 115 return; 116 } 117 // We skip the last element since it should just be the current url. 118 for (size_t i = 0; i < redirect_chain.size() - 1; i++) { 119 std::string printable_redirect = redirect_chain[i].spec(); 120 if (redirect_chain[i].SchemeIsSecure()) { 121 printable_redirect = features::kSecureRedirectValue; 122 } 123 AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s", 124 feature_prefix.c_str(), 125 features::kRedirect, 126 i, 127 printable_redirect.c_str()), 128 1.0, 129 request); 130 } 131 } 132 133 BrowserFeatureExtractor::BrowserFeatureExtractor( 134 WebContents* tab, 135 ClientSideDetectionService* service) 136 : tab_(tab), 137 service_(service), 138 weak_factory_(this) { 139 DCHECK(tab); 140 } 141 142 BrowserFeatureExtractor::~BrowserFeatureExtractor() { 143 weak_factory_.InvalidateWeakPtrs(); 144 // Delete all the pending extractions (delete callback and request objects). 145 STLDeleteContainerPairFirstPointers(pending_extractions_.begin(), 146 pending_extractions_.end()); 147 148 // Also cancel all the pending history service queries. 149 HistoryService* history; 150 bool success = GetHistoryService(&history); 151 DCHECK(success || pending_queries_.size() == 0); 152 // Cancel all the pending history lookups and cleanup the memory. 153 for (PendingQueriesMap::iterator it = pending_queries_.begin(); 154 it != pending_queries_.end(); ++it) { 155 if (history) { 156 history->CancelRequest(it->first); 157 } 158 ExtractionData& extraction = it->second; 159 delete extraction.first; // delete request 160 } 161 pending_queries_.clear(); 162 } 163 164 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info, 165 ClientPhishingRequest* request, 166 const DoneCallback& callback) { 167 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 168 DCHECK(request); 169 DCHECK(info); 170 DCHECK_EQ(0U, request->url().find("http:")); 171 DCHECK(!callback.is_null()); 172 if (callback.is_null()) { 173 DLOG(ERROR) << "ExtractFeatures called without a callback object"; 174 return; 175 } 176 177 // Extract features pertaining to this navigation. 178 const NavigationController& controller = tab_->GetController(); 179 int url_index = -1; 180 int first_host_index = -1; 181 182 GURL request_url(request->url()); 183 int index = controller.GetCurrentEntryIndex(); 184 // The url that we are extracting features for should already be commited. 185 DCHECK_NE(index, -1); 186 for (; index >= 0; index--) { 187 NavigationEntry* entry = controller.GetEntryAtIndex(index); 188 if (url_index == -1 && entry->GetURL() == request_url) { 189 // It's possible that we've been on the on the possibly phishy url before 190 // in this tab, so make sure that we use the latest navigation for 191 // features. 192 // Note that it's possible that the url_index should always be the 193 // latest entry, but I'm worried about possible races during a navigation 194 // and transient entries (i.e. interstiatials) so for now we will just 195 // be cautious. 196 url_index = index; 197 } else if (index < url_index) { 198 if (entry->GetURL().host() == request_url.host()) { 199 first_host_index = index; 200 } else { 201 // We have found the possibly phishing url, but we are no longer on the 202 // host. No reason to look back any further. 203 break; 204 } 205 } 206 } 207 208 // Add features pertaining to how we got to 209 // 1) The candidate url 210 // 2) The first url on the same host as the candidate url (assuming that 211 // it's different from the candidate url). 212 if (url_index != -1) { 213 AddNavigationFeatures( 214 std::string(), controller, url_index, info->url_redirects, request); 215 } 216 if (first_host_index != -1) { 217 AddNavigationFeatures(features::kHostPrefix, 218 controller, 219 first_host_index, 220 info->host_redirects, 221 request); 222 } 223 224 ExtractBrowseInfoFeatures(*info, request); 225 pending_extractions_[request] = callback; 226 base::MessageLoop::current()->PostTask( 227 FROM_HERE, 228 base::Bind(&BrowserFeatureExtractor::StartExtractFeatures, 229 weak_factory_.GetWeakPtr(), request, callback)); 230 } 231 232 void BrowserFeatureExtractor::ExtractMalwareFeatures( 233 const BrowseInfo* info, 234 ClientMalwareRequest* request) { 235 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 236 DCHECK(request); 237 DCHECK(info); 238 DCHECK_EQ(0U, request->url().find("http:")); 239 // get the IPs and urls that match the malware blacklisted IP list. 240 if (service_) { 241 int matched_bad_ips = 0; 242 for (IPUrlMap::const_iterator it = info->ips.begin(); 243 it != info->ips.end(); ++it) { 244 if (service_->IsBadIpAddress(it->first)) { 245 AddMalwareFeature(features::kBadIpFetch + it->first, 246 it->second, 1.0, request); 247 ++matched_bad_ips; 248 // Limit the number of matched bad IPs in one request to control 249 // the request's size 250 if (matched_bad_ips >= kMaxMalwareIPPerRequest) { 251 return; 252 } 253 } 254 } 255 } 256 } 257 258 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures( 259 const BrowseInfo& info, 260 ClientPhishingRequest* request) { 261 if (service_) { 262 for (IPUrlMap::const_iterator it = info.ips.begin(); 263 it != info.ips.end(); ++it) { 264 if (service_->IsBadIpAddress(it->first)) { 265 AddFeature(features::kBadIpFetch + it->first, 1.0, request); 266 } 267 } 268 } 269 if (info.unsafe_resource.get()) { 270 // A SafeBrowsing interstitial was shown for the current URL. 271 AddFeature(features::kSafeBrowsingMaliciousUrl + 272 info.unsafe_resource->url.spec(), 273 1.0, 274 request); 275 AddFeature(features::kSafeBrowsingOriginalUrl + 276 info.unsafe_resource->original_url.spec(), 277 1.0, 278 request); 279 AddFeature(features::kSafeBrowsingIsSubresource, 280 info.unsafe_resource->is_subresource ? 1.0 : 0.0, 281 request); 282 AddFeature(features::kSafeBrowsingThreatType, 283 static_cast<double>(info.unsafe_resource->threat_type), 284 request); 285 } 286 if (info.http_status_code != 0) { 287 AddFeature(features::kHttpStatusCode, info.http_status_code, request); 288 } 289 } 290 291 void BrowserFeatureExtractor::StartExtractFeatures( 292 ClientPhishingRequest* request, 293 const DoneCallback& callback) { 294 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 295 size_t removed = pending_extractions_.erase(request); 296 DCHECK_EQ(1U, removed); 297 HistoryService* history; 298 if (!request || !request->IsInitialized() || !GetHistoryService(&history)) { 299 callback.Run(false, request); 300 return; 301 } 302 CancelableRequestProvider::Handle handle = history->QueryURL( 303 GURL(request->url()), 304 true /* wants_visits */, 305 &request_consumer_, 306 base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone, 307 base::Unretained(this))); 308 309 StorePendingQuery(handle, request, callback); 310 } 311 312 void BrowserFeatureExtractor::QueryUrlHistoryDone( 313 CancelableRequestProvider::Handle handle, 314 bool success, 315 const history::URLRow* row, 316 history::VisitVector* visits) { 317 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 318 ClientPhishingRequest* request; 319 DoneCallback callback; 320 if (!GetPendingQuery(handle, &request, &callback)) { 321 DLOG(FATAL) << "No pending history query found"; 322 return; 323 } 324 DCHECK(request); 325 DCHECK(!callback.is_null()); 326 if (!success) { 327 // URL is not found in the history. In practice this should not 328 // happen (unless there is a real error) because we just visited 329 // that URL. 330 callback.Run(false, request); 331 return; 332 } 333 AddFeature(features::kUrlHistoryVisitCount, 334 static_cast<double>(row->visit_count()), 335 request); 336 337 base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1); 338 int num_visits_24h_ago = 0; 339 int num_visits_typed = 0; 340 int num_visits_link = 0; 341 for (history::VisitVector::const_iterator it = visits->begin(); 342 it != visits->end(); ++it) { 343 if (!content::PageTransitionIsMainFrame(it->transition)) { 344 continue; 345 } 346 if (it->visit_time < threshold) { 347 ++num_visits_24h_ago; 348 } 349 content::PageTransition transition = content::PageTransitionStripQualifier( 350 it->transition); 351 if (transition == content::PAGE_TRANSITION_TYPED) { 352 ++num_visits_typed; 353 } else if (transition == content::PAGE_TRANSITION_LINK) { 354 ++num_visits_link; 355 } 356 } 357 AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo, 358 static_cast<double>(num_visits_24h_ago), 359 request); 360 AddFeature(features::kUrlHistoryTypedCount, 361 static_cast<double>(num_visits_typed), 362 request); 363 AddFeature(features::kUrlHistoryLinkCount, 364 static_cast<double>(num_visits_link), 365 request); 366 367 // Issue next history lookup for host visits. 368 HistoryService* history; 369 if (!GetHistoryService(&history)) { 370 callback.Run(false, request); 371 return; 372 } 373 CancelableRequestProvider::Handle next_handle = 374 history->GetVisibleVisitCountToHost( 375 GURL(request->url()), 376 &request_consumer_, 377 base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone, 378 base::Unretained(this))); 379 StorePendingQuery(next_handle, request, callback); 380 } 381 382 void BrowserFeatureExtractor::QueryHttpHostVisitsDone( 383 CancelableRequestProvider::Handle handle, 384 bool success, 385 int num_visits, 386 base::Time first_visit) { 387 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 388 ClientPhishingRequest* request; 389 DoneCallback callback; 390 if (!GetPendingQuery(handle, &request, &callback)) { 391 DLOG(FATAL) << "No pending history query found"; 392 return; 393 } 394 DCHECK(request); 395 DCHECK(!callback.is_null()); 396 if (!success) { 397 callback.Run(false, request); 398 return; 399 } 400 SetHostVisitsFeatures(num_visits, first_visit, true, request); 401 402 // Same lookup but for the HTTPS URL. 403 HistoryService* history; 404 if (!GetHistoryService(&history)) { 405 callback.Run(false, request); 406 return; 407 } 408 std::string https_url = request->url(); 409 CancelableRequestProvider::Handle next_handle = 410 history->GetVisibleVisitCountToHost( 411 GURL(https_url.replace(0, 5, "https:")), 412 &request_consumer_, 413 base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone, 414 base::Unretained(this))); 415 StorePendingQuery(next_handle, request, callback); 416 } 417 418 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone( 419 CancelableRequestProvider::Handle handle, 420 bool success, 421 int num_visits, 422 base::Time first_visit) { 423 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 424 ClientPhishingRequest* request; 425 DoneCallback callback; 426 if (!GetPendingQuery(handle, &request, &callback)) { 427 DLOG(FATAL) << "No pending history query found"; 428 return; 429 } 430 DCHECK(request); 431 DCHECK(!callback.is_null()); 432 if (!success) { 433 callback.Run(false, request); 434 return; 435 } 436 SetHostVisitsFeatures(num_visits, first_visit, false, request); 437 callback.Run(true, request); // We're done with all the history lookups. 438 } 439 440 void BrowserFeatureExtractor::SetHostVisitsFeatures( 441 int num_visits, 442 base::Time first_visit, 443 bool is_http_query, 444 ClientPhishingRequest* request) { 445 DCHECK(request); 446 AddFeature(is_http_query ? 447 features::kHttpHostVisitCount : features::kHttpsHostVisitCount, 448 static_cast<double>(num_visits), 449 request); 450 if (num_visits > 0) { 451 AddFeature( 452 is_http_query ? 453 features::kFirstHttpHostVisitMoreThan24hAgo : 454 features::kFirstHttpsHostVisitMoreThan24hAgo, 455 (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ? 456 1.0 : 0.0, 457 request); 458 } 459 } 460 461 void BrowserFeatureExtractor::StorePendingQuery( 462 CancelableRequestProvider::Handle handle, 463 ClientPhishingRequest* request, 464 const DoneCallback& callback) { 465 DCHECK_EQ(0U, pending_queries_.count(handle)); 466 pending_queries_[handle] = std::make_pair(request, callback); 467 } 468 469 bool BrowserFeatureExtractor::GetPendingQuery( 470 CancelableRequestProvider::Handle handle, 471 ClientPhishingRequest** request, 472 DoneCallback* callback) { 473 PendingQueriesMap::iterator it = pending_queries_.find(handle); 474 DCHECK(it != pending_queries_.end()); 475 if (it != pending_queries_.end()) { 476 *request = it->second.first; 477 *callback = it->second.second; 478 pending_queries_.erase(it); 479 return true; 480 } 481 return false; 482 } 483 484 bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) { 485 *history = NULL; 486 if (tab_ && tab_->GetBrowserContext()) { 487 Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext()); 488 *history = HistoryServiceFactory::GetForProfile(profile, 489 Profile::EXPLICIT_ACCESS); 490 if (*history) { 491 return true; 492 } 493 } 494 VLOG(2) << "Unable to query history. No history service available."; 495 return false; 496 } 497 498 } // namespace safe_browsing 499