1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // PhishingUrlFeatureExtractor handles computing URL-based features for 6 // the client-side phishing detection model. These include tokens in the 7 // host and path, features pertaining to host length, and IP addresses. 8 9 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ 10 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ 11 12 #include <string> 13 #include <vector> 14 15 #include "base/basictypes.h" 16 17 class GURL; 18 19 namespace safe_browsing { 20 class FeatureMap; 21 22 class PhishingUrlFeatureExtractor { 23 public: 24 PhishingUrlFeatureExtractor(); 25 ~PhishingUrlFeatureExtractor(); 26 27 // Extracts features for |url| into the given feature map. 28 // Returns true on success. 29 bool ExtractFeatures(const GURL& url, FeatureMap* features); 30 31 private: 32 friend class PhishingUrlFeatureExtractorTest; 33 34 static const size_t kMinPathComponentLength = 3; 35 36 // Given a string, finds all substrings of consecutive alphanumeric 37 // characters of length >= kMinPathComponentLength and inserts them into 38 // tokens. 39 static void SplitStringIntoLongAlphanumTokens( 40 const std::string& full, 41 std::vector<std::string>* tokens); 42 43 DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor); 44 }; 45 46 } // namespace safe_browsing 47 48 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ 49