Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // PhishingUrlFeatureExtractor handles computing URL-based features for
      6 // the client-side phishing detection model.  These include tokens in the
      7 // host and path, features pertaining to host length, and IP addresses.
      8 
      9 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
     10 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
     11 
     12 #include <string>
     13 #include <vector>
     14 
     15 #include "base/basictypes.h"
     16 
     17 class GURL;
     18 
     19 namespace safe_browsing {
     20 class FeatureMap;
     21 
     22 class PhishingUrlFeatureExtractor {
     23  public:
     24   PhishingUrlFeatureExtractor();
     25   ~PhishingUrlFeatureExtractor();
     26 
     27   // Extracts features for |url| into the given feature map.
     28   // Returns true on success.
     29   bool ExtractFeatures(const GURL& url, FeatureMap* features);
     30 
     31  private:
     32   friend class PhishingUrlFeatureExtractorTest;
     33 
     34   static const size_t kMinPathComponentLength = 3;
     35 
     36   // Given a string, finds all substrings of consecutive alphanumeric
     37   // characters of length >= kMinPathComponentLength and inserts them into
     38   // tokens.
     39   static void SplitStringIntoLongAlphanumTokens(
     40       const std::string& full,
     41       std::vector<std::string>* tokens);
     42 
     43   DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor);
     44 };
     45 
     46 }  // namespace safe_browsing
     47 
     48 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
     49