browser/net/predictor.h

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// A Predictor object is instantiated once in the browser process, and manages
// both preresolution of hostnames, as well as TCP/IP preconnection to expected
// subresources.
// Most hostname lists are provided by the renderer processes, and include URLs
// that *might* be used in the near future by the browsing user.  One goal of
// this class is to cause the underlying DNS structure to lookup a hostname
// before it is really needed, and hence reduce latency in the standard lookup
// paths.
// Subresource relationships are usually acquired from the referrer field in a
// navigation.  A subresource URL may be associated with a referrer URL.  Later
// navigations may, if the likelihood of needing the subresource is high enough,
// cause this module to speculatively create a TCP/IP connection. If there is
// only a low likelihood, then a DNS pre-resolution operation may be performed.

#ifndef CHROME_BROWSER_NET_PREDICTOR_H_
#define CHROME_BROWSER_NET_PREDICTOR_H_

#include <map>
#include <queue>
#include <set>
#include <string>
#include <vector>

#include "base/gtest_prod_util.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/weak_ptr.h"
#include "chrome/browser/net/referrer.h"
#include "chrome/browser/net/timed_cache.h"
#include "chrome/browser/net/url_info.h"
#include "chrome/common/net/predictor_common.h"
#include "net/base/host_port_pair.h"

class IOThread;
class PrefService;
class Profile;

namespace base {
class ListValue;
class WaitableEvent;
}

namespace net {
class HostResolver;
class URLRequestContextGetter;
}

namespace user_prefs {
class PrefRegistrySyncable;
}

namespace chrome_browser_net {

typedef chrome_common_net::UrlList UrlList;
typedef chrome_common_net::NameList NameList;
typedef std::map<GURL, UrlInfo> Results;

// Predictor is constructed during Profile construction (on the UI thread),
// but it is destroyed on the IO thread when ProfileIOData goes away. All of
// its core state and functionality happens on the IO thread. The only UI
// methods are initialization / shutdown related (including preconnect
// initialization), or convenience methods that internally forward calls to
// the IO thread.
class Predictor {
 public:
  // A version number for prefs that are saved. This should be incremented when
  // we change the format so that we discard old data.
  static const int kPredictorReferrerVersion;

  // Given that the underlying Chromium resolver defaults to a total maximum of
  // 8 paralell resolutions, we will avoid any chance of starving navigational
  // resolutions by limiting the number of paralell speculative resolutions.
  // This is used in the field trials and testing.
  // TODO(jar): Move this limitation into the resolver.
  static const size_t kMaxSpeculativeParallelResolves;

  // To control the congestion avoidance system, we need an estimate of how
  // many speculative requests may arrive at once.  Since we currently only
  // keep 8 subresource names for each frame, we'll use that as our basis.
  // Note that when scanning search results lists, we might actually get 10 at
  // a time, and wikipedia can often supply (during a page scan) upwards of 50.
  // In those odd cases, we may discard some of the later speculative requests
  // mistakenly assuming that the resolutions took too long.
  static const int kTypicalSpeculativeGroupSize;

  // The next constant specifies an amount of queueing delay that is
  // "too large," and indicative of problems with resolutions (perhaps due to
  // an overloaded router, or such).  When we exceed this delay, congestion
  // avoidance will kick in and all speculations in the queue will be discarded.
  static const int kMaxSpeculativeResolveQueueDelayMs;

  // We don't bother learning to preconnect via a GET if the original URL
  // navigation was so long ago, that a preconnection would have been dropped
  // anyway.  We believe most servers will drop the connection in 10 seconds, so
  // we currently estimate this time-till-drop at 10 seconds.
  // TODO(jar): We should do a persistent field trial to validate/optimize this.
  static const int kMaxUnusedSocketLifetimeSecondsWithoutAGet;

  // |max_concurrent| specifies how many concurrent (parallel) prefetches will
  // be performed. Host lookups will be issued through |host_resolver|.
  explicit Predictor(bool preconnect_enabled);

  virtual ~Predictor();

  // This function is used to create a predictor. For testing, we can create
  // a version which does a simpler shutdown.
  static Predictor* CreatePredictor(bool preconnect_enabled,
                                    bool simple_shutdown);

  static void RegisterProfilePrefs(user_prefs::PrefRegistrySyncable* registry);

  // ------------- Start UI thread methods.

  virtual void InitNetworkPredictor(PrefService* user_prefs,
                                    PrefService* local_state,
                                    IOThread* io_thread,
                                    net::URLRequestContextGetter* getter);

  // The Omnibox has proposed a given url to the user, and if it is a search
  // URL, then it also indicates that this is preconnectable (i.e., we could
  // preconnect to the search server).
  void AnticipateOmniboxUrl(const GURL& url, bool preconnectable);

  // Preconnect a URL and all of its subresource domains.
  void PreconnectUrlAndSubresources(const GURL& url,
                                    const GURL& first_party_for_cookies);

  static UrlList GetPredictedUrlListAtStartup(PrefService* user_prefs,
                                              PrefService* local_state);

  static void set_max_queueing_delay(int max_queueing_delay_ms);

  static void set_max_parallel_resolves(size_t max_parallel_resolves);

  virtual void ShutdownOnUIThread(PrefService* user_prefs);

  // ------------- End UI thread methods.

  // ------------- Start IO thread methods.

  // Cancel pending requests and prevent new ones from being made.
  void Shutdown();

  // In some circumstances, for privacy reasons, all results should be
  // discarded.  This method gracefully handles that activity.
  // Destroy all our internal state, which shows what names we've looked up, and
  // how long each has taken, etc. etc.  We also destroy records of suggesses
  // (cache hits etc.).
  void DiscardAllResults();

  // Add hostname(s) to the queue for processing.
  void ResolveList(const UrlList& urls,
                   UrlInfo::ResolutionMotivation motivation);

  void Resolve(const GURL& url, UrlInfo::ResolutionMotivation motivation);

  // Record details of a navigation so that we can preresolve the host name
  // ahead of time the next time the users navigates to the indicated host.
  // Should only be called when urls are distinct, and they should already be
  // canonicalized to not have a path.
  void LearnFromNavigation(const GURL& referring_url, const GURL& target_url);

  // When displaying info in about:dns, the following API is called.
  static void PredictorGetHtmlInfo(Predictor* predictor, std::string* output);

  // Dump HTML table containing list of referrers for about:dns.
  void GetHtmlReferrerLists(std::string* output);

  // Dump the list of currently known referrer domains and related prefetchable
  // domains for about:dns.
  void GetHtmlInfo(std::string* output);

  // Discards any referrer for which all the suggested host names are currently
  // annotated with negligible expected-use.  Scales down (diminishes) the
  // expected-use of those that remain, so that their use will go down by a
  // factor each time we trim (moving the referrer closer to being discarded in
  // a future call).
  // The task is performed synchronously and completes before returing.
  void TrimReferrersNow();

  // Construct a ListValue object that contains all the data in the referrers_
  // so that it can be persisted in a pref.
  void SerializeReferrers(base::ListValue* referral_list);

  // Process a ListValue that contains all the data from a previous reference
  // list, as constructed by SerializeReferrers(), and add all the identified
  // values into the current referrer list.
  void DeserializeReferrers(const base::ListValue& referral_list);

  void DeserializeReferrersThenDelete(base::ListValue* referral_list);

  void DiscardInitialNavigationHistory();

  void FinalizeInitializationOnIOThread(
      const std::vector<GURL>& urls_to_prefetch,
      base::ListValue* referral_list,
      IOThread* io_thread,
      bool predictor_enabled);

  // During startup, we learn what the first N urls visited are, and then
  // resolve the associated hosts ASAP during our next startup.
  void LearnAboutInitialNavigation(const GURL& url);

  // Renderer bundles up list and sends to this browser API via IPC.
  // TODO(jar): Use UrlList instead to include port and scheme.
  void DnsPrefetchList(const NameList& hostnames);

  // May be called from either the IO or UI thread and will PostTask
  // to the IO thread if necessary.
  void DnsPrefetchMotivatedList(const UrlList& urls,
                                UrlInfo::ResolutionMotivation motivation);

  // May be called from either the IO or UI thread and will PostTask
  // to the IO thread if necessary.
  void SaveStateForNextStartupAndTrim(PrefService* prefs);

  void SaveDnsPrefetchStateForNextStartupAndTrim(
      base::ListValue* startup_list,
      base::ListValue* referral_list,
      base::WaitableEvent* completion);

  // May be called from either the IO or UI thread and will PostTask
  // to the IO thread if necessary.
  void EnablePredictor(bool enable);

  void EnablePredictorOnIOThread(bool enable);

  // May be called from either the IO or UI thread and will PostTask
  // to the IO thread if necessary.
  void PreconnectUrl(const GURL& url, const GURL& first_party_for_cookies,
                     UrlInfo::ResolutionMotivation motivation, int count);

  void PreconnectUrlOnIOThread(const GURL& url,
                               const GURL& first_party_for_cookies,
                               UrlInfo::ResolutionMotivation motivation,
                               int count);

  void RecordPreconnectTrigger(const GURL& url);

  void RecordPreconnectNavigationStat(const std::vector<GURL>& url_chain,
                                      bool is_subresource);

  void RecordLinkNavigation(const GURL& url);

  // ------------- End IO thread methods.

  // The following methods may be called on either the IO or UI threads.

  // Instigate pre-connection to any URLs, or pre-resolution of related host,
  // that we predict will be needed after this navigation (typically
  // more-embedded resources on a page).  This method will actually post a task
  // to do the actual work, so as not to jump ahead of the frame navigation that
  // instigated this activity.
  void PredictFrameSubresources(const GURL& url,
                                const GURL& first_party_for_cookies);

  // Put URL in canonical form, including a scheme, host, and port.
  // Returns GURL::EmptyGURL() if the scheme is not http/https or if the url
  // cannot be otherwise canonicalized.
  static GURL CanonicalizeUrl(const GURL& url);

  // Used for testing.
  void SetHostResolver(net::HostResolver* host_resolver) {
    host_resolver_ = host_resolver;
  }
  // Used for testing.
  size_t max_concurrent_dns_lookups() const {
    return max_concurrent_dns_lookups_;
  }
  // Used for testing.
  void SetShutdown(bool shutdown) {
    shutdown_ = shutdown;
  }

  // Flag setting to use preconnection instead of just DNS pre-fetching.
  bool preconnect_enabled() const {
    return preconnect_enabled_;
  }

  // Flag setting for whether we are prefetching dns lookups.
  bool predictor_enabled() const {
    return predictor_enabled_;
  }


 private:
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, BenefitLookupTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, ShutdownWhenResolutionIsPendingTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, SingleLookupTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, ConcurrentLookupTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, MassiveConcurrentLookupTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, PriorityQueuePushPopTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, PriorityQueueReorderTest);
  FRIEND_TEST_ALL_PREFIXES(PredictorTest, ReferrerSerializationTrimTest);
  friend class WaitForResolutionHelper;  // For testing.

  class LookupRequest;

  // A simple priority queue for handling host names.
  // Some names that are queued up have |motivation| that requires very rapid
  // handling.  For example, a sub-resource name lookup MUST be done before the
  // actual sub-resource is fetched.  In contrast, a name that was speculatively
  // noted in a page has to be resolved before the user "gets around to"
  // clicking on a link.  By tagging (with a motivation) each push we make into
  // this FIFO queue, the queue can re-order the more important names to service
  // them sooner (relative to some low priority background resolutions).
  class HostNameQueue {
   public:
    HostNameQueue();
    ~HostNameQueue();
    void Push(const GURL& url,
              UrlInfo::ResolutionMotivation motivation);
    bool IsEmpty() const;
    GURL Pop();

   private:
    // The names in the queue that should be serviced (popped) ASAP.
    std::queue<GURL> rush_queue_;
    // The names in the queue that should only be serviced when rush_queue is
    // empty.
    std::queue<GURL> background_queue_;

  DISALLOW_COPY_AND_ASSIGN(HostNameQueue);
  };

  // The InitialObserver monitors navigations made by the network stack. This
  // is only used to identify startup time resolutions (for re-resolution
  // during our next process startup).
  // TODO(jar): Consider preconnecting at startup, which may be faster than
  // waiting for render process to start and request a connection.
  class InitialObserver {
   public:
    InitialObserver();
    ~InitialObserver();
    // Recording of when we observed each navigation.
    typedef std::map<GURL, base::TimeTicks> FirstNavigations;

    // Potentially add a new URL to our startup list.
    void Append(const GURL& url, Predictor* predictor);

    // Get an HTML version of our current planned first_navigations_.
    void GetFirstResolutionsHtml(std::string* output);

    // Persist the current first_navigations_ for storage in a list.
    void GetInitialDnsResolutionList(base::ListValue* startup_list);

    // Discards all initial loading history.
    void DiscardInitialNavigationHistory() { first_navigations_.clear(); }

   private:
    // List of the first N URL resolutions observed in this run.
    FirstNavigations first_navigations_;

    // The number of URLs we'll save for pre-resolving at next startup.
    static const size_t kStartupResolutionCount = 10;
  };

  // A map that is keyed with the host/port that we've learned were the cause
  // of loading additional URLs.  The list of additional targets is held
  // in a Referrer instance, which is a value in this map.
  typedef std::map<GURL, Referrer> Referrers;

  // Depending on the expected_subresource_use_, we may either make a TCP/IP
  // preconnection, or merely pre-resolve the hostname via DNS (or even do
  // nothing).  The following are the threasholds for taking those actions.
  static const double kPreconnectWorthyExpectedValue;
  static const double kDNSPreresolutionWorthyExpectedValue;
  // Referred hosts with a subresource_use_rate_ that are less than the
  // following threshold will be discarded when we Trim() the list.
  static const double kDiscardableExpectedValue;
  // During trimming operation to discard hosts for which we don't have likely
  // subresources, we multiply the expected_subresource_use_ value by the
  // following ratio until that value is less than kDiscardableExpectedValue.
  // This number should always be less than 1, an more than 0.
  static const double kReferrerTrimRatio;

  // Interval between periodic trimming of our whole referrer list.
  // We only do a major trimming about once an hour, and then only when the user
  // is actively browsing.
  static const int64 kDurationBetweenTrimmingsHours;
  // Interval between incremental trimmings (to avoid inducing Jank).
  static const int64 kDurationBetweenTrimmingIncrementsSeconds;
  // Number of referring URLs processed in an incremental trimming.
  static const size_t kUrlsTrimmedPerIncrement;

  // Only for testing. Returns true if hostname has been successfully resolved
  // (name found).
  bool WasFound(const GURL& url) const {
    Results::const_iterator it(results_.find(url));
    return (it != results_.end()) &&
            it->second.was_found();
  }

  // Only for testing. Return how long was the resolution
  // or UrlInfo::NullDuration() if it hasn't been resolved yet.
  base::TimeDelta GetResolutionDuration(const GURL& url) {
    if (results_.find(url) == results_.end())
      return UrlInfo::NullDuration();
    return results_[url].resolve_duration();
  }

  // Only for testing;
  size_t peak_pending_lookups() const { return peak_pending_lookups_; }

  // ------------- Start IO thread methods.

  // Perform actual resolution or preconnection to subresources now.  This is
  // an internal worker method that is reached via a post task from
  // PredictFrameSubresources().
  void PrepareFrameSubresources(const GURL& url,
                                const GURL& first_party_for_cookies);

  // Access method for use by async lookup request to pass resolution result.
  void OnLookupFinished(LookupRequest* request, const GURL& url, bool found);

  // Underlying method for both async and synchronous lookup to update state.
  void LookupFinished(LookupRequest* request,
                      const GURL& url, bool found);

  // Queue hostname for resolution.  If queueing was done, return the pointer
  // to the queued instance, otherwise return NULL.
  UrlInfo* AppendToResolutionQueue(const GURL& url,
      UrlInfo::ResolutionMotivation motivation);

  // Check to see if too much queuing delay has been noted for the given info,
  // which indicates that there is "congestion" or growing delay in handling the
  // resolution of names.  Rather than letting this congestion potentially grow
  // without bounds, we abandon our queued efforts at pre-resolutions in such a
  // case.
  // To do this, we will recycle |info|, as well as all queued items, back to
  // the state they had before they were queued up.  We can't do anything about
  // the resolutions we've already sent off for processing on another thread, so
  // we just let them complete.  On a slow system, subject to congestion, this
  // will greatly reduce the number of resolutions done, but it will assure that
  // any resolutions that are done, are in a timely and hence potentially
  // helpful manner.
  bool CongestionControlPerformed(UrlInfo* info);

  // Take lookup requests from work_queue_ and tell HostResolver to look them up
  // asynchronously, provided we don't exceed concurrent resolution limit.
  void StartSomeQueuedResolutions();

  // Performs trimming similar to TrimReferrersNow(), except it does it as a
  // series of short tasks by posting continuations again an again until done.
  void TrimReferrers();

  // Loads urls_being_trimmed_ from keys of current referrers_.
  void LoadUrlsForTrimming();

  // Posts a task to do additional incremental trimming of referrers_.
  void PostIncrementalTrimTask();

  // Calls Trim() on some or all of urls_being_trimmed_.
  // If it does not process all the URLs in that vector, it posts a task to
  // continue with them shortly (i.e., it yeilds and continues).
  void IncrementalTrimReferrers(bool trim_all_now);

  // ------------- End IO thread methods.

  scoped_ptr<InitialObserver> initial_observer_;

  // Reference to URLRequestContextGetter from the Profile which owns the
  // predictor. Used by Preconnect.
  scoped_refptr<net::URLRequestContextGetter> url_request_context_getter_;

  // Status of speculative DNS resolution and speculative TCP/IP connection
  // feature.
  bool predictor_enabled_;

  // work_queue_ holds a list of names we need to look up.
  HostNameQueue work_queue_;

  // results_ contains information for existing/prior prefetches.
  Results results_;

  std::set<LookupRequest*> pending_lookups_;

  // For testing, to verify that we don't exceed the limit.
  size_t peak_pending_lookups_;

  // When true, we don't make new lookup requests.
  bool shutdown_;

  // The number of concurrent speculative lookups currently allowed to be sent
  // to the resolver.  Any additional lookups will be queued to avoid exceeding
  // this value.  The queue is a priority queue that will accelerate
  // sub-resource speculation, and retard resolutions suggested by page scans.
  const size_t max_concurrent_dns_lookups_;

  // The maximum queueing delay that is acceptable before we enter congestion
  // reduction mode, and discard all queued (but not yet assigned) resolutions.
  const base::TimeDelta max_dns_queue_delay_;

  // The host resolver we warm DNS entries for.
  net::HostResolver* host_resolver_;

  // Are we currently using preconnection, rather than just DNS resolution, for
  // subresources and omni-box search URLs.
  bool preconnect_enabled_;

  // Most recent suggestion from Omnibox provided via AnticipateOmniboxUrl().
  std::string last_omnibox_host_;

  // The time when the last preresolve was done for last_omnibox_host_.
  base::TimeTicks last_omnibox_preresolve_;

  // The number of consecutive requests to AnticipateOmniboxUrl() that suggested
  // preconnecting (because it was to a search service).
  int consecutive_omnibox_preconnect_count_;

  // The time when the last preconnection was requested to a search service.
  base::TimeTicks last_omnibox_preconnect_;

  class PreconnectUsage;
  scoped_ptr<PreconnectUsage> preconnect_usage_;

  // For each URL that we might navigate to (that we've "learned about")
  // we have a Referrer list. Each Referrer list has all hostnames we might
  // need to pre-resolve or pre-connect to when there is a navigation to the
  // orginial hostname.
  Referrers referrers_;

  // List of URLs in referrers_ currently being trimmed (scaled down to
  // eventually be aged out of use).
  std::vector<GURL> urls_being_trimmed_;

  // A time after which we need to do more trimming of referrers.
  base::TimeTicks next_trim_time_;

  scoped_ptr<base::WeakPtrFactory<Predictor> > weak_factory_;

  DISALLOW_COPY_AND_ASSIGN(Predictor);
};

// This version of the predictor is used for testing.
class SimplePredictor : public Predictor {
 public:
  explicit SimplePredictor(bool preconnect_enabled)
      : Predictor(preconnect_enabled) {}
  virtual ~SimplePredictor() {}
  virtual void InitNetworkPredictor(
      PrefService* user_prefs,
      PrefService* local_state,
      IOThread* io_thread,
      net::URLRequestContextGetter* getter) OVERRIDE;
  virtual void ShutdownOnUIThread(PrefService* user_prefs) OVERRIDE;
};

}  // namespace chrome_browser_net

#endif  // CHROME_BROWSER_NET_PREDICTOR_H_