Home | History | Annotate | Download | only in shill
      1 //
      2 // Copyright (C) 2015 The Android Open Source Project
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 //
     16 
     17 #ifndef SHILL_CONNECTION_DIAGNOSTICS_H_
     18 #define SHILL_CONNECTION_DIAGNOSTICS_H_
     19 
     20 #include <map>
     21 #include <memory>
     22 #include <string>
     23 #include <vector>
     24 
     25 #include <base/callback.h>
     26 #include <base/cancelable_callback.h>
     27 #include <base/memory/weak_ptr.h>
     28 
     29 #include "shill/portal_detector.h"
     30 #include "shill/refptr_types.h"
     31 
     32 namespace shill {
     33 
     34 class ArpClient;
     35 class ByteString;
     36 class DeviceInfo;
     37 class DNSClient;
     38 class DNSClientFactory;
     39 class Error;
     40 class EventDispatcher;
     41 class HTTPURL;
     42 class IcmpSession;
     43 class IcmpSessionFactory;
     44 class Metrics;
     45 class RoutingTable;
     46 struct RoutingTableEntry;
     47 class RTNLHandler;
     48 class RTNLListener;
     49 class RTNLMessage;
     50 
     51 // The ConnectionDiagnostics class implements facilities to diagnose problems
     52 // that a connection encounters reaching a specific URL.
     53 //
     54 // Given a connection and a URL, ConnectionDiagnostics performs the following
     55 // actions:
     56 // (A) Start portal detection on the connection using the given URL.
     57 //     (B) If portal detection ends in the content phase, the connection is
     58 //         either functioning, or we are trapped in a captive portal. END.
     59 //     (C) If the portal detection ends in the DNS phase and failed for any
     60 //         reason other than a timeout, we have found a DNS server issue. END.
     61 //     (D) If the portal detection ends in the DNS phase and failed because of a
     62 //         timeout, ping all DNS servers.
     63 //         (E) If none of the DNS servers reply to pings, then we might have a
     64 //             problem issue reaching DNS servers. Send a request to the kernel
     65 //             for a route the first DNS server on our list (step M).
     66 //         (F) If at least one DNS server replies to pings, and we have DNS
     67 //             retries left, attempt DNS resolution again using the pingable DNS
     68 //             servers.
     69 //         (G) If at least one DNS server replies to pings but we are out of DNS
     70 //             retries, the DNS servers are at fault. END.
     71 //     (H) If portal detection ends in any other phase (i.e. HTTP or Connection)
     72 //         resolve the IP of the target web server via DNS.
     73 //         (I) If DNS resolution fails because of a timeout, ping all DNS
     74 //             servers (step D).
     75 //         (J) If DNS resolution fails for any other reason, we have found a
     76 //             DNS server issue. END.
     77 //         (K) Otherwise, ping the IP address of the target web server.
     78 //             (L) If ping is successful, we can reach the target web server. We
     79 //                 might have a HTTP issue or a broken portal. END.
     80 //             (M) If ping is unsuccessful, we send a request to the kernel for
     81 //                 a route to the IP address of the target web server.
     82 //                 (N) If no route is found, a routing issue has been found.
     83 //                     END.
     84 //                 (O) If a route is found, and the destination is a local IPv6
     85 //                     address, look for a neighbor table entry.
     86 //                     (P) If a neighbor table entry is found, then this
     87 //                         gateway/web server appears to be on the local
     88 //                         network, but is not responding to pings. END.
     89 //                     (Q) If a neighbor table entry is not found, then either
     90 //                         this gateway/web server does not exist on the local
     91 //                         network, or there are link layer issues.
     92 //                 (R) If a route is found and the destination is a remote
     93 //                     address, ping the local gateway.
     94 //                     (S) If the local gateway respond to pings, then we have
     95 //                         found an upstream connectivity problem or gateway
     96 //                         problem. END.
     97 //                     (T) If the local gateway is at an IPv6 address and does
     98 //                         not respond to pings, look for a neighbor table
     99 //                         entry (step O).
    100 //                     (U) If the local gateway is at an IPv4 address and does
    101 //                         not respond to pings, check for an ARP table entry
    102 //                         for its address (step V).
    103 //                 (V) Otherwise, if a route is found and the destination is a
    104 //                     local IPv4 address, look for an ARP table entry for it.
    105 //                     (W) If an ARP table entry is found, then this gateway/
    106 //                         web server appears to be on the local network, but is
    107 //                         not responding to pings. END.
    108 //                     (X) If an ARP table entry is not found, check for IP
    109 //                         address collision in the local network by sending out
    110 //                         an ARP request for the local IP address of this
    111 //                         connection.
    112 //                         (Y) If a reply is received, an IP collision has been
    113 //                             detected. END.
    114 //                         (Z) If no reply was received, no IP address collision
    115 //                             was detected. Since we are here because ARP and
    116 //                             ping failed, either the web server or gateway
    117 //                             does not actually exist on the local network, or
    118 //                             there is a link layer issue. END.
    119 //
    120 // TODO(samueltan): Step F: if retry succeeds, remove the unresponsive DNS
    121 // servers so Chrome does not try to use them.
    122 // TODO(samueltan): Step X: find ways to disambiguate the cause (e.g. can we see
    123 // packets from other hosts?).
    124 class ConnectionDiagnostics {
    125  public:
    126   // The ConnectionDiagnostics::kEventNames string array depends on this enum.
    127   // Any changes to this enum should be synced with that array.
    128   enum Type {
    129     kTypePortalDetection = 0,
    130     kTypePingDNSServers = 1,
    131     kTypeResolveTargetServerIP = 2,
    132     kTypePingTargetServer = 3,
    133     kTypePingGateway = 4,
    134     kTypeFindRoute = 5,
    135     kTypeArpTableLookup = 6,
    136     kTypeNeighborTableLookup = 7,
    137     kTypeIPCollisionCheck = 8
    138   };
    139 
    140   // The ConnectionDiagnostics::kPhaseNames string array depends on this enum.
    141   // Any changes to this enum should be synced with that array.
    142   enum Phase {
    143     kPhaseStart = 0,
    144     kPhaseEnd = 1,
    145     // End phases specific to kTypePortalDetection.
    146     kPhasePortalDetectionEndContent = 2,
    147     kPhasePortalDetectionEndDNS = 3,
    148     kPhasePortalDetectionEndOther = 4
    149   };
    150 
    151   // The ConnectionDiagnostics::kResultNames string array depends on this enum.
    152   // Any changes to this enum should be synced with that array.
    153   enum Result {
    154     kResultSuccess = 0,
    155     kResultFailure = 1,
    156     kResultTimeout = 2
    157   };
    158 
    159   struct Event {
    160     Event(Type type_in, Phase phase_in, Result result_in,
    161           const std::string& message_in)
    162         : type(type_in),
    163           phase(phase_in),
    164           result(result_in),
    165           message(message_in) {}
    166     Type type;
    167     Phase phase;
    168     Result result;
    169     std::string message;
    170   };
    171 
    172   // The result of the diagnostics is a string describing the connection issue
    173   // detected (if any), and list of events (e.g. routing table
    174   // lookup, DNS resolution) performed during the diagnostics.
    175   using ResultCallback =
    176       base::Callback<void(const std::string&, const std::vector<Event>&)>;
    177 
    178   // Metrics::NotifyConnectionDiagnosticsIssue depends on these kIssue strings.
    179   // Any changes to these strings should be synced with that Metrics function.
    180   static const char kIssueIPCollision[];
    181   static const char kIssueRouting[];
    182   static const char kIssueHTTPBrokenPortal[];
    183   static const char kIssueDNSServerMisconfig[];
    184   static const char kIssueDNSServerNoResponse[];
    185   static const char kIssueNoDNSServersConfigured[];
    186   static const char kIssueDNSServersInvalid[];
    187   static const char kIssueNone[];
    188   static const char kIssueCaptivePortal[];
    189   static const char kIssueGatewayUpstream[];
    190   static const char kIssueGatewayNotResponding[];
    191   static const char kIssueServerNotResponding[];
    192   static const char kIssueGatewayArpFailed[];
    193   static const char kIssueServerArpFailed[];
    194   static const char kIssueInternalError[];
    195   static const char kIssueGatewayNoNeighborEntry[];
    196   static const char kIssueServerNoNeighborEntry[];
    197   static const char kIssueGatewayNeighborEntryNotConnected[];
    198   static const char kIssueServerNeighborEntryNotConnected[];
    199 
    200   ConnectionDiagnostics(ConnectionRefPtr connection,
    201                         EventDispatcher* dispatcher,
    202                         Metrics* metrics,
    203                         const DeviceInfo* device_info,
    204                         const ResultCallback& result_callback);
    205   ~ConnectionDiagnostics();
    206 
    207   // Starts diagnosing problems that |connection_| encounters reaching
    208   // |url_string|.
    209   bool Start(const std::string& url_string);
    210 
    211   // Skips the portal detection initiated in ConnectionDiagnostics::Start and
    212   // performs further diagnostics based on the |result| from a completed portal
    213   // detection attempt.
    214   bool StartAfterPortalDetection(const std::string& url_string,
    215                                  const PortalDetector::Result& result);
    216 
    217   void Stop();
    218 
    219   // Returns a string representation of |event|.
    220   static std::string EventToString(const Event& event);
    221 
    222   bool running() { return running_; }
    223 
    224  private:
    225   friend class ConnectionDiagnosticsTest;
    226 
    227   static const int kMaxDNSRetries;
    228   static const int kRouteQueryTimeoutSeconds;
    229   static const int kArpReplyTimeoutSeconds;
    230   static const int kNeighborTableRequestTimeoutSeconds;
    231   static const int kDNSTimeoutSeconds;
    232 
    233   // Create a new Event with |type|, |phase|, |result|, and an empty message,
    234   // and add it to the end of |diagnostic_events_|.
    235   void AddEvent(Type type, Phase phase, Result result);
    236 
    237   // Same as ConnectionDiagnostics::AddEvent, except that the added event
    238   // contains the string |message|.
    239   void AddEventWithMessage(Type type, Phase phase, Result result,
    240                            const std::string& message);
    241 
    242   // Calls |result_callback_|, then stops connection diagnostics.
    243   // |diagnostic_events_| and |issue| are passed as arguments to
    244   // |result_callback_| to report the results of the diagnostics.
    245   void ReportResultAndStop(const std::string &issue);
    246 
    247   void StartAfterPortalDetectionInternal(const PortalDetector::Result& result);
    248 
    249   // Attempts to resolve the IP address of |target_url_| using |dns_servers|.
    250   void ResolveTargetServerIPAddress(
    251       const std::vector<std::string>& dns_servers);
    252 
    253   // Pings all the DNS servers of |connection_|.
    254   void PingDNSServers();
    255 
    256   // Finds a route to the host at |address| by querying the kernel's routing
    257   // table.
    258   void FindRouteToHost(const IPAddress& address);
    259 
    260   // Finds an ARP table entry for |address| by querying the kernel's ARP table.
    261   void FindArpTableEntry(const IPAddress& address);
    262 
    263   // Finds a neighbor table entry for |address| by requesting an RTNL neighbor
    264   // table dump, and looking for a matching neighbor table entry for |address|
    265   // in ConnectionDiagnostics::OnNeighborMsgReceived.
    266   void FindNeighborTableEntry(const IPAddress& address);
    267 
    268   // Checks for an IP collision by sending out an ARP request for the local IP
    269   // address assigned to |connection_|.
    270   void CheckIpCollision();
    271 
    272   // Starts an IcmpSession with |address|. Called when we want to ping the
    273   // target web server or local gateway.
    274   void PingHost(const IPAddress& address);
    275 
    276   // Called after each IcmpSession started in
    277   // ConnectionDiagnostics::PingDNSServers finishes or times out. The DNS server
    278   // that was pinged can be uniquely identified with |dns_server_index|.
    279   // Attempts to resolve the IP address of |target_url_| again if at least one
    280   // DNS server was pinged successfully, and if |num_dns_attempts_| has not yet
    281   // reached |kMaxDNSRetries|.
    282   void OnPingDNSServerComplete(int dns_server_index,
    283                                const std::vector<base::TimeDelta>& result);
    284 
    285   // Called after the DNS IP address resolution on started in
    286   // ConnectionDiagnostics::ResolveTargetServerIPAddress completes.
    287   void OnDNSResolutionComplete(const Error& error, const IPAddress& address);
    288 
    289   // Called after the IcmpSession started in ConnectionDiagnostics::PingHost on
    290   // |address_pinged| finishes or times out. |ping_event_type| indicates the
    291   // type of ping that was started (gateway or target web server), and |result|
    292   // is the result of the IcmpSession.
    293   void OnPingHostComplete(Type ping_event_type, const IPAddress& address_pinged,
    294                           const std::vector<base::TimeDelta>& result);
    295 
    296   // This I/O callback is triggered whenever the ARP reception socket has data
    297   // available to be received.
    298   void OnArpReplyReceived(int fd);
    299 
    300   // Called if no replies to the ARP request sent in
    301   // ConnectionDiagnostics::CheckIpCollision are received within
    302   // |kArpReplyTimeoutSeconds| seconds.
    303   void OnArpRequestTimeout();
    304 
    305   // Called when replies are received to the neighbor table dump request issued
    306   // in ConnectionDiagnostics::FindNeighborTableEntry.
    307   void OnNeighborMsgReceived(const IPAddress& address_queried,
    308                              const RTNLMessage& msg);
    309 
    310   // Called if no neighbor table entry for |address_queried| is received within
    311   // |kNeighborTableRequestTimeoutSeconds| of issuing a dump request in
    312   // ConnectionDiagnostics::FindNeighborTableEntry.
    313   void OnNeighborTableRequestTimeout(const IPAddress& address_queried);
    314 
    315   // Called upon receiving a reply to the routing table query issued in
    316   // ConnectionDiagnostics::FindRoute.
    317   void OnRouteQueryResponse(int interface_index,
    318                             const RoutingTableEntry& entry);
    319 
    320   // Called if no replies to the routing table query issued in
    321   // ConnectionDiagnostics::FindRoute are received within
    322   // |kRouteQueryTimeoutSeconds|.
    323   void OnRouteQueryTimeout();
    324 
    325   // Utility function that returns true iff the event in |diagnostic_events_|
    326   // that is |num_events_ago| before the last event has a matching |type|,
    327   // |phase|, and |result|.
    328   bool DoesPreviousEventMatch(Type type, Phase phase, Result result,
    329                               size_t num_events_ago);
    330 
    331   base::WeakPtrFactory<ConnectionDiagnostics> weak_ptr_factory_;
    332   EventDispatcher* dispatcher_;
    333   Metrics* metrics_;
    334   RoutingTable* routing_table_;
    335   RTNLHandler* rtnl_handler_;
    336 
    337   // The connection being diagnosed.
    338   ConnectionRefPtr connection_;
    339 
    340   // Used to get the MAC address of the device associated with |connection_|.
    341   const DeviceInfo* device_info_;
    342 
    343   // The MAC address of device associated with |connection_|.
    344   ByteString local_mac_address_;
    345 
    346   DNSClientFactory* dns_client_factory_;
    347   std::unique_ptr<DNSClient> dns_client_;
    348   std::unique_ptr<PortalDetector> portal_detector_;
    349   std::unique_ptr<ArpClient> arp_client_;
    350   std::unique_ptr<IcmpSession> icmp_session_;
    351 
    352   // The URL being diagnosed. Stored in unique_ptr so that it can be cleared
    353   // when we stop diagnostics.
    354   std::unique_ptr<HTTPURL> target_url_;
    355 
    356   // Used to ping multiple DNS servers in |connection_| in parallel.
    357   IcmpSessionFactory* icmp_session_factory_;
    358   std::map<int, std::unique_ptr<IcmpSession>>
    359       id_to_pending_dns_server_icmp_session_;
    360   std::vector<std::string> pingable_dns_servers_;
    361 
    362   int num_dns_attempts_;
    363   bool running_;
    364 
    365   ResultCallback result_callback_;
    366   base::CancelableCallback<void(int, const RoutingTableEntry&)>
    367       route_query_callback_;
    368   base::CancelableClosure route_query_timeout_callback_;
    369   base::CancelableClosure arp_reply_timeout_callback_;
    370   base::CancelableClosure neighbor_request_timeout_callback_;
    371 
    372   // IOCallback that fires when the socket associated with |arp_client_| has a
    373   // packet to be received.  Calls ConnectionDiagnostics::OnArpReplyReceived.
    374   std::unique_ptr<IOHandler> receive_response_handler_;
    375 
    376   std::unique_ptr<RTNLListener> neighbor_msg_listener_;
    377 
    378   // Record of all diagnostic events that occurred, sorted in order of
    379   // occurrence.
    380   std::vector<Event> diagnostic_events_;
    381 
    382   DISALLOW_COPY_AND_ASSIGN(ConnectionDiagnostics);
    383 };
    384 
    385 }  // namespace shill
    386 
    387 #endif  // SHILL_CONNECTION_DIAGNOSTICS_H_
    388