Home | History | Annotate | Download | only in socket
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/socket/tcp_client_socket_win.h"
      6 
      7 #include <mstcpip.h>
      8 
      9 #include "base/basictypes.h"
     10 #include "base/compiler_specific.h"
     11 #include "base/memory/memory_debug.h"
     12 #include "base/metrics/stats_counters.h"
     13 #include "base/string_util.h"
     14 #include "base/sys_info.h"
     15 #include "base/win/object_watcher.h"
     16 #include "net/base/address_list_net_log_param.h"
     17 #include "net/base/connection_type_histograms.h"
     18 #include "net/base/io_buffer.h"
     19 #include "net/base/ip_endpoint.h"
     20 #include "net/base/net_errors.h"
     21 #include "net/base/net_log.h"
     22 #include "net/base/net_util.h"
     23 #include "net/base/network_change_notifier.h"
     24 #include "net/base/sys_addrinfo.h"
     25 #include "net/base/winsock_init.h"
     26 #include "net/base/winsock_util.h"
     27 
     28 namespace net {
     29 
     30 namespace {
     31 
     32 int MapConnectError(int os_error) {
     33   switch (os_error) {
     34     // connect fails with WSAEACCES when Windows Firewall blocks the
     35     // connection.
     36     case WSAEACCES:
     37       return ERR_NETWORK_ACCESS_DENIED;
     38     case WSAETIMEDOUT:
     39       return ERR_CONNECTION_TIMED_OUT;
     40     default: {
     41       int net_error = MapSystemError(os_error);
     42       if (net_error == ERR_FAILED)
     43         return ERR_CONNECTION_FAILED;  // More specific than ERR_FAILED.
     44 
     45       // Give a more specific error when the user is offline.
     46       if (net_error == ERR_ADDRESS_UNREACHABLE &&
     47           NetworkChangeNotifier::IsOffline()) {
     48         return ERR_INTERNET_DISCONNECTED;
     49       }
     50 
     51       return net_error;
     52     }
     53   }
     54 }
     55 
     56 }  // namespace
     57 
     58 //-----------------------------------------------------------------------------
     59 
     60 // This class encapsulates all the state that has to be preserved as long as
     61 // there is a network IO operation in progress. If the owner TCPClientSocketWin
     62 // is destroyed while an operation is in progress, the Core is detached and it
     63 // lives until the operation completes and the OS doesn't reference any resource
     64 // declared on this class anymore.
     65 class TCPClientSocketWin::Core : public base::RefCounted<Core> {
     66  public:
     67   explicit Core(TCPClientSocketWin* socket);
     68 
     69   // Start watching for the end of a read or write operation.
     70   void WatchForRead();
     71   void WatchForWrite();
     72 
     73   // The TCPClientSocketWin is going away.
     74   void Detach() { socket_ = NULL; }
     75 
     76   // The separate OVERLAPPED variables for asynchronous operation.
     77   // |read_overlapped_| is used for both Connect() and Read().
     78   // |write_overlapped_| is only used for Write();
     79   OVERLAPPED read_overlapped_;
     80   OVERLAPPED write_overlapped_;
     81 
     82   // The buffers used in Read() and Write().
     83   WSABUF read_buffer_;
     84   WSABUF write_buffer_;
     85   scoped_refptr<IOBuffer> read_iobuffer_;
     86   scoped_refptr<IOBuffer> write_iobuffer_;
     87   int write_buffer_length_;
     88 
     89   // Throttle the read size based on our current slow start state.
     90   // Returns the throttled read size.
     91   int ThrottleReadSize(int size) {
     92     if (slow_start_throttle_ < kMaxSlowStartThrottle) {
     93       size = std::min(size, slow_start_throttle_);
     94       slow_start_throttle_ *= 2;
     95     }
     96     return size;
     97   }
     98 
     99  private:
    100   friend class base::RefCounted<Core>;
    101 
    102   class ReadDelegate : public base::win::ObjectWatcher::Delegate {
    103    public:
    104     explicit ReadDelegate(Core* core) : core_(core) {}
    105     virtual ~ReadDelegate() {}
    106 
    107     // base::ObjectWatcher::Delegate methods:
    108     virtual void OnObjectSignaled(HANDLE object);
    109 
    110    private:
    111     Core* const core_;
    112   };
    113 
    114   class WriteDelegate : public base::win::ObjectWatcher::Delegate {
    115    public:
    116     explicit WriteDelegate(Core* core) : core_(core) {}
    117     virtual ~WriteDelegate() {}
    118 
    119     // base::ObjectWatcher::Delegate methods:
    120     virtual void OnObjectSignaled(HANDLE object);
    121 
    122    private:
    123     Core* const core_;
    124   };
    125 
    126   ~Core();
    127 
    128   // The socket that created this object.
    129   TCPClientSocketWin* socket_;
    130 
    131   // |reader_| handles the signals from |read_watcher_|.
    132   ReadDelegate reader_;
    133   // |writer_| handles the signals from |write_watcher_|.
    134   WriteDelegate writer_;
    135 
    136   // |read_watcher_| watches for events from Connect() and Read().
    137   base::win::ObjectWatcher read_watcher_;
    138   // |write_watcher_| watches for events from Write();
    139   base::win::ObjectWatcher write_watcher_;
    140 
    141   // When doing reads from the socket, we try to mirror TCP's slow start.
    142   // We do this because otherwise the async IO subsystem artifically delays
    143   // returning data to the application.
    144   static const int kInitialSlowStartThrottle = 1 * 1024;
    145   static const int kMaxSlowStartThrottle = 32 * kInitialSlowStartThrottle;
    146   int slow_start_throttle_;
    147 
    148   DISALLOW_COPY_AND_ASSIGN(Core);
    149 };
    150 
    151 TCPClientSocketWin::Core::Core(
    152     TCPClientSocketWin* socket)
    153     : write_buffer_length_(0),
    154       socket_(socket),
    155       ALLOW_THIS_IN_INITIALIZER_LIST(reader_(this)),
    156       ALLOW_THIS_IN_INITIALIZER_LIST(writer_(this)),
    157       slow_start_throttle_(kInitialSlowStartThrottle) {
    158   memset(&read_overlapped_, 0, sizeof(read_overlapped_));
    159   memset(&write_overlapped_, 0, sizeof(write_overlapped_));
    160 }
    161 
    162 TCPClientSocketWin::Core::~Core() {
    163   // Make sure the message loop is not watching this object anymore.
    164   read_watcher_.StopWatching();
    165   write_watcher_.StopWatching();
    166 
    167   WSACloseEvent(read_overlapped_.hEvent);
    168   memset(&read_overlapped_, 0xaf, sizeof(read_overlapped_));
    169   WSACloseEvent(write_overlapped_.hEvent);
    170   memset(&write_overlapped_, 0xaf, sizeof(write_overlapped_));
    171 }
    172 
    173 void TCPClientSocketWin::Core::WatchForRead() {
    174   // We grab an extra reference because there is an IO operation in progress.
    175   // Balanced in ReadDelegate::OnObjectSignaled().
    176   AddRef();
    177   read_watcher_.StartWatching(read_overlapped_.hEvent, &reader_);
    178 }
    179 
    180 void TCPClientSocketWin::Core::WatchForWrite() {
    181   // We grab an extra reference because there is an IO operation in progress.
    182   // Balanced in WriteDelegate::OnObjectSignaled().
    183   AddRef();
    184   write_watcher_.StartWatching(write_overlapped_.hEvent, &writer_);
    185 }
    186 
    187 void TCPClientSocketWin::Core::ReadDelegate::OnObjectSignaled(
    188     HANDLE object) {
    189   DCHECK_EQ(object, core_->read_overlapped_.hEvent);
    190   if (core_->socket_) {
    191     if (core_->socket_->waiting_connect()) {
    192       core_->socket_->DidCompleteConnect();
    193     } else {
    194       core_->socket_->DidCompleteRead();
    195     }
    196   }
    197 
    198   core_->Release();
    199 }
    200 
    201 void TCPClientSocketWin::Core::WriteDelegate::OnObjectSignaled(
    202     HANDLE object) {
    203   DCHECK_EQ(object, core_->write_overlapped_.hEvent);
    204   if (core_->socket_)
    205     core_->socket_->DidCompleteWrite();
    206 
    207   core_->Release();
    208 }
    209 
    210 //-----------------------------------------------------------------------------
    211 
    212 TCPClientSocketWin::TCPClientSocketWin(const AddressList& addresses,
    213                                        net::NetLog* net_log,
    214                                        const net::NetLog::Source& source)
    215     : socket_(INVALID_SOCKET),
    216       addresses_(addresses),
    217       current_ai_(NULL),
    218       waiting_read_(false),
    219       waiting_write_(false),
    220       read_callback_(NULL),
    221       write_callback_(NULL),
    222       next_connect_state_(CONNECT_STATE_NONE),
    223       connect_os_error_(0),
    224       net_log_(BoundNetLog::Make(net_log, NetLog::SOURCE_SOCKET)),
    225       previously_disconnected_(false) {
    226   scoped_refptr<NetLog::EventParameters> params;
    227   if (source.is_valid())
    228     params = new NetLogSourceParameter("source_dependency", source);
    229   net_log_.BeginEvent(NetLog::TYPE_SOCKET_ALIVE, params);
    230   EnsureWinsockInit();
    231 }
    232 
    233 TCPClientSocketWin::~TCPClientSocketWin() {
    234   Disconnect();
    235   net_log_.EndEvent(NetLog::TYPE_SOCKET_ALIVE, NULL);
    236 }
    237 
    238 void TCPClientSocketWin::AdoptSocket(SOCKET socket) {
    239   DCHECK_EQ(socket_, INVALID_SOCKET);
    240   socket_ = socket;
    241   int error = SetupSocket();
    242   DCHECK_EQ(0, error);
    243   core_ = new Core(this);
    244   current_ai_ = addresses_.head();
    245   use_history_.set_was_ever_connected();
    246 }
    247 
    248 #ifdef ANDROID
    249 // TODO(kristianm): handle the case when wait_for_connect is true
    250 // (sync requests)
    251 #endif
    252 int TCPClientSocketWin::Connect(CompletionCallback* callback
    253 #ifdef ANDROID
    254                                 , bool wait_for_connect
    255 #endif
    256                                ) {
    257   DCHECK(CalledOnValidThread());
    258 
    259   // If already connected, then just return OK.
    260   if (socket_ != INVALID_SOCKET)
    261     return OK;
    262 
    263   base::StatsCounter connects("tcp.connect");
    264   connects.Increment();
    265 
    266   net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT,
    267                       new AddressListNetLogParam(addresses_));
    268 
    269   // We will try to connect to each address in addresses_. Start with the
    270   // first one in the list.
    271   next_connect_state_ = CONNECT_STATE_CONNECT;
    272   current_ai_ = addresses_.head();
    273 
    274   int rv = DoConnectLoop(OK);
    275   if (rv == ERR_IO_PENDING) {
    276     // Synchronous operation not supported.
    277     DCHECK(callback);
    278     read_callback_ = callback;
    279   } else {
    280     LogConnectCompletion(rv);
    281   }
    282 
    283   return rv;
    284 }
    285 
    286 int TCPClientSocketWin::DoConnectLoop(int result) {
    287   DCHECK_NE(next_connect_state_, CONNECT_STATE_NONE);
    288 
    289   int rv = result;
    290   do {
    291     ConnectState state = next_connect_state_;
    292     next_connect_state_ = CONNECT_STATE_NONE;
    293     switch (state) {
    294       case CONNECT_STATE_CONNECT:
    295         DCHECK_EQ(OK, rv);
    296         rv = DoConnect();
    297         break;
    298       case CONNECT_STATE_CONNECT_COMPLETE:
    299         rv = DoConnectComplete(rv);
    300         break;
    301       default:
    302         LOG(DFATAL) << "bad state " << state;
    303         rv = ERR_UNEXPECTED;
    304         break;
    305     }
    306   } while (rv != ERR_IO_PENDING && next_connect_state_ != CONNECT_STATE_NONE);
    307 
    308   return rv;
    309 }
    310 
    311 int TCPClientSocketWin::DoConnect() {
    312   const struct addrinfo* ai = current_ai_;
    313   DCHECK(ai);
    314   DCHECK_EQ(0, connect_os_error_);
    315 
    316   if (previously_disconnected_) {
    317     use_history_.Reset();
    318     previously_disconnected_ = false;
    319   }
    320 
    321   net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT,
    322                       new NetLogStringParameter(
    323                           "address", NetAddressToStringWithPort(current_ai_)));
    324 
    325   next_connect_state_ = CONNECT_STATE_CONNECT_COMPLETE;
    326 
    327   connect_os_error_ = CreateSocket(ai);
    328   if (connect_os_error_ != 0)
    329     return MapSystemError(connect_os_error_);
    330 
    331   DCHECK(!core_);
    332   core_ = new Core(this);
    333 
    334   // WSACreateEvent creates a manual-reset event object.
    335   core_->read_overlapped_.hEvent = WSACreateEvent();
    336   // WSAEventSelect sets the socket to non-blocking mode as a side effect.
    337   // Our connect() and recv() calls require that the socket be non-blocking.
    338   WSAEventSelect(socket_, core_->read_overlapped_.hEvent, FD_CONNECT);
    339 
    340   core_->write_overlapped_.hEvent = WSACreateEvent();
    341 
    342   if (!connect(socket_, ai->ai_addr, static_cast<int>(ai->ai_addrlen))) {
    343     // Connected without waiting!
    344     //
    345     // The MSDN page for connect says:
    346     //   With a nonblocking socket, the connection attempt cannot be completed
    347     //   immediately. In this case, connect will return SOCKET_ERROR, and
    348     //   WSAGetLastError will return WSAEWOULDBLOCK.
    349     // which implies that for a nonblocking socket, connect never returns 0.
    350     // It's not documented whether the event object will be signaled or not
    351     // if connect does return 0.  So the code below is essentially dead code
    352     // and we don't know if it's correct.
    353     NOTREACHED();
    354 
    355     if (ResetEventIfSignaled(core_->read_overlapped_.hEvent))
    356       return OK;
    357   } else {
    358     int os_error = WSAGetLastError();
    359     if (os_error != WSAEWOULDBLOCK) {
    360       LOG(ERROR) << "connect failed: " << os_error;
    361       connect_os_error_ = os_error;
    362       return MapConnectError(os_error);
    363     }
    364   }
    365 
    366   core_->WatchForRead();
    367   return ERR_IO_PENDING;
    368 }
    369 
    370 int TCPClientSocketWin::DoConnectComplete(int result) {
    371   // Log the end of this attempt (and any OS error it threw).
    372   int os_error = connect_os_error_;
    373   connect_os_error_ = 0;
    374   scoped_refptr<NetLog::EventParameters> params;
    375   if (result != OK)
    376     params = new NetLogIntegerParameter("os_error", os_error);
    377   net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT, params);
    378 
    379   if (result == OK) {
    380     use_history_.set_was_ever_connected();
    381     return OK;  // Done!
    382   }
    383 
    384   // Close whatever partially connected socket we currently have.
    385   DoDisconnect();
    386 
    387   // Try to fall back to the next address in the list.
    388   if (current_ai_->ai_next) {
    389     next_connect_state_ = CONNECT_STATE_CONNECT;
    390     current_ai_ = current_ai_->ai_next;
    391     return OK;
    392   }
    393 
    394   // Otherwise there is nothing to fall back to, so give up.
    395   return result;
    396 }
    397 
    398 void TCPClientSocketWin::Disconnect() {
    399   DoDisconnect();
    400   current_ai_ = NULL;
    401 }
    402 
    403 void TCPClientSocketWin::DoDisconnect() {
    404   DCHECK(CalledOnValidThread());
    405 
    406   if (socket_ == INVALID_SOCKET)
    407     return;
    408 
    409   // Note: don't use CancelIo to cancel pending IO because it doesn't work
    410   // when there is a Winsock layered service provider.
    411 
    412   // In most socket implementations, closing a socket results in a graceful
    413   // connection shutdown, but in Winsock we have to call shutdown explicitly.
    414   // See the MSDN page "Graceful Shutdown, Linger Options, and Socket Closure"
    415   // at http://msdn.microsoft.com/en-us/library/ms738547.aspx
    416   shutdown(socket_, SD_SEND);
    417 
    418   // This cancels any pending IO.
    419   closesocket(socket_);
    420   socket_ = INVALID_SOCKET;
    421 
    422   if (waiting_connect()) {
    423     // We closed the socket, so this notification will never come.
    424     // From MSDN' WSAEventSelect documentation:
    425     // "Closing a socket with closesocket also cancels the association and
    426     // selection of network events specified in WSAEventSelect for the socket".
    427     core_->Release();
    428   }
    429 
    430   waiting_read_ = false;
    431   waiting_write_ = false;
    432 
    433   core_->Detach();
    434   core_ = NULL;
    435 
    436   previously_disconnected_ = true;
    437 }
    438 
    439 bool TCPClientSocketWin::IsConnected() const {
    440   DCHECK(CalledOnValidThread());
    441 
    442   if (socket_ == INVALID_SOCKET || waiting_connect())
    443     return false;
    444 
    445   // Check if connection is alive.
    446   char c;
    447   int rv = recv(socket_, &c, 1, MSG_PEEK);
    448   if (rv == 0)
    449     return false;
    450   if (rv == SOCKET_ERROR && WSAGetLastError() != WSAEWOULDBLOCK)
    451     return false;
    452 
    453   return true;
    454 }
    455 
    456 bool TCPClientSocketWin::IsConnectedAndIdle() const {
    457   DCHECK(CalledOnValidThread());
    458 
    459   if (socket_ == INVALID_SOCKET || waiting_connect())
    460     return false;
    461 
    462   // Check if connection is alive and we haven't received any data
    463   // unexpectedly.
    464   char c;
    465   int rv = recv(socket_, &c, 1, MSG_PEEK);
    466   if (rv >= 0)
    467     return false;
    468   if (WSAGetLastError() != WSAEWOULDBLOCK)
    469     return false;
    470 
    471   return true;
    472 }
    473 
    474 int TCPClientSocketWin::GetPeerAddress(AddressList* address) const {
    475   DCHECK(CalledOnValidThread());
    476   DCHECK(address);
    477   if (!IsConnected())
    478     return ERR_SOCKET_NOT_CONNECTED;
    479   address->Copy(current_ai_, false);
    480   return OK;
    481 }
    482 
    483 int TCPClientSocketWin::GetLocalAddress(IPEndPoint* address) const {
    484   DCHECK(CalledOnValidThread());
    485   DCHECK(address);
    486   if (!IsConnected())
    487     return ERR_SOCKET_NOT_CONNECTED;
    488 
    489   struct sockaddr_storage addr_storage;
    490   socklen_t addr_len = sizeof(addr_storage);
    491   struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
    492   if (getsockname(socket_, addr, &addr_len))
    493     return MapSystemError(WSAGetLastError());
    494   if (!address->FromSockAddr(addr, addr_len))
    495     return ERR_FAILED;
    496   return OK;
    497 }
    498 
    499 void TCPClientSocketWin::SetSubresourceSpeculation() {
    500   use_history_.set_subresource_speculation();
    501 }
    502 
    503 void TCPClientSocketWin::SetOmniboxSpeculation() {
    504   use_history_.set_omnibox_speculation();
    505 }
    506 
    507 bool TCPClientSocketWin::WasEverUsed() const {
    508   return use_history_.was_used_to_convey_data();
    509 }
    510 
    511 bool TCPClientSocketWin::UsingTCPFastOpen() const {
    512   // Not supported on windows.
    513   return false;
    514 }
    515 
    516 int TCPClientSocketWin::Read(IOBuffer* buf,
    517                              int buf_len,
    518                              CompletionCallback* callback) {
    519   DCHECK(CalledOnValidThread());
    520   DCHECK_NE(socket_, INVALID_SOCKET);
    521   DCHECK(!waiting_read_);
    522   DCHECK(!read_callback_);
    523   DCHECK(!core_->read_iobuffer_);
    524 
    525   buf_len = core_->ThrottleReadSize(buf_len);
    526 
    527   core_->read_buffer_.len = buf_len;
    528   core_->read_buffer_.buf = buf->data();
    529 
    530   // TODO(wtc): Remove the assertion after enough testing.
    531   AssertEventNotSignaled(core_->read_overlapped_.hEvent);
    532   DWORD num, flags = 0;
    533   int rv = WSARecv(socket_, &core_->read_buffer_, 1, &num, &flags,
    534                    &core_->read_overlapped_, NULL);
    535   if (rv == 0) {
    536     if (ResetEventIfSignaled(core_->read_overlapped_.hEvent)) {
    537       // Because of how WSARecv fills memory when used asynchronously, Purify
    538       // isn't able to detect that it's been initialized, so it scans for 0xcd
    539       // in the buffer and reports UMRs (uninitialized memory reads) for those
    540       // individual bytes. We override that in PURIFY builds to avoid the
    541       // false error reports.
    542       // See bug 5297.
    543       base::MemoryDebug::MarkAsInitialized(core_->read_buffer_.buf, num);
    544       base::StatsCounter read_bytes("tcp.read_bytes");
    545       read_bytes.Add(num);
    546       if (num > 0)
    547         use_history_.set_was_used_to_convey_data();
    548       LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_RECEIVED, num,
    549                       core_->read_buffer_.buf);
    550       return static_cast<int>(num);
    551     }
    552   } else {
    553     int os_error = WSAGetLastError();
    554     if (os_error != WSA_IO_PENDING)
    555       return MapSystemError(os_error);
    556   }
    557   core_->WatchForRead();
    558   waiting_read_ = true;
    559   read_callback_ = callback;
    560   core_->read_iobuffer_ = buf;
    561   return ERR_IO_PENDING;
    562 }
    563 
    564 int TCPClientSocketWin::Write(IOBuffer* buf,
    565                               int buf_len,
    566                               CompletionCallback* callback) {
    567   DCHECK(CalledOnValidThread());
    568   DCHECK_NE(socket_, INVALID_SOCKET);
    569   DCHECK(!waiting_write_);
    570   DCHECK(!write_callback_);
    571   DCHECK_GT(buf_len, 0);
    572   DCHECK(!core_->write_iobuffer_);
    573 
    574   base::StatsCounter writes("tcp.writes");
    575   writes.Increment();
    576 
    577   core_->write_buffer_.len = buf_len;
    578   core_->write_buffer_.buf = buf->data();
    579   core_->write_buffer_length_ = buf_len;
    580 
    581   // TODO(wtc): Remove the assertion after enough testing.
    582   AssertEventNotSignaled(core_->write_overlapped_.hEvent);
    583   DWORD num;
    584   int rv = WSASend(socket_, &core_->write_buffer_, 1, &num, 0,
    585                    &core_->write_overlapped_, NULL);
    586   if (rv == 0) {
    587     if (ResetEventIfSignaled(core_->write_overlapped_.hEvent)) {
    588       rv = static_cast<int>(num);
    589       if (rv > buf_len || rv < 0) {
    590         // It seems that some winsock interceptors report that more was written
    591         // than was available. Treat this as an error.  http://crbug.com/27870
    592         LOG(ERROR) << "Detected broken LSP: Asked to write " << buf_len
    593                    << " bytes, but " << rv << " bytes reported.";
    594         return ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
    595       }
    596       base::StatsCounter write_bytes("tcp.write_bytes");
    597       write_bytes.Add(rv);
    598       if (rv > 0)
    599         use_history_.set_was_used_to_convey_data();
    600       LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_SENT, rv,
    601                       core_->write_buffer_.buf);
    602       return rv;
    603     }
    604   } else {
    605     int os_error = WSAGetLastError();
    606     if (os_error != WSA_IO_PENDING)
    607       return MapSystemError(os_error);
    608   }
    609   core_->WatchForWrite();
    610   waiting_write_ = true;
    611   write_callback_ = callback;
    612   core_->write_iobuffer_ = buf;
    613   return ERR_IO_PENDING;
    614 }
    615 
    616 bool TCPClientSocketWin::SetReceiveBufferSize(int32 size) {
    617   DCHECK(CalledOnValidThread());
    618   int rv = setsockopt(socket_, SOL_SOCKET, SO_RCVBUF,
    619                       reinterpret_cast<const char*>(&size), sizeof(size));
    620   DCHECK(!rv) << "Could not set socket receive buffer size: " << GetLastError();
    621   return rv == 0;
    622 }
    623 
    624 bool TCPClientSocketWin::SetSendBufferSize(int32 size) {
    625   DCHECK(CalledOnValidThread());
    626   int rv = setsockopt(socket_, SOL_SOCKET, SO_SNDBUF,
    627                       reinterpret_cast<const char*>(&size), sizeof(size));
    628   DCHECK(!rv) << "Could not set socket send buffer size: " << GetLastError();
    629   return rv == 0;
    630 }
    631 
    632 int TCPClientSocketWin::CreateSocket(const struct addrinfo* ai) {
    633   socket_ = WSASocket(ai->ai_family, ai->ai_socktype, ai->ai_protocol, NULL, 0,
    634                       WSA_FLAG_OVERLAPPED);
    635   if (socket_ == INVALID_SOCKET) {
    636     int os_error = WSAGetLastError();
    637     LOG(ERROR) << "WSASocket failed: " << os_error;
    638     return os_error;
    639   }
    640   return SetupSocket();
    641 }
    642 
    643 int TCPClientSocketWin::SetupSocket() {
    644   // Increase the socket buffer sizes from the default sizes for WinXP.  In
    645   // performance testing, there is substantial benefit by increasing from 8KB
    646   // to 64KB.
    647   // See also:
    648   //    http://support.microsoft.com/kb/823764/EN-US
    649   // On Vista, if we manually set these sizes, Vista turns off its receive
    650   // window auto-tuning feature.
    651   //    http://blogs.msdn.com/wndp/archive/2006/05/05/Winhec-blog-tcpip-2.aspx
    652   // Since Vista's auto-tune is better than any static value we can could set,
    653   // only change these on pre-vista machines.
    654   int32 major_version, minor_version, fix_version;
    655   base::SysInfo::OperatingSystemVersionNumbers(&major_version, &minor_version,
    656     &fix_version);
    657   if (major_version < 6) {
    658     const int32 kSocketBufferSize = 64 * 1024;
    659     SetReceiveBufferSize(kSocketBufferSize);
    660     SetSendBufferSize(kSocketBufferSize);
    661   }
    662 
    663   // Disable Nagle.
    664   // The Nagle implementation on windows is governed by RFC 896.  The idea
    665   // behind Nagle is to reduce small packets on the network.  When Nagle is
    666   // enabled, if a partial packet has been sent, the TCP stack will disallow
    667   // further *partial* packets until an ACK has been received from the other
    668   // side.  Good applications should always strive to send as much data as
    669   // possible and avoid partial-packet sends.  However, in most real world
    670   // applications, there are edge cases where this does not happen, and two
    671   // partil packets may be sent back to back.  For a browser, it is NEVER
    672   // a benefit to delay for an RTT before the second packet is sent.
    673   //
    674   // As a practical example in Chromium today, consider the case of a small
    675   // POST.  I have verified this:
    676   //     Client writes 649 bytes of header  (partial packet #1)
    677   //     Client writes 50 bytes of POST data (partial packet #2)
    678   // In the above example, with Nagle, a RTT delay is inserted between these
    679   // two sends due to nagle.  RTTs can easily be 100ms or more.  The best
    680   // fix is to make sure that for POSTing data, we write as much data as
    681   // possible and minimize partial packets.  We will fix that.  But disabling
    682   // Nagle also ensure we don't run into this delay in other edge cases.
    683   // See also:
    684   //    http://technet.microsoft.com/en-us/library/bb726981.aspx
    685   const BOOL kDisableNagle = TRUE;
    686   int rv = setsockopt(socket_, IPPROTO_TCP, TCP_NODELAY,
    687                       reinterpret_cast<const char*>(&kDisableNagle),
    688                       sizeof(kDisableNagle));
    689   DCHECK(!rv) << "Could not disable nagle";
    690 
    691   // Enable TCP Keep-Alive to prevent NAT routers from timing out TCP
    692   // connections. See http://crbug.com/27400 for details.
    693 
    694   struct tcp_keepalive keepalive_vals = {
    695     1, // TCP keep-alive on.
    696     45000,  // Wait 45s until sending first TCP keep-alive packet.
    697     45000,  // Wait 45s between sending TCP keep-alive packets.
    698   };
    699   DWORD bytes_returned = 0xABAB;
    700   rv = WSAIoctl(socket_, SIO_KEEPALIVE_VALS, &keepalive_vals,
    701                 sizeof(keepalive_vals), NULL, 0,
    702                 &bytes_returned, NULL, NULL);
    703   DCHECK(!rv) << "Could not enable TCP Keep-Alive for socket: " << socket_
    704               << " [error: " << WSAGetLastError() << "].";
    705 
    706   // Disregard any failure in disabling nagle or enabling TCP Keep-Alive.
    707   return 0;
    708 }
    709 
    710 void TCPClientSocketWin::LogConnectCompletion(int net_error) {
    711   if (net_error == OK)
    712     UpdateConnectionTypeHistograms(CONNECTION_ANY);
    713 
    714   if (net_error != OK) {
    715     net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_CONNECT, net_error);
    716     return;
    717   }
    718 
    719   struct sockaddr_storage source_address;
    720   socklen_t addrlen = sizeof(source_address);
    721   int rv = getsockname(
    722       socket_, reinterpret_cast<struct sockaddr*>(&source_address), &addrlen);
    723   if (rv != 0) {
    724     LOG(ERROR) << "getsockname() [rv: " << rv
    725                << "] error: " << WSAGetLastError();
    726     NOTREACHED();
    727     net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_CONNECT, rv);
    728     return;
    729   }
    730 
    731   const std::string source_address_str =
    732       NetAddressToStringWithPort(
    733           reinterpret_cast<const struct sockaddr*>(&source_address),
    734           sizeof(source_address));
    735   net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT,
    736                     make_scoped_refptr(new NetLogStringParameter(
    737                         "source address",
    738                         source_address_str)));
    739 }
    740 
    741 void TCPClientSocketWin::DoReadCallback(int rv) {
    742   DCHECK_NE(rv, ERR_IO_PENDING);
    743   DCHECK(read_callback_);
    744 
    745   // since Run may result in Read being called, clear read_callback_ up front.
    746   CompletionCallback* c = read_callback_;
    747   read_callback_ = NULL;
    748   c->Run(rv);
    749 }
    750 
    751 void TCPClientSocketWin::DoWriteCallback(int rv) {
    752   DCHECK_NE(rv, ERR_IO_PENDING);
    753   DCHECK(write_callback_);
    754 
    755   // since Run may result in Write being called, clear write_callback_ up front.
    756   CompletionCallback* c = write_callback_;
    757   write_callback_ = NULL;
    758   c->Run(rv);
    759 }
    760 
    761 void TCPClientSocketWin::DidCompleteConnect() {
    762   DCHECK_EQ(next_connect_state_, CONNECT_STATE_CONNECT_COMPLETE);
    763   int result;
    764 
    765   WSANETWORKEVENTS events;
    766   int rv = WSAEnumNetworkEvents(socket_, core_->read_overlapped_.hEvent,
    767                                 &events);
    768   int os_error = 0;
    769   if (rv == SOCKET_ERROR) {
    770     NOTREACHED();
    771     os_error = WSAGetLastError();
    772     result = MapSystemError(os_error);
    773   } else if (events.lNetworkEvents & FD_CONNECT) {
    774     os_error = events.iErrorCode[FD_CONNECT_BIT];
    775     result = MapConnectError(os_error);
    776   } else {
    777     NOTREACHED();
    778     result = ERR_UNEXPECTED;
    779   }
    780 
    781   connect_os_error_ = os_error;
    782   rv = DoConnectLoop(result);
    783   if (rv != ERR_IO_PENDING) {
    784     LogConnectCompletion(rv);
    785     DoReadCallback(rv);
    786   }
    787 }
    788 
    789 void TCPClientSocketWin::DidCompleteRead() {
    790   DCHECK(waiting_read_);
    791   DWORD num_bytes, flags;
    792   BOOL ok = WSAGetOverlappedResult(socket_, &core_->read_overlapped_,
    793                                    &num_bytes, FALSE, &flags);
    794   WSAResetEvent(core_->read_overlapped_.hEvent);
    795   waiting_read_ = false;
    796   core_->read_iobuffer_ = NULL;
    797   if (ok) {
    798     base::StatsCounter read_bytes("tcp.read_bytes");
    799     read_bytes.Add(num_bytes);
    800     if (num_bytes > 0)
    801       use_history_.set_was_used_to_convey_data();
    802     LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_RECEIVED, num_bytes,
    803                     core_->read_buffer_.buf);
    804   }
    805   DoReadCallback(ok ? num_bytes : MapSystemError(WSAGetLastError()));
    806 }
    807 
    808 void TCPClientSocketWin::DidCompleteWrite() {
    809   DCHECK(waiting_write_);
    810 
    811   DWORD num_bytes, flags;
    812   BOOL ok = WSAGetOverlappedResult(socket_, &core_->write_overlapped_,
    813                                    &num_bytes, FALSE, &flags);
    814   WSAResetEvent(core_->write_overlapped_.hEvent);
    815   waiting_write_ = false;
    816   int rv;
    817   if (!ok) {
    818     rv = MapSystemError(WSAGetLastError());
    819   } else {
    820     rv = static_cast<int>(num_bytes);
    821     if (rv > core_->write_buffer_length_ || rv < 0) {
    822       // It seems that some winsock interceptors report that more was written
    823       // than was available. Treat this as an error.  http://crbug.com/27870
    824       LOG(ERROR) << "Detected broken LSP: Asked to write "
    825                  << core_->write_buffer_length_ << " bytes, but " << rv
    826                  << " bytes reported.";
    827       rv = ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
    828     } else {
    829       base::StatsCounter write_bytes("tcp.write_bytes");
    830       write_bytes.Add(num_bytes);
    831       if (num_bytes > 0)
    832         use_history_.set_was_used_to_convey_data();
    833       LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_SENT, num_bytes,
    834                       core_->write_buffer_.buf);
    835     }
    836   }
    837   core_->write_iobuffer_ = NULL;
    838   DoWriteCallback(rv);
    839 }
    840 
    841 }  // namespace net
    842