Home | History | Annotate | Download | only in c-ares
      1 
      2 /* Copyright 1998 by the Massachusetts Institute of Technology.
      3  * Copyright (C) 2004-2010 by Daniel Stenberg
      4  *
      5  * Permission to use, copy, modify, and distribute this
      6  * software and its documentation for any purpose and without
      7  * fee is hereby granted, provided that the above copyright
      8  * notice appear in all copies and that both that copyright
      9  * notice and this permission notice appear in supporting
     10  * documentation, and that the name of M.I.T. not be used in
     11  * advertising or publicity pertaining to distribution of the
     12  * software without specific, written prior permission.
     13  * M.I.T. makes no representations about the suitability of
     14  * this software for any purpose.  It is provided "as is"
     15  * without express or implied warranty.
     16  */
     17 
     18 #include "ares_setup.h"
     19 
     20 #ifdef HAVE_SYS_SOCKET_H
     21 #  include <sys/socket.h>
     22 #endif
     23 #ifdef HAVE_SYS_UIO_H
     24 #  include <sys/uio.h>
     25 #endif
     26 #ifdef HAVE_NETINET_IN_H
     27 #  include <netinet/in.h>
     28 #endif
     29 #ifdef HAVE_NETINET_TCP_H
     30 #  include <netinet/tcp.h>
     31 #endif
     32 #ifdef HAVE_NETDB_H
     33 #  include <netdb.h>
     34 #endif
     35 #ifdef HAVE_ARPA_NAMESER_H
     36 #  include <arpa/nameser.h>
     37 #else
     38 #  include "nameser.h"
     39 #endif
     40 #ifdef HAVE_ARPA_NAMESER_COMPAT_H
     41 #  include <arpa/nameser_compat.h>
     42 #endif
     43 
     44 #ifdef HAVE_SYS_TIME_H
     45 #  include <sys/time.h>
     46 #endif
     47 
     48 #ifdef HAVE_STRINGS_H
     49 #  include <strings.h>
     50 #endif
     51 #ifdef HAVE_UNISTD_H
     52 #  include <unistd.h>
     53 #endif
     54 #ifdef HAVE_SYS_IOCTL_H
     55 #  include <sys/ioctl.h>
     56 #endif
     57 #ifdef NETWARE
     58 #  include <sys/filio.h>
     59 #endif
     60 
     61 #include <assert.h>
     62 #include <string.h>
     63 #include <stdlib.h>
     64 #include <fcntl.h>
     65 #include <time.h>
     66 
     67 #include "ares.h"
     68 #include "ares_dns.h"
     69 #include "ares_nowarn.h"
     70 #include "ares_private.h"
     71 
     72 
     73 static int try_again(int errnum);
     74 static void write_tcp_data(ares_channel channel, fd_set *write_fds,
     75                            ares_socket_t write_fd, struct timeval *now);
     76 static void read_tcp_data(ares_channel channel, fd_set *read_fds,
     77                           ares_socket_t read_fd, struct timeval *now);
     78 static void read_udp_packets(ares_channel channel, fd_set *read_fds,
     79                              ares_socket_t read_fd, struct timeval *now);
     80 static void advance_tcp_send_queue(ares_channel channel, int whichserver,
     81                                    ssize_t num_bytes);
     82 static void process_timeouts(ares_channel channel, struct timeval *now);
     83 static void process_broken_connections(ares_channel channel,
     84                                        struct timeval *now);
     85 static void process_answer(ares_channel channel, unsigned char *abuf,
     86                            int alen, int whichserver, int tcp,
     87                            struct timeval *now);
     88 static void handle_error(ares_channel channel, int whichserver,
     89                          struct timeval *now);
     90 static void skip_server(ares_channel channel, struct query *query,
     91                         int whichserver);
     92 static void next_server(ares_channel channel, struct query *query,
     93                         struct timeval *now);
     94 static int open_tcp_socket(ares_channel channel, struct server_state *server);
     95 static int open_udp_socket(ares_channel channel, struct server_state *server);
     96 static int same_questions(const unsigned char *qbuf, int qlen,
     97                           const unsigned char *abuf, int alen);
     98 static int same_address(struct sockaddr *sa, struct ares_addr *aa);
     99 static void end_query(ares_channel channel, struct query *query, int status,
    100                       unsigned char *abuf, int alen);
    101 
    102 /* return true if now is exactly check time or later */
    103 int ares__timedout(struct timeval *now,
    104                    struct timeval *check)
    105 {
    106   long secs = (now->tv_sec - check->tv_sec);
    107 
    108   if(secs > 0)
    109     return 1; /* yes, timed out */
    110   if(secs < 0)
    111     return 0; /* nope, not timed out */
    112 
    113   /* if the full seconds were identical, check the sub second parts */
    114   return (now->tv_usec - check->tv_usec >= 0);
    115 }
    116 
    117 /* add the specific number of milliseconds to the time in the first argument */
    118 int ares__timeadd(struct timeval *now,
    119                   int millisecs)
    120 {
    121   now->tv_sec += millisecs/1000;
    122   now->tv_usec += (millisecs%1000)*1000;
    123 
    124   if(now->tv_usec >= 1000000) {
    125     ++(now->tv_sec);
    126     now->tv_usec -= 1000000;
    127   }
    128 
    129   return 0;
    130 }
    131 
    132 /* return time offset between now and (future) check, in milliseconds */
    133 long ares__timeoffset(struct timeval *now,
    134                       struct timeval *check)
    135 {
    136   return (check->tv_sec - now->tv_sec)*1000 +
    137          (check->tv_usec - now->tv_usec)/1000;
    138 }
    139 
    140 
    141 /*
    142  * generic process function
    143  */
    144 static void processfds(ares_channel channel,
    145                        fd_set *read_fds, ares_socket_t read_fd,
    146                        fd_set *write_fds, ares_socket_t write_fd)
    147 {
    148   struct timeval now = ares__tvnow();
    149 
    150   write_tcp_data(channel, write_fds, write_fd, &now);
    151   read_tcp_data(channel, read_fds, read_fd, &now);
    152   read_udp_packets(channel, read_fds, read_fd, &now);
    153   process_timeouts(channel, &now);
    154   process_broken_connections(channel, &now);
    155 }
    156 
    157 /* Something interesting happened on the wire, or there was a timeout.
    158  * See what's up and respond accordingly.
    159  */
    160 void ares_process(ares_channel channel, fd_set *read_fds, fd_set *write_fds)
    161 {
    162   processfds(channel, read_fds, ARES_SOCKET_BAD, write_fds, ARES_SOCKET_BAD);
    163 }
    164 
    165 /* Something interesting happened on the wire, or there was a timeout.
    166  * See what's up and respond accordingly.
    167  */
    168 void ares_process_fd(ares_channel channel,
    169                      ares_socket_t read_fd, /* use ARES_SOCKET_BAD or valid
    170                                                file descriptors */
    171                      ares_socket_t write_fd)
    172 {
    173   processfds(channel, NULL, read_fd, NULL, write_fd);
    174 }
    175 
    176 
    177 /* Return 1 if the specified error number describes a readiness error, or 0
    178  * otherwise. This is mostly for HP-UX, which could return EAGAIN or
    179  * EWOULDBLOCK. See this man page
    180  *
    181  * http://devrsrc1.external.hp.com/STKS/cgi-bin/man2html?
    182  *     manpage=/usr/share/man/man2.Z/send.2
    183  */
    184 static int try_again(int errnum)
    185 {
    186 #if !defined EWOULDBLOCK && !defined EAGAIN
    187 #error "Neither EWOULDBLOCK nor EAGAIN defined"
    188 #endif
    189   switch (errnum)
    190     {
    191 #ifdef EWOULDBLOCK
    192     case EWOULDBLOCK:
    193       return 1;
    194 #endif
    195 #if defined EAGAIN && EAGAIN != EWOULDBLOCK
    196     case EAGAIN:
    197       return 1;
    198 #endif
    199     }
    200   return 0;
    201 }
    202 
    203 /* If any TCP sockets select true for writing, write out queued data
    204  * we have for them.
    205  */
    206 static void write_tcp_data(ares_channel channel,
    207                            fd_set *write_fds,
    208                            ares_socket_t write_fd,
    209                            struct timeval *now)
    210 {
    211   struct server_state *server;
    212   struct send_request *sendreq;
    213   struct iovec *vec;
    214   int i;
    215   ssize_t scount;
    216   ssize_t wcount;
    217   size_t n;
    218 
    219   if(!write_fds && (write_fd == ARES_SOCKET_BAD))
    220     /* no possible action */
    221     return;
    222 
    223   for (i = 0; i < channel->nservers; i++)
    224     {
    225       /* Make sure server has data to send and is selected in write_fds or
    226          write_fd. */
    227       server = &channel->servers[i];
    228       if (!server->qhead || server->tcp_socket == ARES_SOCKET_BAD ||
    229           server->is_broken)
    230         continue;
    231 
    232       if(write_fds) {
    233         if(!FD_ISSET(server->tcp_socket, write_fds))
    234           continue;
    235       }
    236       else {
    237         if(server->tcp_socket != write_fd)
    238           continue;
    239       }
    240 
    241       if(write_fds)
    242         /* If there's an error and we close this socket, then open
    243          * another with the same fd to talk to another server, then we
    244          * don't want to think that it was the new socket that was
    245          * ready. This is not disastrous, but is likely to result in
    246          * extra system calls and confusion. */
    247         FD_CLR(server->tcp_socket, write_fds);
    248 
    249       /* Count the number of send queue items. */
    250       n = 0;
    251       for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
    252         n++;
    253 
    254       /* Allocate iovecs so we can send all our data at once. */
    255       vec = malloc(n * sizeof(struct iovec));
    256       if (vec)
    257         {
    258           /* Fill in the iovecs and send. */
    259           n = 0;
    260           for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
    261             {
    262               vec[n].iov_base = (char *) sendreq->data;
    263               vec[n].iov_len = sendreq->len;
    264               n++;
    265             }
    266           wcount = (ssize_t)writev(server->tcp_socket, vec, (int)n);
    267           free(vec);
    268           if (wcount < 0)
    269             {
    270               if (!try_again(SOCKERRNO))
    271                   handle_error(channel, i, now);
    272               continue;
    273             }
    274 
    275           /* Advance the send queue by as many bytes as we sent. */
    276           advance_tcp_send_queue(channel, i, wcount);
    277         }
    278       else
    279         {
    280           /* Can't allocate iovecs; just send the first request. */
    281           sendreq = server->qhead;
    282 
    283           scount = swrite(server->tcp_socket, sendreq->data, sendreq->len);
    284           if (scount < 0)
    285             {
    286               if (!try_again(SOCKERRNO))
    287                   handle_error(channel, i, now);
    288               continue;
    289             }
    290 
    291           /* Advance the send queue by as many bytes as we sent. */
    292           advance_tcp_send_queue(channel, i, scount);
    293         }
    294     }
    295 }
    296 
    297 /* Consume the given number of bytes from the head of the TCP send queue. */
    298 static void advance_tcp_send_queue(ares_channel channel, int whichserver,
    299                                    ssize_t num_bytes)
    300 {
    301   struct send_request *sendreq;
    302   struct server_state *server = &channel->servers[whichserver];
    303   while (num_bytes > 0) {
    304     sendreq = server->qhead;
    305     if ((size_t)num_bytes >= sendreq->len) {
    306       num_bytes -= sendreq->len;
    307       server->qhead = sendreq->next;
    308       if (sendreq->data_storage)
    309         free(sendreq->data_storage);
    310       free(sendreq);
    311       if (server->qhead == NULL) {
    312         SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 0);
    313         server->qtail = NULL;
    314 
    315         /* qhead is NULL so we cannot continue this loop */
    316         break;
    317       }
    318     }
    319     else {
    320       sendreq->data += num_bytes;
    321       sendreq->len -= num_bytes;
    322       num_bytes = 0;
    323     }
    324   }
    325 }
    326 
    327 /* If any TCP socket selects true for reading, read some data,
    328  * allocate a buffer if we finish reading the length word, and process
    329  * a packet if we finish reading one.
    330  */
    331 static void read_tcp_data(ares_channel channel, fd_set *read_fds,
    332                           ares_socket_t read_fd, struct timeval *now)
    333 {
    334   struct server_state *server;
    335   int i;
    336   ssize_t count;
    337 
    338   if(!read_fds && (read_fd == ARES_SOCKET_BAD))
    339     /* no possible action */
    340     return;
    341 
    342   for (i = 0; i < channel->nservers; i++)
    343     {
    344       /* Make sure the server has a socket and is selected in read_fds. */
    345       server = &channel->servers[i];
    346       if (server->tcp_socket == ARES_SOCKET_BAD || server->is_broken)
    347         continue;
    348 
    349       if(read_fds) {
    350         if(!FD_ISSET(server->tcp_socket, read_fds))
    351           continue;
    352       }
    353       else {
    354         if(server->tcp_socket != read_fd)
    355           continue;
    356       }
    357 
    358       if(read_fds)
    359         /* If there's an error and we close this socket, then open
    360          * another with the same fd to talk to another server, then we
    361          * don't want to think that it was the new socket that was
    362          * ready. This is not disastrous, but is likely to result in
    363          * extra system calls and confusion. */
    364         FD_CLR(server->tcp_socket, read_fds);
    365 
    366       if (server->tcp_lenbuf_pos != 2)
    367         {
    368           /* We haven't yet read a length word, so read that (or
    369            * what's left to read of it).
    370            */
    371           count = sread(server->tcp_socket,
    372                         server->tcp_lenbuf + server->tcp_lenbuf_pos,
    373                         2 - server->tcp_lenbuf_pos);
    374           if (count <= 0)
    375             {
    376               if (!(count == -1 && try_again(SOCKERRNO)))
    377                   handle_error(channel, i, now);
    378               continue;
    379             }
    380 
    381           server->tcp_lenbuf_pos += (int)count;
    382           if (server->tcp_lenbuf_pos == 2)
    383             {
    384               /* We finished reading the length word.  Decode the
    385                * length and allocate a buffer for the data.
    386                */
    387               server->tcp_length = server->tcp_lenbuf[0] << 8
    388                 | server->tcp_lenbuf[1];
    389               server->tcp_buffer = malloc(server->tcp_length);
    390               if (!server->tcp_buffer)
    391                 handle_error(channel, i, now);
    392               server->tcp_buffer_pos = 0;
    393             }
    394         }
    395       else
    396         {
    397           /* Read data into the allocated buffer. */
    398           count = sread(server->tcp_socket,
    399                         server->tcp_buffer + server->tcp_buffer_pos,
    400                         server->tcp_length - server->tcp_buffer_pos);
    401           if (count <= 0)
    402             {
    403               if (!(count == -1 && try_again(SOCKERRNO)))
    404                   handle_error(channel, i, now);
    405               continue;
    406             }
    407 
    408           server->tcp_buffer_pos += (int)count;
    409           if (server->tcp_buffer_pos == server->tcp_length)
    410             {
    411               /* We finished reading this answer; process it and
    412                * prepare to read another length word.
    413                */
    414               process_answer(channel, server->tcp_buffer, server->tcp_length,
    415                              i, 1, now);
    416           if (server->tcp_buffer)
    417                         free(server->tcp_buffer);
    418               server->tcp_buffer = NULL;
    419               server->tcp_lenbuf_pos = 0;
    420               server->tcp_buffer_pos = 0;
    421             }
    422         }
    423     }
    424 }
    425 
    426 /* If any UDP sockets select true for reading, process them. */
    427 static void read_udp_packets(ares_channel channel, fd_set *read_fds,
    428                              ares_socket_t read_fd, struct timeval *now)
    429 {
    430   struct server_state *server;
    431   int i;
    432   ssize_t count;
    433   unsigned char buf[PACKETSZ + 1];
    434 #ifdef HAVE_RECVFROM
    435   ares_socklen_t fromlen;
    436   union {
    437     struct sockaddr     sa;
    438     struct sockaddr_in  sa4;
    439     struct sockaddr_in6 sa6;
    440   } from;
    441 #endif
    442 
    443   if(!read_fds && (read_fd == ARES_SOCKET_BAD))
    444     /* no possible action */
    445     return;
    446 
    447   for (i = 0; i < channel->nservers; i++)
    448     {
    449       /* Make sure the server has a socket and is selected in read_fds. */
    450       server = &channel->servers[i];
    451 
    452       if (server->udp_socket == ARES_SOCKET_BAD || server->is_broken)
    453         continue;
    454 
    455       if(read_fds) {
    456         if(!FD_ISSET(server->udp_socket, read_fds))
    457           continue;
    458       }
    459       else {
    460         if(server->udp_socket != read_fd)
    461           continue;
    462       }
    463 
    464       if(read_fds)
    465         /* If there's an error and we close this socket, then open
    466          * another with the same fd to talk to another server, then we
    467          * don't want to think that it was the new socket that was
    468          * ready. This is not disastrous, but is likely to result in
    469          * extra system calls and confusion. */
    470         FD_CLR(server->udp_socket, read_fds);
    471 
    472       /* To reduce event loop overhead, read and process as many
    473        * packets as we can. */
    474       do {
    475 #ifdef HAVE_RECVFROM
    476         if (server->addr.family == AF_INET)
    477           fromlen = sizeof(from.sa4);
    478         else
    479           fromlen = sizeof(from.sa6);
    480         count = (ssize_t)recvfrom(server->udp_socket, (void *)buf, sizeof(buf),
    481                                   0, &from.sa, &fromlen);
    482 #else
    483         count = sread(server->udp_socket, buf, sizeof(buf));
    484 #endif
    485         if (count == -1 && try_again(SOCKERRNO))
    486           continue;
    487         else if (count <= 0)
    488           handle_error(channel, i, now);
    489 #ifdef HAVE_RECVFROM
    490         else if (!same_address(&from.sa, &server->addr))
    491           /* The address the response comes from does not match
    492            * the address we sent the request to. Someone may be
    493            * attempting to perform a cache poisoning attack. */
    494           break;
    495 #endif
    496         else
    497           process_answer(channel, buf, (int)count, i, 0, now);
    498        } while (count > 0);
    499     }
    500 }
    501 
    502 /* If any queries have timed out, note the timeout and move them on. */
    503 static void process_timeouts(ares_channel channel, struct timeval *now)
    504 {
    505   time_t t;  /* the time of the timeouts we're processing */
    506   struct query *query;
    507   struct list_node* list_head;
    508   struct list_node* list_node;
    509 
    510   /* Process all the timeouts that have fired since the last time we
    511    * processed timeouts. If things are going well, then we'll have
    512    * hundreds/thousands of queries that fall into future buckets, and
    513    * only a handful of requests that fall into the "now" bucket, so
    514    * this should be quite quick.
    515    */
    516   for (t = channel->last_timeout_processed; t <= now->tv_sec; t++)
    517     {
    518       list_head = &(channel->queries_by_timeout[t % ARES_TIMEOUT_TABLE_SIZE]);
    519       for (list_node = list_head->next; list_node != list_head; )
    520         {
    521           query = list_node->data;
    522           list_node = list_node->next;  /* in case the query gets deleted */
    523           if (query->timeout.tv_sec && ares__timedout(now, &query->timeout))
    524             {
    525               query->error_status = ARES_ETIMEOUT;
    526               ++query->timeouts;
    527               next_server(channel, query, now);
    528             }
    529         }
    530      }
    531   channel->last_timeout_processed = now->tv_sec;
    532 }
    533 
    534 /* Handle an answer from a server. */
    535 static void process_answer(ares_channel channel, unsigned char *abuf,
    536                            int alen, int whichserver, int tcp,
    537                            struct timeval *now)
    538 {
    539   int tc, rcode;
    540   unsigned short id;
    541   struct query *query;
    542   struct list_node* list_head;
    543   struct list_node* list_node;
    544 
    545   /* If there's no room in the answer for a header, we can't do much
    546    * with it. */
    547   if (alen < HFIXEDSZ)
    548     return;
    549 
    550   /* Grab the query ID, truncate bit, and response code from the packet. */
    551   id = DNS_HEADER_QID(abuf);
    552   tc = DNS_HEADER_TC(abuf);
    553   rcode = DNS_HEADER_RCODE(abuf);
    554 
    555   /* Find the query corresponding to this packet. The queries are
    556    * hashed/bucketed by query id, so this lookup should be quick.
    557    * Note that both the query id and the questions must be the same;
    558    * when the query id wraps around we can have multiple outstanding
    559    * queries with the same query id, so we need to check both the id and
    560    * question.
    561    */
    562   query = NULL;
    563   list_head = &(channel->queries_by_qid[id % ARES_QID_TABLE_SIZE]);
    564   for (list_node = list_head->next; list_node != list_head;
    565        list_node = list_node->next)
    566     {
    567       struct query *q = list_node->data;
    568       if ((q->qid == id) && same_questions(q->qbuf, q->qlen, abuf, alen))
    569         {
    570           query = q;
    571           break;
    572         }
    573     }
    574   if (!query)
    575     return;
    576 
    577   /* If we got a truncated UDP packet and are not ignoring truncation,
    578    * don't accept the packet, and switch the query to TCP if we hadn't
    579    * done so already.
    580    */
    581   if ((tc || alen > PACKETSZ) && !tcp && !(channel->flags & ARES_FLAG_IGNTC))
    582     {
    583       if (!query->using_tcp)
    584         {
    585           query->using_tcp = 1;
    586           ares__send_query(channel, query, now);
    587         }
    588       return;
    589     }
    590 
    591   /* Limit alen to PACKETSZ if we aren't using TCP (only relevant if we
    592    * are ignoring truncation.
    593    */
    594   if (alen > PACKETSZ && !tcp)
    595     alen = PACKETSZ;
    596 
    597   /* If we aren't passing through all error packets, discard packets
    598    * with SERVFAIL, NOTIMP, or REFUSED response codes.
    599    */
    600   if (!(channel->flags & ARES_FLAG_NOCHECKRESP))
    601     {
    602       if (rcode == SERVFAIL || rcode == NOTIMP || rcode == REFUSED)
    603         {
    604           skip_server(channel, query, whichserver);
    605           if (query->server == whichserver)
    606             next_server(channel, query, now);
    607           return;
    608         }
    609     }
    610 
    611   end_query(channel, query, ARES_SUCCESS, abuf, alen);
    612 }
    613 
    614 /* Close all the connections that are no longer usable. */
    615 static void process_broken_connections(ares_channel channel,
    616                                        struct timeval *now)
    617 {
    618   int i;
    619   for (i = 0; i < channel->nservers; i++)
    620     {
    621       struct server_state *server = &channel->servers[i];
    622       if (server->is_broken)
    623         {
    624           handle_error(channel, i, now);
    625         }
    626     }
    627 }
    628 
    629 static void handle_error(ares_channel channel, int whichserver,
    630                          struct timeval *now)
    631 {
    632   struct server_state *server;
    633   struct query *query;
    634   struct list_node list_head;
    635   struct list_node* list_node;
    636 
    637   server = &channel->servers[whichserver];
    638 
    639   /* Reset communications with this server. */
    640   ares__close_sockets(channel, server);
    641 
    642   /* Tell all queries talking to this server to move on and not try
    643    * this server again. We steal the current list of queries that were
    644    * in-flight to this server, since when we call next_server this can
    645    * cause the queries to be re-sent to this server, which will
    646    * re-insert these queries in that same server->queries_to_server
    647    * list.
    648    */
    649   ares__init_list_head(&list_head);
    650   ares__swap_lists(&list_head, &(server->queries_to_server));
    651   for (list_node = list_head.next; list_node != &list_head; )
    652     {
    653       query = list_node->data;
    654       list_node = list_node->next;  /* in case the query gets deleted */
    655       assert(query->server == whichserver);
    656       skip_server(channel, query, whichserver);
    657       next_server(channel, query, now);
    658     }
    659   /* Each query should have removed itself from our temporary list as
    660    * it re-sent itself or finished up...
    661    */
    662   assert(ares__is_list_empty(&list_head));
    663 }
    664 
    665 static void skip_server(ares_channel channel, struct query *query,
    666                         int whichserver) {
    667   /* The given server gave us problems with this query, so if we have
    668    * the luxury of using other servers, then let's skip the
    669    * potentially broken server and just use the others. If we only
    670    * have one server and we need to retry then we should just go ahead
    671    * and re-use that server, since it's our only hope; perhaps we
    672    * just got unlucky, and retrying will work (eg, the server timed
    673    * out our TCP connection just as we were sending another request).
    674    */
    675   if (channel->nservers > 1)
    676     {
    677       query->server_info[whichserver].skip_server = 1;
    678     }
    679 }
    680 
    681 static void next_server(ares_channel channel, struct query *query,
    682                         struct timeval *now)
    683 {
    684   /* We need to try each server channel->tries times. We have channel->nservers
    685    * servers to try. In total, we need to do channel->nservers * channel->tries
    686    * attempts. Use query->try to remember how many times we already attempted
    687    * this query. Use modular arithmetic to find the next server to try. */
    688   while (++(query->try_count) < (channel->nservers * channel->tries))
    689     {
    690       struct server_state *server;
    691 
    692       /* Move on to the next server. */
    693       query->server = (query->server + 1) % channel->nservers;
    694       server = &channel->servers[query->server];
    695 
    696       /* We don't want to use this server if (1) we decided this
    697        * connection is broken, and thus about to be closed, (2)
    698        * we've decided to skip this server because of earlier
    699        * errors we encountered, or (3) we already sent this query
    700        * over this exact connection.
    701        */
    702       if (!server->is_broken &&
    703            !query->server_info[query->server].skip_server &&
    704            !(query->using_tcp &&
    705              (query->server_info[query->server].tcp_connection_generation ==
    706               server->tcp_connection_generation)))
    707         {
    708            ares__send_query(channel, query, now);
    709            return;
    710         }
    711 
    712       /* You might think that with TCP we only need one try. However,
    713        * even when using TCP, servers can time-out our connection just
    714        * as we're sending a request, or close our connection because
    715        * they die, or never send us a reply because they get wedged or
    716        * tickle a bug that drops our request.
    717        */
    718     }
    719 
    720   /* If we are here, all attempts to perform query failed. */
    721   end_query(channel, query, query->error_status, NULL, 0);
    722 }
    723 
    724 void ares__send_query(ares_channel channel, struct query *query,
    725                       struct timeval *now)
    726 {
    727   struct send_request *sendreq;
    728   struct server_state *server;
    729   int timeplus;
    730 
    731   server = &channel->servers[query->server];
    732   if (query->using_tcp)
    733     {
    734       /* Make sure the TCP socket for this server is set up and queue
    735        * a send request.
    736        */
    737       if (server->tcp_socket == ARES_SOCKET_BAD)
    738         {
    739           if (open_tcp_socket(channel, server) == -1)
    740             {
    741               skip_server(channel, query, query->server);
    742               next_server(channel, query, now);
    743               return;
    744             }
    745         }
    746       sendreq = calloc(1, sizeof(struct send_request));
    747       if (!sendreq)
    748         {
    749         end_query(channel, query, ARES_ENOMEM, NULL, 0);
    750           return;
    751         }
    752       /* To make the common case fast, we avoid copies by using the
    753        * query's tcpbuf for as long as the query is alive. In the rare
    754        * case where the query ends while it's queued for transmission,
    755        * then we give the sendreq its own copy of the request packet
    756        * and put it in sendreq->data_storage.
    757        */
    758       sendreq->data_storage = NULL;
    759       sendreq->data = query->tcpbuf;
    760       sendreq->len = query->tcplen;
    761       sendreq->owner_query = query;
    762       sendreq->next = NULL;
    763       if (server->qtail)
    764         server->qtail->next = sendreq;
    765       else
    766         {
    767           SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 1);
    768           server->qhead = sendreq;
    769         }
    770       server->qtail = sendreq;
    771       query->server_info[query->server].tcp_connection_generation =
    772         server->tcp_connection_generation;
    773     }
    774   else
    775     {
    776       if (server->udp_socket == ARES_SOCKET_BAD)
    777         {
    778           if (open_udp_socket(channel, server) == -1)
    779             {
    780               skip_server(channel, query, query->server);
    781               next_server(channel, query, now);
    782               return;
    783             }
    784         }
    785       if (swrite(server->udp_socket, query->qbuf, query->qlen) == -1)
    786         {
    787           /* FIXME: Handle EAGAIN here since it likely can happen. */
    788           skip_server(channel, query, query->server);
    789           next_server(channel, query, now);
    790           return;
    791         }
    792     }
    793     timeplus = channel->timeout << (query->try_count / channel->nservers);
    794     timeplus = (timeplus * (9 + (rand () & 7))) / 16;
    795     query->timeout = *now;
    796     ares__timeadd(&query->timeout,
    797                   timeplus);
    798     /* Keep track of queries bucketed by timeout, so we can process
    799      * timeout events quickly.
    800      */
    801     ares__remove_from_list(&(query->queries_by_timeout));
    802     ares__insert_in_list(
    803         &(query->queries_by_timeout),
    804         &(channel->queries_by_timeout[query->timeout.tv_sec %
    805                                       ARES_TIMEOUT_TABLE_SIZE]));
    806 
    807     /* Keep track of queries bucketed by server, so we can process server
    808      * errors quickly.
    809      */
    810     ares__remove_from_list(&(query->queries_to_server));
    811     ares__insert_in_list(&(query->queries_to_server),
    812                          &(server->queries_to_server));
    813 }
    814 
    815 /*
    816  * setsocknonblock sets the given socket to either blocking or non-blocking
    817  * mode based on the 'nonblock' boolean argument. This function is highly
    818  * portable.
    819  */
    820 static int setsocknonblock(ares_socket_t sockfd,    /* operate on this */
    821                     int nonblock   /* TRUE or FALSE */)
    822 {
    823 #if defined(USE_BLOCKING_SOCKETS)
    824 
    825   return 0; /* returns success */
    826 
    827 #elif defined(HAVE_FCNTL_O_NONBLOCK)
    828 
    829   /* most recent unix versions */
    830   int flags;
    831   flags = fcntl(sockfd, F_GETFL, 0);
    832   if (FALSE != nonblock)
    833     return fcntl(sockfd, F_SETFL, flags | O_NONBLOCK);
    834   else
    835     return fcntl(sockfd, F_SETFL, flags & (~O_NONBLOCK));
    836 
    837 #elif defined(HAVE_IOCTL_FIONBIO)
    838 
    839   /* older unix versions */
    840   int flags;
    841   flags = nonblock;
    842   return ioctl(sockfd, FIONBIO, &flags);
    843 
    844 #elif defined(HAVE_IOCTLSOCKET_FIONBIO)
    845 
    846 #ifdef WATT32
    847   char flags;
    848 #else
    849   /* Windows */
    850   unsigned long flags;
    851 #endif
    852   flags = nonblock;
    853   return ioctlsocket(sockfd, FIONBIO, &flags);
    854 
    855 #elif defined(HAVE_IOCTLSOCKET_CAMEL_FIONBIO)
    856 
    857   /* Amiga */
    858   return IoctlSocket(sockfd, FIONBIO, (long)nonblock);
    859 
    860 #elif defined(HAVE_SETSOCKOPT_SO_NONBLOCK)
    861 
    862   /* BeOS */
    863   long b = nonblock ? 1 : 0;
    864   return setsockopt(sockfd, SOL_SOCKET, SO_NONBLOCK, &b, sizeof(b));
    865 
    866 #else
    867 #  error "no non-blocking method was found/used/set"
    868 #endif
    869 }
    870 
    871 static int configure_socket(ares_socket_t s, int family, ares_channel channel)
    872 {
    873   union {
    874     struct sockaddr     sa;
    875     struct sockaddr_in  sa4;
    876     struct sockaddr_in6 sa6;
    877   } local;
    878 
    879   setsocknonblock(s, TRUE);
    880 
    881 #if defined(FD_CLOEXEC) && !defined(MSDOS)
    882   /* Configure the socket fd as close-on-exec. */
    883   if (fcntl(s, F_SETFD, FD_CLOEXEC) == -1)
    884     return -1;
    885 #endif
    886 
    887   /* Set the socket's send and receive buffer sizes. */
    888   if ((channel->socket_send_buffer_size > 0) &&
    889       setsockopt(s, SOL_SOCKET, SO_SNDBUF,
    890                  (void *)&channel->socket_send_buffer_size,
    891                  sizeof(channel->socket_send_buffer_size)) == -1)
    892     return -1;
    893 
    894   if ((channel->socket_receive_buffer_size > 0) &&
    895       setsockopt(s, SOL_SOCKET, SO_RCVBUF,
    896                  (void *)&channel->socket_receive_buffer_size,
    897                  sizeof(channel->socket_receive_buffer_size)) == -1)
    898     return -1;
    899 
    900 #ifdef SO_BINDTODEVICE
    901   if (channel->local_dev_name[0]) {
    902     if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
    903                    channel->local_dev_name, sizeof(channel->local_dev_name))) {
    904       /* Only root can do this, and usually not fatal if it doesn't work, so */
    905       /* just continue on. */
    906     }
    907   }
    908 #endif
    909 
    910   if (family == AF_INET) {
    911     if (channel->local_ip4) {
    912       memset(&local.sa4, 0, sizeof(local.sa4));
    913       local.sa4.sin_family = AF_INET;
    914       local.sa4.sin_addr.s_addr = htonl(channel->local_ip4);
    915       if (bind(s, &local.sa, sizeof(local.sa4)) < 0)
    916         return -1;
    917     }
    918   }
    919   else if (family == AF_INET6) {
    920     if (memcmp(channel->local_ip6, &ares_in6addr_any, sizeof(channel->local_ip6)) != 0) {
    921       memset(&local.sa6, 0, sizeof(local.sa6));
    922       local.sa6.sin6_family = AF_INET6;
    923       memcpy(&local.sa6.sin6_addr, channel->local_ip6, sizeof(channel->local_ip6));
    924       if (bind(s, &local.sa, sizeof(local.sa6)) < 0)
    925         return -1;
    926     }
    927   }
    928 
    929   return 0;
    930 }
    931 
    932 static int open_tcp_socket(ares_channel channel, struct server_state *server)
    933 {
    934   ares_socket_t s;
    935   int opt;
    936   ares_socklen_t salen;
    937   union {
    938     struct sockaddr_in  sa4;
    939     struct sockaddr_in6 sa6;
    940   } saddr;
    941   struct sockaddr *sa;
    942 
    943   switch (server->addr.family)
    944     {
    945       case AF_INET:
    946         sa = (void *)&saddr.sa4;
    947         salen = sizeof(saddr.sa4);
    948         memset(sa, 0, salen);
    949         saddr.sa4.sin_family = AF_INET;
    950         saddr.sa4.sin_port = (unsigned short)(channel->tcp_port & 0xffff);
    951         memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
    952                sizeof(server->addr.addrV4));
    953         break;
    954       case AF_INET6:
    955         sa = (void *)&saddr.sa6;
    956         salen = sizeof(saddr.sa6);
    957         memset(sa, 0, salen);
    958         saddr.sa6.sin6_family = AF_INET6;
    959         saddr.sa6.sin6_port = (unsigned short)(channel->tcp_port & 0xffff);
    960         memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
    961                sizeof(server->addr.addrV6));
    962         break;
    963       default:
    964         return -1;
    965     }
    966 
    967   /* Acquire a socket. */
    968   s = socket(server->addr.family, SOCK_STREAM, 0);
    969   if (s == ARES_SOCKET_BAD)
    970     return -1;
    971 
    972   /* Configure it. */
    973   if (configure_socket(s, server->addr.family, channel) < 0)
    974     {
    975        sclose(s);
    976        return -1;
    977     }
    978 
    979 #ifdef TCP_NODELAY
    980   /*
    981    * Disable the Nagle algorithm (only relevant for TCP sockets, and thus not
    982    * in configure_socket). In general, in DNS lookups we're pretty much
    983    * interested in firing off a single request and then waiting for a reply,
    984    * so batching isn't very interesting.
    985    */
    986   opt = 1;
    987   if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY,
    988                  (void *)&opt, sizeof(opt)) == -1)
    989     {
    990        sclose(s);
    991        return -1;
    992     }
    993 #endif
    994 
    995   /* Connect to the server. */
    996   if (connect(s, sa, salen) == -1)
    997     {
    998       int err = SOCKERRNO;
    999 
   1000       if (err != EINPROGRESS && err != EWOULDBLOCK)
   1001         {
   1002           sclose(s);
   1003           return -1;
   1004         }
   1005     }
   1006 
   1007   if (channel->sock_create_cb)
   1008     {
   1009       int err = channel->sock_create_cb(s, SOCK_STREAM,
   1010                                         channel->sock_create_cb_data);
   1011       if (err < 0)
   1012         {
   1013           sclose(s);
   1014           return err;
   1015         }
   1016     }
   1017 
   1018   SOCK_STATE_CALLBACK(channel, s, 1, 0);
   1019   server->tcp_buffer_pos = 0;
   1020   server->tcp_socket = s;
   1021   server->tcp_connection_generation = ++channel->tcp_connection_generation;
   1022   return 0;
   1023 }
   1024 
   1025 static int open_udp_socket(ares_channel channel, struct server_state *server)
   1026 {
   1027   ares_socket_t s;
   1028   ares_socklen_t salen;
   1029   union {
   1030     struct sockaddr_in  sa4;
   1031     struct sockaddr_in6 sa6;
   1032   } saddr;
   1033   struct sockaddr *sa;
   1034 
   1035   switch (server->addr.family)
   1036     {
   1037       case AF_INET:
   1038         sa = (void *)&saddr.sa4;
   1039         salen = sizeof(saddr.sa4);
   1040         memset(sa, 0, salen);
   1041         saddr.sa4.sin_family = AF_INET;
   1042         saddr.sa4.sin_port = (unsigned short)(channel->udp_port & 0xffff);
   1043         memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
   1044                sizeof(server->addr.addrV4));
   1045         break;
   1046       case AF_INET6:
   1047         sa = (void *)&saddr.sa6;
   1048         salen = sizeof(saddr.sa6);
   1049         memset(sa, 0, salen);
   1050         saddr.sa6.sin6_family = AF_INET6;
   1051         saddr.sa6.sin6_port = (unsigned short)(channel->udp_port & 0xffff);
   1052         memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
   1053                sizeof(server->addr.addrV6));
   1054         break;
   1055       default:
   1056         return -1;
   1057     }
   1058 
   1059   /* Acquire a socket. */
   1060   s = socket(server->addr.family, SOCK_DGRAM, 0);
   1061   if (s == ARES_SOCKET_BAD)
   1062     return -1;
   1063 
   1064   /* Set the socket non-blocking. */
   1065   if (configure_socket(s, server->addr.family, channel) < 0)
   1066     {
   1067        sclose(s);
   1068        return -1;
   1069     }
   1070 
   1071   /* Connect to the server. */
   1072   if (connect(s, sa, salen) == -1)
   1073     {
   1074       int err = SOCKERRNO;
   1075 
   1076       if (err != EINPROGRESS && err != EWOULDBLOCK)
   1077         {
   1078           sclose(s);
   1079           return -1;
   1080         }
   1081     }
   1082 
   1083   if (channel->sock_create_cb)
   1084     {
   1085       int err = channel->sock_create_cb(s, SOCK_DGRAM,
   1086                                         channel->sock_create_cb_data);
   1087       if (err < 0)
   1088         {
   1089           sclose(s);
   1090           return err;
   1091         }
   1092     }
   1093 
   1094   SOCK_STATE_CALLBACK(channel, s, 1, 0);
   1095 
   1096   server->udp_socket = s;
   1097   return 0;
   1098 }
   1099 
   1100 static int same_questions(const unsigned char *qbuf, int qlen,
   1101                           const unsigned char *abuf, int alen)
   1102 {
   1103   struct {
   1104     const unsigned char *p;
   1105     int qdcount;
   1106     char *name;
   1107     long namelen;
   1108     int type;
   1109     int dnsclass;
   1110   } q, a;
   1111   int i, j;
   1112 
   1113   if (qlen < HFIXEDSZ || alen < HFIXEDSZ)
   1114     return 0;
   1115 
   1116   /* Extract qdcount from the request and reply buffers and compare them. */
   1117   q.qdcount = DNS_HEADER_QDCOUNT(qbuf);
   1118   a.qdcount = DNS_HEADER_QDCOUNT(abuf);
   1119   if (q.qdcount != a.qdcount)
   1120     return 0;
   1121 
   1122   /* For each question in qbuf, find it in abuf. */
   1123   q.p = qbuf + HFIXEDSZ;
   1124   for (i = 0; i < q.qdcount; i++)
   1125     {
   1126       /* Decode the question in the query. */
   1127       if (ares_expand_name(q.p, qbuf, qlen, &q.name, &q.namelen)
   1128           != ARES_SUCCESS)
   1129         return 0;
   1130       q.p += q.namelen;
   1131       if (q.p + QFIXEDSZ > qbuf + qlen)
   1132         {
   1133           free(q.name);
   1134           return 0;
   1135         }
   1136       q.type = DNS_QUESTION_TYPE(q.p);
   1137       q.dnsclass = DNS_QUESTION_CLASS(q.p);
   1138       q.p += QFIXEDSZ;
   1139 
   1140       /* Search for this question in the answer. */
   1141       a.p = abuf + HFIXEDSZ;
   1142       for (j = 0; j < a.qdcount; j++)
   1143         {
   1144           /* Decode the question in the answer. */
   1145           if (ares_expand_name(a.p, abuf, alen, &a.name, &a.namelen)
   1146               != ARES_SUCCESS)
   1147             {
   1148               free(q.name);
   1149               return 0;
   1150             }
   1151           a.p += a.namelen;
   1152           if (a.p + QFIXEDSZ > abuf + alen)
   1153             {
   1154               free(q.name);
   1155               free(a.name);
   1156               return 0;
   1157             }
   1158           a.type = DNS_QUESTION_TYPE(a.p);
   1159           a.dnsclass = DNS_QUESTION_CLASS(a.p);
   1160           a.p += QFIXEDSZ;
   1161 
   1162           /* Compare the decoded questions. */
   1163           if (strcasecmp(q.name, a.name) == 0 && q.type == a.type
   1164               && q.dnsclass == a.dnsclass)
   1165             {
   1166               free(a.name);
   1167               break;
   1168             }
   1169           free(a.name);
   1170         }
   1171 
   1172       free(q.name);
   1173       if (j == a.qdcount)
   1174         return 0;
   1175     }
   1176   return 1;
   1177 }
   1178 
   1179 static int same_address(struct sockaddr *sa, struct ares_addr *aa)
   1180 {
   1181   void *addr1;
   1182   void *addr2;
   1183 
   1184   if (sa->sa_family == aa->family)
   1185     {
   1186       switch (aa->family)
   1187         {
   1188           case AF_INET:
   1189             addr1 = &aa->addrV4;
   1190             addr2 = &((struct sockaddr_in *)sa)->sin_addr;
   1191             if (memcmp(addr1, addr2, sizeof(aa->addrV4)) == 0)
   1192               return 1; /* match */
   1193             break;
   1194           case AF_INET6:
   1195             addr1 = &aa->addrV6;
   1196             addr2 = &((struct sockaddr_in6 *)sa)->sin6_addr;
   1197             if (memcmp(addr1, addr2, sizeof(aa->addrV6)) == 0)
   1198               return 1; /* match */
   1199             break;
   1200           default:
   1201             break;
   1202         }
   1203     }
   1204   return 0; /* different */
   1205 }
   1206 
   1207 static void end_query (ares_channel channel, struct query *query, int status,
   1208                        unsigned char *abuf, int alen)
   1209 {
   1210   int i;
   1211 
   1212   /* First we check to see if this query ended while one of our send
   1213    * queues still has pointers to it.
   1214    */
   1215   for (i = 0; i < channel->nservers; i++)
   1216     {
   1217       struct server_state *server = &channel->servers[i];
   1218       struct send_request *sendreq;
   1219       for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
   1220         if (sendreq->owner_query == query)
   1221           {
   1222             sendreq->owner_query = NULL;
   1223             assert(sendreq->data_storage == NULL);
   1224             if (status == ARES_SUCCESS)
   1225               {
   1226                 /* We got a reply for this query, but this queued
   1227                  * sendreq points into this soon-to-be-gone query's
   1228                  * tcpbuf. Probably this means we timed out and queued
   1229                  * the query for retransmission, then received a
   1230                  * response before actually retransmitting. This is
   1231                  * perfectly fine, so we want to keep the connection
   1232                  * running smoothly if we can. But in the worst case
   1233                  * we may have sent only some prefix of the query,
   1234                  * with some suffix of the query left to send. Also,
   1235                  * the buffer may be queued on multiple queues. To
   1236                  * prevent dangling pointers to the query's tcpbuf and
   1237                  * handle these cases, we just give such sendreqs
   1238                  * their own copy of the query packet.
   1239                  */
   1240                sendreq->data_storage = malloc(sendreq->len);
   1241                if (sendreq->data_storage != NULL)
   1242                  {
   1243                    memcpy(sendreq->data_storage, sendreq->data, sendreq->len);
   1244                    sendreq->data = sendreq->data_storage;
   1245                  }
   1246               }
   1247             if ((status != ARES_SUCCESS) || (sendreq->data_storage == NULL))
   1248               {
   1249                 /* We encountered an error (probably a timeout,
   1250                  * suggesting the DNS server we're talking to is
   1251                  * probably unreachable, wedged, or severely
   1252                  * overloaded) or we couldn't copy the request, so
   1253                  * mark the connection as broken. When we get to
   1254                  * process_broken_connections() we'll close the
   1255                  * connection and try to re-send requests to another
   1256                  * server.
   1257                  */
   1258                server->is_broken = 1;
   1259                /* Just to be paranoid, zero out this sendreq... */
   1260                sendreq->data = NULL;
   1261                sendreq->len = 0;
   1262              }
   1263           }
   1264     }
   1265 
   1266   /* Invoke the callback */
   1267   query->callback(query->arg, status, query->timeouts, abuf, alen);
   1268   ares__free_query(query);
   1269 
   1270   /* Simple cleanup policy: if no queries are remaining, close all
   1271    * network sockets unless STAYOPEN is set.
   1272    */
   1273   if (!(channel->flags & ARES_FLAG_STAYOPEN) &&
   1274       ares__is_list_empty(&(channel->all_queries)))
   1275     {
   1276       for (i = 0; i < channel->nservers; i++)
   1277         ares__close_sockets(channel, &channel->servers[i]);
   1278     }
   1279 }
   1280 
   1281 void ares__free_query(struct query *query)
   1282 {
   1283   /* Remove the query from all the lists in which it is linked */
   1284   ares__remove_from_list(&(query->queries_by_qid));
   1285   ares__remove_from_list(&(query->queries_by_timeout));
   1286   ares__remove_from_list(&(query->queries_to_server));
   1287   ares__remove_from_list(&(query->all_queries));
   1288   /* Zero out some important stuff, to help catch bugs */
   1289   query->callback = NULL;
   1290   query->arg = NULL;
   1291   /* Deallocate the memory associated with the query */
   1292   free(query->tcpbuf);
   1293   free(query->server_info);
   1294   free(query);
   1295 }
   1296