1 2 /* Copyright 1998 by the Massachusetts Institute of Technology. 3 * Copyright (C) 2004-2010 by Daniel Stenberg 4 * 5 * Permission to use, copy, modify, and distribute this 6 * software and its documentation for any purpose and without 7 * fee is hereby granted, provided that the above copyright 8 * notice appear in all copies and that both that copyright 9 * notice and this permission notice appear in supporting 10 * documentation, and that the name of M.I.T. not be used in 11 * advertising or publicity pertaining to distribution of the 12 * software without specific, written prior permission. 13 * M.I.T. makes no representations about the suitability of 14 * this software for any purpose. It is provided "as is" 15 * without express or implied warranty. 16 */ 17 18 #include "ares_setup.h" 19 20 #ifdef HAVE_SYS_SOCKET_H 21 # include <sys/socket.h> 22 #endif 23 #ifdef HAVE_SYS_UIO_H 24 # include <sys/uio.h> 25 #endif 26 #ifdef HAVE_NETINET_IN_H 27 # include <netinet/in.h> 28 #endif 29 #ifdef HAVE_NETINET_TCP_H 30 # include <netinet/tcp.h> 31 #endif 32 #ifdef HAVE_NETDB_H 33 # include <netdb.h> 34 #endif 35 #ifdef HAVE_ARPA_NAMESER_H 36 # include <arpa/nameser.h> 37 #else 38 # include "nameser.h" 39 #endif 40 #ifdef HAVE_ARPA_NAMESER_COMPAT_H 41 # include <arpa/nameser_compat.h> 42 #endif 43 44 #ifdef HAVE_SYS_TIME_H 45 # include <sys/time.h> 46 #endif 47 48 #ifdef HAVE_STRINGS_H 49 # include <strings.h> 50 #endif 51 #ifdef HAVE_UNISTD_H 52 # include <unistd.h> 53 #endif 54 #ifdef HAVE_SYS_IOCTL_H 55 # include <sys/ioctl.h> 56 #endif 57 #ifdef NETWARE 58 # include <sys/filio.h> 59 #endif 60 61 #include <assert.h> 62 #include <string.h> 63 #include <stdlib.h> 64 #include <fcntl.h> 65 #include <time.h> 66 67 #include "ares.h" 68 #include "ares_dns.h" 69 #include "ares_nowarn.h" 70 #include "ares_private.h" 71 72 73 static int try_again(int errnum); 74 static void write_tcp_data(ares_channel channel, fd_set *write_fds, 75 ares_socket_t write_fd, struct timeval *now); 76 static void read_tcp_data(ares_channel channel, fd_set *read_fds, 77 ares_socket_t read_fd, struct timeval *now); 78 static void read_udp_packets(ares_channel channel, fd_set *read_fds, 79 ares_socket_t read_fd, struct timeval *now); 80 static void advance_tcp_send_queue(ares_channel channel, int whichserver, 81 ssize_t num_bytes); 82 static void process_timeouts(ares_channel channel, struct timeval *now); 83 static void process_broken_connections(ares_channel channel, 84 struct timeval *now); 85 static void process_answer(ares_channel channel, unsigned char *abuf, 86 int alen, int whichserver, int tcp, 87 struct timeval *now); 88 static void handle_error(ares_channel channel, int whichserver, 89 struct timeval *now); 90 static void skip_server(ares_channel channel, struct query *query, 91 int whichserver); 92 static void next_server(ares_channel channel, struct query *query, 93 struct timeval *now); 94 static int open_tcp_socket(ares_channel channel, struct server_state *server); 95 static int open_udp_socket(ares_channel channel, struct server_state *server); 96 static int same_questions(const unsigned char *qbuf, int qlen, 97 const unsigned char *abuf, int alen); 98 static int same_address(struct sockaddr *sa, struct ares_addr *aa); 99 static void end_query(ares_channel channel, struct query *query, int status, 100 unsigned char *abuf, int alen); 101 102 /* return true if now is exactly check time or later */ 103 int ares__timedout(struct timeval *now, 104 struct timeval *check) 105 { 106 long secs = (now->tv_sec - check->tv_sec); 107 108 if(secs > 0) 109 return 1; /* yes, timed out */ 110 if(secs < 0) 111 return 0; /* nope, not timed out */ 112 113 /* if the full seconds were identical, check the sub second parts */ 114 return (now->tv_usec - check->tv_usec >= 0); 115 } 116 117 /* add the specific number of milliseconds to the time in the first argument */ 118 int ares__timeadd(struct timeval *now, 119 int millisecs) 120 { 121 now->tv_sec += millisecs/1000; 122 now->tv_usec += (millisecs%1000)*1000; 123 124 if(now->tv_usec >= 1000000) { 125 ++(now->tv_sec); 126 now->tv_usec -= 1000000; 127 } 128 129 return 0; 130 } 131 132 /* return time offset between now and (future) check, in milliseconds */ 133 long ares__timeoffset(struct timeval *now, 134 struct timeval *check) 135 { 136 return (check->tv_sec - now->tv_sec)*1000 + 137 (check->tv_usec - now->tv_usec)/1000; 138 } 139 140 141 /* 142 * generic process function 143 */ 144 static void processfds(ares_channel channel, 145 fd_set *read_fds, ares_socket_t read_fd, 146 fd_set *write_fds, ares_socket_t write_fd) 147 { 148 struct timeval now = ares__tvnow(); 149 150 write_tcp_data(channel, write_fds, write_fd, &now); 151 read_tcp_data(channel, read_fds, read_fd, &now); 152 read_udp_packets(channel, read_fds, read_fd, &now); 153 process_timeouts(channel, &now); 154 process_broken_connections(channel, &now); 155 } 156 157 /* Something interesting happened on the wire, or there was a timeout. 158 * See what's up and respond accordingly. 159 */ 160 void ares_process(ares_channel channel, fd_set *read_fds, fd_set *write_fds) 161 { 162 processfds(channel, read_fds, ARES_SOCKET_BAD, write_fds, ARES_SOCKET_BAD); 163 } 164 165 /* Something interesting happened on the wire, or there was a timeout. 166 * See what's up and respond accordingly. 167 */ 168 void ares_process_fd(ares_channel channel, 169 ares_socket_t read_fd, /* use ARES_SOCKET_BAD or valid 170 file descriptors */ 171 ares_socket_t write_fd) 172 { 173 processfds(channel, NULL, read_fd, NULL, write_fd); 174 } 175 176 177 /* Return 1 if the specified error number describes a readiness error, or 0 178 * otherwise. This is mostly for HP-UX, which could return EAGAIN or 179 * EWOULDBLOCK. See this man page 180 * 181 * http://devrsrc1.external.hp.com/STKS/cgi-bin/man2html? 182 * manpage=/usr/share/man/man2.Z/send.2 183 */ 184 static int try_again(int errnum) 185 { 186 #if !defined EWOULDBLOCK && !defined EAGAIN 187 #error "Neither EWOULDBLOCK nor EAGAIN defined" 188 #endif 189 switch (errnum) 190 { 191 #ifdef EWOULDBLOCK 192 case EWOULDBLOCK: 193 return 1; 194 #endif 195 #if defined EAGAIN && EAGAIN != EWOULDBLOCK 196 case EAGAIN: 197 return 1; 198 #endif 199 } 200 return 0; 201 } 202 203 /* If any TCP sockets select true for writing, write out queued data 204 * we have for them. 205 */ 206 static void write_tcp_data(ares_channel channel, 207 fd_set *write_fds, 208 ares_socket_t write_fd, 209 struct timeval *now) 210 { 211 struct server_state *server; 212 struct send_request *sendreq; 213 struct iovec *vec; 214 int i; 215 ssize_t scount; 216 ssize_t wcount; 217 size_t n; 218 219 if(!write_fds && (write_fd == ARES_SOCKET_BAD)) 220 /* no possible action */ 221 return; 222 223 for (i = 0; i < channel->nservers; i++) 224 { 225 /* Make sure server has data to send and is selected in write_fds or 226 write_fd. */ 227 server = &channel->servers[i]; 228 if (!server->qhead || server->tcp_socket == ARES_SOCKET_BAD || 229 server->is_broken) 230 continue; 231 232 if(write_fds) { 233 if(!FD_ISSET(server->tcp_socket, write_fds)) 234 continue; 235 } 236 else { 237 if(server->tcp_socket != write_fd) 238 continue; 239 } 240 241 if(write_fds) 242 /* If there's an error and we close this socket, then open 243 * another with the same fd to talk to another server, then we 244 * don't want to think that it was the new socket that was 245 * ready. This is not disastrous, but is likely to result in 246 * extra system calls and confusion. */ 247 FD_CLR(server->tcp_socket, write_fds); 248 249 /* Count the number of send queue items. */ 250 n = 0; 251 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next) 252 n++; 253 254 /* Allocate iovecs so we can send all our data at once. */ 255 vec = malloc(n * sizeof(struct iovec)); 256 if (vec) 257 { 258 /* Fill in the iovecs and send. */ 259 n = 0; 260 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next) 261 { 262 vec[n].iov_base = (char *) sendreq->data; 263 vec[n].iov_len = sendreq->len; 264 n++; 265 } 266 wcount = (ssize_t)writev(server->tcp_socket, vec, (int)n); 267 free(vec); 268 if (wcount < 0) 269 { 270 if (!try_again(SOCKERRNO)) 271 handle_error(channel, i, now); 272 continue; 273 } 274 275 /* Advance the send queue by as many bytes as we sent. */ 276 advance_tcp_send_queue(channel, i, wcount); 277 } 278 else 279 { 280 /* Can't allocate iovecs; just send the first request. */ 281 sendreq = server->qhead; 282 283 scount = swrite(server->tcp_socket, sendreq->data, sendreq->len); 284 if (scount < 0) 285 { 286 if (!try_again(SOCKERRNO)) 287 handle_error(channel, i, now); 288 continue; 289 } 290 291 /* Advance the send queue by as many bytes as we sent. */ 292 advance_tcp_send_queue(channel, i, scount); 293 } 294 } 295 } 296 297 /* Consume the given number of bytes from the head of the TCP send queue. */ 298 static void advance_tcp_send_queue(ares_channel channel, int whichserver, 299 ssize_t num_bytes) 300 { 301 struct send_request *sendreq; 302 struct server_state *server = &channel->servers[whichserver]; 303 while (num_bytes > 0) { 304 sendreq = server->qhead; 305 if ((size_t)num_bytes >= sendreq->len) { 306 num_bytes -= sendreq->len; 307 server->qhead = sendreq->next; 308 if (sendreq->data_storage) 309 free(sendreq->data_storage); 310 free(sendreq); 311 if (server->qhead == NULL) { 312 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 0); 313 server->qtail = NULL; 314 315 /* qhead is NULL so we cannot continue this loop */ 316 break; 317 } 318 } 319 else { 320 sendreq->data += num_bytes; 321 sendreq->len -= num_bytes; 322 num_bytes = 0; 323 } 324 } 325 } 326 327 /* If any TCP socket selects true for reading, read some data, 328 * allocate a buffer if we finish reading the length word, and process 329 * a packet if we finish reading one. 330 */ 331 static void read_tcp_data(ares_channel channel, fd_set *read_fds, 332 ares_socket_t read_fd, struct timeval *now) 333 { 334 struct server_state *server; 335 int i; 336 ssize_t count; 337 338 if(!read_fds && (read_fd == ARES_SOCKET_BAD)) 339 /* no possible action */ 340 return; 341 342 for (i = 0; i < channel->nservers; i++) 343 { 344 /* Make sure the server has a socket and is selected in read_fds. */ 345 server = &channel->servers[i]; 346 if (server->tcp_socket == ARES_SOCKET_BAD || server->is_broken) 347 continue; 348 349 if(read_fds) { 350 if(!FD_ISSET(server->tcp_socket, read_fds)) 351 continue; 352 } 353 else { 354 if(server->tcp_socket != read_fd) 355 continue; 356 } 357 358 if(read_fds) 359 /* If there's an error and we close this socket, then open 360 * another with the same fd to talk to another server, then we 361 * don't want to think that it was the new socket that was 362 * ready. This is not disastrous, but is likely to result in 363 * extra system calls and confusion. */ 364 FD_CLR(server->tcp_socket, read_fds); 365 366 if (server->tcp_lenbuf_pos != 2) 367 { 368 /* We haven't yet read a length word, so read that (or 369 * what's left to read of it). 370 */ 371 count = sread(server->tcp_socket, 372 server->tcp_lenbuf + server->tcp_lenbuf_pos, 373 2 - server->tcp_lenbuf_pos); 374 if (count <= 0) 375 { 376 if (!(count == -1 && try_again(SOCKERRNO))) 377 handle_error(channel, i, now); 378 continue; 379 } 380 381 server->tcp_lenbuf_pos += (int)count; 382 if (server->tcp_lenbuf_pos == 2) 383 { 384 /* We finished reading the length word. Decode the 385 * length and allocate a buffer for the data. 386 */ 387 server->tcp_length = server->tcp_lenbuf[0] << 8 388 | server->tcp_lenbuf[1]; 389 server->tcp_buffer = malloc(server->tcp_length); 390 if (!server->tcp_buffer) 391 handle_error(channel, i, now); 392 server->tcp_buffer_pos = 0; 393 } 394 } 395 else 396 { 397 /* Read data into the allocated buffer. */ 398 count = sread(server->tcp_socket, 399 server->tcp_buffer + server->tcp_buffer_pos, 400 server->tcp_length - server->tcp_buffer_pos); 401 if (count <= 0) 402 { 403 if (!(count == -1 && try_again(SOCKERRNO))) 404 handle_error(channel, i, now); 405 continue; 406 } 407 408 server->tcp_buffer_pos += (int)count; 409 if (server->tcp_buffer_pos == server->tcp_length) 410 { 411 /* We finished reading this answer; process it and 412 * prepare to read another length word. 413 */ 414 process_answer(channel, server->tcp_buffer, server->tcp_length, 415 i, 1, now); 416 if (server->tcp_buffer) 417 free(server->tcp_buffer); 418 server->tcp_buffer = NULL; 419 server->tcp_lenbuf_pos = 0; 420 server->tcp_buffer_pos = 0; 421 } 422 } 423 } 424 } 425 426 /* If any UDP sockets select true for reading, process them. */ 427 static void read_udp_packets(ares_channel channel, fd_set *read_fds, 428 ares_socket_t read_fd, struct timeval *now) 429 { 430 struct server_state *server; 431 int i; 432 ssize_t count; 433 unsigned char buf[PACKETSZ + 1]; 434 #ifdef HAVE_RECVFROM 435 ares_socklen_t fromlen; 436 union { 437 struct sockaddr sa; 438 struct sockaddr_in sa4; 439 struct sockaddr_in6 sa6; 440 } from; 441 #endif 442 443 if(!read_fds && (read_fd == ARES_SOCKET_BAD)) 444 /* no possible action */ 445 return; 446 447 for (i = 0; i < channel->nservers; i++) 448 { 449 /* Make sure the server has a socket and is selected in read_fds. */ 450 server = &channel->servers[i]; 451 452 if (server->udp_socket == ARES_SOCKET_BAD || server->is_broken) 453 continue; 454 455 if(read_fds) { 456 if(!FD_ISSET(server->udp_socket, read_fds)) 457 continue; 458 } 459 else { 460 if(server->udp_socket != read_fd) 461 continue; 462 } 463 464 if(read_fds) 465 /* If there's an error and we close this socket, then open 466 * another with the same fd to talk to another server, then we 467 * don't want to think that it was the new socket that was 468 * ready. This is not disastrous, but is likely to result in 469 * extra system calls and confusion. */ 470 FD_CLR(server->udp_socket, read_fds); 471 472 /* To reduce event loop overhead, read and process as many 473 * packets as we can. */ 474 do { 475 #ifdef HAVE_RECVFROM 476 if (server->addr.family == AF_INET) 477 fromlen = sizeof(from.sa4); 478 else 479 fromlen = sizeof(from.sa6); 480 count = (ssize_t)recvfrom(server->udp_socket, (void *)buf, sizeof(buf), 481 0, &from.sa, &fromlen); 482 #else 483 count = sread(server->udp_socket, buf, sizeof(buf)); 484 #endif 485 if (count == -1 && try_again(SOCKERRNO)) 486 continue; 487 else if (count <= 0) 488 handle_error(channel, i, now); 489 #ifdef HAVE_RECVFROM 490 else if (!same_address(&from.sa, &server->addr)) 491 /* The address the response comes from does not match 492 * the address we sent the request to. Someone may be 493 * attempting to perform a cache poisoning attack. */ 494 break; 495 #endif 496 else 497 process_answer(channel, buf, (int)count, i, 0, now); 498 } while (count > 0); 499 } 500 } 501 502 /* If any queries have timed out, note the timeout and move them on. */ 503 static void process_timeouts(ares_channel channel, struct timeval *now) 504 { 505 time_t t; /* the time of the timeouts we're processing */ 506 struct query *query; 507 struct list_node* list_head; 508 struct list_node* list_node; 509 510 /* Process all the timeouts that have fired since the last time we 511 * processed timeouts. If things are going well, then we'll have 512 * hundreds/thousands of queries that fall into future buckets, and 513 * only a handful of requests that fall into the "now" bucket, so 514 * this should be quite quick. 515 */ 516 for (t = channel->last_timeout_processed; t <= now->tv_sec; t++) 517 { 518 list_head = &(channel->queries_by_timeout[t % ARES_TIMEOUT_TABLE_SIZE]); 519 for (list_node = list_head->next; list_node != list_head; ) 520 { 521 query = list_node->data; 522 list_node = list_node->next; /* in case the query gets deleted */ 523 if (query->timeout.tv_sec && ares__timedout(now, &query->timeout)) 524 { 525 query->error_status = ARES_ETIMEOUT; 526 ++query->timeouts; 527 next_server(channel, query, now); 528 } 529 } 530 } 531 channel->last_timeout_processed = now->tv_sec; 532 } 533 534 /* Handle an answer from a server. */ 535 static void process_answer(ares_channel channel, unsigned char *abuf, 536 int alen, int whichserver, int tcp, 537 struct timeval *now) 538 { 539 int tc, rcode; 540 unsigned short id; 541 struct query *query; 542 struct list_node* list_head; 543 struct list_node* list_node; 544 545 /* If there's no room in the answer for a header, we can't do much 546 * with it. */ 547 if (alen < HFIXEDSZ) 548 return; 549 550 /* Grab the query ID, truncate bit, and response code from the packet. */ 551 id = DNS_HEADER_QID(abuf); 552 tc = DNS_HEADER_TC(abuf); 553 rcode = DNS_HEADER_RCODE(abuf); 554 555 /* Find the query corresponding to this packet. The queries are 556 * hashed/bucketed by query id, so this lookup should be quick. 557 * Note that both the query id and the questions must be the same; 558 * when the query id wraps around we can have multiple outstanding 559 * queries with the same query id, so we need to check both the id and 560 * question. 561 */ 562 query = NULL; 563 list_head = &(channel->queries_by_qid[id % ARES_QID_TABLE_SIZE]); 564 for (list_node = list_head->next; list_node != list_head; 565 list_node = list_node->next) 566 { 567 struct query *q = list_node->data; 568 if ((q->qid == id) && same_questions(q->qbuf, q->qlen, abuf, alen)) 569 { 570 query = q; 571 break; 572 } 573 } 574 if (!query) 575 return; 576 577 /* If we got a truncated UDP packet and are not ignoring truncation, 578 * don't accept the packet, and switch the query to TCP if we hadn't 579 * done so already. 580 */ 581 if ((tc || alen > PACKETSZ) && !tcp && !(channel->flags & ARES_FLAG_IGNTC)) 582 { 583 if (!query->using_tcp) 584 { 585 query->using_tcp = 1; 586 ares__send_query(channel, query, now); 587 } 588 return; 589 } 590 591 /* Limit alen to PACKETSZ if we aren't using TCP (only relevant if we 592 * are ignoring truncation. 593 */ 594 if (alen > PACKETSZ && !tcp) 595 alen = PACKETSZ; 596 597 /* If we aren't passing through all error packets, discard packets 598 * with SERVFAIL, NOTIMP, or REFUSED response codes. 599 */ 600 if (!(channel->flags & ARES_FLAG_NOCHECKRESP)) 601 { 602 if (rcode == SERVFAIL || rcode == NOTIMP || rcode == REFUSED) 603 { 604 skip_server(channel, query, whichserver); 605 if (query->server == whichserver) 606 next_server(channel, query, now); 607 return; 608 } 609 } 610 611 end_query(channel, query, ARES_SUCCESS, abuf, alen); 612 } 613 614 /* Close all the connections that are no longer usable. */ 615 static void process_broken_connections(ares_channel channel, 616 struct timeval *now) 617 { 618 int i; 619 for (i = 0; i < channel->nservers; i++) 620 { 621 struct server_state *server = &channel->servers[i]; 622 if (server->is_broken) 623 { 624 handle_error(channel, i, now); 625 } 626 } 627 } 628 629 static void handle_error(ares_channel channel, int whichserver, 630 struct timeval *now) 631 { 632 struct server_state *server; 633 struct query *query; 634 struct list_node list_head; 635 struct list_node* list_node; 636 637 server = &channel->servers[whichserver]; 638 639 /* Reset communications with this server. */ 640 ares__close_sockets(channel, server); 641 642 /* Tell all queries talking to this server to move on and not try 643 * this server again. We steal the current list of queries that were 644 * in-flight to this server, since when we call next_server this can 645 * cause the queries to be re-sent to this server, which will 646 * re-insert these queries in that same server->queries_to_server 647 * list. 648 */ 649 ares__init_list_head(&list_head); 650 ares__swap_lists(&list_head, &(server->queries_to_server)); 651 for (list_node = list_head.next; list_node != &list_head; ) 652 { 653 query = list_node->data; 654 list_node = list_node->next; /* in case the query gets deleted */ 655 assert(query->server == whichserver); 656 skip_server(channel, query, whichserver); 657 next_server(channel, query, now); 658 } 659 /* Each query should have removed itself from our temporary list as 660 * it re-sent itself or finished up... 661 */ 662 assert(ares__is_list_empty(&list_head)); 663 } 664 665 static void skip_server(ares_channel channel, struct query *query, 666 int whichserver) { 667 /* The given server gave us problems with this query, so if we have 668 * the luxury of using other servers, then let's skip the 669 * potentially broken server and just use the others. If we only 670 * have one server and we need to retry then we should just go ahead 671 * and re-use that server, since it's our only hope; perhaps we 672 * just got unlucky, and retrying will work (eg, the server timed 673 * out our TCP connection just as we were sending another request). 674 */ 675 if (channel->nservers > 1) 676 { 677 query->server_info[whichserver].skip_server = 1; 678 } 679 } 680 681 static void next_server(ares_channel channel, struct query *query, 682 struct timeval *now) 683 { 684 /* We need to try each server channel->tries times. We have channel->nservers 685 * servers to try. In total, we need to do channel->nservers * channel->tries 686 * attempts. Use query->try to remember how many times we already attempted 687 * this query. Use modular arithmetic to find the next server to try. */ 688 while (++(query->try_count) < (channel->nservers * channel->tries)) 689 { 690 struct server_state *server; 691 692 /* Move on to the next server. */ 693 query->server = (query->server + 1) % channel->nservers; 694 server = &channel->servers[query->server]; 695 696 /* We don't want to use this server if (1) we decided this 697 * connection is broken, and thus about to be closed, (2) 698 * we've decided to skip this server because of earlier 699 * errors we encountered, or (3) we already sent this query 700 * over this exact connection. 701 */ 702 if (!server->is_broken && 703 !query->server_info[query->server].skip_server && 704 !(query->using_tcp && 705 (query->server_info[query->server].tcp_connection_generation == 706 server->tcp_connection_generation))) 707 { 708 ares__send_query(channel, query, now); 709 return; 710 } 711 712 /* You might think that with TCP we only need one try. However, 713 * even when using TCP, servers can time-out our connection just 714 * as we're sending a request, or close our connection because 715 * they die, or never send us a reply because they get wedged or 716 * tickle a bug that drops our request. 717 */ 718 } 719 720 /* If we are here, all attempts to perform query failed. */ 721 end_query(channel, query, query->error_status, NULL, 0); 722 } 723 724 void ares__send_query(ares_channel channel, struct query *query, 725 struct timeval *now) 726 { 727 struct send_request *sendreq; 728 struct server_state *server; 729 int timeplus; 730 731 server = &channel->servers[query->server]; 732 if (query->using_tcp) 733 { 734 /* Make sure the TCP socket for this server is set up and queue 735 * a send request. 736 */ 737 if (server->tcp_socket == ARES_SOCKET_BAD) 738 { 739 if (open_tcp_socket(channel, server) == -1) 740 { 741 skip_server(channel, query, query->server); 742 next_server(channel, query, now); 743 return; 744 } 745 } 746 sendreq = calloc(1, sizeof(struct send_request)); 747 if (!sendreq) 748 { 749 end_query(channel, query, ARES_ENOMEM, NULL, 0); 750 return; 751 } 752 /* To make the common case fast, we avoid copies by using the 753 * query's tcpbuf for as long as the query is alive. In the rare 754 * case where the query ends while it's queued for transmission, 755 * then we give the sendreq its own copy of the request packet 756 * and put it in sendreq->data_storage. 757 */ 758 sendreq->data_storage = NULL; 759 sendreq->data = query->tcpbuf; 760 sendreq->len = query->tcplen; 761 sendreq->owner_query = query; 762 sendreq->next = NULL; 763 if (server->qtail) 764 server->qtail->next = sendreq; 765 else 766 { 767 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 1); 768 server->qhead = sendreq; 769 } 770 server->qtail = sendreq; 771 query->server_info[query->server].tcp_connection_generation = 772 server->tcp_connection_generation; 773 } 774 else 775 { 776 if (server->udp_socket == ARES_SOCKET_BAD) 777 { 778 if (open_udp_socket(channel, server) == -1) 779 { 780 skip_server(channel, query, query->server); 781 next_server(channel, query, now); 782 return; 783 } 784 } 785 if (swrite(server->udp_socket, query->qbuf, query->qlen) == -1) 786 { 787 /* FIXME: Handle EAGAIN here since it likely can happen. */ 788 skip_server(channel, query, query->server); 789 next_server(channel, query, now); 790 return; 791 } 792 } 793 timeplus = channel->timeout << (query->try_count / channel->nservers); 794 timeplus = (timeplus * (9 + (rand () & 7))) / 16; 795 query->timeout = *now; 796 ares__timeadd(&query->timeout, 797 timeplus); 798 /* Keep track of queries bucketed by timeout, so we can process 799 * timeout events quickly. 800 */ 801 ares__remove_from_list(&(query->queries_by_timeout)); 802 ares__insert_in_list( 803 &(query->queries_by_timeout), 804 &(channel->queries_by_timeout[query->timeout.tv_sec % 805 ARES_TIMEOUT_TABLE_SIZE])); 806 807 /* Keep track of queries bucketed by server, so we can process server 808 * errors quickly. 809 */ 810 ares__remove_from_list(&(query->queries_to_server)); 811 ares__insert_in_list(&(query->queries_to_server), 812 &(server->queries_to_server)); 813 } 814 815 /* 816 * setsocknonblock sets the given socket to either blocking or non-blocking 817 * mode based on the 'nonblock' boolean argument. This function is highly 818 * portable. 819 */ 820 static int setsocknonblock(ares_socket_t sockfd, /* operate on this */ 821 int nonblock /* TRUE or FALSE */) 822 { 823 #if defined(USE_BLOCKING_SOCKETS) 824 825 return 0; /* returns success */ 826 827 #elif defined(HAVE_FCNTL_O_NONBLOCK) 828 829 /* most recent unix versions */ 830 int flags; 831 flags = fcntl(sockfd, F_GETFL, 0); 832 if (FALSE != nonblock) 833 return fcntl(sockfd, F_SETFL, flags | O_NONBLOCK); 834 else 835 return fcntl(sockfd, F_SETFL, flags & (~O_NONBLOCK)); 836 837 #elif defined(HAVE_IOCTL_FIONBIO) 838 839 /* older unix versions */ 840 int flags; 841 flags = nonblock; 842 return ioctl(sockfd, FIONBIO, &flags); 843 844 #elif defined(HAVE_IOCTLSOCKET_FIONBIO) 845 846 #ifdef WATT32 847 char flags; 848 #else 849 /* Windows */ 850 unsigned long flags; 851 #endif 852 flags = nonblock; 853 return ioctlsocket(sockfd, FIONBIO, &flags); 854 855 #elif defined(HAVE_IOCTLSOCKET_CAMEL_FIONBIO) 856 857 /* Amiga */ 858 return IoctlSocket(sockfd, FIONBIO, (long)nonblock); 859 860 #elif defined(HAVE_SETSOCKOPT_SO_NONBLOCK) 861 862 /* BeOS */ 863 long b = nonblock ? 1 : 0; 864 return setsockopt(sockfd, SOL_SOCKET, SO_NONBLOCK, &b, sizeof(b)); 865 866 #else 867 # error "no non-blocking method was found/used/set" 868 #endif 869 } 870 871 static int configure_socket(ares_socket_t s, int family, ares_channel channel) 872 { 873 union { 874 struct sockaddr sa; 875 struct sockaddr_in sa4; 876 struct sockaddr_in6 sa6; 877 } local; 878 879 setsocknonblock(s, TRUE); 880 881 #if defined(FD_CLOEXEC) && !defined(MSDOS) 882 /* Configure the socket fd as close-on-exec. */ 883 if (fcntl(s, F_SETFD, FD_CLOEXEC) == -1) 884 return -1; 885 #endif 886 887 /* Set the socket's send and receive buffer sizes. */ 888 if ((channel->socket_send_buffer_size > 0) && 889 setsockopt(s, SOL_SOCKET, SO_SNDBUF, 890 (void *)&channel->socket_send_buffer_size, 891 sizeof(channel->socket_send_buffer_size)) == -1) 892 return -1; 893 894 if ((channel->socket_receive_buffer_size > 0) && 895 setsockopt(s, SOL_SOCKET, SO_RCVBUF, 896 (void *)&channel->socket_receive_buffer_size, 897 sizeof(channel->socket_receive_buffer_size)) == -1) 898 return -1; 899 900 #ifdef SO_BINDTODEVICE 901 if (channel->local_dev_name[0]) { 902 if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, 903 channel->local_dev_name, sizeof(channel->local_dev_name))) { 904 /* Only root can do this, and usually not fatal if it doesn't work, so */ 905 /* just continue on. */ 906 } 907 } 908 #endif 909 910 if (family == AF_INET) { 911 if (channel->local_ip4) { 912 memset(&local.sa4, 0, sizeof(local.sa4)); 913 local.sa4.sin_family = AF_INET; 914 local.sa4.sin_addr.s_addr = htonl(channel->local_ip4); 915 if (bind(s, &local.sa, sizeof(local.sa4)) < 0) 916 return -1; 917 } 918 } 919 else if (family == AF_INET6) { 920 if (memcmp(channel->local_ip6, &ares_in6addr_any, sizeof(channel->local_ip6)) != 0) { 921 memset(&local.sa6, 0, sizeof(local.sa6)); 922 local.sa6.sin6_family = AF_INET6; 923 memcpy(&local.sa6.sin6_addr, channel->local_ip6, sizeof(channel->local_ip6)); 924 if (bind(s, &local.sa, sizeof(local.sa6)) < 0) 925 return -1; 926 } 927 } 928 929 return 0; 930 } 931 932 static int open_tcp_socket(ares_channel channel, struct server_state *server) 933 { 934 ares_socket_t s; 935 int opt; 936 ares_socklen_t salen; 937 union { 938 struct sockaddr_in sa4; 939 struct sockaddr_in6 sa6; 940 } saddr; 941 struct sockaddr *sa; 942 943 switch (server->addr.family) 944 { 945 case AF_INET: 946 sa = (void *)&saddr.sa4; 947 salen = sizeof(saddr.sa4); 948 memset(sa, 0, salen); 949 saddr.sa4.sin_family = AF_INET; 950 saddr.sa4.sin_port = (unsigned short)(channel->tcp_port & 0xffff); 951 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4, 952 sizeof(server->addr.addrV4)); 953 break; 954 case AF_INET6: 955 sa = (void *)&saddr.sa6; 956 salen = sizeof(saddr.sa6); 957 memset(sa, 0, salen); 958 saddr.sa6.sin6_family = AF_INET6; 959 saddr.sa6.sin6_port = (unsigned short)(channel->tcp_port & 0xffff); 960 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6, 961 sizeof(server->addr.addrV6)); 962 break; 963 default: 964 return -1; 965 } 966 967 /* Acquire a socket. */ 968 s = socket(server->addr.family, SOCK_STREAM, 0); 969 if (s == ARES_SOCKET_BAD) 970 return -1; 971 972 /* Configure it. */ 973 if (configure_socket(s, server->addr.family, channel) < 0) 974 { 975 sclose(s); 976 return -1; 977 } 978 979 #ifdef TCP_NODELAY 980 /* 981 * Disable the Nagle algorithm (only relevant for TCP sockets, and thus not 982 * in configure_socket). In general, in DNS lookups we're pretty much 983 * interested in firing off a single request and then waiting for a reply, 984 * so batching isn't very interesting. 985 */ 986 opt = 1; 987 if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY, 988 (void *)&opt, sizeof(opt)) == -1) 989 { 990 sclose(s); 991 return -1; 992 } 993 #endif 994 995 /* Connect to the server. */ 996 if (connect(s, sa, salen) == -1) 997 { 998 int err = SOCKERRNO; 999 1000 if (err != EINPROGRESS && err != EWOULDBLOCK) 1001 { 1002 sclose(s); 1003 return -1; 1004 } 1005 } 1006 1007 if (channel->sock_create_cb) 1008 { 1009 int err = channel->sock_create_cb(s, SOCK_STREAM, 1010 channel->sock_create_cb_data); 1011 if (err < 0) 1012 { 1013 sclose(s); 1014 return err; 1015 } 1016 } 1017 1018 SOCK_STATE_CALLBACK(channel, s, 1, 0); 1019 server->tcp_buffer_pos = 0; 1020 server->tcp_socket = s; 1021 server->tcp_connection_generation = ++channel->tcp_connection_generation; 1022 return 0; 1023 } 1024 1025 static int open_udp_socket(ares_channel channel, struct server_state *server) 1026 { 1027 ares_socket_t s; 1028 ares_socklen_t salen; 1029 union { 1030 struct sockaddr_in sa4; 1031 struct sockaddr_in6 sa6; 1032 } saddr; 1033 struct sockaddr *sa; 1034 1035 switch (server->addr.family) 1036 { 1037 case AF_INET: 1038 sa = (void *)&saddr.sa4; 1039 salen = sizeof(saddr.sa4); 1040 memset(sa, 0, salen); 1041 saddr.sa4.sin_family = AF_INET; 1042 saddr.sa4.sin_port = (unsigned short)(channel->udp_port & 0xffff); 1043 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4, 1044 sizeof(server->addr.addrV4)); 1045 break; 1046 case AF_INET6: 1047 sa = (void *)&saddr.sa6; 1048 salen = sizeof(saddr.sa6); 1049 memset(sa, 0, salen); 1050 saddr.sa6.sin6_family = AF_INET6; 1051 saddr.sa6.sin6_port = (unsigned short)(channel->udp_port & 0xffff); 1052 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6, 1053 sizeof(server->addr.addrV6)); 1054 break; 1055 default: 1056 return -1; 1057 } 1058 1059 /* Acquire a socket. */ 1060 s = socket(server->addr.family, SOCK_DGRAM, 0); 1061 if (s == ARES_SOCKET_BAD) 1062 return -1; 1063 1064 /* Set the socket non-blocking. */ 1065 if (configure_socket(s, server->addr.family, channel) < 0) 1066 { 1067 sclose(s); 1068 return -1; 1069 } 1070 1071 /* Connect to the server. */ 1072 if (connect(s, sa, salen) == -1) 1073 { 1074 int err = SOCKERRNO; 1075 1076 if (err != EINPROGRESS && err != EWOULDBLOCK) 1077 { 1078 sclose(s); 1079 return -1; 1080 } 1081 } 1082 1083 if (channel->sock_create_cb) 1084 { 1085 int err = channel->sock_create_cb(s, SOCK_DGRAM, 1086 channel->sock_create_cb_data); 1087 if (err < 0) 1088 { 1089 sclose(s); 1090 return err; 1091 } 1092 } 1093 1094 SOCK_STATE_CALLBACK(channel, s, 1, 0); 1095 1096 server->udp_socket = s; 1097 return 0; 1098 } 1099 1100 static int same_questions(const unsigned char *qbuf, int qlen, 1101 const unsigned char *abuf, int alen) 1102 { 1103 struct { 1104 const unsigned char *p; 1105 int qdcount; 1106 char *name; 1107 long namelen; 1108 int type; 1109 int dnsclass; 1110 } q, a; 1111 int i, j; 1112 1113 if (qlen < HFIXEDSZ || alen < HFIXEDSZ) 1114 return 0; 1115 1116 /* Extract qdcount from the request and reply buffers and compare them. */ 1117 q.qdcount = DNS_HEADER_QDCOUNT(qbuf); 1118 a.qdcount = DNS_HEADER_QDCOUNT(abuf); 1119 if (q.qdcount != a.qdcount) 1120 return 0; 1121 1122 /* For each question in qbuf, find it in abuf. */ 1123 q.p = qbuf + HFIXEDSZ; 1124 for (i = 0; i < q.qdcount; i++) 1125 { 1126 /* Decode the question in the query. */ 1127 if (ares_expand_name(q.p, qbuf, qlen, &q.name, &q.namelen) 1128 != ARES_SUCCESS) 1129 return 0; 1130 q.p += q.namelen; 1131 if (q.p + QFIXEDSZ > qbuf + qlen) 1132 { 1133 free(q.name); 1134 return 0; 1135 } 1136 q.type = DNS_QUESTION_TYPE(q.p); 1137 q.dnsclass = DNS_QUESTION_CLASS(q.p); 1138 q.p += QFIXEDSZ; 1139 1140 /* Search for this question in the answer. */ 1141 a.p = abuf + HFIXEDSZ; 1142 for (j = 0; j < a.qdcount; j++) 1143 { 1144 /* Decode the question in the answer. */ 1145 if (ares_expand_name(a.p, abuf, alen, &a.name, &a.namelen) 1146 != ARES_SUCCESS) 1147 { 1148 free(q.name); 1149 return 0; 1150 } 1151 a.p += a.namelen; 1152 if (a.p + QFIXEDSZ > abuf + alen) 1153 { 1154 free(q.name); 1155 free(a.name); 1156 return 0; 1157 } 1158 a.type = DNS_QUESTION_TYPE(a.p); 1159 a.dnsclass = DNS_QUESTION_CLASS(a.p); 1160 a.p += QFIXEDSZ; 1161 1162 /* Compare the decoded questions. */ 1163 if (strcasecmp(q.name, a.name) == 0 && q.type == a.type 1164 && q.dnsclass == a.dnsclass) 1165 { 1166 free(a.name); 1167 break; 1168 } 1169 free(a.name); 1170 } 1171 1172 free(q.name); 1173 if (j == a.qdcount) 1174 return 0; 1175 } 1176 return 1; 1177 } 1178 1179 static int same_address(struct sockaddr *sa, struct ares_addr *aa) 1180 { 1181 void *addr1; 1182 void *addr2; 1183 1184 if (sa->sa_family == aa->family) 1185 { 1186 switch (aa->family) 1187 { 1188 case AF_INET: 1189 addr1 = &aa->addrV4; 1190 addr2 = &((struct sockaddr_in *)sa)->sin_addr; 1191 if (memcmp(addr1, addr2, sizeof(aa->addrV4)) == 0) 1192 return 1; /* match */ 1193 break; 1194 case AF_INET6: 1195 addr1 = &aa->addrV6; 1196 addr2 = &((struct sockaddr_in6 *)sa)->sin6_addr; 1197 if (memcmp(addr1, addr2, sizeof(aa->addrV6)) == 0) 1198 return 1; /* match */ 1199 break; 1200 default: 1201 break; 1202 } 1203 } 1204 return 0; /* different */ 1205 } 1206 1207 static void end_query (ares_channel channel, struct query *query, int status, 1208 unsigned char *abuf, int alen) 1209 { 1210 int i; 1211 1212 /* First we check to see if this query ended while one of our send 1213 * queues still has pointers to it. 1214 */ 1215 for (i = 0; i < channel->nservers; i++) 1216 { 1217 struct server_state *server = &channel->servers[i]; 1218 struct send_request *sendreq; 1219 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next) 1220 if (sendreq->owner_query == query) 1221 { 1222 sendreq->owner_query = NULL; 1223 assert(sendreq->data_storage == NULL); 1224 if (status == ARES_SUCCESS) 1225 { 1226 /* We got a reply for this query, but this queued 1227 * sendreq points into this soon-to-be-gone query's 1228 * tcpbuf. Probably this means we timed out and queued 1229 * the query for retransmission, then received a 1230 * response before actually retransmitting. This is 1231 * perfectly fine, so we want to keep the connection 1232 * running smoothly if we can. But in the worst case 1233 * we may have sent only some prefix of the query, 1234 * with some suffix of the query left to send. Also, 1235 * the buffer may be queued on multiple queues. To 1236 * prevent dangling pointers to the query's tcpbuf and 1237 * handle these cases, we just give such sendreqs 1238 * their own copy of the query packet. 1239 */ 1240 sendreq->data_storage = malloc(sendreq->len); 1241 if (sendreq->data_storage != NULL) 1242 { 1243 memcpy(sendreq->data_storage, sendreq->data, sendreq->len); 1244 sendreq->data = sendreq->data_storage; 1245 } 1246 } 1247 if ((status != ARES_SUCCESS) || (sendreq->data_storage == NULL)) 1248 { 1249 /* We encountered an error (probably a timeout, 1250 * suggesting the DNS server we're talking to is 1251 * probably unreachable, wedged, or severely 1252 * overloaded) or we couldn't copy the request, so 1253 * mark the connection as broken. When we get to 1254 * process_broken_connections() we'll close the 1255 * connection and try to re-send requests to another 1256 * server. 1257 */ 1258 server->is_broken = 1; 1259 /* Just to be paranoid, zero out this sendreq... */ 1260 sendreq->data = NULL; 1261 sendreq->len = 0; 1262 } 1263 } 1264 } 1265 1266 /* Invoke the callback */ 1267 query->callback(query->arg, status, query->timeouts, abuf, alen); 1268 ares__free_query(query); 1269 1270 /* Simple cleanup policy: if no queries are remaining, close all 1271 * network sockets unless STAYOPEN is set. 1272 */ 1273 if (!(channel->flags & ARES_FLAG_STAYOPEN) && 1274 ares__is_list_empty(&(channel->all_queries))) 1275 { 1276 for (i = 0; i < channel->nservers; i++) 1277 ares__close_sockets(channel, &channel->servers[i]); 1278 } 1279 } 1280 1281 void ares__free_query(struct query *query) 1282 { 1283 /* Remove the query from all the lists in which it is linked */ 1284 ares__remove_from_list(&(query->queries_by_qid)); 1285 ares__remove_from_list(&(query->queries_by_timeout)); 1286 ares__remove_from_list(&(query->queries_to_server)); 1287 ares__remove_from_list(&(query->all_queries)); 1288 /* Zero out some important stuff, to help catch bugs */ 1289 query->callback = NULL; 1290 query->arg = NULL; 1291 /* Deallocate the memory associated with the query */ 1292 free(query->tcpbuf); 1293 free(query->server_info); 1294 free(query); 1295 } 1296