Home | History | Annotate | Download | only in libevent
      1 /*
      2  * Copyright 2000-2007 Niels Provos <provos (at) citi.umich.edu>
      3  * Copyright 2007-2012 Niels Provos, Nick Mathewson
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the author may not be used to endorse or promote products
     14  *    derived from this software without specific prior written permission.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 #include "event2/event-config.h"
     28 
     29 #include <stdint.h>
     30 #include <sys/types.h>
     31 #include <sys/resource.h>
     32 #ifdef _EVENT_HAVE_SYS_TIME_H
     33 #include <sys/time.h>
     34 #endif
     35 #include <sys/queue.h>
     36 #include <sys/epoll.h>
     37 #include <signal.h>
     38 #include <limits.h>
     39 #include <stdio.h>
     40 #include <stdlib.h>
     41 #include <string.h>
     42 #include <unistd.h>
     43 #include <errno.h>
     44 #ifdef _EVENT_HAVE_FCNTL_H
     45 #include <fcntl.h>
     46 #endif
     47 
     48 #include "event-internal.h"
     49 #include "evsignal-internal.h"
     50 #include "event2/thread.h"
     51 #include "evthread-internal.h"
     52 #include "log-internal.h"
     53 #include "evmap-internal.h"
     54 #include "changelist-internal.h"
     55 
     56 struct epollop {
     57 	struct epoll_event *events;
     58 	int nevents;
     59 	int epfd;
     60 };
     61 
     62 static void *epoll_init(struct event_base *);
     63 static int epoll_dispatch(struct event_base *, struct timeval *);
     64 static void epoll_dealloc(struct event_base *);
     65 
     66 static const struct eventop epollops_changelist = {
     67 	"epoll (with changelist)",
     68 	epoll_init,
     69 	event_changelist_add,
     70 	event_changelist_del,
     71 	epoll_dispatch,
     72 	epoll_dealloc,
     73 	1, /* need reinit */
     74 	EV_FEATURE_ET|EV_FEATURE_O1,
     75 	EVENT_CHANGELIST_FDINFO_SIZE
     76 };
     77 
     78 
     79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
     80     short old, short events, void *p);
     81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
     82     short old, short events, void *p);
     83 
     84 const struct eventop epollops = {
     85 	"epoll",
     86 	epoll_init,
     87 	epoll_nochangelist_add,
     88 	epoll_nochangelist_del,
     89 	epoll_dispatch,
     90 	epoll_dealloc,
     91 	1, /* need reinit */
     92 	EV_FEATURE_ET|EV_FEATURE_O1,
     93 	0
     94 };
     95 
     96 #define INITIAL_NEVENT 32
     97 #define MAX_NEVENT 4096
     98 
     99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
    100  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
    101  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
    102  * largest number of msec we can support here is 2147482.  Let's
    103  * round that down by 47 seconds.
    104  */
    105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
    106 
    107 static void *
    108 epoll_init(struct event_base *base)
    109 {
    110 	int epfd;
    111 	struct epollop *epollop;
    112 
    113 	/* Initialize the kernel queue.  (The size field is ignored since
    114 	 * 2.6.8.) */
    115 	if ((epfd = epoll_create(32000)) == -1) {
    116 		if (errno != ENOSYS)
    117 			event_warn("epoll_create");
    118 		return (NULL);
    119 	}
    120 
    121 	evutil_make_socket_closeonexec(epfd);
    122 
    123 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
    124 		close(epfd);
    125 		return (NULL);
    126 	}
    127 
    128 	epollop->epfd = epfd;
    129 
    130 	/* Initialize fields */
    131 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
    132 	if (epollop->events == NULL) {
    133 		mm_free(epollop);
    134 		close(epfd);
    135 		return (NULL);
    136 	}
    137 	epollop->nevents = INITIAL_NEVENT;
    138 
    139 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
    140 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
    141 		evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
    142 		base->evsel = &epollops_changelist;
    143 
    144 	evsig_init(base);
    145 
    146 	return (epollop);
    147 }
    148 
    149 static const char *
    150 change_to_string(int change)
    151 {
    152 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
    153 	if (change == EV_CHANGE_ADD) {
    154 		return "add";
    155 	} else if (change == EV_CHANGE_DEL) {
    156 		return "del";
    157 	} else if (change == 0) {
    158 		return "none";
    159 	} else {
    160 		return "???";
    161 	}
    162 }
    163 
    164 static const char *
    165 epoll_op_to_string(int op)
    166 {
    167 	return op == EPOLL_CTL_ADD?"ADD":
    168 	    op == EPOLL_CTL_DEL?"DEL":
    169 	    op == EPOLL_CTL_MOD?"MOD":
    170 	    "???";
    171 }
    172 
    173 static int
    174 epoll_apply_one_change(struct event_base *base,
    175     struct epollop *epollop,
    176     const struct event_change *ch)
    177 {
    178 	struct epoll_event epev;
    179 	int op, events = 0;
    180 
    181 	if (1) {
    182 		/* The logic here is a little tricky.  If we had no events set
    183 		   on the fd before, we need to set op="ADD" and set
    184 		   events=the events we want to add.  If we had any events set
    185 		   on the fd before, and we want any events to remain on the
    186 		   fd, we need to say op="MOD" and set events=the events we
    187 		   want to remain.  But if we want to delete the last event,
    188 		   we say op="DEL" and set events=the remaining events.  What
    189 		   fun!
    190 		*/
    191 
    192 		/* TODO: Turn this into a switch or a table lookup. */
    193 
    194 		if ((ch->read_change & EV_CHANGE_ADD) ||
    195 		    (ch->write_change & EV_CHANGE_ADD)) {
    196 			/* If we are adding anything at all, we'll want to do
    197 			 * either an ADD or a MOD. */
    198 			events = 0;
    199 			op = EPOLL_CTL_ADD;
    200 			if (ch->read_change & EV_CHANGE_ADD) {
    201 				events |= EPOLLIN;
    202 			} else if (ch->read_change & EV_CHANGE_DEL) {
    203 				;
    204 			} else if (ch->old_events & EV_READ) {
    205 				events |= EPOLLIN;
    206 			}
    207 			if (ch->write_change & EV_CHANGE_ADD) {
    208 				events |= EPOLLOUT;
    209 			} else if (ch->write_change & EV_CHANGE_DEL) {
    210 				;
    211 			} else if (ch->old_events & EV_WRITE) {
    212 				events |= EPOLLOUT;
    213 			}
    214 			if ((ch->read_change|ch->write_change) & EV_ET)
    215 				events |= EPOLLET;
    216 
    217 			if (ch->old_events) {
    218 				/* If MOD fails, we retry as an ADD, and if
    219 				 * ADD fails we will retry as a MOD.  So the
    220 				 * only hard part here is to guess which one
    221 				 * will work.  As a heuristic, we'll try
    222 				 * MOD first if we think there were old
    223 				 * events and ADD if we think there were none.
    224 				 *
    225 				 * We can be wrong about the MOD if the file
    226 				 * has in fact been closed and re-opened.
    227 				 *
    228 				 * We can be wrong about the ADD if the
    229 				 * the fd has been re-created with a dup()
    230 				 * of the same file that it was before.
    231 				 */
    232 				op = EPOLL_CTL_MOD;
    233 			}
    234 		} else if ((ch->read_change & EV_CHANGE_DEL) ||
    235 		    (ch->write_change & EV_CHANGE_DEL)) {
    236 			/* If we're deleting anything, we'll want to do a MOD
    237 			 * or a DEL. */
    238 			op = EPOLL_CTL_DEL;
    239 
    240 			if (ch->read_change & EV_CHANGE_DEL) {
    241 				if (ch->write_change & EV_CHANGE_DEL) {
    242 					events = EPOLLIN|EPOLLOUT;
    243 				} else if (ch->old_events & EV_WRITE) {
    244 					events = EPOLLOUT;
    245 					op = EPOLL_CTL_MOD;
    246 				} else {
    247 					events = EPOLLIN;
    248 				}
    249 			} else if (ch->write_change & EV_CHANGE_DEL) {
    250 				if (ch->old_events & EV_READ) {
    251 					events = EPOLLIN;
    252 					op = EPOLL_CTL_MOD;
    253 				} else {
    254 					events = EPOLLOUT;
    255 				}
    256 			}
    257 		}
    258 
    259 		if (!events)
    260 			return 0;
    261 
    262 		memset(&epev, 0, sizeof(epev));
    263 		epev.data.fd = ch->fd;
    264 		epev.events = events;
    265 		if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
    266 			if (op == EPOLL_CTL_MOD && errno == ENOENT) {
    267 				/* If a MOD operation fails with ENOENT, the
    268 				 * fd was probably closed and re-opened.  We
    269 				 * should retry the operation as an ADD.
    270 				 */
    271 				if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
    272 					event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
    273 					    (int)epev.events, ch->fd);
    274 					return -1;
    275 				} else {
    276 					event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
    277 						(int)epev.events,
    278 						ch->fd));
    279 				}
    280 			} else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
    281 				/* If an ADD operation fails with EEXIST,
    282 				 * either the operation was redundant (as with a
    283 				 * precautionary add), or we ran into a fun
    284 				 * kernel bug where using dup*() to duplicate the
    285 				 * same file into the same fd gives you the same epitem
    286 				 * rather than a fresh one.  For the second case,
    287 				 * we must retry with MOD. */
    288 				if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
    289 					event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
    290 					    (int)epev.events, ch->fd);
    291 					return -1;
    292 				} else {
    293 					event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
    294 						(int)epev.events,
    295 						ch->fd));
    296 				}
    297 			} else if (op == EPOLL_CTL_DEL &&
    298 			    (errno == ENOENT || errno == EBADF ||
    299 				errno == EPERM)) {
    300 				/* If a delete fails with one of these errors,
    301 				 * that's fine too: we closed the fd before we
    302 				 * got around to calling epoll_dispatch. */
    303 				event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
    304 					(int)epev.events,
    305 					ch->fd,
    306 					strerror(errno)));
    307 			} else {
    308 				event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
    309 				    epoll_op_to_string(op),
    310 				    (int)epev.events,
    311 				    ch->fd,
    312 				    ch->old_events,
    313 				    ch->read_change,
    314 				    change_to_string(ch->read_change),
    315 				    ch->write_change,
    316 				    change_to_string(ch->write_change));
    317 				return -1;
    318 			}
    319 		} else {
    320 			event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
    321 				epoll_op_to_string(op),
    322 				(int)epev.events,
    323 				(int)ch->fd,
    324 				ch->old_events,
    325 				ch->read_change,
    326 				ch->write_change));
    327 		}
    328 	}
    329 	return 0;
    330 }
    331 
    332 static int
    333 epoll_apply_changes(struct event_base *base)
    334 {
    335 	struct event_changelist *changelist = &base->changelist;
    336 	struct epollop *epollop = base->evbase;
    337 	struct event_change *ch;
    338 
    339 	int r = 0;
    340 	int i;
    341 
    342 	for (i = 0; i < changelist->n_changes; ++i) {
    343 		ch = &changelist->changes[i];
    344 		if (epoll_apply_one_change(base, epollop, ch) < 0)
    345 			r = -1;
    346 	}
    347 
    348 	return (r);
    349 }
    350 
    351 static int
    352 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
    353     short old, short events, void *p)
    354 {
    355 	struct event_change ch;
    356 	ch.fd = fd;
    357 	ch.old_events = old;
    358 	ch.read_change = ch.write_change = 0;
    359 	if (events & EV_WRITE)
    360 		ch.write_change = EV_CHANGE_ADD |
    361 		    (events & EV_ET);
    362 	if (events & EV_READ)
    363 		ch.read_change = EV_CHANGE_ADD |
    364 		    (events & EV_ET);
    365 
    366 	return epoll_apply_one_change(base, base->evbase, &ch);
    367 }
    368 
    369 static int
    370 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
    371     short old, short events, void *p)
    372 {
    373 	struct event_change ch;
    374 	ch.fd = fd;
    375 	ch.old_events = old;
    376 	ch.read_change = ch.write_change = 0;
    377 	if (events & EV_WRITE)
    378 		ch.write_change = EV_CHANGE_DEL;
    379 	if (events & EV_READ)
    380 		ch.read_change = EV_CHANGE_DEL;
    381 
    382 	return epoll_apply_one_change(base, base->evbase, &ch);
    383 }
    384 
    385 static int
    386 epoll_dispatch(struct event_base *base, struct timeval *tv)
    387 {
    388 	struct epollop *epollop = base->evbase;
    389 	struct epoll_event *events = epollop->events;
    390 	int i, res;
    391 	long timeout = -1;
    392 
    393 	if (tv != NULL) {
    394 		timeout = evutil_tv_to_msec(tv);
    395 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
    396 			/* Linux kernels can wait forever if the timeout is
    397 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
    398 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
    399 		}
    400 	}
    401 
    402 	epoll_apply_changes(base);
    403 	event_changelist_remove_all(&base->changelist, base);
    404 
    405 	EVBASE_RELEASE_LOCK(base, th_base_lock);
    406 
    407 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
    408 
    409 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
    410 
    411 	if (res == -1) {
    412 		if (errno != EINTR) {
    413 			event_warn("epoll_wait");
    414 			return (-1);
    415 		}
    416 
    417 		return (0);
    418 	}
    419 
    420 	event_debug(("%s: epoll_wait reports %d", __func__, res));
    421 	EVUTIL_ASSERT(res <= epollop->nevents);
    422 
    423 	for (i = 0; i < res; i++) {
    424 		int what = events[i].events;
    425 		short ev = 0;
    426 
    427 		if (what & (EPOLLHUP|EPOLLERR)) {
    428 			ev = EV_READ | EV_WRITE;
    429 		} else {
    430 			if (what & EPOLLIN)
    431 				ev |= EV_READ;
    432 			if (what & EPOLLOUT)
    433 				ev |= EV_WRITE;
    434 		}
    435 
    436 		if (!ev)
    437 			continue;
    438 
    439 		evmap_io_active(base, events[i].data.fd, ev | EV_ET);
    440 	}
    441 
    442 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
    443 		/* We used all of the event space this time.  We should
    444 		   be ready for more events next time. */
    445 		int new_nevents = epollop->nevents * 2;
    446 		struct epoll_event *new_events;
    447 
    448 		new_events = mm_realloc(epollop->events,
    449 		    new_nevents * sizeof(struct epoll_event));
    450 		if (new_events) {
    451 			epollop->events = new_events;
    452 			epollop->nevents = new_nevents;
    453 		}
    454 	}
    455 
    456 	return (0);
    457 }
    458 
    459 
    460 static void
    461 epoll_dealloc(struct event_base *base)
    462 {
    463 	struct epollop *epollop = base->evbase;
    464 
    465 	evsig_dealloc(base);
    466 	if (epollop->events)
    467 		mm_free(epollop->events);
    468 	if (epollop->epfd >= 0)
    469 		close(epollop->epfd);
    470 
    471 	memset(epollop, 0, sizeof(struct epollop));
    472 	mm_free(epollop);
    473 }
    474