1 /* 2 * Copyright 2000-2007 Niels Provos <provos (at) citi.umich.edu> 3 * Copyright 2007-2012 Niels Provos, Nick Mathewson 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 #include "event2/event-config.h" 28 29 #include <stdint.h> 30 #include <sys/types.h> 31 #include <sys/resource.h> 32 #ifdef _EVENT_HAVE_SYS_TIME_H 33 #include <sys/time.h> 34 #endif 35 #include <sys/queue.h> 36 #include <sys/epoll.h> 37 #include <signal.h> 38 #include <limits.h> 39 #include <stdio.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <unistd.h> 43 #include <errno.h> 44 #ifdef _EVENT_HAVE_FCNTL_H 45 #include <fcntl.h> 46 #endif 47 48 #include "event-internal.h" 49 #include "evsignal-internal.h" 50 #include "event2/thread.h" 51 #include "evthread-internal.h" 52 #include "log-internal.h" 53 #include "evmap-internal.h" 54 #include "changelist-internal.h" 55 56 struct epollop { 57 struct epoll_event *events; 58 int nevents; 59 int epfd; 60 }; 61 62 static void *epoll_init(struct event_base *); 63 static int epoll_dispatch(struct event_base *, struct timeval *); 64 static void epoll_dealloc(struct event_base *); 65 66 static const struct eventop epollops_changelist = { 67 "epoll (with changelist)", 68 epoll_init, 69 event_changelist_add, 70 event_changelist_del, 71 epoll_dispatch, 72 epoll_dealloc, 73 1, /* need reinit */ 74 EV_FEATURE_ET|EV_FEATURE_O1, 75 EVENT_CHANGELIST_FDINFO_SIZE 76 }; 77 78 79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 80 short old, short events, void *p); 81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 82 short old, short events, void *p); 83 84 const struct eventop epollops = { 85 "epoll", 86 epoll_init, 87 epoll_nochangelist_add, 88 epoll_nochangelist_del, 89 epoll_dispatch, 90 epoll_dealloc, 91 1, /* need reinit */ 92 EV_FEATURE_ET|EV_FEATURE_O1, 93 0 94 }; 95 96 #define INITIAL_NEVENT 32 97 #define MAX_NEVENT 4096 98 99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 100 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 101 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 102 * largest number of msec we can support here is 2147482. Let's 103 * round that down by 47 seconds. 104 */ 105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 106 107 static void * 108 epoll_init(struct event_base *base) 109 { 110 int epfd; 111 struct epollop *epollop; 112 113 /* Initialize the kernel queue. (The size field is ignored since 114 * 2.6.8.) */ 115 if ((epfd = epoll_create(32000)) == -1) { 116 if (errno != ENOSYS) 117 event_warn("epoll_create"); 118 return (NULL); 119 } 120 121 evutil_make_socket_closeonexec(epfd); 122 123 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 124 close(epfd); 125 return (NULL); 126 } 127 128 epollop->epfd = epfd; 129 130 /* Initialize fields */ 131 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 132 if (epollop->events == NULL) { 133 mm_free(epollop); 134 close(epfd); 135 return (NULL); 136 } 137 epollop->nevents = INITIAL_NEVENT; 138 139 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 140 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 141 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL)) 142 base->evsel = &epollops_changelist; 143 144 evsig_init(base); 145 146 return (epollop); 147 } 148 149 static const char * 150 change_to_string(int change) 151 { 152 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 153 if (change == EV_CHANGE_ADD) { 154 return "add"; 155 } else if (change == EV_CHANGE_DEL) { 156 return "del"; 157 } else if (change == 0) { 158 return "none"; 159 } else { 160 return "???"; 161 } 162 } 163 164 static const char * 165 epoll_op_to_string(int op) 166 { 167 return op == EPOLL_CTL_ADD?"ADD": 168 op == EPOLL_CTL_DEL?"DEL": 169 op == EPOLL_CTL_MOD?"MOD": 170 "???"; 171 } 172 173 static int 174 epoll_apply_one_change(struct event_base *base, 175 struct epollop *epollop, 176 const struct event_change *ch) 177 { 178 struct epoll_event epev; 179 int op, events = 0; 180 181 if (1) { 182 /* The logic here is a little tricky. If we had no events set 183 on the fd before, we need to set op="ADD" and set 184 events=the events we want to add. If we had any events set 185 on the fd before, and we want any events to remain on the 186 fd, we need to say op="MOD" and set events=the events we 187 want to remain. But if we want to delete the last event, 188 we say op="DEL" and set events=the remaining events. What 189 fun! 190 */ 191 192 /* TODO: Turn this into a switch or a table lookup. */ 193 194 if ((ch->read_change & EV_CHANGE_ADD) || 195 (ch->write_change & EV_CHANGE_ADD)) { 196 /* If we are adding anything at all, we'll want to do 197 * either an ADD or a MOD. */ 198 events = 0; 199 op = EPOLL_CTL_ADD; 200 if (ch->read_change & EV_CHANGE_ADD) { 201 events |= EPOLLIN; 202 } else if (ch->read_change & EV_CHANGE_DEL) { 203 ; 204 } else if (ch->old_events & EV_READ) { 205 events |= EPOLLIN; 206 } 207 if (ch->write_change & EV_CHANGE_ADD) { 208 events |= EPOLLOUT; 209 } else if (ch->write_change & EV_CHANGE_DEL) { 210 ; 211 } else if (ch->old_events & EV_WRITE) { 212 events |= EPOLLOUT; 213 } 214 if ((ch->read_change|ch->write_change) & EV_ET) 215 events |= EPOLLET; 216 217 if (ch->old_events) { 218 /* If MOD fails, we retry as an ADD, and if 219 * ADD fails we will retry as a MOD. So the 220 * only hard part here is to guess which one 221 * will work. As a heuristic, we'll try 222 * MOD first if we think there were old 223 * events and ADD if we think there were none. 224 * 225 * We can be wrong about the MOD if the file 226 * has in fact been closed and re-opened. 227 * 228 * We can be wrong about the ADD if the 229 * the fd has been re-created with a dup() 230 * of the same file that it was before. 231 */ 232 op = EPOLL_CTL_MOD; 233 } 234 } else if ((ch->read_change & EV_CHANGE_DEL) || 235 (ch->write_change & EV_CHANGE_DEL)) { 236 /* If we're deleting anything, we'll want to do a MOD 237 * or a DEL. */ 238 op = EPOLL_CTL_DEL; 239 240 if (ch->read_change & EV_CHANGE_DEL) { 241 if (ch->write_change & EV_CHANGE_DEL) { 242 events = EPOLLIN|EPOLLOUT; 243 } else if (ch->old_events & EV_WRITE) { 244 events = EPOLLOUT; 245 op = EPOLL_CTL_MOD; 246 } else { 247 events = EPOLLIN; 248 } 249 } else if (ch->write_change & EV_CHANGE_DEL) { 250 if (ch->old_events & EV_READ) { 251 events = EPOLLIN; 252 op = EPOLL_CTL_MOD; 253 } else { 254 events = EPOLLOUT; 255 } 256 } 257 } 258 259 if (!events) 260 return 0; 261 262 memset(&epev, 0, sizeof(epev)); 263 epev.data.fd = ch->fd; 264 epev.events = events; 265 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) { 266 if (op == EPOLL_CTL_MOD && errno == ENOENT) { 267 /* If a MOD operation fails with ENOENT, the 268 * fd was probably closed and re-opened. We 269 * should retry the operation as an ADD. 270 */ 271 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 272 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 273 (int)epev.events, ch->fd); 274 return -1; 275 } else { 276 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 277 (int)epev.events, 278 ch->fd)); 279 } 280 } else if (op == EPOLL_CTL_ADD && errno == EEXIST) { 281 /* If an ADD operation fails with EEXIST, 282 * either the operation was redundant (as with a 283 * precautionary add), or we ran into a fun 284 * kernel bug where using dup*() to duplicate the 285 * same file into the same fd gives you the same epitem 286 * rather than a fresh one. For the second case, 287 * we must retry with MOD. */ 288 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 289 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 290 (int)epev.events, ch->fd); 291 return -1; 292 } else { 293 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 294 (int)epev.events, 295 ch->fd)); 296 } 297 } else if (op == EPOLL_CTL_DEL && 298 (errno == ENOENT || errno == EBADF || 299 errno == EPERM)) { 300 /* If a delete fails with one of these errors, 301 * that's fine too: we closed the fd before we 302 * got around to calling epoll_dispatch. */ 303 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 304 (int)epev.events, 305 ch->fd, 306 strerror(errno))); 307 } else { 308 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)", 309 epoll_op_to_string(op), 310 (int)epev.events, 311 ch->fd, 312 ch->old_events, 313 ch->read_change, 314 change_to_string(ch->read_change), 315 ch->write_change, 316 change_to_string(ch->write_change)); 317 return -1; 318 } 319 } else { 320 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]", 321 epoll_op_to_string(op), 322 (int)epev.events, 323 (int)ch->fd, 324 ch->old_events, 325 ch->read_change, 326 ch->write_change)); 327 } 328 } 329 return 0; 330 } 331 332 static int 333 epoll_apply_changes(struct event_base *base) 334 { 335 struct event_changelist *changelist = &base->changelist; 336 struct epollop *epollop = base->evbase; 337 struct event_change *ch; 338 339 int r = 0; 340 int i; 341 342 for (i = 0; i < changelist->n_changes; ++i) { 343 ch = &changelist->changes[i]; 344 if (epoll_apply_one_change(base, epollop, ch) < 0) 345 r = -1; 346 } 347 348 return (r); 349 } 350 351 static int 352 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 353 short old, short events, void *p) 354 { 355 struct event_change ch; 356 ch.fd = fd; 357 ch.old_events = old; 358 ch.read_change = ch.write_change = 0; 359 if (events & EV_WRITE) 360 ch.write_change = EV_CHANGE_ADD | 361 (events & EV_ET); 362 if (events & EV_READ) 363 ch.read_change = EV_CHANGE_ADD | 364 (events & EV_ET); 365 366 return epoll_apply_one_change(base, base->evbase, &ch); 367 } 368 369 static int 370 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 371 short old, short events, void *p) 372 { 373 struct event_change ch; 374 ch.fd = fd; 375 ch.old_events = old; 376 ch.read_change = ch.write_change = 0; 377 if (events & EV_WRITE) 378 ch.write_change = EV_CHANGE_DEL; 379 if (events & EV_READ) 380 ch.read_change = EV_CHANGE_DEL; 381 382 return epoll_apply_one_change(base, base->evbase, &ch); 383 } 384 385 static int 386 epoll_dispatch(struct event_base *base, struct timeval *tv) 387 { 388 struct epollop *epollop = base->evbase; 389 struct epoll_event *events = epollop->events; 390 int i, res; 391 long timeout = -1; 392 393 if (tv != NULL) { 394 timeout = evutil_tv_to_msec(tv); 395 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 396 /* Linux kernels can wait forever if the timeout is 397 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 398 timeout = MAX_EPOLL_TIMEOUT_MSEC; 399 } 400 } 401 402 epoll_apply_changes(base); 403 event_changelist_remove_all(&base->changelist, base); 404 405 EVBASE_RELEASE_LOCK(base, th_base_lock); 406 407 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 408 409 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 410 411 if (res == -1) { 412 if (errno != EINTR) { 413 event_warn("epoll_wait"); 414 return (-1); 415 } 416 417 return (0); 418 } 419 420 event_debug(("%s: epoll_wait reports %d", __func__, res)); 421 EVUTIL_ASSERT(res <= epollop->nevents); 422 423 for (i = 0; i < res; i++) { 424 int what = events[i].events; 425 short ev = 0; 426 427 if (what & (EPOLLHUP|EPOLLERR)) { 428 ev = EV_READ | EV_WRITE; 429 } else { 430 if (what & EPOLLIN) 431 ev |= EV_READ; 432 if (what & EPOLLOUT) 433 ev |= EV_WRITE; 434 } 435 436 if (!ev) 437 continue; 438 439 evmap_io_active(base, events[i].data.fd, ev | EV_ET); 440 } 441 442 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 443 /* We used all of the event space this time. We should 444 be ready for more events next time. */ 445 int new_nevents = epollop->nevents * 2; 446 struct epoll_event *new_events; 447 448 new_events = mm_realloc(epollop->events, 449 new_nevents * sizeof(struct epoll_event)); 450 if (new_events) { 451 epollop->events = new_events; 452 epollop->nevents = new_nevents; 453 } 454 } 455 456 return (0); 457 } 458 459 460 static void 461 epoll_dealloc(struct event_base *base) 462 { 463 struct epollop *epollop = base->evbase; 464 465 evsig_dealloc(base); 466 if (epollop->events) 467 mm_free(epollop->events); 468 if (epollop->epfd >= 0) 469 close(epollop->epfd); 470 471 memset(epollop, 0, sizeof(struct epollop)); 472 mm_free(epollop); 473 } 474