1 /* 2 * Copyright 2000-2003 Niels Provos <provos (at) citi.umich.edu> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 #ifdef HAVE_CONFIG_H 28 #include "config.h" 29 #endif 30 31 #include <stdint.h> 32 #include <sys/types.h> 33 #include <sys/resource.h> 34 #ifdef HAVE_SYS_TIME_H 35 #include <sys/time.h> 36 #else 37 #include <sys/_libevent_time.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/epoll.h> 41 #include <signal.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <errno.h> 47 #ifdef HAVE_FCNTL_H 48 #include <fcntl.h> 49 #endif 50 51 #include "event.h" 52 #include "event-internal.h" 53 #include "evsignal.h" 54 #include "log.h" 55 56 /* due to limitations in the epoll interface, we need to keep track of 57 * all file descriptors outself. 58 */ 59 struct evepoll { 60 struct event *evread; 61 struct event *evwrite; 62 }; 63 64 struct epollop { 65 struct evepoll *fds; 66 int nfds; 67 struct epoll_event *events; 68 int nevents; 69 int epfd; 70 }; 71 72 static void *epoll_init (struct event_base *); 73 static int epoll_add (void *, struct event *); 74 static int epoll_del (void *, struct event *); 75 static int epoll_dispatch (struct event_base *, void *, struct timeval *); 76 static void epoll_dealloc (struct event_base *, void *); 77 78 const struct eventop epollops = { 79 "epoll", 80 epoll_init, 81 epoll_add, 82 epoll_del, 83 epoll_dispatch, 84 epoll_dealloc, 85 1 /* need reinit */ 86 }; 87 88 #ifdef HAVE_SETFD 89 #define FD_CLOSEONEXEC(x) do { \ 90 if (fcntl(x, F_SETFD, 1) == -1) \ 91 event_warn("fcntl(%d, F_SETFD)", x); \ 92 } while (0) 93 #else 94 #define FD_CLOSEONEXEC(x) 95 #endif 96 97 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 98 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 99 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 100 * largest number of msec we can support here is 2147482. Let's 101 * round that down by 47 seconds. 102 */ 103 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 104 105 #define INITIAL_NFILES 32 106 #define INITIAL_NEVENTS 32 107 #define MAX_NEVENTS 4096 108 109 static void * 110 epoll_init(struct event_base *base) 111 { 112 int epfd; 113 struct epollop *epollop; 114 115 /* Disable epollueue when this environment variable is set */ 116 if (evutil_getenv("EVENT_NOEPOLL")) 117 return (NULL); 118 119 /* Initalize the kernel queue */ 120 if ((epfd = epoll_create(32000)) == -1) { 121 if (errno != ENOSYS) 122 event_warn("epoll_create"); 123 return (NULL); 124 } 125 126 FD_CLOSEONEXEC(epfd); 127 128 if (!(epollop = calloc(1, sizeof(struct epollop)))) 129 return (NULL); 130 131 epollop->epfd = epfd; 132 133 /* Initalize fields */ 134 epollop->events = malloc(INITIAL_NEVENTS * sizeof(struct epoll_event)); 135 if (epollop->events == NULL) { 136 free(epollop); 137 return (NULL); 138 } 139 epollop->nevents = INITIAL_NEVENTS; 140 141 epollop->fds = calloc(INITIAL_NFILES, sizeof(struct evepoll)); 142 if (epollop->fds == NULL) { 143 free(epollop->events); 144 free(epollop); 145 return (NULL); 146 } 147 epollop->nfds = INITIAL_NFILES; 148 149 evsignal_init(base); 150 151 return (epollop); 152 } 153 154 static int 155 epoll_recalc(struct event_base *base, void *arg, int max) 156 { 157 struct epollop *epollop = arg; 158 159 if (max >= epollop->nfds) { 160 struct evepoll *fds; 161 int nfds; 162 163 nfds = epollop->nfds; 164 while (nfds <= max) 165 nfds <<= 1; 166 167 fds = realloc(epollop->fds, nfds * sizeof(struct evepoll)); 168 if (fds == NULL) { 169 event_warn("realloc"); 170 return (-1); 171 } 172 epollop->fds = fds; 173 memset(fds + epollop->nfds, 0, 174 (nfds - epollop->nfds) * sizeof(struct evepoll)); 175 epollop->nfds = nfds; 176 } 177 178 return (0); 179 } 180 181 static int 182 epoll_dispatch(struct event_base *base, void *arg, struct timeval *tv) 183 { 184 struct epollop *epollop = arg; 185 struct epoll_event *events = epollop->events; 186 struct evepoll *evep; 187 int i, res, timeout = -1; 188 189 if (tv != NULL) 190 timeout = tv->tv_sec * 1000 + (tv->tv_usec + 999) / 1000; 191 192 if (timeout > MAX_EPOLL_TIMEOUT_MSEC) { 193 /* Linux kernels can wait forever if the timeout is too big; 194 * see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 195 timeout = MAX_EPOLL_TIMEOUT_MSEC; 196 } 197 198 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 199 200 if (res == -1) { 201 if (errno != EINTR) { 202 event_warn("epoll_wait"); 203 return (-1); 204 } 205 206 evsignal_process(base); 207 return (0); 208 } else if (base->sig.evsignal_caught) { 209 evsignal_process(base); 210 } 211 212 event_debug(("%s: epoll_wait reports %d", __func__, res)); 213 214 for (i = 0; i < res; i++) { 215 int what = events[i].events; 216 struct event *evread = NULL, *evwrite = NULL; 217 int fd = events[i].data.fd; 218 219 if (fd < 0 || fd >= epollop->nfds) 220 continue; 221 evep = &epollop->fds[fd]; 222 223 if (what & (EPOLLHUP|EPOLLERR)) { 224 evread = evep->evread; 225 evwrite = evep->evwrite; 226 } else { 227 if (what & EPOLLIN) { 228 evread = evep->evread; 229 } 230 231 if (what & EPOLLOUT) { 232 evwrite = evep->evwrite; 233 } 234 } 235 236 if (!(evread||evwrite)) 237 continue; 238 239 if (evread != NULL) 240 event_active(evread, EV_READ, 1); 241 if (evwrite != NULL) 242 event_active(evwrite, EV_WRITE, 1); 243 } 244 245 if (res == epollop->nevents && epollop->nevents < MAX_NEVENTS) { 246 /* We used all of the event space this time. We should 247 be ready for more events next time. */ 248 int new_nevents = epollop->nevents * 2; 249 struct epoll_event *new_events; 250 251 new_events = realloc(epollop->events, 252 new_nevents * sizeof(struct epoll_event)); 253 if (new_events) { 254 epollop->events = new_events; 255 epollop->nevents = new_nevents; 256 } 257 } 258 259 return (0); 260 } 261 262 263 static int 264 epoll_add(void *arg, struct event *ev) 265 { 266 struct epollop *epollop = arg; 267 struct epoll_event epev = {0, {0}}; 268 struct evepoll *evep; 269 int fd, op, events; 270 271 if (ev->ev_events & EV_SIGNAL) 272 return (evsignal_add(ev)); 273 274 fd = ev->ev_fd; 275 if (fd >= epollop->nfds) { 276 /* Extent the file descriptor array as necessary */ 277 if (epoll_recalc(ev->ev_base, epollop, fd) == -1) 278 return (-1); 279 } 280 evep = &epollop->fds[fd]; 281 op = EPOLL_CTL_ADD; 282 events = 0; 283 if (evep->evread != NULL) { 284 events |= EPOLLIN; 285 op = EPOLL_CTL_MOD; 286 } 287 if (evep->evwrite != NULL) { 288 events |= EPOLLOUT; 289 op = EPOLL_CTL_MOD; 290 } 291 292 if (ev->ev_events & EV_READ) 293 events |= EPOLLIN; 294 if (ev->ev_events & EV_WRITE) 295 events |= EPOLLOUT; 296 297 epev.data.fd = fd; 298 epev.events = events; 299 if (epoll_ctl(epollop->epfd, op, ev->ev_fd, &epev) == -1) 300 return (-1); 301 302 /* Update events responsible */ 303 if (ev->ev_events & EV_READ) 304 evep->evread = ev; 305 if (ev->ev_events & EV_WRITE) 306 evep->evwrite = ev; 307 308 return (0); 309 } 310 311 static int 312 epoll_del(void *arg, struct event *ev) 313 { 314 struct epollop *epollop = arg; 315 struct epoll_event epev = {0, {0}}; 316 struct evepoll *evep; 317 int fd, events, op; 318 int needwritedelete = 1, needreaddelete = 1; 319 320 if (ev->ev_events & EV_SIGNAL) 321 return (evsignal_del(ev)); 322 323 fd = ev->ev_fd; 324 if (fd >= epollop->nfds) 325 return (0); 326 evep = &epollop->fds[fd]; 327 328 op = EPOLL_CTL_DEL; 329 events = 0; 330 331 if (ev->ev_events & EV_READ) 332 events |= EPOLLIN; 333 if (ev->ev_events & EV_WRITE) 334 events |= EPOLLOUT; 335 336 if ((events & (EPOLLIN|EPOLLOUT)) != (EPOLLIN|EPOLLOUT)) { 337 if ((events & EPOLLIN) && evep->evwrite != NULL) { 338 needwritedelete = 0; 339 events = EPOLLOUT; 340 op = EPOLL_CTL_MOD; 341 } else if ((events & EPOLLOUT) && evep->evread != NULL) { 342 needreaddelete = 0; 343 events = EPOLLIN; 344 op = EPOLL_CTL_MOD; 345 } 346 } 347 348 epev.events = events; 349 epev.data.fd = fd; 350 351 if (needreaddelete) 352 evep->evread = NULL; 353 if (needwritedelete) 354 evep->evwrite = NULL; 355 356 if (epoll_ctl(epollop->epfd, op, fd, &epev) == -1) 357 return (-1); 358 359 return (0); 360 } 361 362 static void 363 epoll_dealloc(struct event_base *base, void *arg) 364 { 365 struct epollop *epollop = arg; 366 367 evsignal_dealloc(base); 368 if (epollop->fds) 369 free(epollop->fds); 370 if (epollop->events) 371 free(epollop->events); 372 if (epollop->epfd >= 0) 373 close(epollop->epfd); 374 375 memset(epollop, 0, sizeof(struct epollop)); 376 free(epollop); 377 } 378