[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] Switch from select() to poll() in xenconsoled's IO loop



In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.

pollfd array is dynamically allocated / reallocated. If the array fails to
expand, we just ignore the incoming fd.

Change from V2:
  * remove unnecessary malloc in initialize_pollfd_arrays
  * use ROUND_UP to get new size of arrays

Change from V3:
  * remove initialize and destroy function for array
  * embedded tracking structure in struct domain, eliminate fd_to_pollfd

Change from V4:
  * make xs_pollfd local to io.c
  * add back the 5 ms fuzz
  * handle POLLERR and POLLHUP
  * treat POLLPRI as error if set in revents
  * abort if xenconsoled's own fds get screwed
  * handle broken tty if tty's fds get screwed

Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx>
---
 tools/console/daemon/io.c |  189 +++++++++++++++++++++++++++++++--------------
 1 file changed, 131 insertions(+), 58 deletions(-)

diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..8d16cac 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
-#include <sys/select.h>
+#include <poll.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <termios.h>
@@ -69,6 +69,7 @@ static int log_hv_fd = -1;
 static evtchn_port_or_error_t log_hv_evtchn = -1;
 static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
 static xc_evtchn *xce_handle = NULL;
+static struct pollfd *xce_pollfd = NULL;
 
 struct buffer {
        char *data;
@@ -81,7 +82,9 @@ struct buffer {
 struct domain {
        int domid;
        int master_fd;
+       struct pollfd *master_pollfd;
        int slave_fd;
+       struct pollfd *slave_pollfd;
        int log_fd;
        bool is_dead;
        unsigned last_seen;
@@ -92,6 +95,7 @@ struct domain {
        evtchn_port_or_error_t local_port;
        evtchn_port_or_error_t remote_port;
        xc_evtchn *xce_handle;
+       struct pollfd *xce_pollfd;
        struct xencons_interface *interface;
        int event_count;
        long long next_period;
@@ -769,6 +773,17 @@ static int ring_free_bytes(struct domain *dom)
        return (sizeof(intf->in) - space);
 }
 
+static void domain_handle_broken_tty(struct domain *dom, int recreate)
+{
+       domain_close_tty(dom);
+
+       if (recreate) {
+               domain_create_tty(dom);
+       } else {
+               shutdown_domain(dom);
+       }
+}
+
 static void handle_tty_read(struct domain *dom)
 {
        ssize_t len = 0;
@@ -794,13 +809,7 @@ static void handle_tty_read(struct domain *dom)
         * keep the slave open for the duration.
         */
        if (len < 0) {
-               domain_close_tty(dom);
-
-               if (domain_is_valid(dom->domid)) {
-                       domain_create_tty(dom);
-               } else {
-                       shutdown_domain(dom);
-               }
+               domain_handle_broken_tty(dom, domain_is_valid(dom->domid));
        } else if (domain_is_valid(dom->domid)) {
                prod = intf->in_prod;
                for (i = 0; i < len; i++) {
@@ -828,14 +837,7 @@ static void handle_tty_write(struct domain *dom)
        if (len < 1) {
                dolog(LOG_DEBUG, "Write failed on domain %d: %zd, %d\n",
                      dom->domid, len, errno);
-
-               domain_close_tty(dom);
-
-               if (domain_is_valid(dom->domid)) {
-                       domain_create_tty(dom);
-               } else {
-                       shutdown_domain(dom);
-               }
+               domain_handle_broken_tty(dom, domain_is_valid(dom->domid));
        } else {
                buffer_advance(&dom->buffer, len);
        }
@@ -928,9 +930,53 @@ static void handle_log_reload(void)
        }
 }
 
+struct pollfd *xs_pollfd;
+static struct pollfd  *fds;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
+static struct pollfd *set_fds(int fd, short events)
+{
+       struct pollfd *ret;
+       if (current_array_size < nr_fds + 1) {
+               struct pollfd  *new_fds = NULL;
+               unsigned long newsize;
+
+               /* Round up to 2^8 boundary, in practice this just
+                * make newsize larger than current_array_size.
+                */
+               newsize = ROUNDUP(nr_fds + 1, 8);
+
+               new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
+               if (!new_fds)
+                       goto fail;
+               fds = new_fds;
+
+               memset(&fds[0] + current_array_size, 0,
+                      sizeof(struct pollfd) * (newsize-current_array_size));
+               current_array_size = newsize;
+       }
+
+       fds[nr_fds].fd = fd;
+       fds[nr_fds].events = events;
+       ret = &fds[nr_fds];
+       nr_fds++;
+
+       return ret;
+fail:
+       dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+       return NULL;
+}
+
+static void reset_fds(void)
+{
+       nr_fds = 0;
+       memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+}
+
 void handle_io(void)
 {
-       fd_set readfds, writefds;
        int ret;
 
        if (log_hv) {
@@ -959,21 +1005,17 @@ void handle_io(void)
 
        for (;;) {
                struct domain *d, *n;
-               int max_fd = -1;
-               struct timeval timeout;
+               int poll_timeout; /* timeout in milliseconds */
                struct timespec ts;
                long long now, next_timeout = 0;
 
-               FD_ZERO(&readfds);
-               FD_ZERO(&writefds);
+               reset_fds();
 
-               FD_SET(xs_fileno(xs), &readfds);
-               max_fd = MAX(xs_fileno(xs), max_fd);
+               xs_pollfd = set_fds(xs_fileno(xs), POLLIN|POLLPRI);
 
-               if (log_hv) {
-                       FD_SET(xc_evtchn_fd(xce_handle), &readfds);
-                       max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
-               }
+               if (log_hv)
+                       xce_pollfd = set_fds(xc_evtchn_fd(xce_handle),
+                                            POLLIN|POLLPRI);
 
                if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
                        return;
@@ -982,10 +1024,12 @@ void handle_io(void)
                /* Re-calculate any event counter allowances & unblock
                   domains with new allowance */
                for (d = dom_head; d; d = d->next) {
-                       /* Add 5ms of fuzz since select() often returns
-                          a couple of ms sooner than requested. Without
-                          the fuzz we typically do an extra spin in select()
-                          with a 1/2 ms timeout every other iteration */
+                       /* CS 16257:955ee4fa1345 introduces a 5ms fuzz
+                        * for select(), it is not clear poll() has
+                        * similar behavior (returning a couple of ms
+                        * sooner than requested) as well. Just leave
+                        * the fuzz here. Remove it with a separate
+                        * patch if necessary */
                        if ((now+5) > d->next_period) {
                                d->next_period = now + RATE_LIMIT_PERIOD;
                                if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
@@ -1006,75 +1050,101 @@ void handle_io(void)
                                    !d->buffer.max_capacity ||
                                    d->buffer.size < d->buffer.max_capacity) {
                                        int evtchn_fd = 
xc_evtchn_fd(d->xce_handle);
-                                       FD_SET(evtchn_fd, &readfds);
-                                       max_fd = MAX(evtchn_fd, max_fd);
+                                       d->xce_pollfd = set_fds(evtchn_fd,
+                                                               POLLIN|POLLPRI);
                                }
                        }
 
                        if (d->master_fd != -1) {
+                               short events = 0;
                                if (!d->is_dead && ring_free_bytes(d))
-                                       FD_SET(d->master_fd, &readfds);
+                                       events |= POLLIN;
 
                                if (!buffer_empty(&d->buffer))
-                                       FD_SET(d->master_fd, &writefds);
-                               max_fd = MAX(d->master_fd, max_fd);
+                                       events |= POLLOUT;
+
+                               if (events)
+                                       d->master_pollfd =
+                                               set_fds(d->master_fd,
+                                                       events|POLLPRI);
                        }
                }
 
                /* If any domain has been rate limited, we need to work
-                  out what timeout to supply to select */
+                  out what timeout to supply to poll */
                if (next_timeout) {
                        long long duration = (next_timeout - now);
                        if (duration <= 0) /* sanity check */
                                duration = 1;
-                       timeout.tv_sec = duration / 1000;
-                       timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
-                                          * 1000);
+                       poll_timeout = (int)duration;
                }
 
-               ret = select(max_fd + 1, &readfds, &writefds, 0,
-                            next_timeout ? &timeout : NULL);
+               ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
 
                if (log_reload) {
                        handle_log_reload();
                        log_reload = 0;
                }
 
-               /* Abort if select failed, except for EINTR cases
+               /* Abort if poll failed, except for EINTR cases
                   which indicate a possible log reload */
                if (ret == -1) {
                        if (errno == EINTR)
                                continue;
-                       dolog(LOG_ERR, "Failure in select: %d (%s)",
+                       dolog(LOG_ERR, "Failure in poll: %d (%s)",
                              errno, strerror(errno));
                        break;
                }
 
-               if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
-                       handle_hv_logs();
+               if (log_hv && xce_pollfd) {
+                       if (xce_pollfd->revents & ~(POLLIN|POLLOUT|POLLPRI)) {
+                               dolog(LOG_ERR,
+                                     "Failure in poll xce_handle: %d (%s)",
+                                     errno, strerror(errno));
+                               break;
+                       } else if (xce_pollfd->revents & POLLIN)
+                               handle_hv_logs();
+               }
 
                if (ret <= 0)
                        continue;
 
-               if (FD_ISSET(xs_fileno(xs), &readfds))
-                       handle_xs();
+               if (xs_pollfd) {
+                       if (xs_pollfd->revents & ~(POLLIN|POLLOUT|POLLPRI)) {
+                               dolog(LOG_ERR,
+                                     "Failure in poll xs_handle: %d (%s)",
+                                     errno, strerror(errno));
+                               break;
+                       } else if (xs_pollfd->revents & POLLIN)
+                               handle_xs();
+               }
 
                for (d = dom_head; d; d = n) {
                        n = d->next;
                        if (d->event_count < RATE_LIMIT_ALLOWANCE) {
                                if (d->xce_handle != NULL &&
-                                   FD_ISSET(xc_evtchn_fd(d->xce_handle),
-                                            &readfds))
-                                       handle_ring_read(d);
+                                   d->xce_pollfd &&
+                                   !(d->xce_pollfd->revents &
+                                     ~(POLLIN|POLLOUT|POLLPRI)) &&
+                                     (d->xce_pollfd->revents &
+                                      POLLIN))
+                                   handle_ring_read(d);
                        }
 
-                       if (d->master_fd != -1 && FD_ISSET(d->master_fd,
-                                                          &readfds))
-                               handle_tty_read(d);
-
-                       if (d->master_fd != -1 && FD_ISSET(d->master_fd,
-                                                          &writefds))
-                               handle_tty_write(d);
+                       if (d->master_fd != -1 && d->master_pollfd) {
+                               if (d->master_pollfd->revents &
+                                   ~(POLLIN|POLLOUT|POLLPRI))
+                                       domain_handle_broken_tty(d,
+                                                  domain_is_valid(d->domid));
+                               else {
+                                       if (d->master_pollfd->revents &
+                                           POLLIN)
+                                               handle_tty_read(d);
+                                       if (d->master_pollfd->revents &
+                                           POLLOUT)
+                                               handle_tty_write(d);
+                               }
+                       }
 
                        if (d->last_seen != enum_pass)
                                shutdown_domain(d);
@@ -1084,6 +1154,9 @@ void handle_io(void)
                }
        }
 
+       free(fds);
+       current_array_size = 0;
+
  out:
        if (log_hv_fd != -1) {
                close(log_hv_fd);
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.