[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] Switch from select() to poll() in xenconsoled's IO loop



Just to be clear, this is version 5 of the patch.

Git send-email mysteriously dropped my subject prefix.


Wei. 

On Tue, 2013-01-08 at 11:50 +0000, Wei Liu wrote:
> In Linux select() typically supports up to 1024 file descriptors. This can be
> a problem when user tries to boot up many guests. Switching to poll() has
> minimum impact on existing code and has better scalibility.
> 
> pollfd array is dynamically allocated / reallocated. If the array fails to
> expand, we just ignore the incoming fd.
> 
> Change from V2:
>   * remove unnecessary malloc in initialize_pollfd_arrays
>   * use ROUND_UP to get new size of arrays
> 
> Change from V3:
>   * remove initialize and destroy function for array
>   * embedded tracking structure in struct domain, eliminate fd_to_pollfd
> 
> Change from V4:
>   * make xs_pollfd local to io.c
>   * add back the 5 ms fuzz
>   * handle POLLERR and POLLHUP
>   * treat POLLPRI as error if set in revents
>   * abort if xenconsoled's own fds get screwed
>   * handle broken tty if tty's fds get screwed
> 
> Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx>
> ---
>  tools/console/daemon/io.c |  189 
> +++++++++++++++++++++++++++++++--------------
>  1 file changed, 131 insertions(+), 58 deletions(-)
> 
> diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
> index 48fe151..8d16cac 100644
> --- a/tools/console/daemon/io.c
> +++ b/tools/console/daemon/io.c
> @@ -28,7 +28,7 @@
>  #include <stdlib.h>
>  #include <errno.h>
>  #include <string.h>
> -#include <sys/select.h>
> +#include <poll.h>
>  #include <fcntl.h>
>  #include <unistd.h>
>  #include <termios.h>
> @@ -69,6 +69,7 @@ static int log_hv_fd = -1;
>  static evtchn_port_or_error_t log_hv_evtchn = -1;
>  static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
>  static xc_evtchn *xce_handle = NULL;
> +static struct pollfd *xce_pollfd = NULL;
>  
>  struct buffer {
>       char *data;
> @@ -81,7 +82,9 @@ struct buffer {
>  struct domain {
>       int domid;
>       int master_fd;
> +     struct pollfd *master_pollfd;
>       int slave_fd;
> +     struct pollfd *slave_pollfd;
>       int log_fd;
>       bool is_dead;
>       unsigned last_seen;
> @@ -92,6 +95,7 @@ struct domain {
>       evtchn_port_or_error_t local_port;
>       evtchn_port_or_error_t remote_port;
>       xc_evtchn *xce_handle;
> +     struct pollfd *xce_pollfd;
>       struct xencons_interface *interface;
>       int event_count;
>       long long next_period;
> @@ -769,6 +773,17 @@ static int ring_free_bytes(struct domain *dom)
>       return (sizeof(intf->in) - space);
>  }
>  
> +static void domain_handle_broken_tty(struct domain *dom, int recreate)
> +{
> +     domain_close_tty(dom);
> +
> +     if (recreate) {
> +             domain_create_tty(dom);
> +     } else {
> +             shutdown_domain(dom);
> +     }
> +}
> +
>  static void handle_tty_read(struct domain *dom)
>  {
>       ssize_t len = 0;
> @@ -794,13 +809,7 @@ static void handle_tty_read(struct domain *dom)
>        * keep the slave open for the duration.
>        */
>       if (len < 0) {
> -             domain_close_tty(dom);
> -
> -             if (domain_is_valid(dom->domid)) {
> -                     domain_create_tty(dom);
> -             } else {
> -                     shutdown_domain(dom);
> -             }
> +             domain_handle_broken_tty(dom, domain_is_valid(dom->domid));
>       } else if (domain_is_valid(dom->domid)) {
>               prod = intf->in_prod;
>               for (i = 0; i < len; i++) {
> @@ -828,14 +837,7 @@ static void handle_tty_write(struct domain *dom)
>       if (len < 1) {
>               dolog(LOG_DEBUG, "Write failed on domain %d: %zd, %d\n",
>                     dom->domid, len, errno);
> -
> -             domain_close_tty(dom);
> -
> -             if (domain_is_valid(dom->domid)) {
> -                     domain_create_tty(dom);
> -             } else {
> -                     shutdown_domain(dom);
> -             }
> +             domain_handle_broken_tty(dom, domain_is_valid(dom->domid));
>       } else {
>               buffer_advance(&dom->buffer, len);
>       }
> @@ -928,9 +930,53 @@ static void handle_log_reload(void)
>       }
>  }
>  
> +struct pollfd *xs_pollfd;
> +static struct pollfd  *fds;
> +static unsigned int current_array_size;
> +static unsigned int nr_fds;
> +
> +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & 
> ~((1UL<<(_w))-1))
> +static struct pollfd *set_fds(int fd, short events)
> +{
> +     struct pollfd *ret;
> +     if (current_array_size < nr_fds + 1) {
> +             struct pollfd  *new_fds = NULL;
> +             unsigned long newsize;
> +
> +             /* Round up to 2^8 boundary, in practice this just
> +              * make newsize larger than current_array_size.
> +              */
> +             newsize = ROUNDUP(nr_fds + 1, 8);
> +
> +             new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
> +             if (!new_fds)
> +                     goto fail;
> +             fds = new_fds;
> +
> +             memset(&fds[0] + current_array_size, 0,
> +                    sizeof(struct pollfd) * (newsize-current_array_size));
> +             current_array_size = newsize;
> +     }
> +
> +     fds[nr_fds].fd = fd;
> +     fds[nr_fds].events = events;
> +     ret = &fds[nr_fds];
> +     nr_fds++;
> +
> +     return ret;
> +fail:
> +     dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
> +     return NULL;
> +}
> +
> +static void reset_fds(void)
> +{
> +     nr_fds = 0;
> +     memset(fds, 0, sizeof(struct pollfd) * current_array_size);
> +}
> +
>  void handle_io(void)
>  {
> -     fd_set readfds, writefds;
>       int ret;
>  
>       if (log_hv) {
> @@ -959,21 +1005,17 @@ void handle_io(void)
>  
>       for (;;) {
>               struct domain *d, *n;
> -             int max_fd = -1;
> -             struct timeval timeout;
> +             int poll_timeout; /* timeout in milliseconds */
>               struct timespec ts;
>               long long now, next_timeout = 0;
>  
> -             FD_ZERO(&readfds);
> -             FD_ZERO(&writefds);
> +             reset_fds();
>  
> -             FD_SET(xs_fileno(xs), &readfds);
> -             max_fd = MAX(xs_fileno(xs), max_fd);
> +             xs_pollfd = set_fds(xs_fileno(xs), POLLIN|POLLPRI);
>  
> -             if (log_hv) {
> -                     FD_SET(xc_evtchn_fd(xce_handle), &readfds);
> -                     max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
> -             }
> +             if (log_hv)
> +                     xce_pollfd = set_fds(xc_evtchn_fd(xce_handle),
> +                                          POLLIN|POLLPRI);
>  
>               if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
>                       return;
> @@ -982,10 +1024,12 @@ void handle_io(void)
>               /* Re-calculate any event counter allowances & unblock
>                  domains with new allowance */
>               for (d = dom_head; d; d = d->next) {
> -                     /* Add 5ms of fuzz since select() often returns
> -                        a couple of ms sooner than requested. Without
> -                        the fuzz we typically do an extra spin in select()
> -                        with a 1/2 ms timeout every other iteration */
> +                     /* CS 16257:955ee4fa1345 introduces a 5ms fuzz
> +                      * for select(), it is not clear poll() has
> +                      * similar behavior (returning a couple of ms
> +                      * sooner than requested) as well. Just leave
> +                      * the fuzz here. Remove it with a separate
> +                      * patch if necessary */
>                       if ((now+5) > d->next_period) {
>                               d->next_period = now + RATE_LIMIT_PERIOD;
>                               if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
> @@ -1006,75 +1050,101 @@ void handle_io(void)
>                                   !d->buffer.max_capacity ||
>                                   d->buffer.size < d->buffer.max_capacity) {
>                                       int evtchn_fd = 
> xc_evtchn_fd(d->xce_handle);
> -                                     FD_SET(evtchn_fd, &readfds);
> -                                     max_fd = MAX(evtchn_fd, max_fd);
> +                                     d->xce_pollfd = set_fds(evtchn_fd,
> +                                                             POLLIN|POLLPRI);
>                               }
>                       }
>  
>                       if (d->master_fd != -1) {
> +                             short events = 0;
>                               if (!d->is_dead && ring_free_bytes(d))
> -                                     FD_SET(d->master_fd, &readfds);
> +                                     events |= POLLIN;
>  
>                               if (!buffer_empty(&d->buffer))
> -                                     FD_SET(d->master_fd, &writefds);
> -                             max_fd = MAX(d->master_fd, max_fd);
> +                                     events |= POLLOUT;
> +
> +                             if (events)
> +                                     d->master_pollfd =
> +                                             set_fds(d->master_fd,
> +                                                     events|POLLPRI);
>                       }
>               }
>  
>               /* If any domain has been rate limited, we need to work
> -                out what timeout to supply to select */
> +                out what timeout to supply to poll */
>               if (next_timeout) {
>                       long long duration = (next_timeout - now);
>                       if (duration <= 0) /* sanity check */
>                               duration = 1;
> -                     timeout.tv_sec = duration / 1000;
> -                     timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
> -                                        * 1000);
> +                     poll_timeout = (int)duration;
>               }
>  
> -             ret = select(max_fd + 1, &readfds, &writefds, 0,
> -                          next_timeout ? &timeout : NULL);
> +             ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
>  
>               if (log_reload) {
>                       handle_log_reload();
>                       log_reload = 0;
>               }
>  
> -             /* Abort if select failed, except for EINTR cases
> +             /* Abort if poll failed, except for EINTR cases
>                  which indicate a possible log reload */
>               if (ret == -1) {
>                       if (errno == EINTR)
>                               continue;
> -                     dolog(LOG_ERR, "Failure in select: %d (%s)",
> +                     dolog(LOG_ERR, "Failure in poll: %d (%s)",
>                             errno, strerror(errno));
>                       break;
>               }
>  
> -             if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
> -                     handle_hv_logs();
> +             if (log_hv && xce_pollfd) {
> +                     if (xce_pollfd->revents & ~(POLLIN|POLLOUT|POLLPRI)) {
> +                             dolog(LOG_ERR,
> +                                   "Failure in poll xce_handle: %d (%s)",
> +                                   errno, strerror(errno));
> +                             break;
> +                     } else if (xce_pollfd->revents & POLLIN)
> +                             handle_hv_logs();
> +             }
>  
>               if (ret <= 0)
>                       continue;
>  
> -             if (FD_ISSET(xs_fileno(xs), &readfds))
> -                     handle_xs();
> +             if (xs_pollfd) {
> +                     if (xs_pollfd->revents & ~(POLLIN|POLLOUT|POLLPRI)) {
> +                             dolog(LOG_ERR,
> +                                   "Failure in poll xs_handle: %d (%s)",
> +                                   errno, strerror(errno));
> +                             break;
> +                     } else if (xs_pollfd->revents & POLLIN)
> +                             handle_xs();
> +             }
>  
>               for (d = dom_head; d; d = n) {
>                       n = d->next;
>                       if (d->event_count < RATE_LIMIT_ALLOWANCE) {
>                               if (d->xce_handle != NULL &&
> -                                 FD_ISSET(xc_evtchn_fd(d->xce_handle),
> -                                          &readfds))
> -                                     handle_ring_read(d);
> +                                 d->xce_pollfd &&
> +                                 !(d->xce_pollfd->revents &
> +                                   ~(POLLIN|POLLOUT|POLLPRI)) &&
> +                                   (d->xce_pollfd->revents &
> +                                    POLLIN))
> +                                 handle_ring_read(d);
>                       }
>  
> -                     if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> -                                                        &readfds))
> -                             handle_tty_read(d);
> -
> -                     if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> -                                                        &writefds))
> -                             handle_tty_write(d);
> +                     if (d->master_fd != -1 && d->master_pollfd) {
> +                             if (d->master_pollfd->revents &
> +                                 ~(POLLIN|POLLOUT|POLLPRI))
> +                                     domain_handle_broken_tty(d,
> +                                                domain_is_valid(d->domid));
> +                             else {
> +                                     if (d->master_pollfd->revents &
> +                                         POLLIN)
> +                                             handle_tty_read(d);
> +                                     if (d->master_pollfd->revents &
> +                                         POLLOUT)
> +                                             handle_tty_write(d);
> +                             }
> +                     }
>  
>                       if (d->last_seen != enum_pass)
>                               shutdown_domain(d);
> @@ -1084,6 +1154,9 @@ void handle_io(void)
>               }
>       }
>  
> +     free(fds);
> +     current_array_size = 0;
> +
>   out:
>       if (log_hv_fd != -1) {
>               close(log_hv_fd);



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.