[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 1/2] libxl: Implement the handler to handle unrecoverable AER errors



On 2017-08-08 15:33:01 +0100, Wei Liu wrote:
> On Mon, Aug 07, 2017 at 06:54:56PM -0500, Venu Busireddy wrote:
> > Implement the callback function to handle unrecoverable AER errors, and
> > also the public APIs that can be used to register/unregister the handler.
> > When an AER error occurs, the handler will forcibly remove the erring
> > PCIe device from the guest.
> > 
> > Signed-off-by: Venu Busireddy <venu.busireddy@xxxxxxxxxx>
> > ---
> >  tools/libxl/libxl.h          | 14 +++++++
> >  tools/libxl/libxl_event.h    | 13 +++++++
> >  tools/libxl/libxl_internal.h |  7 ++++
> >  tools/libxl/libxl_pci.c      | 90 
> > ++++++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 124 insertions(+)
> > 
> > diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
> > index 7cf0f31..c5af0aa 100644
> > --- a/tools/libxl/libxl.h
> > +++ b/tools/libxl/libxl.h
> > @@ -1044,6 +1044,20 @@ void libxl_mac_copy(libxl_ctx *ctx, libxl_mac *dst, 
> > const libxl_mac *src);
> >   */
> >  #define LIBXL_HAVE_QED 1
> >  
> > +/* LIBXL_HAVE_REG_AER_EVENTS_HANDLER
> > + *
> > + * If it is defined, libxl has a library function called
> > + * libxl_reg_aer_events_handler.
> > + */
> > +#define LIBXL_HAVE_REG_AER_EVENTS_HANDLER 1
> > +
> > +/* LIBXL_HAVE_UNREG_AER_EVENTS_HANDLER
> > + *
> > + * If it is defined, libxl has a library function called
> > + * libxl_unreg_aer_events_handler.
> > + */
> > +#define LIBXL_HAVE_UNREG_AER_EVENTS_HANDLER 1
> > +
> 
> You can consolidate both into
> 
> LIBLX_HAVE_AER_EVENTS_HANDLER
> 
> >  typedef char **libxl_string_list;
> >  void libxl_string_list_dispose(libxl_string_list *sl);
> >  int libxl_string_list_length(const libxl_string_list *sl);
> > diff --git a/tools/libxl/libxl_event.h b/tools/libxl/libxl_event.h
> > index 1ea789e..1aea906 100644
> > --- a/tools/libxl/libxl_event.h
> > +++ b/tools/libxl/libxl_event.h
> > @@ -184,6 +184,19 @@ void libxl_evdisable_domain_death(libxl_ctx *ctx, 
> > libxl_evgen_domain_death*);
> >     * may generate only a DEATH event.
> >     */
> >  
> > +typedef struct libxl__aer_watch libxl_aer_watch;
> > +int libxl_reg_aer_events_handler(libxl_ctx *, uint32_t, libxl_aer_watch **)
> > +                        LIBXL_EXTERNAL_CALLERS_ONLY;
> > +  /*
> > +   * Registers a handler to handle the occurrence of unrecoverable AER 
> > errors.
> > +   * This function depends on the calling application running the libxl's
> > +   * internal event loop. Toolstacks that do not use libxl's internal
> > +   * event loop must arrange to have their own event loop created and enter
> > +   * libxl (say, call libxl_event_wait()), to enable the event to be 
> > processed.
> > +   */
> > +void libxl_unreg_aer_events_handler(libxl_ctx *, uint32_t, libxl_aer_watch 
> > *)
> > +                        LIBXL_EXTERNAL_CALLERS_ONLY;
> > +
> >  typedef struct libxl__evgen_disk_eject libxl_evgen_disk_eject;
> >  int libxl_evenable_disk_eject(libxl_ctx *ctx, uint32_t domid, const char 
> > *vdev,
> >                          libxl_ev_user, libxl_evgen_disk_eject **evgen_out);
> > diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
> > index afe6652..2b74286 100644
> > --- a/tools/libxl/libxl_internal.h
> > +++ b/tools/libxl/libxl_internal.h
> > @@ -352,6 +352,13 @@ struct libxl__ev_child {
> >      LIBXL_LIST_ENTRY(struct libxl__ev_child) entry;
> >  };
> >  
> > +/*
> > + * Structure used for AER event handling.
> > + */
> > +struct libxl__aer_watch {
> > +    uint32_t domid;
> > +    libxl__ev_xswatch watch;
> > +};
> >  
> >  /*
> >   * evgen structures, which are the state we use for generating
> > diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
> > index 65ad5e5..feedf27 100644
> > --- a/tools/libxl/libxl_pci.c
> > +++ b/tools/libxl/libxl_pci.c
> > @@ -1678,6 +1678,96 @@ static int libxl_device_pci_compare(libxl_device_pci 
> > *d1,
> >      return COMPARE_PCI(d1, d2);
> >  }
> >  
> > +static void aer_backend_watch_callback(libxl__egc *egc,
> > +                                       libxl__ev_xswatch *watch,
> > +                                       const char *watch_path,
> > +                                       const char *event_path)
> > +{
> > +    EGC_GC;
> > +    libxl_aer_watch *aer_ws = CONTAINER_OF(watch, *aer_ws, watch);
> > +    int rc;
> > +    uint32_t dom, bus, dev, fn;
> > +    uint32_t domid = aer_ws->domid;
> > +    char *p, *path;
> > +    const char *aerFailedSBDF;
> > +    libxl_device_pci pcidev;
> > +
> > +    /* Extract the backend directory. */
> > +    path = libxl__strdup(gc, event_path);
> > +    p = strrchr(path, '/');
> > +    if ((p == NULL) || (strcmp(p, "/aerFailedSBDF") != 0))
> > +        return;
> > +    /* Truncate the string so it points to the backend directory. */
> > +    *p = '\0';
> > +
> > +    /* Fetch the value of the failed PCI device. */
> > +    rc = libxl__xs_read_checked(gc, XBT_NULL,
> > +            GCSPRINTF("%s/aerFailedSBDF", path), &aerFailedSBDF);
> > +    if (rc || !aerFailedSBDF)
> > +        return;
> > +    sscanf(aerFailedSBDF, "%x:%x:%x.%x", &dom, &bus, &dev, &fn);
> > +
> > +    libxl_device_pci_init(&pcidev);
> > +    pcidev_struct_fill(&pcidev, dom, bus, dev, fn, 0);
> > +    /* Forcibly remove the device from the guest */
> > +    rc = libxl__device_pci_remove_common(gc, domid, &pcidev, 1);
> > +    if (rc)
> > +        LOGD(ERROR, domid, " libxl__device_pci_remove_common() failed, 
> > rc=x%x",
> > +                (unsigned int)rc);
> > +
> > +    return;
> > +}
> > +
> > +int libxl_reg_aer_events_handler(libxl_ctx *ctx,
> > +                                 uint32_t domid,
> > +                                 libxl_aer_watch **aer_ws_out)
> 
> Afaict libxl_aer_watch is an opaque type to external caller, so this
> won't work, right?
> 
> > +{
> > +    int rc = 0;
> > +    int dom0_domid;
> 
> uint32_t pciback_domid;
> 
> > +    char *be_path;
> > +    libxl_aer_watch *aer_ws = NULL;
> > +    GC_INIT(ctx);
> > +
> > +    *aer_ws_out = NULL;
> > +
> > +    rc = libxl__get_domid(gc, (uint32_t *)(&dom0_domid));
> > +    if (rc) {
> > +        LOGD(ERROR, domid, " libxl__get_domid() failed, rc = %d", rc);
> > +        goto out;
> > +    }
> > +
> > +    aer_ws = malloc(sizeof(libxl_aer_watch));
> 
> libxl__calloc(NOGC, ...);
> 
> And then you can skip the check and memset.
> 
> > +    if (!aer_ws) {
> > +        rc = ERROR_NOMEM;
> > +        goto out;
> > +    }
> > +    memset(aer_ws, 0, sizeof(libxl_aer_watch));
> > +
> > +    aer_ws->domid = domid;
> > +    be_path = GCSPRINTF("/local/domain/%d/backend/pci/%u/%d/%s",
> > +            dom0_domid, domid, dom0_domid, "aerFailedSBDF");
> 
> Use %u here.
> 
> > +    rc = libxl__ev_xswatch_register(gc, &aer_ws->watch,
> > +            aer_backend_watch_callback, be_path);
> > +    *aer_ws_out = aer_ws;
> > +
> > +out:
> > +    GC_FREE;
> > +    return rc;
> > +}
> > +
> > +void libxl_unreg_aer_events_handler(libxl_ctx *ctx,
> > +                                    uint32_t domid,
> > +                                    libxl_aer_watch *aer_ws)
> > +{
> > +    GC_INIT(ctx);
> > +
> > +    libxl__ev_xswatch_deregister(gc, &aer_ws->watch);
> > +
> > +    free(aer_ws);
> > +    GC_FREE;
> > +    return;
> > +}

I will implement all your above suggestions in v4.

> I think a bigger question is whether you agree with Ian's comments
> regarding API design and whether you have more questions?

Ian suggested that I document the use of the API (about the event loop),
and I believe I addressed it. I don't have any more questions. Just
waiting for Ian's "Ack", or more comments.

Venu



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.