[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH net-next RFC 2/5] xen-netback: Change TX path from grant copy to mapping



> -----Original Message-----
> From: xen-devel-bounces@xxxxxxxxxxxxx [mailto:xen-devel-
> bounces@xxxxxxxxxxxxx] On Behalf Of Zoltan Kiss
> Sent: 30 October 2013 00:50
> To: Ian Campbell; Wei Liu; xen-devel@xxxxxxxxxxxxxxxxxxxx;
> netdev@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Jonathan Davies
> Cc: Zoltan Kiss
> Subject: [Xen-devel] [PATCH net-next RFC 2/5] xen-netback: Change TX path
> from grant copy to mapping
> 
> This patch changes the grant copy on the TX patch to grant mapping
> 
> Signed-off-by: Zoltan Kiss <zoltan.kiss@xxxxxxxxxx>
> ---
>  drivers/net/xen-netback/interface.c |   39 +++++-
>  drivers/net/xen-netback/netback.c   |  241 +++++++++++++--------------------
> --
>  2 files changed, 126 insertions(+), 154 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-
> netback/interface.c
> index f5c3c57..fb16ede 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -336,8 +336,20 @@ struct xenvif *xenvif_alloc(struct device *parent,
> domid_t domid,
>       vif->pending_prod = MAX_PENDING_REQS;
>       for (i = 0; i < MAX_PENDING_REQS; i++)
>               vif->pending_ring[i] = i;
> -     for (i = 0; i < MAX_PENDING_REQS; i++)
> -             vif->mmap_pages[i] = NULL;
> +     err = alloc_xenballooned_pages(MAX_PENDING_REQS,
> +             vif->mmap_pages,
> +             false);

Since this is a per-vif allocation, is this going to scale?

> +     if (err) {
> +             netdev_err(dev, "Could not reserve mmap_pages\n");
> +             return NULL;
> +     }
> +     for (i = 0; i < MAX_PENDING_REQS; i++) {
> +             vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
> +                     { .callback = xenvif_zerocopy_callback,
> +                       .ctx = NULL,
> +                       .desc = i };
> +             vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
> +     }
> 
>       /*
>        * Initialise a dummy MAC address. We choose the numerically
> @@ -481,11 +493,34 @@ void xenvif_disconnect(struct xenvif *vif)
> 
>  void xenvif_free(struct xenvif *vif)
>  {
> +     int i;
> +
>       netif_napi_del(&vif->napi);
> 
>       unregister_netdev(vif->dev);
> 
>       free_netdev(vif->dev);
> 
> +     /* FIXME: This is a workaround for 2 problems:
> +      * - the guest sent a packet just before teardown, and it is still not
> +      *   delivered
> +      * - the stack forgot to notify us that we can give back a slot
> +      * For the first problem we shouldn't do this, as the skb might still
> +      * access that page. I will come up with a more robust solution later.
> +      * The second is definitely a bug, it leaks a slot from the ring
> +      * forever.
> +      * Classic kernel patchset has delayed copy for that, we might want to
> +      * reuse that?

I think you're going to have to. You can't have once guest left at the mercy of 
another; if the mapping sticks around for too long then you really need the 
copy-aside.

> +      */
> +     for (i = 0; i < MAX_PENDING_REQS; ++i) {
> +             if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> +                     netdev_err(vif->dev,
> +                             "Page still granted! Index: %x\n", i);
> +                     xenvif_idx_unmap(vif, i);
> +             }
> +     }
> +
> +     free_xenballooned_pages(MAX_PENDING_REQS, vif-
> >mmap_pages);
> +
>       module_put(THIS_MODULE);
>  }
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-
> netback/netback.c
> index 10470dc..e544e9f 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -883,10 +883,10 @@ static inline void xenvif_tx_create_gop(struct
> xenvif *vif, u16 pending_idx,
> 
>  }
> 
> -static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
> +static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif
> *vif,
>                                              struct sk_buff *skb,
>                                              struct xen_netif_tx_request *txp,
> -                                            struct gnttab_copy *gop)
> +                                            struct gnttab_map_grant_ref
> *gop)
>  {
>       struct skb_shared_info *shinfo = skb_shinfo(skb);
>       skb_frag_t *frags = shinfo->frags;
> @@ -907,83 +907,12 @@ static struct gnttab_copy
> *xenvif_get_requests(struct xenvif *vif,
>       /* Skip first skb fragment if it is on same page as header fragment. */
>       start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
> 
> -     /* Coalesce tx requests, at this point the packet passed in
> -      * should be <= 64K. Any packets larger than 64K have been
> -      * handled in xenvif_count_requests().
> -      */
> -     for (shinfo->nr_frags = slot = start; slot < nr_slots;
> -          shinfo->nr_frags++) {
> -             struct pending_tx_info *pending_tx_info =
> -                     vif->pending_tx_info;
> -
> -             page = alloc_page(GFP_ATOMIC|__GFP_COLD);
> -             if (!page)
> -                     goto err;
> -
> -             dst_offset = 0;
> -             first = NULL;
> -             while (dst_offset < PAGE_SIZE && slot < nr_slots) {
> -                     gop->flags = GNTCOPY_source_gref;
> -
> -                     gop->source.u.ref = txp->gref;
> -                     gop->source.domid = vif->domid;
> -                     gop->source.offset = txp->offset;
> -
> -                     gop->dest.domid = DOMID_SELF;
> -
> -                     gop->dest.offset = dst_offset;
> -                     gop->dest.u.gmfn =
> virt_to_mfn(page_address(page));
> -
> -                     if (dst_offset + txp->size > PAGE_SIZE) {
> -                             /* This page can only merge a portion
> -                              * of tx request. Do not increment any
> -                              * pointer / counter here. The txp
> -                              * will be dealt with in future
> -                              * rounds, eventually hitting the
> -                              * `else` branch.
> -                              */
> -                             gop->len = PAGE_SIZE - dst_offset;
> -                             txp->offset += gop->len;
> -                             txp->size -= gop->len;
> -                             dst_offset += gop->len; /* quit loop */
> -                     } else {
> -                             /* This tx request can be merged in the page
> */
> -                             gop->len = txp->size;
> -                             dst_offset += gop->len;
> -
> +     for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
> +          shinfo->nr_frags++, txp++, gop++) {
>                               index = pending_index(vif-
> >pending_cons++);
> -
>                               pending_idx = vif->pending_ring[index];
> -
> -
>       memcpy(&pending_tx_info[pending_idx].req, txp,
> -                                    sizeof(*txp));
> -
> -                             /* Poison these fields, corresponding
> -                              * fields for head tx req will be set
> -                              * to correct values after the loop.
> -                              */
> -                             vif->mmap_pages[pending_idx] = (void
> *)(~0UL);
> -                             pending_tx_info[pending_idx].head =
> -                                     INVALID_PENDING_RING_IDX;
> -
> -                             if (!first) {
> -                                     first =
> &pending_tx_info[pending_idx];
> -                                     start_idx = index;
> -                                     head_idx = pending_idx;
> -                             }
> -
> -                             txp++;
> -                             slot++;
> -                     }
> -
> -                     gop++;
> -             }
> -
> -             first->req.offset = 0;
> -             first->req.size = dst_offset;
> -             first->head = start_idx;
> -             vif->mmap_pages[head_idx] = page;
> -             frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
> +             xenvif_tx_create_gop(vif, pending_idx, txp, gop);
> +             frag_set_pending_idx(&frags[shinfo->nr_frags],
> pending_idx);
>       }
> 
>       BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
> @@ -1005,9 +934,9 @@ err:
> 
>  static int xenvif_tx_check_gop(struct xenvif *vif,
>                              struct sk_buff *skb,
> -                            struct gnttab_copy **gopp)
> +                            struct gnttab_map_grant_ref **gopp)
>  {
> -     struct gnttab_copy *gop = *gopp;
> +     struct gnttab_map_grant_ref *gop = *gopp;
>       u16 pending_idx = *((u16 *)skb->data);
>       struct skb_shared_info *shinfo = skb_shinfo(skb);
>       struct pending_tx_info *tx_info;
> @@ -1019,6 +948,16 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>       err = gop->status;
>       if (unlikely(err))
>               xenvif_idx_release(vif, pending_idx,
> XEN_NETIF_RSP_ERROR);
> +     else {
> +             if (vif->grant_tx_handle[pending_idx] !=
> +                     NETBACK_INVALID_HANDLE) {
> +                     netdev_err(vif->dev,
> +                             "Stale mapped handle! pending_idx %x
> handle %x\n",
> +                             pending_idx, vif-
> >grant_tx_handle[pending_idx]);
> +                     xenvif_fatal_tx_err(vif);
> +             }
> +             vif->grant_tx_handle[pending_idx] = gop->handle;
> +     }
> 
>       /* Skip first skb fragment if it is on same page as header fragment. */
>       start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
> @@ -1032,18 +971,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>               head = tx_info->head;
> 
>               /* Check error status: if okay then remember grant handle.
> */
> -             do {
>                       newerr = (++gop)->status;
> -                     if (newerr)
> -                             break;
> -                     peek = vif->pending_ring[pending_index(++head)];
> -             } while (!pending_tx_is_head(vif, peek));
> 
>               if (likely(!newerr)) {
> +                     if (vif->grant_tx_handle[pending_idx] !=
> +                             NETBACK_INVALID_HANDLE) {
> +                             netdev_err(vif->dev,
> +                                     "Stale mapped handle! pending_idx
> %x handle %x\n",
> +                                     pending_idx,
> +                                     vif->grant_tx_handle[pending_idx]);
> +                             xenvif_fatal_tx_err(vif);
> +                     }
> +                     vif->grant_tx_handle[pending_idx] = gop->handle;
>                       /* Had a previous error? Invalidate this fragment. */
> -                     if (unlikely(err))
> +                     if (unlikely(err)) {
> +                             xenvif_idx_unmap(vif, pending_idx);
>                               xenvif_idx_release(vif, pending_idx,
>                                                  XEN_NETIF_RSP_OKAY);
> +                     }
>                       continue;
>               }
> 
> @@ -1056,9 +1001,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
> 
>               /* First error: invalidate header and preceding fragments. */
>               pending_idx = *((u16 *)skb->data);
> +             xenvif_idx_unmap(vif, pending_idx);
>               xenvif_idx_release(vif, pending_idx,
> XEN_NETIF_RSP_OKAY);
>               for (j = start; j < i; j++) {
>                       pending_idx = frag_get_pending_idx(&shinfo-
> >frags[j]);
> +                     xenvif_idx_unmap(vif, pending_idx);
>                       xenvif_idx_release(vif, pending_idx,
>                                          XEN_NETIF_RSP_OKAY);
>               }
> @@ -1071,7 +1018,8 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>       return err;
>  }
> 
> -static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
> +static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb,
> +             u16 prev_pending_idx)
>  {
>       struct skb_shared_info *shinfo = skb_shinfo(skb);
>       int nr_frags = shinfo->nr_frags;
> @@ -1085,6 +1033,15 @@ static void xenvif_fill_frags(struct xenvif *vif,
> struct sk_buff *skb)
> 
>               pending_idx = frag_get_pending_idx(frag);
> 
> +             /* If this is not the first frag, chain it to the previous*/
> +             vif->pending_tx_info[pending_idx].callback_struct.ctx =
> NULL;
> +             if (pending_idx != prev_pending_idx) {
> +                     vif-
> >pending_tx_info[prev_pending_idx].callback_struct.ctx =
> +                             &(vif-
> >pending_tx_info[pending_idx].callback_struct);
> +                     prev_pending_idx = pending_idx;
> +             }
> +
> +
>               txp = &vif->pending_tx_info[pending_idx].req;
>               page = virt_to_page(idx_to_kaddr(vif, pending_idx));
>               __skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
> @@ -1092,10 +1049,15 @@ static void xenvif_fill_frags(struct xenvif *vif,
> struct sk_buff *skb)
>               skb->data_len += txp->size;
>               skb->truesize += txp->size;
> 
> -             /* Take an extra reference to offset xenvif_idx_release */
> +             /* Take an extra reference to offset network stack's
> put_page */
>               get_page(vif->mmap_pages[pending_idx]);
> -             xenvif_idx_release(vif, pending_idx,
> XEN_NETIF_RSP_OKAY);
>       }
> +     /* FIXME: __skb_fill_page_desc set this to true because page-
> >pfmemalloc
> +      * overlaps with "index", and "mapping" is not set. I think mapping
> +      * should be set. If delivered to local stack, it would drop this
> +      * skb in sk_filter unless the socket has the right to use it.
> +      */
> +     skb->pfmemalloc = false;
>  }
> 
>  static int xenvif_get_extras(struct xenvif *vif,
> @@ -1426,7 +1388,7 @@ static bool tx_credit_exceeded(struct xenvif *vif,
> unsigned size)
> 
>  static unsigned xenvif_tx_build_gops(struct xenvif *vif)
>  {
> -     struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop;
> +     struct gnttab_map_grant_ref *gop = vif->tx_map_ops,
> *request_gop;
>       struct sk_buff *skb;
>       int ret;
> 
> @@ -1533,30 +1495,10 @@ static unsigned xenvif_tx_build_gops(struct
> xenvif *vif)
>                       }
>               }
> 
> -             /* XXX could copy straight to head */
> -             page = xenvif_alloc_page(vif, pending_idx);
> -             if (!page) {
> -                     kfree_skb(skb);
> -                     xenvif_tx_err(vif, &txreq, idx);
> -                     break;
> -             }
> -
> -             gop->source.u.ref = txreq.gref;
> -             gop->source.domid = vif->domid;
> -             gop->source.offset = txreq.offset;
> -
> -             gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> -             gop->dest.domid = DOMID_SELF;
> -             gop->dest.offset = txreq.offset;
> -
> -             gop->len = txreq.size;
> -             gop->flags = GNTCOPY_source_gref;
> +             xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);
> 
>               gop++;
> 
> -             memcpy(&vif->pending_tx_info[pending_idx].req,
> -                    &txreq, sizeof(txreq));
> -             vif->pending_tx_info[pending_idx].head = index;
>               *((u16 *)skb->data) = pending_idx;
> 
>               __skb_put(skb, data_len);
> @@ -1585,17 +1527,17 @@ static unsigned xenvif_tx_build_gops(struct
> xenvif *vif)
> 
>               vif->tx.req_cons = idx;
> 
> -             if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif-
> >tx_copy_ops))
> +             if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif-
> >tx_map_ops))
>                       break;
>       }
> 
> -     return gop - vif->tx_copy_ops;
> +     return gop - vif->tx_map_ops;
>  }
> 
> 
>  static int xenvif_tx_submit(struct xenvif *vif, int budget)
>  {
> -     struct gnttab_copy *gop = vif->tx_copy_ops;
> +     struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
>       struct sk_buff *skb;
>       int work_done = 0;
> 
> @@ -1620,14 +1562,25 @@ static int xenvif_tx_submit(struct xenvif *vif, int
> budget)
>               memcpy(skb->data,
>                      (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
>                      data_len);
> +             vif->pending_tx_info[pending_idx].callback_struct.ctx =
> NULL;
>               if (data_len < txp->size) {
>                       /* Append the packet payload as a fragment. */
>                       txp->offset += data_len;
>                       txp->size -= data_len;
> -             } else {
> +                     skb_shinfo(skb)->destructor_arg =
> +                             &vif-
> >pending_tx_info[pending_idx].callback_struct;
> +             } else if (!skb_shinfo(skb)->nr_frags) {
>                       /* Schedule a response immediately. */
> +                     skb_shinfo(skb)->destructor_arg = NULL;
> +                     xenvif_idx_unmap(vif, pending_idx);
>                       xenvif_idx_release(vif, pending_idx,
>                                          XEN_NETIF_RSP_OKAY);
> +             } else {
> +                     /* FIXME: first request fits linear space, I don't know
> +                      * if any guest would do that, but I think it's possible
> +                      */

The Windows frontend, because it has to parse the packet headers, will coalesce 
everything up to the payload in a single frag and it would be a good idea to 
copy this directly into the linear area.

> +                     skb_shinfo(skb)->destructor_arg =
> +                             &vif-
> >pending_tx_info[pending_idx].callback_struct;
>               }
> 
>               if (txp->flags & XEN_NETTXF_csum_blank)
> @@ -1635,13 +1588,19 @@ static int xenvif_tx_submit(struct xenvif *vif, int
> budget)
>               else if (txp->flags & XEN_NETTXF_data_validated)
>                       skb->ip_summed = CHECKSUM_UNNECESSARY;
> 
> -             xenvif_fill_frags(vif, skb);
> +             xenvif_fill_frags(vif, skb, pending_idx);
> 
>               if (skb_is_nonlinear(skb) && skb_headlen(skb) <
> PKT_PROT_LEN) {
>                       int target = min_t(int, skb->len, PKT_PROT_LEN);
>                       __pskb_pull_tail(skb, target - skb_headlen(skb));
>               }
> 
> +             /* Set this flag after __pskb_pull_tail, as it can trigger
> +              * skb_copy_ubufs, while we are still in control of the skb
> +              */

You can't be sure that there will be no subsequent pullups. The v6 parsing code 
I added may need to do that on a (hopefully) rare occasion.

  Paul

> +             if (skb_shinfo(skb)->destructor_arg)
> +                     skb_shinfo(skb)->tx_flags |=
> SKBTX_DEV_ZEROCOPY;
> +
>               skb->dev      = vif->dev;
>               skb->protocol = eth_type_trans(skb, skb->dev);
>               skb_reset_network_header(skb);
> @@ -1770,17 +1729,25 @@ static inline void xenvif_tx_action_dealloc(struct
> xenvif *vif)
>  int xenvif_tx_action(struct xenvif *vif, int budget)
>  {
>       unsigned nr_gops;
> -     int work_done;
> +     int work_done, ret;
> 
>       if (unlikely(!tx_work_todo(vif)))
>               return 0;
> 
> +     xenvif_tx_action_dealloc(vif);
> +
>       nr_gops = xenvif_tx_build_gops(vif);
> 
>       if (nr_gops == 0)
>               return 0;
> 
> -     gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
> +     if (nr_gops) {
> +             ret = gnttab_map_refs(vif->tx_map_ops,
> +                     NULL,
> +                     vif->pages_to_gnt,
> +                     nr_gops);
> +             BUG_ON(ret);
> +     }
> 
>       work_done = xenvif_tx_submit(vif, nr_gops);
> 
> @@ -1791,45 +1758,13 @@ static void xenvif_idx_release(struct xenvif *vif,
> u16 pending_idx,
>                              u8 status)
>  {
>       struct pending_tx_info *pending_tx_info;
> -     pending_ring_idx_t head;
> +     pending_ring_idx_t index;
>       u16 peek; /* peek into next tx request */
> 
> -     BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
> -
> -     /* Already complete? */
> -     if (vif->mmap_pages[pending_idx] == NULL)
> -             return;
> -
> -     pending_tx_info = &vif->pending_tx_info[pending_idx];
> -
> -     head = pending_tx_info->head;
> -
> -     BUG_ON(!pending_tx_is_head(vif, head));
> -     BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx);
> -
> -     do {
> -             pending_ring_idx_t index;
> -             pending_ring_idx_t idx = pending_index(head);
> -             u16 info_idx = vif->pending_ring[idx];
> -
> -             pending_tx_info = &vif->pending_tx_info[info_idx];
> +             pending_tx_info = &vif->pending_tx_info[pending_idx];
>               make_tx_response(vif, &pending_tx_info->req, status);
> -
> -             /* Setting any number other than
> -              * INVALID_PENDING_RING_IDX indicates this slot is
> -              * starting a new packet / ending a previous packet.
> -              */
> -             pending_tx_info->head = 0;
> -
>               index = pending_index(vif->pending_prod++);
> -             vif->pending_ring[index] = vif->pending_ring[info_idx];
> -
> -             peek = vif->pending_ring[pending_index(++head)];
> -
> -     } while (!pending_tx_is_head(vif, peek));
> -
> -     put_page(vif->mmap_pages[pending_idx]);
> -     vif->mmap_pages[pending_idx] = NULL;
> +             vif->pending_ring[index] = pending_idx;
>  }
> 
>  void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
> @@ -1904,6 +1839,8 @@ static inline int rx_work_todo(struct xenvif *vif)
> 
>  static inline int tx_work_todo(struct xenvif *vif)
>  {
> +     if (vif->dealloc_cons != vif->dealloc_prod)
> +             return 1;
> 
>       if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) &&
>           (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.