Xen project Mailing List

Re: [Xen-devel] [PATCH net-next RFC 2/5] xen-netback: Change TX path from grant copy to mapping

To: Zoltan Kiss <zoltan.kiss@xxxxxxxxxx>, Ian Campbell <Ian.Campbell@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, "netdev@xxxxxxxxxxxxxxx" <netdev@xxxxxxxxxxxxxxx>, "linux-kernel@xxxxxxxxxxxxxxx" <linux-kernel@xxxxxxxxxxxxxxx>, "Jonathan Davies" <Jonathan.Davies@xxxxxxxxxx>

From: Paul Durrant <Paul.Durrant@xxxxxxxxxx>

Date: Wed, 30 Oct 2013 09:11:37 +0000

Accept-language: en-GB, en-US

Cc: Zoltan Kiss <zoltan.kiss@xxxxxxxxxx>

Delivery-date: Wed, 30 Oct 2013 09:12:04 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

Thread-index: AQHO1Qpk26TaZxNAnUG0XhQ8tazlZ5oM9GPQ

Thread-topic: [Xen-devel] [PATCH net-next RFC 2/5] xen-netback: Change TX path from grant copy to mapping

> -----Original Message----- > From: xen-devel-bounces@xxxxxxxxxxxxx [mailto:xen-devel- > bounces@xxxxxxxxxxxxx] On Behalf Of Zoltan Kiss > Sent: 30 October 2013 00:50 > To: Ian Campbell; Wei Liu; xen-devel@xxxxxxxxxxxxxxxxxxxx; > netdev@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Jonathan Davies > Cc: Zoltan Kiss > Subject: [Xen-devel] [PATCH net-next RFC 2/5] xen-netback: Change TX path > from grant copy to mapping > > This patch changes the grant copy on the TX patch to grant mapping > > Signed-off-by: Zoltan Kiss <zoltan.kiss@xxxxxxxxxx> > --- > drivers/net/xen-netback/interface.c | 39 +++++- > drivers/net/xen-netback/netback.c | 241 +++++++++++++-------------------- > -- > 2 files changed, 126 insertions(+), 154 deletions(-) > > diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen- > netback/interface.c > index f5c3c57..fb16ede 100644 > --- a/drivers/net/xen-netback/interface.c > +++ b/drivers/net/xen-netback/interface.c > @@ -336,8 +336,20 @@ struct xenvif *xenvif_alloc(struct device *parent, > domid_t domid, > vif->pending_prod = MAX_PENDING_REQS; > for (i = 0; i < MAX_PENDING_REQS; i++) > vif->pending_ring[i] = i; > - for (i = 0; i < MAX_PENDING_REQS; i++) > - vif->mmap_pages[i] = NULL; > + err = alloc_xenballooned_pages(MAX_PENDING_REQS, > + vif->mmap_pages, > + false); Since this is a per-vif allocation, is this going to scale? > + if (err) { > + netdev_err(dev, "Could not reserve mmap_pages\n"); > + return NULL; > + } > + for (i = 0; i < MAX_PENDING_REQS; i++) { > + vif->pending_tx_info[i].callback_struct = (struct ubuf_info) > + { .callback = xenvif_zerocopy_callback, > + .ctx = NULL, > + .desc = i }; > + vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE; > + } > > /* > * Initialise a dummy MAC address. We choose the numerically > @@ -481,11 +493,34 @@ void xenvif_disconnect(struct xenvif *vif) > > void xenvif_free(struct xenvif *vif) > { > + int i; > + > netif_napi_del(&vif->napi); > > unregister_netdev(vif->dev); > > free_netdev(vif->dev); > > + /* FIXME: This is a workaround for 2 problems: > + * - the guest sent a packet just before teardown, and it is still not > + * delivered > + * - the stack forgot to notify us that we can give back a slot > + * For the first problem we shouldn't do this, as the skb might still > + * access that page. I will come up with a more robust solution later. > + * The second is definitely a bug, it leaks a slot from the ring > + * forever. > + * Classic kernel patchset has delayed copy for that, we might want to > + * reuse that? I think you're going to have to. You can't have once guest left at the mercy of another; if the mapping sticks around for too long then you really need the copy-aside. > + */ > + for (i = 0; i < MAX_PENDING_REQS; ++i) { > + if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) { > + netdev_err(vif->dev, > + "Page still granted! Index: %x\n", i); > + xenvif_idx_unmap(vif, i); > + } > + } > + > + free_xenballooned_pages(MAX_PENDING_REQS, vif- > >mmap_pages); > + > module_put(THIS_MODULE); > } > diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen- > netback/netback.c > index 10470dc..e544e9f 100644 > --- a/drivers/net/xen-netback/netback.c > +++ b/drivers/net/xen-netback/netback.c > @@ -883,10 +883,10 @@ static inline void xenvif_tx_create_gop(struct > xenvif *vif, u16 pending_idx, > > } > > -static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif, > +static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif > *vif, > struct sk_buff *skb, > struct xen_netif_tx_request *txp, > - struct gnttab_copy *gop) > + struct gnttab_map_grant_ref > *gop) > { > struct skb_shared_info *shinfo = skb_shinfo(skb); > skb_frag_t *frags = shinfo->frags; > @@ -907,83 +907,12 @@ static struct gnttab_copy > *xenvif_get_requests(struct xenvif *vif, > /* Skip first skb fragment if it is on same page as header fragment. */ > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > > - /* Coalesce tx requests, at this point the packet passed in > - * should be <= 64K. Any packets larger than 64K have been > - * handled in xenvif_count_requests(). > - */ > - for (shinfo->nr_frags = slot = start; slot < nr_slots; > - shinfo->nr_frags++) { > - struct pending_tx_info *pending_tx_info = > - vif->pending_tx_info; > - > - page = alloc_page(GFP_ATOMIC|__GFP_COLD); > - if (!page) > - goto err; > - > - dst_offset = 0; > - first = NULL; > - while (dst_offset < PAGE_SIZE && slot < nr_slots) { > - gop->flags = GNTCOPY_source_gref; > - > - gop->source.u.ref = txp->gref; > - gop->source.domid = vif->domid; > - gop->source.offset = txp->offset; > - > - gop->dest.domid = DOMID_SELF; > - > - gop->dest.offset = dst_offset; > - gop->dest.u.gmfn = > virt_to_mfn(page_address(page)); > - > - if (dst_offset + txp->size > PAGE_SIZE) { > - /* This page can only merge a portion > - * of tx request. Do not increment any > - * pointer / counter here. The txp > - * will be dealt with in future > - * rounds, eventually hitting the > - * `else` branch. > - */ > - gop->len = PAGE_SIZE - dst_offset; > - txp->offset += gop->len; > - txp->size -= gop->len; > - dst_offset += gop->len; /* quit loop */ > - } else { > - /* This tx request can be merged in the page > */ > - gop->len = txp->size; > - dst_offset += gop->len; > - > + for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots; > + shinfo->nr_frags++, txp++, gop++) { > index = pending_index(vif- > >pending_cons++); > - > pending_idx = vif->pending_ring[index]; > - > - > memcpy(&pending_tx_info[pending_idx].req, txp, > - sizeof(*txp)); > - > - /* Poison these fields, corresponding > - * fields for head tx req will be set > - * to correct values after the loop. > - */ > - vif->mmap_pages[pending_idx] = (void > *)(~0UL); > - pending_tx_info[pending_idx].head = > - INVALID_PENDING_RING_IDX; > - > - if (!first) { > - first = > &pending_tx_info[pending_idx]; > - start_idx = index; > - head_idx = pending_idx; > - } > - > - txp++; > - slot++; > - } > - > - gop++; > - } > - > - first->req.offset = 0; > - first->req.size = dst_offset; > - first->head = start_idx; > - vif->mmap_pages[head_idx] = page; > - frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); > + xenvif_tx_create_gop(vif, pending_idx, txp, gop); > + frag_set_pending_idx(&frags[shinfo->nr_frags], > pending_idx); > } > > BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); > @@ -1005,9 +934,9 @@ err: > > static int xenvif_tx_check_gop(struct xenvif *vif, > struct sk_buff *skb, > - struct gnttab_copy **gopp) > + struct gnttab_map_grant_ref **gopp) > { > - struct gnttab_copy *gop = *gopp; > + struct gnttab_map_grant_ref *gop = *gopp; > u16 pending_idx = *((u16 *)skb->data); > struct skb_shared_info *shinfo = skb_shinfo(skb); > struct pending_tx_info *tx_info; > @@ -1019,6 +948,16 @@ static int xenvif_tx_check_gop(struct xenvif *vif, > err = gop->status; > if (unlikely(err)) > xenvif_idx_release(vif, pending_idx, > XEN_NETIF_RSP_ERROR); > + else { > + if (vif->grant_tx_handle[pending_idx] != > + NETBACK_INVALID_HANDLE) { > + netdev_err(vif->dev, > + "Stale mapped handle! pending_idx %x > handle %x\n", > + pending_idx, vif- > >grant_tx_handle[pending_idx]); > + xenvif_fatal_tx_err(vif); > + } > + vif->grant_tx_handle[pending_idx] = gop->handle; > + } > > /* Skip first skb fragment if it is on same page as header fragment. */ > start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); > @@ -1032,18 +971,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif, > head = tx_info->head; > > /* Check error status: if okay then remember grant handle. > */ > - do { > newerr = (++gop)->status; > - if (newerr) > - break; > - peek = vif->pending_ring[pending_index(++head)]; > - } while (!pending_tx_is_head(vif, peek)); > > if (likely(!newerr)) { > + if (vif->grant_tx_handle[pending_idx] != > + NETBACK_INVALID_HANDLE) { > + netdev_err(vif->dev, > + "Stale mapped handle! pending_idx > %x handle %x\n", > + pending_idx, > + vif->grant_tx_handle[pending_idx]); > + xenvif_fatal_tx_err(vif); > + } > + vif->grant_tx_handle[pending_idx] = gop->handle; > /* Had a previous error? Invalidate this fragment. */ > - if (unlikely(err)) > + if (unlikely(err)) { > + xenvif_idx_unmap(vif, pending_idx); > xenvif_idx_release(vif, pending_idx, > XEN_NETIF_RSP_OKAY); > + } > continue; > } > > @@ -1056,9 +1001,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif, > > /* First error: invalidate header and preceding fragments. */ > pending_idx = *((u16 *)skb->data); > + xenvif_idx_unmap(vif, pending_idx); > xenvif_idx_release(vif, pending_idx, > XEN_NETIF_RSP_OKAY); > for (j = start; j < i; j++) { > pending_idx = frag_get_pending_idx(&shinfo- > >frags[j]); > + xenvif_idx_unmap(vif, pending_idx); > xenvif_idx_release(vif, pending_idx, > XEN_NETIF_RSP_OKAY); > } > @@ -1071,7 +1018,8 @@ static int xenvif_tx_check_gop(struct xenvif *vif, > return err; > } > > -static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) > +static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb, > + u16 prev_pending_idx) > { > struct skb_shared_info *shinfo = skb_shinfo(skb); > int nr_frags = shinfo->nr_frags; > @@ -1085,6 +1033,15 @@ static void xenvif_fill_frags(struct xenvif *vif, > struct sk_buff *skb) > > pending_idx = frag_get_pending_idx(frag); > > + /* If this is not the first frag, chain it to the previous*/ > + vif->pending_tx_info[pending_idx].callback_struct.ctx = > NULL; > + if (pending_idx != prev_pending_idx) { > + vif- > >pending_tx_info[prev_pending_idx].callback_struct.ctx = > + &(vif- > >pending_tx_info[pending_idx].callback_struct); > + prev_pending_idx = pending_idx; > + } > + > + > txp = &vif->pending_tx_info[pending_idx].req; > page = virt_to_page(idx_to_kaddr(vif, pending_idx)); > __skb_fill_page_desc(skb, i, page, txp->offset, txp->size); > @@ -1092,10 +1049,15 @@ static void xenvif_fill_frags(struct xenvif *vif, > struct sk_buff *skb) > skb->data_len += txp->size; > skb->truesize += txp->size; > > - /* Take an extra reference to offset xenvif_idx_release */ > + /* Take an extra reference to offset network stack's > put_page */ > get_page(vif->mmap_pages[pending_idx]); > - xenvif_idx_release(vif, pending_idx, > XEN_NETIF_RSP_OKAY); > } > + /* FIXME: __skb_fill_page_desc set this to true because page- > >pfmemalloc > + * overlaps with "index", and "mapping" is not set. I think mapping > + * should be set. If delivered to local stack, it would drop this > + * skb in sk_filter unless the socket has the right to use it. > + */ > + skb->pfmemalloc = false; > } > > static int xenvif_get_extras(struct xenvif *vif, > @@ -1426,7 +1388,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, > unsigned size) > > static unsigned xenvif_tx_build_gops(struct xenvif *vif) > { > - struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop; > + struct gnttab_map_grant_ref *gop = vif->tx_map_ops, > *request_gop; > struct sk_buff *skb; > int ret; > > @@ -1533,30 +1495,10 @@ static unsigned xenvif_tx_build_gops(struct > xenvif *vif) > } > } > > - /* XXX could copy straight to head */ > - page = xenvif_alloc_page(vif, pending_idx); > - if (!page) { > - kfree_skb(skb); > - xenvif_tx_err(vif, &txreq, idx); > - break; > - } > - > - gop->source.u.ref = txreq.gref; > - gop->source.domid = vif->domid; > - gop->source.offset = txreq.offset; > - > - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); > - gop->dest.domid = DOMID_SELF; > - gop->dest.offset = txreq.offset; > - > - gop->len = txreq.size; > - gop->flags = GNTCOPY_source_gref; > + xenvif_tx_create_gop(vif, pending_idx, &txreq, gop); > > gop++; > > - memcpy(&vif->pending_tx_info[pending_idx].req, > - &txreq, sizeof(txreq)); > - vif->pending_tx_info[pending_idx].head = index; > *((u16 *)skb->data) = pending_idx; > > __skb_put(skb, data_len); > @@ -1585,17 +1527,17 @@ static unsigned xenvif_tx_build_gops(struct > xenvif *vif) > > vif->tx.req_cons = idx; > > - if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif- > >tx_copy_ops)) > + if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif- > >tx_map_ops)) > break; > } > > - return gop - vif->tx_copy_ops; > + return gop - vif->tx_map_ops; > } > > > static int xenvif_tx_submit(struct xenvif *vif, int budget) > { > - struct gnttab_copy *gop = vif->tx_copy_ops; > + struct gnttab_map_grant_ref *gop = vif->tx_map_ops; > struct sk_buff *skb; > int work_done = 0; > > @@ -1620,14 +1562,25 @@ static int xenvif_tx_submit(struct xenvif *vif, int > budget) > memcpy(skb->data, > (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset), > data_len); > + vif->pending_tx_info[pending_idx].callback_struct.ctx = > NULL; > if (data_len < txp->size) { > /* Append the packet payload as a fragment. */ > txp->offset += data_len; > txp->size -= data_len; > - } else { > + skb_shinfo(skb)->destructor_arg = > + &vif- > >pending_tx_info[pending_idx].callback_struct; > + } else if (!skb_shinfo(skb)->nr_frags) { > /* Schedule a response immediately. */ > + skb_shinfo(skb)->destructor_arg = NULL; > + xenvif_idx_unmap(vif, pending_idx); > xenvif_idx_release(vif, pending_idx, > XEN_NETIF_RSP_OKAY); > + } else { > + /* FIXME: first request fits linear space, I don't know > + * if any guest would do that, but I think it's possible > + */ The Windows frontend, because it has to parse the packet headers, will coalesce everything up to the payload in a single frag and it would be a good idea to copy this directly into the linear area. > + skb_shinfo(skb)->destructor_arg = > + &vif- > >pending_tx_info[pending_idx].callback_struct; > } > > if (txp->flags & XEN_NETTXF_csum_blank) > @@ -1635,13 +1588,19 @@ static int xenvif_tx_submit(struct xenvif *vif, int > budget) > else if (txp->flags & XEN_NETTXF_data_validated) > skb->ip_summed = CHECKSUM_UNNECESSARY; > > - xenvif_fill_frags(vif, skb); > + xenvif_fill_frags(vif, skb, pending_idx); > > if (skb_is_nonlinear(skb) && skb_headlen(skb) < > PKT_PROT_LEN) { > int target = min_t(int, skb->len, PKT_PROT_LEN); > __pskb_pull_tail(skb, target - skb_headlen(skb)); > } > > + /* Set this flag after __pskb_pull_tail, as it can trigger > + * skb_copy_ubufs, while we are still in control of the skb > + */ You can't be sure that there will be no subsequent pullups. The v6 parsing code I added may need to do that on a (hopefully) rare occasion. Paul > + if (skb_shinfo(skb)->destructor_arg) > + skb_shinfo(skb)->tx_flags |= > SKBTX_DEV_ZEROCOPY; > + > skb->dev = vif->dev; > skb->protocol = eth_type_trans(skb, skb->dev); > skb_reset_network_header(skb); > @@ -1770,17 +1729,25 @@ static inline void xenvif_tx_action_dealloc(struct > xenvif *vif) > int xenvif_tx_action(struct xenvif *vif, int budget) > { > unsigned nr_gops; > - int work_done; > + int work_done, ret; > > if (unlikely(!tx_work_todo(vif))) > return 0; > > + xenvif_tx_action_dealloc(vif); > + > nr_gops = xenvif_tx_build_gops(vif); > > if (nr_gops == 0) > return 0; > > - gnttab_batch_copy(vif->tx_copy_ops, nr_gops); > + if (nr_gops) { > + ret = gnttab_map_refs(vif->tx_map_ops, > + NULL, > + vif->pages_to_gnt, > + nr_gops); > + BUG_ON(ret); > + } > > work_done = xenvif_tx_submit(vif, nr_gops); > > @@ -1791,45 +1758,13 @@ static void xenvif_idx_release(struct xenvif *vif, > u16 pending_idx, > u8 status) > { > struct pending_tx_info *pending_tx_info; > - pending_ring_idx_t head; > + pending_ring_idx_t index; > u16 peek; /* peek into next tx request */ > > - BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL)); > - > - /* Already complete? */ > - if (vif->mmap_pages[pending_idx] == NULL) > - return; > - > - pending_tx_info = &vif->pending_tx_info[pending_idx]; > - > - head = pending_tx_info->head; > - > - BUG_ON(!pending_tx_is_head(vif, head)); > - BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx); > - > - do { > - pending_ring_idx_t index; > - pending_ring_idx_t idx = pending_index(head); > - u16 info_idx = vif->pending_ring[idx]; > - > - pending_tx_info = &vif->pending_tx_info[info_idx]; > + pending_tx_info = &vif->pending_tx_info[pending_idx]; > make_tx_response(vif, &pending_tx_info->req, status); > - > - /* Setting any number other than > - * INVALID_PENDING_RING_IDX indicates this slot is > - * starting a new packet / ending a previous packet. > - */ > - pending_tx_info->head = 0; > - > index = pending_index(vif->pending_prod++); > - vif->pending_ring[index] = vif->pending_ring[info_idx]; > - > - peek = vif->pending_ring[pending_index(++head)]; > - > - } while (!pending_tx_is_head(vif, peek)); > - > - put_page(vif->mmap_pages[pending_idx]); > - vif->mmap_pages[pending_idx] = NULL; > + vif->pending_ring[index] = pending_idx; > } > > void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx) > @@ -1904,6 +1839,8 @@ static inline int rx_work_todo(struct xenvif *vif) > > static inline int tx_work_todo(struct xenvif *vif) > { > + if (vif->dealloc_cons != vif->dealloc_prod) > + return 1; > > if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) && > (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxx > http://lists.xen.org/xen-devel _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.