Xen project Mailing List

[Xen-devel] [PATCH 21/22] NC2 VMQ support.

This only includes the transmit half, because the receiver uses an unmodified posted buffers mode implementation. This includes various bits of patches which were Signed-off-by: Jose Renato Santos <jsantos@xxxxxxxxxx> Signed-off-by: Mitch Williams <mitch.a.williams@xxxxxxxxx> Signed-off-by: Steven Smith <steven.smith@xxxxxxxxxx> All bugs are mine, of course. --- drivers/xen/Kconfig | 5 + drivers/xen/netchannel2/Makefile | 4 + drivers/xen/netchannel2/chan.c | 7 +- drivers/xen/netchannel2/netback2.c | 9 + drivers/xen/netchannel2/netchannel2_core.h | 10 + drivers/xen/netchannel2/posted_buffer.h | 50 ++ drivers/xen/netchannel2/posted_buffers.c | 20 +- drivers/xen/netchannel2/util.c | 8 +- drivers/xen/netchannel2/vmq.c | 805 ++++++++++++++++++++++++++++ drivers/xen/netchannel2/vmq.h | 58 ++ drivers/xen/netchannel2/vmq_def.h | 68 +++ drivers/xen/netchannel2/xmit_packet.c | 6 + 12 files changed, 1029 insertions(+), 21 deletions(-) create mode 100644 drivers/xen/netchannel2/posted_buffer.h create mode 100644 drivers/xen/netchannel2/vmq.c create mode 100644 drivers/xen/netchannel2/vmq.h create mode 100644 drivers/xen/netchannel2/vmq_def.h diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a7e5b5c..a37b0cd 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -234,6 +234,11 @@ config XEN_NETDEV2_FRONTEND depends on XEN_NETCHANNEL2 default y +config XEN_NETDEV2_VMQ + bool "Net channel 2 support for multi-queue devices" + depends on XEN_NETDEV2_BACKEND && NET_VMQ + default y + config XEN_NETDEV2_BYPASSABLE bool "Net channel 2 bypassee support" depends on XEN_NETDEV2_BACKEND diff --git a/drivers/xen/netchannel2/Makefile b/drivers/xen/netchannel2/Makefile index 11a257e..918d8d8 100644 --- a/drivers/xen/netchannel2/Makefile +++ b/drivers/xen/netchannel2/Makefile @@ -12,6 +12,10 @@ ifeq ($(CONFIG_XEN_NETDEV2_FRONTEND),y) netchannel2-objs += netfront2.o endif +ifeq ($(CONFIG_XEN_NETDEV2_VMQ),y) +netchannel2-objs += vmq.o +endif + ifeq ($(CONFIG_XEN_NETDEV2_BYPASSABLE),y) netchannel2-objs += bypassee.o endif diff --git a/drivers/xen/netchannel2/chan.c b/drivers/xen/netchannel2/chan.c index 060b49b..8dad6fe 100644 --- a/drivers/xen/netchannel2/chan.c +++ b/drivers/xen/netchannel2/chan.c @@ -13,6 +13,7 @@ #include "netchannel2_endpoint.h" #include "netchannel2_core.h" +#include "vmq.h" static int process_ring(struct napi_struct *napi, int work_avail); @@ -810,6 +811,8 @@ static int process_ring(struct napi_struct *napi, /* Pick up incoming messages. */ work_done = nc2_poll(ncrp, work_avail, &rx_queue); + do_vmq_work(nc); + /* Transmit pending packets. */ if (!skb_queue_empty(&ncrp->pending_tx_queue)) { skb = __skb_dequeue(&ncrp->pending_tx_queue); @@ -828,9 +831,11 @@ static int process_ring(struct napi_struct *napi, This must happen before we flush the rings, since that's when the PACKET messages will be made visible to the other end. */ - if (ncrp == &nc->rings) + if (ncrp == &nc->rings) { flush_hypercall_batcher(&nc->batcher, nc2_posted_on_gntcopy_fail); + vmq_flush_unmap_hypercall(); + } flush_rings(ncrp); diff --git a/drivers/xen/netchannel2/netback2.c b/drivers/xen/netchannel2/netback2.c index 129ef81..eb2a781 100644 --- a/drivers/xen/netchannel2/netback2.c +++ b/drivers/xen/netchannel2/netback2.c @@ -10,8 +10,13 @@ #include "netchannel2_core.h" #include "netchannel2_endpoint.h" #include "netchannel2_uspace.h" +#include "vmq.h" +#ifdef CONFIG_XEN_NETDEV2_VMQ +#define NR_TX_BUFS (VMQ_MAX_BUFFERS+256) +#else #define NR_TX_BUFS 256 +#endif static atomic_t next_handle; /* A list of all currently-live netback2 interfaces. */ @@ -168,6 +173,8 @@ static int attach_to_frontend(struct netback2 *nd) return err; } + nc2_vmq_connect(nc); + /* All done */ nd->attached = 1; @@ -176,6 +183,8 @@ static int attach_to_frontend(struct netback2 *nd) static void nb2_shutdown(struct netchannel2 *nc) { + nc2_vmq_disconnect(nc); + nc2_set_nr_tx_buffers(nc, 0); } diff --git a/drivers/xen/netchannel2/netchannel2_core.h b/drivers/xen/netchannel2/netchannel2_core.h index 1939cbb..8e1657d 100644 --- a/drivers/xen/netchannel2/netchannel2_core.h +++ b/drivers/xen/netchannel2/netchannel2_core.h @@ -7,6 +7,8 @@ #include <linux/skbuff.h> #include <linux/netdevice.h> +#include "vmq_def.h" + /* After we send this number of frags, we request the other end to * notify us when sending the corresponding finish packet message */ #define MAX_MAX_COUNT_FRAGS_NO_EVENT 192 @@ -43,6 +45,9 @@ enum transmit_policy { transmit_policy_grant = transmit_policy_first, transmit_policy_post, transmit_policy_map, +#ifdef CONFIG_XEN_NETDEV2_VMQ + transmit_policy_vmq, +#endif transmit_policy_small, transmit_policy_last = transmit_policy_small }; @@ -437,6 +442,11 @@ struct netchannel2 { struct hypercall_batcher batcher; +#ifdef CONFIG_XEN_NETDEV2_VMQ + /* vmq data for supporting multi-queue devices */ + nc2_vmq_t vmq; +#endif + #ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS struct nc2_auto_bypass auto_bypass; #endif diff --git a/drivers/xen/netchannel2/posted_buffer.h b/drivers/xen/netchannel2/posted_buffer.h new file mode 100644 index 0000000..e249777 --- /dev/null +++ b/drivers/xen/netchannel2/posted_buffer.h @@ -0,0 +1,50 @@ +/* Buffer management related bits, shared between vmq.c and + * posted_buffer.c */ +#ifndef NC2_POSTED_BUFFER_H__ +#define NC2_POSTED_BUFFER_H__ + +/* A buffer which the other end has provided us which we can use to + transmit packets to it. */ +struct nc2_tx_buffer { + struct list_head list; + uint32_t id; /* ID assigned by the remote endpoint. */ + grant_ref_t gref; + uint16_t off_in_page; + uint16_t size; + grant_handle_t grant_handle; +}; + +/* add a buffer to the pending list to be returned to the other end buffer */ +static inline void return_tx_buffer(struct netchannel2 *nc, + struct nc2_tx_buffer *buffer) +{ + list_add(&buffer->list, &nc->pending_tx_buffer_return); +} + +static inline struct nc2_tx_buffer *_get_tx_buffer(struct netchannel2 *nc) +{ + struct nc2_tx_buffer *buffer; + struct list_head *entry = nc->avail_tx_buffers.next; + list_del(entry); + buffer = list_entry(entry, struct nc2_tx_buffer, list); + nc->nr_avail_tx_buffers--; + return buffer; +} + +/* recycle a posted buffer: return it to the list of available buffers */ +static inline void recycle_tx_buffer(struct netchannel2 *nc, + struct nc2_tx_buffer *buffer) +{ + list_add(&buffer->list, &nc->avail_tx_buffers); + nc->nr_avail_tx_buffers++; +} + +/* add a buffer slot to list of unused buffer slots after it has been + * returned to other end */ +static inline void free_tx_buffer(struct netchannel2 *nc, + struct nc2_tx_buffer *buffer) +{ + list_add(&buffer->list, &nc->unused_tx_buffer_slots); +} + +#endif /* !NC2_POSTED_BUFFER_H__ */ diff --git a/drivers/xen/netchannel2/posted_buffers.c b/drivers/xen/netchannel2/posted_buffers.c index 96de7da..9fb7570 100644 --- a/drivers/xen/netchannel2/posted_buffers.c +++ b/drivers/xen/netchannel2/posted_buffers.c @@ -9,6 +9,7 @@ #include <xen/live_maps.h> #include "netchannel2_endpoint.h" #include "netchannel2_core.h" +#include "posted_buffer.h" #define POSTED_BUFFER_SIZE PAGE_SIZE @@ -350,17 +351,6 @@ void nc2_handle_set_nr_posted_buffers(struct netchannel2 *nc, /* -------------------------- Transmit ------------------------------- */ -/* A buffer which the other end has provided us which we can use to - transmit packets to it. */ -struct nc2_tx_buffer { - struct list_head list; - uint32_t id; /* ID assigned by the remote endpoint. */ - grant_ref_t gref; - uint16_t off_in_page; - uint16_t size; - grant_handle_t grant_handle; -}; - /* A representation of a packet which is halfway through being prepared for transmission. */ struct post_packet_plan { @@ -373,14 +363,6 @@ struct post_packet_plan { volatile struct netchannel2_fragment *output_frag; }; -/* add a buffer slot to list of unused buffer slots after it has been - * returned to other end */ -static void free_tx_buffer(struct netchannel2 *nc, - struct nc2_tx_buffer *buffer) -{ - list_add(&buffer->list, &nc->unused_tx_buffer_slots); -} - /* A grant copy failed while we were transmitting a packet. That indicates that the *receiving* domain gave us a bad RX buffer. We're too late to send them an error, so there isn't really diff --git a/drivers/xen/netchannel2/util.c b/drivers/xen/netchannel2/util.c index 79d9f09..1d96256 100644 --- a/drivers/xen/netchannel2/util.c +++ b/drivers/xen/netchannel2/util.c @@ -34,7 +34,13 @@ int allocate_txp_slot(struct netchannel2_ring_pair *ncrp, static void nc2_free_skb(struct netchannel2 *nc, struct sk_buff *skb) { - dev_kfree_skb(skb); +#ifdef CONFIG_XEN_NETDEV2_VMQ + nc2_vmq_t *vmq = &nc->vmq; + if (get_skb_overlay(skb)->policy == transmit_policy_vmq) + skb_queue_tail(&vmq->dealloc_queue, skb); + else +#endif + dev_kfree_skb(skb); } void release_txp_slot(struct netchannel2_ring_pair *ncrp, diff --git a/drivers/xen/netchannel2/vmq.c b/drivers/xen/netchannel2/vmq.c new file mode 100644 index 0000000..e36962b --- /dev/null +++ b/drivers/xen/netchannel2/vmq.c @@ -0,0 +1,805 @@ +/***************************************************************************** + * vmq.c + * + * Support multi-queue network devices. + * + * Copyright (c) 2008, Kaushik Kumar Ram, Rice University. + * Copyright (c) 2008, Jose Renato Santos, Hewlett-Packard Co. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ +/* This only implements the transmit half of the method; receive is + * handled by posted_buffers.c */ +#include <linux/kernel.h> +#include <linux/netvmq.h> +#include <linux/skbuff.h> +#include <xen/xenbus.h> +#include <xen/balloon.h> +#include "netchannel2_core.h" + +#include "posted_buffer.h" +#include "vmq.h" + +/* state of device queue when operating in vmq mode */ +#define VMQ_QUEUE_DISABLED 0 +#define VMQ_QUEUE_STARTING 1 +#define VMQ_QUEUE_ENABLED 2 +#define VMQ_QUEUE_CLOSING 3 + +#define VMQ_MAX_UNMAP_OPS 256 +struct vmq_unmap_grants { + unsigned n; + gnttab_unmap_grant_ref_t gop[VMQ_MAX_UNMAP_OPS]; +}; +typedef struct vmq_unmap_grants vmq_unmap_grants_t; + +vmq_unmap_grants_t vmq_unmap_grants; + +static inline void vmq_flush_unmap_grants(void) +{ + if (vmq_unmap_grants.n == 0) + return; + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + vmq_unmap_grants.gop, + vmq_unmap_grants.n)) + BUG(); + vmq_unmap_grants.n = 0; +} + +static inline gnttab_unmap_grant_ref_t *vmq_next_unmap_gop(void) +{ + if (vmq_unmap_grants.n == VMQ_MAX_UNMAP_OPS) + vmq_flush_unmap_grants(); + return &vmq_unmap_grants.gop[vmq_unmap_grants.n++]; +} + +void vmq_flush_unmap_hypercall(void) +{ + vmq_flush_unmap_grants(); +} + +static inline unsigned long vmq_idx_to_pfn(nc2_vmq_t *vmq, unsigned int idx) +{ + return page_to_pfn(vmq->pages[idx]); +} + +static inline unsigned long vmq_idx_to_kaddr(nc2_vmq_t *vmq, unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(vmq_idx_to_pfn(vmq, idx)); +} + +/* get vmq idx from page struct */ +static long nc2_vmq_page_index(struct page *page) +{ + nc2_vmq_buf_t *vmq_buf; + vmq_buf = (nc2_vmq_buf_t *)page->mapping; + return vmq_buf - vmq_buf->nc->vmq.buffer; +} + +/* Read a physical device name from xenstore and + * returns a pointer to the associated net_device structure. + * Returns NULL on error. */ +static struct net_device *read_pdev(struct xenbus_device *dev) +{ + char *pdevstr; + struct net_device *pdev = NULL; + + pdevstr = xenbus_read(XBT_NIL, dev->nodename, "pdev", NULL); + if (IS_ERR(pdevstr)) + return NULL; + + if (pdevstr) + pdev = dev_get_by_name(&init_net, pdevstr); + + kfree(pdevstr); + + return pdev; +} + +static void nc2_vmq_page_release(struct page *page, unsigned int order) +{ + printk(KERN_CRIT "%s: ERROR: Unexpected release of netchannel2 vmq page", + __func__); + BUG_ON(1); +} + +static inline int nc2_vmq_is_disabled(struct netchannel2 *nc) +{ + return nc->vmq.vmq_state == VMQ_QUEUE_DISABLED; +} + +static inline int nc2_vmq_is_starting(struct netchannel2 *nc) +{ + return nc->vmq.vmq_state == VMQ_QUEUE_STARTING; +} + +static inline int nc2_vmq_is_enabled(struct netchannel2 *nc) +{ + return nc->vmq.vmq_state == VMQ_QUEUE_ENABLED; +} + +static inline int nc2_vmq_is_closing(struct netchannel2 *nc) +{ + return nc->vmq.vmq_state == VMQ_QUEUE_CLOSING; +} + +static inline void nc2_vmq_enable(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + vmq_get(vmq); + vmq_enable_queue(vmq->pdev, vmq->vmq_id); + vmq->vmq_state = VMQ_QUEUE_ENABLED; +} + +void nc2_vmq_disconnect(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + + if (nc2_vmq_is_enabled(nc)) { + vmq_disable_queue(vmq->pdev, vmq->vmq_id); + vmq_free_queue(vmq->pdev, vmq->vmq_id); + vmq->vmq_state = VMQ_QUEUE_CLOSING; + /* wait until all buffers have been returned by dev driver */ + wait_event(vmq->waiting_to_free, + atomic_read(&vmq->refcnt) == 0); + return; + } + + if (nc2_vmq_is_starting(nc)) { + vmq_free_queue(vmq->pdev, vmq->vmq_id); + vmq->vmq_state = VMQ_QUEUE_CLOSING; + return; + } + +} + + +static void nc2_vmq_end_map_buffers(gnttab_map_grant_ref_t *mop, int count, + struct netchannel2 *nc, u16 *alloc_idx) +{ + int i, err; + u16 idx; + unsigned int prod; + nc2_vmq_t *vmq = &nc->vmq; + + prod = vmq->mapped_pages_prod; + + for (i = 0; i < count; i++) { + idx = alloc_idx[i]; + + /* Check error status */ + err = mop->status; + if (likely(!err)) { + set_phys_to_machine( + __pa(vmq_idx_to_kaddr(vmq, idx)) + >> PAGE_SHIFT, + FOREIGN_FRAME(mop->dev_bus_addr + >> PAGE_SHIFT)); + /* Store the handle */ + vmq->buffer[idx].buf->grant_handle = mop->handle; + + /* Add it to the mapped pages list */ + vmq->mapped_pages[VMQ_IDX_MASK(prod++)] = idx; + mop++; + continue; + } + + /* Error mapping page: return posted buffer to other end. + * TODO: We might need an error field on the return buffer + * message */ + return_tx_buffer(nc, vmq->buffer[idx].buf); + + /* Add the page back to the free list */ + vmq->unmapped_pages[VMQ_IDX_MASK(vmq->unmapped_pages_prod++)] + = idx; + + mop++; + } + + smp_wmb(); + vmq->mapped_pages_prod = prod; + + return; +} + +/* Map guest buffers and place them in the mapped buffers list. The mapped + * pages in this list are used when allocating a skb (vmq_alloc_skb()). + */ +static void nc2_vmq_map_buffers(struct netchannel2 *nc) +{ + u16 idx; + int count = 0; + unsigned int cons; + int nbufs; + int buf_avail; + struct nc2_tx_buffer *buf; + struct nc2_vmq *vmq = &nc->vmq; + int n_mapped = nr_vmq_bufs(nc); + + + /* + * Putting hundreds of bytes on the stack is considered rude. + * Static works because a tasklet can only be on one CPU at any time. + */ + static gnttab_map_grant_ref_t rx_map_ops[VMQ_MAX_BUFFERS]; + static u16 alloc_idx[VMQ_MAX_BUFFERS]; + + /* If there is at least VMQ_MIN_BUFFERS buffers, no work to do */ + if (n_mapped >= VMQ_MIN_BUFFERS) + return; + + /* Try to get VMQ_MAX_BUFFERS mapped buffers, if there are + sufficient buffers posted by the other end */ + nbufs = VMQ_MAX_BUFFERS - n_mapped; + buf_avail = nc->nr_avail_tx_buffers; + if (nbufs > buf_avail) + nbufs = buf_avail; + + /* Xen cannot handle more than 512 grant ops in a single hypercall */ + if (nbufs > 512) + nbufs = 512; + + /* give up if there are no buffers available */ + if (nbufs <= 0) + return; + + /* Note that we *should* have free pages to consume here + * and no checks are needed. + */ + cons = vmq->unmapped_pages_cons; + + while (count < nbufs) { + idx = vmq->unmapped_pages[VMQ_IDX_MASK(cons++)]; + buf = vmq->buffer[idx].buf = _get_tx_buffer(nc); + /* Setup grant map operation */ + gnttab_set_map_op(&rx_map_ops[count], + vmq_idx_to_kaddr(vmq, idx), + GNTMAP_host_map, + buf->gref, + nc->rings.otherend_id); + alloc_idx[count] = idx; + count++; + } + + vmq->unmapped_pages_cons = cons; + + /* Map all the pages */ + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + rx_map_ops, nbufs)); + + /* Finalize buffer mapping after checking if the grant operations + succeeded */ + nc2_vmq_end_map_buffers(rx_map_ops, nbufs, nc, alloc_idx); + + vmq->nbufs += nbufs; +} + +static void nc2_vmq_unmap_buf(struct netchannel2 *nc, + unsigned int idx, int recycle) +{ + nc2_vmq_t *vmq = &nc->vmq; + unsigned long pfn; + gnttab_unmap_grant_ref_t *gop; + unsigned prod; + + pfn = vmq_idx_to_pfn(vmq, idx); + /* Already unmapped? */ + if (!phys_to_machine_mapping_valid(pfn)) + return; + + gop = vmq_next_unmap_gop(); + gnttab_set_unmap_op(gop, vmq_idx_to_kaddr(vmq, idx), + GNTMAP_host_map, + vmq->buffer[idx].buf->grant_handle); + + vmq->nbufs--; + + set_phys_to_machine(__pa(vmq_idx_to_kaddr(vmq, idx)) >> + PAGE_SHIFT, + INVALID_P2M_ENTRY); + /* Ready for next use. */ + gnttab_reset_grant_page(vmq->pages[idx]); + /* Add the page back to the unmapped list */ + prod = vmq->unmapped_pages_prod; + vmq->unmapped_pages[VMQ_IDX_MASK(prod++)] = idx; + if (recycle) + recycle_tx_buffer(nc, vmq->buffer[idx].buf); + else + free_tx_buffer(nc, vmq->buffer[idx].buf); + smp_wmb(); + vmq->unmapped_pages_prod = prod; +} + +static void nc2_vmq_free_mapped_bufs(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + unsigned int idx; + unsigned prod, cons; + + /* The queue should be disabled before this function is called */ + BUG_ON(vmq->vmq_state == VMQ_QUEUE_ENABLED); + + cons = vmq->mapped_pages_cons; + prod = vmq->mapped_pages_prod; + smp_rmb(); + + while (cons != prod) { + idx = vmq->mapped_pages[VMQ_IDX_MASK(cons++)]; + nc2_vmq_unmap_buf(nc, idx, 1); + } + + vmq_flush_unmap_grants(); + + vmq->mapped_pages_cons = cons; + +} + +static void nc2_vmq_free_skb(struct sk_buff *skb) +{ + struct netchannel2 *nc; + nc2_vmq_t *vmq; + unsigned int idx; + int nr_frags, i; + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frags = shinfo->frags; + + nc = netdev_priv(skb->dev); + vmq = &nc->vmq; + + nr_frags = shinfo->nr_frags; + for (i = 0; i < nr_frags; i++) { + idx = nc2_vmq_page_index(frags[i].page); + nc2_vmq_unmap_buf(nc, idx, 1); + } + + vmq_flush_unmap_grants(); + + shinfo->frag_list = NULL; + shinfo->nr_frags = 0; + + /* Add the skb back to the free pool */ + skb_queue_tail(&vmq->free_skb_list, skb); +} + +/* Initialize the free socket buffer list */ +static int vmq_init_free_skb_list(int n, struct sk_buff_head *free_skb_list) +{ + int i; + struct sk_buff *skb; + + skb_queue_head_init(free_skb_list); + + for (i = 0; i < n; i++) { + skb = alloc_skb(VMQ_SKB_SIZE, GFP_ATOMIC); + if (!skb) { + printk("Netchannel2 vmq: Failed to allocate socket " + "buffer %d (max=%d)\n", i, (int)n); + goto error; + } + skb_queue_tail(free_skb_list, skb); + } + + return 0; +error: + /* Free all the allocated buffers and return Error */ + while (!skb_queue_empty(free_skb_list)) + kfree_skb(skb_dequeue(free_skb_list)); + + return -1; +} + +/* Initialize vmq. Return 1 if vmq is used and 0 otherwise */ +int nc2_vmq_connect(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + struct page *page; + int q_id; + int size; + int i; + + vmq->vmq_mode = 0; + vmq->pdev = read_pdev(nc->xenbus_device); + + /* cannot use vmq mode if physical device not found */ + if (!vmq->pdev) + return 0; + + /* Allocate a RX queue */ + q_id = vmq_alloc_queue(vmq->pdev, VMQ_TYPE_RX); + if (q_id < 0) + /* Allocation failed, cannot use multi-queue */ + goto free_pdev; + + vmq->vmq_id = q_id; + + /* Set the size of the queue */ + size = vmq_get_maxsize(vmq->pdev); + if (size > VMQ_QUEUE_SIZE) + size = VMQ_QUEUE_SIZE; + if (vmq_set_size(vmq->pdev, q_id, size) < 0) { + /* Failure, free up the queue and return error */ + printk(KERN_ERR "%s: could not set queue size on net device\n", + __func__); + goto free_queue; + } + vmq->vmq_size = size; + + /* Set the mac address of the queue */ + if (vmq_set_mac(vmq->pdev, q_id, nc->rings.remote_mac) < 0) { + /* Failure, free up the queue and return error */ + printk(KERN_ERR "%s: could not set MAC address for net device queue\n", + __func__); + goto free_queue; + } + + vmq->pages = alloc_empty_pages_and_pagevec(VMQ_MAX_BUFFERS); + if (vmq->pages == NULL) { + printk(KERN_ERR "%s: out of memory\n", __func__); + goto free_queue; + } + + skb_queue_head_init(&vmq->dealloc_queue); + skb_queue_head_init(&vmq->rx_queue); + + if (vmq_init_free_skb_list(VMQ_MAX_BUFFERS, + &vmq->free_skb_list)) { + printk(KERN_ERR "%s: Could not allocate free socket buffers", + __func__); + goto free_pagevec; + } + + for (i = 0; i < VMQ_MAX_BUFFERS; i++) { + vmq->buffer[i].nc = nc; + page = vmq->pages[i]; + SetPageForeign(page, nc2_vmq_page_release); + page->mapping = (void *)&vmq->buffer[i]; + vmq->unmapped_pages[i] = i; + } + + vmq->unmapped_pages_prod = VMQ_MAX_BUFFERS; + vmq->unmapped_pages_cons = 0; + + vmq->mapped_pages_prod = 0; + vmq->mapped_pages_cons = 0; + + vmq->nbufs = 0; + vmq->vmq_mode = 1; + + /* Store the pointer to netchannel2 device in pdev */ + BUG_ON((vmq->pdev->vmq == NULL) || (vmq->pdev->vmq->queue == NULL)); + vmq->pdev->vmq->queue[q_id].guest = (void *)nc->net_device; + + atomic_set(&vmq->refcnt, 0); + init_waitqueue_head(&vmq->waiting_to_free); + + printk(KERN_INFO "Netchannel2 using vmq mode for guest %d\n", + nc->xenbus_device->otherend_id); + + vmq->vmq_state = VMQ_QUEUE_STARTING; + + return 1; /* Success */ + + +free_pagevec: + free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS); +free_queue: + vmq_free_queue(vmq->pdev, vmq->vmq_id); +free_pdev: + dev_put(vmq->pdev); + vmq->pdev = NULL; + return 0; +} + +void nc2_vmq_shutdown(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + int i; + + if (!vmq->vmq_mode) + return; + + /* All posted bufs should have been returned */ + BUG_ON(nr_vmq_bufs(nc) != nr_vmq_mapped_bufs(nc)); + + /* free the mapped bufs */ + nc2_vmq_free_mapped_bufs(nc); + + /* Free the vmq pages */ + if (vmq->pages) { + for (i = 0; i < VMQ_MAX_BUFFERS; i++) { + if (PageForeign(vmq->pages[i])) + ClearPageForeign(vmq->pages[i]); + vmq->pages[i]->mapping = NULL; + } + free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS); + vmq->pages = NULL; + } + + while (!skb_queue_empty(&vmq->free_skb_list)) { + /* Free the socket buffer pool */ + kfree_skb(skb_dequeue(&vmq->free_skb_list)); + } + vmq->vmq_state = VMQ_QUEUE_DISABLED; + vmq->vmq_mode = 0; + + if (vmq->pdev) { + dev_put(vmq->pdev); + vmq->pdev = NULL; + } + + vmq_put(vmq); +} + +static int prepare_xmit_allocate_vmq(struct netchannel2 *nc, + struct sk_buff *skb) +{ + unsigned msg_size; + + msg_size = get_transmitted_packet_msg_size(skb); + if (!nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size)) + return -1; + return 0; +} + +void do_vmq_work(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + struct sk_buff *skb; + unsigned long flags; + + /* if not in vmq mode do nothing */ + if (!nc2_in_vmq_mode(nc)) + return; + + /* Map guest buffers for dedicated NIC RX queue if needed */ + if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) { + nc2_vmq_map_buffers(nc); + /* We delay enabling the queue until we have enough + posted buffers. Check if it is time to enable it */ + if (nc2_vmq_is_starting(nc) && + (nr_vmq_bufs(nc) >= VMQ_MIN_BUFFERS)) { + nc2_vmq_enable(nc); + } + } + + /* free vmq skb's returned by the physical device driver */ + while (!skb_queue_empty(&nc->vmq.dealloc_queue)) + nc2_vmq_free_skb(skb_dequeue(&nc->vmq.dealloc_queue)); + + /* complete vmq closing after all packets returned by physical + * device driver */ + + if (nc2_vmq_is_closing(nc) && + (nr_vmq_bufs(nc) == nr_vmq_mapped_bufs(nc))) { + nc->vmq.vmq_state = VMQ_QUEUE_DISABLED; + nc2_vmq_shutdown(nc); + } + + spin_lock_irqsave(&vmq->rx_queue.lock, flags); + while (!skb_queue_empty(&vmq->rx_queue)) { + skb = __skb_dequeue(&nc->vmq.rx_queue); + if (prepare_xmit_allocate_vmq(nc, skb) < 0) { + __skb_queue_head(&vmq->rx_queue, skb); + spin_unlock_irqrestore(&vmq->rx_queue.lock, flags); + return; + } + __skb_queue_tail(&nc->rings.pending_tx_queue, skb); + } + spin_unlock_irqrestore(&vmq->rx_queue.lock, flags); +} + +/* Return the netchannel2 device corresponding to the given queue in pdev */ +static inline struct net_device *nc2_vmq_queue_to_vif(struct net_device *pdev, + int queue_id) +{ + net_vmq_t *n_vmq; + vmq_queue_t *vmq_q; + + n_vmq = pdev->vmq; + BUG_ON(n_vmq == NULL); + vmq_q = &n_vmq->queue[queue_id]; + BUG_ON(vmq_q == NULL); + + return (struct net_device *)vmq_q->guest; +} + +/* Handle incoming vmq packet */ +int vmq_netif_rx(struct sk_buff *skb, int queue_id) +{ + struct skb_cb_overlay *skb_co = get_skb_overlay(skb); + struct net_device *dev; + struct netchannel2 *nc; + nc2_vmq_t *vmq; + + memset(skb_co, 0, sizeof(*skb_co)); + + skb_co->nr_fragments = skb_shinfo(skb)->nr_frags; + skb_co->type = NC2_PACKET_TYPE_pre_posted; + skb_co->policy = transmit_policy_vmq; + + /* get the netchannel2 interface corresponding to this queue */ + dev = nc2_vmq_queue_to_vif(skb->dev, queue_id); + nc = netdev_priv(dev); + vmq = &nc->vmq; + + /* replace source dev with destination dev */ + skb->dev = dev; + /* add skb to rx_queue */ + skb_queue_tail(&vmq->rx_queue, skb); + + /* Trigger thread excution to procees new packets */ + nc2_kick(&nc->rings); + + return 0; +} +EXPORT_SYMBOL(vmq_netif_rx); + + +/* Allocate a socket buffer from the free list, get a guest posted + * buffer, attach it to the skb, and return it. + */ +struct sk_buff *vmq_alloc_skb(struct net_device *netdevice, int queue_id, + unsigned int length) +{ + struct sk_buff *skb; + struct netchannel2 *nc; + nc2_vmq_t *vmq; + unsigned int idx; + int nr_bufs, i; + unsigned int cons; + unsigned int prod; + + /* get the netchannel2 interface corresponding to this queue */ + nc = netdev_priv(nc2_vmq_queue_to_vif(netdevice, queue_id)); + + vmq = &nc->vmq; + + /* Get a free buffer from the pool */ + if (skb_queue_empty(&vmq->free_skb_list)) { + /* No buffers to allocate */ + return NULL; + } + + + skb = skb_dequeue(&vmq->free_skb_list); + BUG_ON(skb == NULL); + + nr_bufs = VMQ_NUM_BUFFERS(length); + + cons = vmq->mapped_pages_cons; + prod = vmq->mapped_pages_prod; + smp_rmb(); + + if (nr_bufs > (prod - cons)) + /* Not enough mapped buffers in the pool */ + goto kick_nc2; + + if (nr_bufs > MAX_SKB_FRAGS) + goto error; + + for (i = 0; i < nr_bufs; i++) { + idx = vmq->mapped_pages[VMQ_IDX_MASK(cons)]; + /* FIX ME: This can be simplified */ + skb_shinfo(skb)->frags[i].page = + virt_to_page(vmq_idx_to_kaddr(vmq, idx)); + skb_shinfo(skb)->frags[i].page_offset = 0; + skb_shinfo(skb)->frags[i].size = PAGE_SIZE; + skb_shinfo(skb)->nr_frags++; + skb->dev = netdevice; + cons++; + } + + vmq->mapped_pages_cons = cons; + + /* if number of buffers get low run tasklet to map more buffers */ + if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) + nc2_kick(&nc->rings); + + return skb; + +kick_nc2: + /* kick netchannel2 interface to get any recently posted buffers */ + nc2_kick(&nc->rings); +error: + /* Add the skb back to the free pool */ + skb_queue_tail(&vmq->free_skb_list, skb); + return NULL; +} +EXPORT_SYMBOL(vmq_alloc_skb); + +/* Detach the guest pages and free the socket buffer */ +void vmq_free_skb(struct sk_buff *skb, int queue_id) +{ + struct net_device *dev; + struct netchannel2 *nc; + nc2_vmq_t *vmq; + + /* get the netchannel2 interface corresponding to this queue */ + dev = nc2_vmq_queue_to_vif(skb->dev, queue_id); + + nc = netdev_priv(dev); + vmq = &nc->vmq; + + /* Add skb to the dealloc queue */ + skb->dev = dev; + skb_queue_tail(&vmq->dealloc_queue, skb); + + /* kick netchannel2 interface */ + nc2_kick(&nc->rings); + +} +EXPORT_SYMBOL(vmq_free_skb); + +int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb) +{ + int nr_frags; + long idx; + nc2_vmq_t *vmq = &nc->vmq; + + nr_frags = skb_shinfo(skb)->nr_frags; + if (vmq->vmq_mode && nr_frags && + PageForeign(skb_shinfo(skb)->frags[0].page)) { + idx = nc2_vmq_page_index(skb_shinfo(skb)->frags[0].page); + if ((idx >= 0) && (idx < VMQ_MAX_BUFFERS)) + return 1; + } + + return 0; +} + +/* Prepare to transmit a vmq packet */ +void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb, + volatile void *msg_buf) +{ + volatile struct netchannel2_msg_packet *msg = msg_buf; + volatile struct netchannel2_fragment *out_frag; + nc2_vmq_t *vmq = &nc->vmq; + skb_frag_t *frag; + struct nc2_tx_buffer *txbuf; + int nr_frags; + unsigned int idx; + unsigned x; + + nr_frags = skb_shinfo(skb)->nr_frags; + for (x = 0; x < nr_frags; x++) { + frag = &skb_shinfo(skb)->frags[x]; + out_frag = &msg->frags[x]; + + idx = nc2_vmq_page_index(frag->page); + txbuf = vmq->buffer[idx].buf; + out_frag->pre_post.id = txbuf->id; + out_frag->off = frag->page_offset; + out_frag->size = frag->size; + /* TODO: need to batch unmap grants */ + nc2_vmq_unmap_buf(nc, idx, 0); + } + + /* Avoid unmapping frags grants when skb is freed later */ + /* by nc2_vmq_free_skb() */ + skb_shinfo(skb)->nr_frags = 0; +} + diff --git a/drivers/xen/netchannel2/vmq.h b/drivers/xen/netchannel2/vmq.h new file mode 100644 index 0000000..fa1cc8a --- /dev/null +++ b/drivers/xen/netchannel2/vmq.h @@ -0,0 +1,58 @@ +#ifndef VMQ_H__ +#define VMQ_H__ + +#include "netchannel2_core.h" + +#ifdef CONFIG_XEN_NETDEV2_VMQ + +int nc2_vmq_connect(struct netchannel2 *nc); +void nc2_vmq_disconnect(struct netchannel2 *nc); +void do_vmq_work(struct netchannel2 *nc); +int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb); +void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb, + volatile void *msg); +void vmq_flush_unmap_hypercall(void); + +#define vmq_get(_b) \ + atomic_inc(&(_b)->refcnt); + +#define vmq_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) { \ + wake_up(&(_b)->waiting_to_free); \ + } \ + } while (0) + +static inline int nr_vmq_mapped_bufs(struct netchannel2 *nc) +{ + return nc->vmq.mapped_pages_prod - + nc->vmq.mapped_pages_cons; +} + +static inline int nr_vmq_bufs(struct netchannel2 *nc) +{ + return nc->vmq.nbufs; +} + +static inline int nc2_in_vmq_mode(struct netchannel2 *nc) +{ + return nc->vmq.vmq_mode; +} + +#else +static inline int nc2_vmq_connect(struct netchannel2 *nc) +{ + return 0; +} +static inline void nc2_vmq_disconnect(struct netchannel2 *nc) +{ +} +static inline void do_vmq_work(struct netchannel2 *nc) +{ +} +static inline void vmq_flush_unmap_hypercall(void) +{ +} +#endif /* CONFIG_XEN_NETDEV2_VMQ */ + +#endif /* !VMQ_H__ */ diff --git a/drivers/xen/netchannel2/vmq_def.h b/drivers/xen/netchannel2/vmq_def.h new file mode 100644 index 0000000..60f1ccb --- /dev/null +++ b/drivers/xen/netchannel2/vmq_def.h @@ -0,0 +1,68 @@ +#ifndef VMQ_DEF_H__ +#define VMQ_DEF_H__ + + +/* size of HW queue in VMQ device */ +#define VMQ_QUEUE_SIZE 1024 + +/* Mimimum amount of buffers needed for VMQ + * This is the lower water mark that triggers mapping more guest buffers + * Should be larger than the queue size to allow for in flight packets + */ +#define VMQ_MIN_BUFFERS 1920 + +/* Maximum amount of posted buffers which are reserved for VMQ + * Should be less than MAX_POSTED_BUFFERS. For now, the difference can be used + * for intra-node guest to guest traffic. When we map guest buffers we try to + * have VMQ_MAX_BUFFERS mapped. The difference (VMQ_MAX_BUFFERS-VMQ_MIN_BUFFERS) + * helps batch multiple grant map operattions + * VMQ_QUEUE_SIZE < VMQ_MIN_BUFFER < VMQ_MAX_BUFFER < MAX_POSTED_BUFFERS + * VMQ_MAX_BUFFERS must be a power of 2 + */ +#define VMQ_MAX_BUFFERS 2048 + +/* skb size is zero since packet data uses fragments */ +#define VMQ_SKB_SIZE 0 + +#define VMQ_NUM_BUFFERS(len) ((len + PAGE_SIZE - 1) / PAGE_SIZE) + +#define VMQ_IDX_MASK(_i) ((_i)&(VMQ_MAX_BUFFERS-1)) + +typedef struct nc2_vmq_buf { + struct nc2_tx_buffer *buf; + struct netchannel2 *nc; +} nc2_vmq_buf_t; + +typedef struct nc2_vmq { + struct net_device *pdev; /* Pointer to physical device */ + int vmq_mode; /* indicate if vif is in vmq mode */ + struct page **pages; /* pages for mapping guest RX bufs */ + struct sk_buff_head free_skb_list; /* Free socket buffer pool */ + struct sk_buff_head dealloc_queue; /* list of skb's to be free */ + struct sk_buff_head rx_queue; /* list of received packets */ + + /* guest mapped buffers */ + nc2_vmq_buf_t buffer[VMQ_MAX_BUFFERS]; + + /* Ring with free pages available for mapping guest RX buffers */ + u16 unmapped_pages[VMQ_MAX_BUFFERS]; + unsigned int unmapped_pages_prod; + unsigned int unmapped_pages_cons; + + /* Ring of mapped RX pages avaialable for vmq device */ + u16 mapped_pages[VMQ_MAX_BUFFERS]; + unsigned int mapped_pages_prod; + unsigned int mapped_pages_cons; + + unsigned int nbufs; /* number of vmq buffers: posted to */ + /* HW queue or available to be posted */ + int vmq_id; /* Queue id */ + int vmq_size; /* Queue size */ + int vmq_state; /* queue stste */ + + atomic_t refcnt; + wait_queue_head_t waiting_to_free; + +} nc2_vmq_t; + +#endif /* !VMQ_DEF_H__ */ diff --git a/drivers/xen/netchannel2/xmit_packet.c b/drivers/xen/netchannel2/xmit_packet.c index 1a879aa..09827fc 100644 --- a/drivers/xen/netchannel2/xmit_packet.c +++ b/drivers/xen/netchannel2/xmit_packet.c @@ -3,6 +3,7 @@ #include <linux/kernel.h> #include <linux/version.h> #include "netchannel2_core.h" +#include "vmq.h" /* You don't normally want to transmit in posted buffers mode, because grant mode is usually faster, but it's sometimes useful for testing @@ -189,6 +190,11 @@ int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp, set_offload_flags(skb, msg); switch (skb_co->policy) { +#ifdef CONFIG_XEN_NETDEV2_VMQ + case transmit_policy_vmq: + xmit_vmq(nc, skb, msg); + break; +#endif case transmit_policy_small: /* Nothing to do */ break; -- 1.6.3.1 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.