[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC PATCH V3 15/16] netfront: multi page ring support.



On Mon, 2012-01-30 at 21:39 +0000, Konrad Rzeszutek Wilk wrote:
> On Mon, Jan 30, 2012 at 02:45:33PM +0000, Wei Liu wrote:
> > Use DMA API to allocate ring pages, because we need to get machine
> > contiginous memory.
> 
> >
> > Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx>
> > ---
> >  drivers/net/xen-netfront.c |  258 
> > ++++++++++++++++++++++++++++++++------------
> >  1 files changed, 187 insertions(+), 71 deletions(-)
> >
> > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > index 01f589d..32ec212 100644
> > --- a/drivers/net/xen-netfront.c
> > +++ b/drivers/net/xen-netfront.c
> > @@ -66,9 +66,18 @@ struct netfront_cb {
> >
> >  #define GRANT_INVALID_REF    0
> >
> > -#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
> > -#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
> > -#define TX_MAX_TARGET min_t(int, NET_TX_RING_SIZE, 256)
> > +#define XENNET_MAX_RING_PAGE_ORDER 2
> > +#define XENNET_MAX_RING_PAGES      (1U << XENNET_MAX_RING_PAGE_ORDER)
> > +
> > +#define NET_TX_RING_SIZE(_nr_pages)                                  \
> > +     __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE * (_nr_pages))
> > +#define NET_RX_RING_SIZE(_nr_pages)                                  \
> > +     __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE * (_nr_pages))
> > +
> > +#define XENNET_MAX_TX_RING_SIZE NET_TX_RING_SIZE(XENNET_MAX_RING_PAGES)
> > +#define XENNET_MAX_RX_RING_SIZE NET_RX_RING_SIZE(XENNET_MAX_RING_PAGES)
> > +
> > +#define TX_MAX_TARGET XENNET_MAX_TX_RING_SIZE
> >
> >  struct netfront_stats {
> >       u64                     rx_packets;
> > @@ -84,12 +93,20 @@ struct netfront_info {
> >
> >       struct napi_struct napi;
> >
> > +     /* Statistics */
> > +     struct netfront_stats __percpu *stats;
> > +
> > +     unsigned long rx_gso_checksum_fixup;
> > +
> >       unsigned int evtchn;
> >       struct xenbus_device *xbdev;
> >
> >       spinlock_t   tx_lock;
> >       struct xen_netif_tx_front_ring tx;
> > -     int tx_ring_ref;
> > +     dma_addr_t tx_ring_dma_handle;
> > +     int tx_ring_ref[XENNET_MAX_RING_PAGES];
> > +     int tx_ring_page_order;
> > +     int tx_ring_pages;
> >
> >       /*
> >        * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
> > @@ -103,36 +120,34 @@ struct netfront_info {
> >       union skb_entry {
> >               struct sk_buff *skb;
> >               unsigned long link;
> > -     } tx_skbs[NET_TX_RING_SIZE];
> > +     } tx_skbs[XENNET_MAX_TX_RING_SIZE];
> >       grant_ref_t gref_tx_head;
> > -     grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
> > +     grant_ref_t grant_tx_ref[XENNET_MAX_TX_RING_SIZE];
> >       unsigned tx_skb_freelist;
> >
> >       spinlock_t   rx_lock ____cacheline_aligned_in_smp;
> >       struct xen_netif_rx_front_ring rx;
> > -     int rx_ring_ref;
> > +     dma_addr_t rx_ring_dma_handle;
> > +     int rx_ring_ref[XENNET_MAX_RING_PAGES];
> > +     int rx_ring_page_order;
> > +     int rx_ring_pages;
> >
> >       /* Receive-ring batched refills. */
> >  #define RX_MIN_TARGET 8
> >  #define RX_DFL_MIN_TARGET 64
> > -#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
> > +#define RX_MAX_TARGET XENNET_MAX_RX_RING_SIZE
> >       unsigned rx_min_target, rx_max_target, rx_target;
> >       struct sk_buff_head rx_batch;
> >
> >       struct timer_list rx_refill_timer;
> >
> > -     struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
> > +     struct sk_buff *rx_skbs[XENNET_MAX_RX_RING_SIZE];
> >       grant_ref_t gref_rx_head;
> > -     grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
> > -
> > -     unsigned long rx_pfn_array[NET_RX_RING_SIZE];
> > -     struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
> > -     struct mmu_update rx_mmu[NET_RX_RING_SIZE];
> > -
> > -     /* Statistics */
> > -     struct netfront_stats __percpu *stats;
> > +     grant_ref_t grant_rx_ref[XENNET_MAX_RX_RING_SIZE];
> >
> > -     unsigned long rx_gso_checksum_fixup;
> > +     unsigned long rx_pfn_array[XENNET_MAX_RX_RING_SIZE];
> > +     struct multicall_entry rx_mcl[XENNET_MAX_RX_RING_SIZE+1];
> > +     struct mmu_update rx_mmu[XENNET_MAX_RX_RING_SIZE];
> >  };
> >
> >  struct netfront_rx_info {
> > @@ -170,15 +185,15 @@ static unsigned short get_id_from_freelist(unsigned 
> > *head,
> >       return id;
> >  }
> >
> > -static int xennet_rxidx(RING_IDX idx)
> > +static int xennet_rxidx(RING_IDX idx, struct netfront_info *info)
> >  {
> > -     return idx & (NET_RX_RING_SIZE - 1);
> > +     return idx & (NET_RX_RING_SIZE(info->rx_ring_pages) - 1);
> >  }
> >
> >  static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
> >                                        RING_IDX ri)
> >  {
> > -     int i = xennet_rxidx(ri);
> > +     int i = xennet_rxidx(ri, np);
> >       struct sk_buff *skb = np->rx_skbs[i];
> >       np->rx_skbs[i] = NULL;
> >       return skb;
> > @@ -187,7 +202,7 @@ static struct sk_buff *xennet_get_rx_skb(struct 
> > netfront_info *np,
> >  static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
> >                                           RING_IDX ri)
> >  {
> > -     int i = xennet_rxidx(ri);
> > +     int i = xennet_rxidx(ri, np);
> >       grant_ref_t ref = np->grant_rx_ref[i];
> >       np->grant_rx_ref[i] = GRANT_INVALID_REF;
> >       return ref;
> > @@ -300,7 +315,7 @@ no_skb:
> >
> >               skb->dev = dev;
> >
> > -             id = xennet_rxidx(req_prod + i);
> > +             id = xennet_rxidx(req_prod + i, np);
> >
> >               BUG_ON(np->rx_skbs[id]);
> >               np->rx_skbs[id] = skb;
> > @@ -596,7 +611,7 @@ static int xennet_close(struct net_device *dev)
> >  static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff 
> > *skb,
> >                               grant_ref_t ref)
> >  {
> > -     int new = xennet_rxidx(np->rx.req_prod_pvt);
> > +     int new = xennet_rxidx(np->rx.req_prod_pvt, np);
> >
> >       BUG_ON(np->rx_skbs[new]);
> >       np->rx_skbs[new] = skb;
> > @@ -1089,7 +1104,7 @@ static void xennet_release_tx_bufs(struct 
> > netfront_info *np)
> >       struct sk_buff *skb;
> >       int i;
> >
> > -     for (i = 0; i < NET_TX_RING_SIZE; i++) {
> > +     for (i = 0; i < NET_TX_RING_SIZE(np->tx_ring_pages); i++) {
> >               /* Skip over entries which are actually freelist references */
> >               if (skb_entry_is_link(&np->tx_skbs[i]))
> >                       continue;
> > @@ -1123,7 +1138,7 @@ static void xennet_release_rx_bufs(struct 
> > netfront_info *np)
> >
> >       spin_lock_bh(&np->rx_lock);
> >
> > -     for (id = 0; id < NET_RX_RING_SIZE; id++) {
> > +     for (id = 0; id < NET_RX_RING_SIZE(np->rx_ring_pages); id++) {
> >               ref = np->grant_rx_ref[id];
> >               if (ref == GRANT_INVALID_REF) {
> >                       unused++;
> > @@ -1305,13 +1320,13 @@ static struct net_device * __devinit 
> > xennet_create_dev(struct xenbus_device *dev
> >
> >       /* Initialise tx_skbs as a free chain containing every entry. */
> >       np->tx_skb_freelist = 0;
> > -     for (i = 0; i < NET_TX_RING_SIZE; i++) {
> > +     for (i = 0; i < XENNET_MAX_TX_RING_SIZE; i++) {
> >               skb_entry_set_link(&np->tx_skbs[i], i+1);
> >               np->grant_tx_ref[i] = GRANT_INVALID_REF;
> >       }
> >
> >       /* Clear out rx_skbs */
> > -     for (i = 0; i < NET_RX_RING_SIZE; i++) {
> > +     for (i = 0; i < XENNET_MAX_RX_RING_SIZE; i++) {
> >               np->rx_skbs[i] = NULL;
> >               np->grant_rx_ref[i] = GRANT_INVALID_REF;
> >       }
> > @@ -1409,15 +1424,11 @@ static int __devinit netfront_probe(struct 
> > xenbus_device *dev,
> >       return err;
> >  }
> >
> > -static void xennet_end_access(int ref, void *page)
> > -{
> > -     /* This frees the page as a side-effect */
> > -     if (ref != GRANT_INVALID_REF)
> > -             gnttab_end_foreign_access(ref, 0, (unsigned long)page);
> > -}
> > -
> >  static void xennet_disconnect_backend(struct netfront_info *info)
> >  {
> > +     int i;
> > +     struct xenbus_device *dev = info->xbdev;
> > +
> >       /* Stop old i/f to prevent errors whilst we rebuild the state. */
> >       spin_lock_bh(&info->rx_lock);
> >       spin_lock_irq(&info->tx_lock);
> > @@ -1429,12 +1440,24 @@ static void xennet_disconnect_backend(struct 
> > netfront_info *info)
> >               unbind_from_irqhandler(info->netdev->irq, info->netdev);
> >       info->evtchn = info->netdev->irq = 0;
> >
> > -     /* End access and free the pages */
> > -     xennet_end_access(info->tx_ring_ref, info->tx.sring);
> > -     xennet_end_access(info->rx_ring_ref, info->rx.sring);
> > +     for (i = 0; i < info->tx_ring_pages; i++) {
> > +             int ref = info->tx_ring_ref[i];
> > +             gnttab_end_foreign_access_ref(ref, 0);
> > +             info->tx_ring_ref[i] = GRANT_INVALID_REF;
> > +     }
> > +     dma_free_coherent(NULL, PAGE_SIZE * info->tx_ring_pages,
> > +                       (void *)info->tx.sring,
> > +                       info->tx_ring_dma_handle);
> > +
> > +     for (i = 0; i < info->rx_ring_pages; i++) {
> > +             int ref = info->rx_ring_ref[i];
> > +             gnttab_end_foreign_access_ref(ref, 0);
> > +             info->rx_ring_ref[i] = GRANT_INVALID_REF;
> > +     }
> > +     dma_free_coherent(NULL, PAGE_SIZE * info->rx_ring_pages,
> > +                       (void *)info->rx.sring,
> > +                       info->rx_ring_dma_handle);
> >
> > -     info->tx_ring_ref = GRANT_INVALID_REF;
> > -     info->rx_ring_ref = GRANT_INVALID_REF;
> >       info->tx.sring = NULL;
> >       info->rx.sring = NULL;
> >  }
> > @@ -1483,9 +1506,13 @@ static int setup_netfront(struct xenbus_device *dev, 
> > struct netfront_info *info)
> >       struct xen_netif_rx_sring *rxs;
> >       int err;
> >       struct net_device *netdev = info->netdev;
> > +     unsigned int max_tx_ring_page_order, max_rx_ring_page_order;
> > +     int i, j;
> >
> > -     info->tx_ring_ref = GRANT_INVALID_REF;
> > -     info->rx_ring_ref = GRANT_INVALID_REF;
> > +     for (i = 0; i < XENNET_MAX_RING_PAGES; i++) {
> > +             info->tx_ring_ref[i] = GRANT_INVALID_REF;
> > +             info->rx_ring_ref[i] = GRANT_INVALID_REF;
> > +     }
> >       info->rx.sring = NULL;
> >       info->tx.sring = NULL;
> >       netdev->irq = 0;
> > @@ -1496,50 +1523,105 @@ static int setup_netfront(struct xenbus_device 
> > *dev, struct netfront_info *info)
> >               goto fail;
> >       }
> >
> > -     txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_NOIO | 
> > __GFP_HIGH);
> > +     err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> > +                        "max-tx-ring-page-order", "%u",
> > +                        &max_tx_ring_page_order);
> > +     if (err < 0) {
> > +             info->tx_ring_page_order = 0;
> > +             dev_info(&dev->dev, "single tx ring\n");
> > +     } else {
> > +             info->tx_ring_page_order = max_tx_ring_page_order;
> > +             dev_info(&dev->dev, "multi page tx ring, order = %d\n",
> > +                      max_tx_ring_page_order);
> > +     }
> > +     info->tx_ring_pages = (1U << info->tx_ring_page_order);
> > +
> > +     txs = (struct xen_netif_tx_sring *)
> > +             dma_alloc_coherent(NULL, PAGE_SIZE * info->tx_ring_pages,
> > +                                &info->tx_ring_dma_handle,
> > +                                __GFP_ZERO | GFP_NOIO | __GFP_HIGH);
> 
> Hm, so I see you are using 'NULL' which is a big nono (the API docs say that).
> But the other reason why it is a no-no, is b/c this way the generic DMA 
> engine has no
> clue whether you are OK getting pages under 4GB or above it (so 64-bit 
> support).
> 
> If you don't supply a 'dev' it will assume 4GB. But when you are run this as a
> pure PV guest that won't matter the slighest b/I there are no DMA code in 
> action
> (well, there is dma_alloc_coherent - which looking at the code would NULL it 
> seems).
> 
> Anyhow, if you get to have more than 4GB in the guest or do PCI passthrough 
> and use
> 'iommu=soft'- at which point the Xen SWIOTLB will kick and you will end up 
> 'swizzling'
> the pages to be under 4GB. That can be fixed if you declerae a 'fake' device 
> where you set
> the coherent_dma_mask to DMA_BIT_MASK(64).
> 

This seems to be a reasonable solution. I could not set netfront's DMA
mask, that's why I used NULL device. And, how do I create a 'fake'
device?

> But if you boot the guest under HVM, then it will use the generic SWIOTLB 
> code, which
> won't guaranteeing the pages to be "machine" contingous but will be "guest 
> machine"
> contingous. Is that sufficient for this?
> 

For HVM, this is sufficient.

> How did you test this? Did you supply iommu=soft  to your guest or booted it
> with more than 4GB?
> 

I haven't tested guest with more than 4GB RAM.


Wei.




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.