diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index c47b9fa..d214c67 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -44,7 +44,7 @@ } while (0) #elif defined(__x86_64__) #define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0) -#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) +#define get_xen_guest_handle(val, hnd) do { val = (hnd); } while (0) #endif #endif diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index bd7a398..f510ee0 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include #include @@ -53,6 +55,7 @@ #include #include +#include #include #include @@ -107,7 +110,7 @@ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; #endif /* List of ballooned pages, threaded through the mem_map array. */ -static LIST_HEAD(ballooned_pages); +static struct list_head ballooned_pages[MAX_NUMNODES]; /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); @@ -160,13 +163,14 @@ static unsigned long shrink_frame(unsigned long nr_pages) /* balloon_append: add the given page to the balloon. */ static void balloon_append(struct page *page) { + int node = page_to_nid(page); /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { - list_add_tail(&page->lru, &ballooned_pages); + list_add_tail(&page->lru, &ballooned_pages[node]); balloon_stats.balloon_high++; dec_totalhigh_pages(); } else { - list_add(&page->lru, &ballooned_pages); + list_add(&page->lru, &ballooned_pages[node]); balloon_stats.balloon_low++; } @@ -174,14 +178,14 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(int node) { struct page *page; - if (list_empty(&ballooned_pages)) + if (list_empty(&ballooned_pages[node])) return NULL; - page = list_entry(ballooned_pages.next, struct page, lru); + page = list_entry(ballooned_pages[node].next, struct page, lru); list_del(&page->lru); if (PageHighMem(page)) { @@ -196,17 +200,17 @@ static struct page *balloon_retrieve(void) return page; } -static struct page *balloon_first_page(void) +static struct page *balloon_first_page(int node) { - if (list_empty(&ballooned_pages)) + if (list_empty(&ballooned_pages[node])) return NULL; - return list_entry(ballooned_pages.next, struct page, lru); + return list_entry(ballooned_pages[node].next, struct page, lru); } -static struct page *balloon_next_page(struct page *page) +static struct page *balloon_next_page(int node, struct page *page) { struct list_head *next = page->lru.next; - if (next == &ballooned_pages) + if (next == &ballooned_pages[node]) return NULL; return list_entry(next, struct page, lru); } @@ -228,13 +232,26 @@ static unsigned long current_target(void) return target; } -static int increase_reservation(unsigned long nr_pages) +static inline unsigned int xenmemf_vnode_to_mnode(int vnode) +{ +#ifdef CONFIG_XEN_NUMA_GUEST + extern struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout; + int mnid; + mnid = HYPERVISOR_pv_numa_layout.vnode_data[vnode].mnode_id; + return XENMEMF_exact_node(mnid); +#else + return 0; +#endif +} + +static int __increase_node_reservation(int node, unsigned long nr_pages) { unsigned long pfn, mfn, i, j, flags; struct page *page; - long rc; + long rc = 0; + struct xen_memory_reservation reservation = { - .mem_flags = 0, + .mem_flags = xenmemf_vnode_to_mnode(node), .domid = DOMID_SELF }; @@ -243,13 +260,15 @@ static int increase_reservation(unsigned long nr_pages) spin_lock_irqsave(&xen_reservation_lock, flags); - page = balloon_first_page(); - for (i = 0; i < nr_pages; i++) { - BUG_ON(page == NULL); + if (!(page = balloon_first_page(node))) + goto out; + + for (i = 0; page && i ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + node = next_node(node, node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); + + rc = __increase_node_reservation(node, nr_pages); + + return rc; +} + static int decrease_reservation(unsigned long nr_pages) { unsigned long pfn, lpfn, mfn, i, j, flags; @@ -302,6 +338,9 @@ static int decrease_reservation(unsigned long nr_pages) int need_sleep = 0; int discontig, discontig_free; int ret; + + static int node; + struct xen_memory_reservation reservation = { .mem_flags = 0, .domid = DOMID_SELF @@ -311,7 +350,7 @@ static int decrease_reservation(unsigned long nr_pages) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) { + if (!(page = alloc_pages_node(node, GFP_BALLOON, balloon_order))) { nr_pages = i; need_sleep = 1; break; @@ -366,9 +405,15 @@ static int decrease_reservation(unsigned long nr_pages) spin_unlock_irqrestore(&xen_reservation_lock, flags); + /* balloon from all nodes. */ + node = next_node(node, node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); + return need_sleep; } +static void nodemem_distribution(void); /* * We avoid multiple worker processes conflicting via the balloon mutex. * We may of course race updates of the target counts (which are protected @@ -400,6 +445,7 @@ static void balloon_process(struct work_struct *work) mod_timer(&balloon_timer, jiffies + HZ); mutex_unlock(&balloon_mutex); + nodemem_distribution(); } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -453,6 +499,7 @@ static int __init balloon_init(void) { unsigned long pfn; struct page *page; + int node; if (!xen_pv_domain()) return -ENODEV; @@ -460,6 +507,9 @@ static int __init balloon_init(void) pr_info("xen_balloon: Initialising balloon driver with page order %d.\n", balloon_order); + for_each_node(node) + INIT_LIST_HEAD(&ballooned_pages[node]); + balloon_npages = 1 << balloon_order; balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order; @@ -745,4 +795,113 @@ static int register_balloon(struct sys_device *sysdev) return error; } +/************************************************************************/ +/* NUMA Guest memory distribution stats */ +#ifdef CONFIG_XEN_NUMA_GUEST + +#define MEMNODE_BUFSIZE (PAGE_SIZE) +#define INVALID_NID (-1) +static int8_t memnode_buf[MEMNODE_BUFSIZE]; +static struct xenmem_numa_op __xen_numa_memop; +#define ___memnode (__xen_numa_memop.u.mnodemap) +static int xen_memnodemap_initialized; + +static inline int xen_mfn_to_nid(unsigned long mfn) +{ + uint8_t *memnode_map; + unsigned long addr; + + addr = mfn<> ___memnode.shift) >= ___memnode.mapsize) + return INVALID_NID; + get_xen_guest_handle(memnode_map, ___memnode.map); + return memnode_map[addr >> ___memnode.shift]; +} + +static inline int xen_memnodemap(void) +{ + int rc; + + printk(KERN_INFO "xen_memnodemap called\n"); + + __xen_numa_memop.cmd = XENMEM_machine_nodemap; + ___memnode.bufsize = MEMNODE_BUFSIZE; + memset(memnode_buf, 0xFF, MEMNODE_BUFSIZE); + set_xen_guest_handle(___memnode.map, memnode_buf); + + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &__xen_numa_memop))) { + xen_memnodemap_initialized = 0; + printk("XENMEM_memnode_map failed\n"); + } else { + xen_memnodemap_initialized = 1; + printk("XENMEM_memnode_map done\n"); + } + + return rc; +} + +unsigned int node_match_counts[MAX_NUMNODES][MAX_NUMNODES]; + +static void nodemem_distribution(void) +{ + int gnid, mnid; + unsigned int oob_mfns, invalid_p2ms; + + if (!xen_memnodemap_initialized && xen_memnodemap()) + return; + + printk(KERN_INFO "Domain nodemem distribution :\n"); + if (xen_feature(XENFEAT_auto_translated_physmap)) + { + printk(KERN_INFO "Enlightened ballooning disabled (auto_translated)\n"); + return; + } + + for_each_node(gnid) + for_each_node(mnid) + node_match_counts[gnid][mnid] = 0; + + oob_mfns = 0; + invalid_p2ms = 0; + + for_each_online_node(gnid) + { + unsigned long pfn, mfn, start_pfn, end_pfn; + start_pfn = node_start_pfn(gnid); + end_pfn = node_end_pfn(gnid); + printk(KERN_INFO "vnode[%d] : start(%lX), end(%lX)\n", + gnid, start_pfn, end_pfn); + for(pfn=start_pfn;pfn