[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH] xen, xen-sparse: modify spinlocks to use directed yield
The following patch creates a new hypercall, do_confer() which allows a vcpu to yield to another vcpu that is not currently running. Also included in the patch is modification to x86 spinlock code to use the new hcall modeled on the ppc64 implementation which has an identical hcall. When a vcpu acquires a spin or write lock, the vcpu id of the holder is recorded in the lock. When a different vcpu attempts to acquire the contested lock, the spinlock code will yield its timeslice to the lock holder if they are not currently running rather than just spinning until the lock holder's next timeslice. There are a couple things not done with this patch. First, I wasn't able to determine if there was a guaranteed, scheduler independent way of switching to another vcpu immediately. Currently, the implementation calls domain_wake() on the vcpu that is holding the lock, and then yields. This has two issues: 1) there is no guarantee that the scheduler will pick the domain that has been woken next 2) it is wrong IMHO to have the woken domain run for more than the remainder of the caller's slice as this would be preferential to lock-holder vcpus. Ideally I would like a way to donate the remainder of the current vcpu's slice to a target vcpu and a SCHED_OP that allows the marking of a vcpu as high priority that the various scheduler can implement in whatever way makes sense. Second, there is a conflict in Linux between CONFIG_PREEMPT and the spinlock code that yields the vcpu to the lock holder. That is, when CONFIG_PREEMPT is enabled, the actually spinlock code that is used never calls into the code that invokes the spin_yield() function which utilizes the confer hcall. I believe that the behavior is intentional as it doesn't make sense to encourage the lock to be broken (what CONFIG_PREEMPT spinlocks do) and then yield to a vcpu. I am currently investigating the performance difference between CONFIG_PREEMPT and directed yielding in spinlocks and should have some numbers shortly to see the perf trade off of these two options. Trivial, but important, for some reason, the shared_info->shproc value got reset to 0 at some point after it's initial allocation where I set it to 1. I've not tracked down why that is the case, but once that is fixed, the the SHARED_PROCESSOR define in spinlocks.h will change to check if shproc=1. Comments and questions requested. -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@xxxxxxxxxx diffstat output: linux-2.6.11-xen-sparse/arch/i386/lib/Makefile | 11 linux-2.6.11-xen-sparse/arch/i386/lib/locks.c | 76 +++++ linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 | 4 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S | 2 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h | 16 + linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h | 140 +++++++--- xen/arch/x86/domain.c | 2 xen/arch/x86/x86_32/entry.S | 1 xen/common/domain.c | 1 xen/common/schedule.c | 69 ++++ xen/include/public/xen.h | 11 xen/include/xen/sched.h | 9 12 files changed, 302 insertions(+), 40 deletions(-) Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx> --- diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c confer/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c --- b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 1969-12-31 18:00:00.000000000 -0600 +++ confer/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 2005-05-20 10:37:58.300767080 -0500 @@ -0,0 +1,76 @@ +/* + * Spin and read/write lock operations. + * + * Copyright (C) 2001-2004 Paul Mackerras <paulus@xxxxxxxxxx>, IBM + * Copyright (C) 2001 Anton Blanchard <anton@xxxxxxxxxx>, IBM + * Copyright (C) 2002 Dave Engebretsen <engebret@xxxxxxxxxx>, IBM + * Rework to support virtual processors + * Copyright (C) 2005 Ryan Harper <ryanh@xxxxxxxxxx>, IBM + * Rework for Xen on x86 + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/stringify.h> +#include <asm/hypercall.h> +#include <asm/processor.h> + +/* waiting for a spinlock... */ +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) +void __spin_yield(spinlock_t *lock) +{ + unsigned int lock_value, holder_cpu, yield_count; + shared_info_t *s = HYPERVISOR_shared_info; + + lock_value = lock->slock; + if (lock_value == 1) + return; + holder_cpu = lock->cpu; + BUG_ON(holder_cpu >= NR_CPUS); + yield_count = s->vcpu_data[holder_cpu].yield_count; + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (lock->slock != lock_value) + return; /* something has changed */ + HYPERVISOR_confer(holder_cpu, yield_count); +} + +void __rw_yield(rwlock_t *rw) +{ + unsigned int lock_value, holder_cpu, yield_count; + shared_info_t *s = HYPERVISOR_shared_info; + + lock_value = rw->lock; + if (lock_value == RW_LOCK_BIAS) + return; + holder_cpu = rw->cpu; + BUG_ON(holder_cpu >= NR_CPUS); + yield_count = s->vcpu_data[holder_cpu].yield_count; + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (rw->lock != lock_value) + return; /* something has changed */ + HYPERVISOR_confer(holder_cpu, yield_count); +} + +void spin_unlock_wait(spinlock_t *lock) +{ + while (spin_is_locked(lock)) { + cpu_relax(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } + cpu_relax(); +} +EXPORT_SYMBOL(spin_unlock_wait); +#endif diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile confer/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile --- b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 1969-12-31 18:00:00.000000000 -0600 +++ confer/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 2005-05-20 10:37:58.301766928 -0500 @@ -0,0 +1,11 @@ +# +# Makefile for i386-specific library files.. +# + + +lib-y = checksum.o delay.o usercopy.o getuser.o memcpy.o strstr.o \ + bitops.o + +lib-$(CONFIG_X86_USE_3DNOW) += mmx.o +lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_XEN) += locks.o diff -urN b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 confer/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 --- b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 2005-05-20 10:39:36.826788848 -0500 +++ confer/linux-2.6.11-xen-sparse/arch/xen/configs/xenU-smp_defconfig_x86_32 2005-05-20 10:37:58.303766624 -0500 @@ -117,8 +117,8 @@ CONFIG_SMP=y CONFIG_NR_CPUS=8 # CONFIG_SCHED_SMT is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_BKL=y +# CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_BKL is not set CONFIG_X86_CPUID=y # diff -urN b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S confer/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S --- b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-05-19 22:20:32.000000000 -0500 +++ confer/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-05-20 10:37:58.304766472 -0500 @@ -80,7 +80,7 @@ #define evtchn_upcall_pending /* 0 */ #define evtchn_upcall_mask 1 -#define sizeof_vcpu_shift 3 +#define sizeof_vcpu_shift 4 #ifdef CONFIG_SMP #define preempt_disable(reg) incl TI_preempt_count(reg) diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h --- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h 2005-05-19 22:20:32.000000000 -0500 +++ confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h 2005-05-20 10:37:58.306766168 -0500 @@ -517,4 +517,20 @@ return ret; } +static inline int +HYPERVISOR_confer( + unsigned int vcpu, unsigned int yield_count) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_confer), "1" (vcpu), "2" (yield_count) + : "memory"); + + return ret; +} + #endif /* __HYPERCALL_H__ */ diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h --- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h 2005-05-19 22:20:14.000000000 -0500 +++ confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h 2005-05-20 10:37:58.307766016 -0500 @@ -22,10 +22,36 @@ #ifdef CONFIG_PREEMPT unsigned int break_lock; #endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + unsigned int cpu; +#endif } spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + */ +typedef struct { + volatile unsigned int lock; +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned magic; +#endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + unsigned int cpu; +#endif +} rwlock_t; + #ifdef CONFIG_DEBUG_SPINLOCK #define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC #else @@ -44,7 +70,20 @@ */ #define spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) <= 0) +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) +#include <linux/smp.h> +#define SPINLOCK_CPU (smp_processor_id()) +/* We only yield to the hypervisor if we are in shared processor mode */ +#define SHARED_PROCESSOR (HYPERVISOR_shared_info->shproc == 0) +extern void __spin_yield(spinlock_t *lock); +extern void __rw_yield(rwlock_t *rw); +extern void spin_unlock_wait(spinlock_t *lock); +#else +#define __spin_yield(x) barrier() +#define __rw_yield(x) barrier() +#define SHARED_PROCESSOR 0 #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) +#endif #define spin_lock_string \ "\n1:\t" \ @@ -125,6 +164,9 @@ "xchgb %b0,%1" :"=q" (oldval), "=m" (lock->slock) :"0" (0) : "memory"); +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + lock->cpu = SPINLOCK_CPU; +#endif return oldval > 0; } @@ -136,43 +178,55 @@ BUG(); } #endif - __asm__ __volatile__( - spin_lock_string - :"=m" (lock->slock) : : "memory"); +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + while (1) { + if ( likely(_raw_spin_trylock(lock)) ) + break; + do { + cpu_relax(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } while (likely(spin_is_locked(lock))); + cpu_relax(); + } +#else + __asm__ __volatile__( + spin_lock_string + :"=m" (lock->slock) : : "memory"); +#endif } static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags) { +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + unsigned long flags_dis; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if (unlikely(lock->magic != SPINLOCK_MAGIC)) { printk("eip: %p\n", __builtin_return_address(0)); BUG(); } #endif - __asm__ __volatile__( - spin_lock_string_flags - :"=m" (lock->slock) : "r" (flags) : "memory"); -} - -/* - * Read-write spinlocks, allowing multiple readers - * but only one writer. - * - * NOTE! it is quite common to have readers in interrupts - * but no interrupt writers. For those circumstances we - * can "mix" irq-safe locks - any writer needs to get a - * irq-safe write-lock, but readers can get non-irqsafe - * read-locks. - */ -typedef struct { - volatile unsigned int lock; -#ifdef CONFIG_DEBUG_SPINLOCK - unsigned magic; -#endif -#ifdef CONFIG_PREEMPT - unsigned int break_lock; +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + while (1) { + if ( likely(_raw_spin_trylock(lock)) ) + break; + local_save_flags(flags_dis); + local_irq_restore(flags); + do { + cpu_relax(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } while (likely(spin_is_locked(lock))); + cpu_relax(); + local_irq_restore(flags_dis); + } +#else + __asm__ __volatile__( + spin_lock_string_flags + :"=m" (lock->slock) : "r" (flags) : "memory"); #endif -} rwlock_t; +} #define RWLOCK_MAGIC 0xdeaf1eed @@ -198,6 +252,18 @@ */ #define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) +static inline int _raw_write_trylock(rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) { +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + lock->cpu = SPINLOCK_CPU; +#endif + return 1; + } + atomic_add(RW_LOCK_BIAS, count); + return 0; +} /* * On x86, we implement read-write locks as a 32-bit counter * with the high bit (sign) being the "contended" bit. @@ -222,7 +288,20 @@ #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(rw->magic != RWLOCK_MAGIC); #endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) + while (1) { + if ( likely(_raw_write_trylock(rw)) ) + break; + do { + cpu_relax(); + if (SHARED_PROCESSOR) + __rw_yield(rw); + } while ( likely(!write_can_lock(rw))); + cpu_relax(); + } +#else __build_write_lock(rw, "__write_lock_failed"); +#endif } #define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") @@ -238,13 +317,6 @@ return 0; } -static inline int _raw_write_trylock(rwlock_t *lock) -{ - atomic_t *count = (atomic_t *)lock; - if (atomic_sub_and_test(RW_LOCK_BIAS, count)) - return 1; - atomic_add(RW_LOCK_BIAS, count); - return 0; -} + #endif /* __ASM_SPINLOCK_H */ diff -urN b/xen/arch/x86/domain.c confer/xen/arch/x86/domain.c --- b/xen/arch/x86/domain.c 2005-05-19 22:20:28.000000000 -0500 +++ confer/xen/arch/x86/domain.c 2005-05-20 10:38:29.187071648 -0500 @@ -253,6 +253,8 @@ memset(d->shared_info, 0, PAGE_SIZE); ed->vcpu_info = &d->shared_info->vcpu_data[ed->vcpu_id]; ed->cpumap = CPUMAP_RUNANYWHERE; + /* default vcpus to sharing physical cpus */ + d->shared_info->shproc = 1; SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d); machine_to_phys_mapping[virt_to_phys(d->shared_info) >> PAGE_SHIFT] = INVALID_M2P_ENTRY; diff -urN b/xen/arch/x86/x86_32/entry.S confer/xen/arch/x86/x86_32/entry.S --- b/xen/arch/x86/x86_32/entry.S 2005-05-19 22:20:33.000000000 -0500 +++ confer/xen/arch/x86/x86_32/entry.S 2005-05-20 10:37:58.353759024 -0500 @@ -749,6 +749,7 @@ .long do_boot_vcpu .long do_ni_hypercall /* 25 */ .long do_mmuext_op + .long do_confer .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr diff -urN b/xen/common/domain.c confer/xen/common/domain.c --- b/xen/common/domain.c 2005-05-19 22:20:15.000000000 -0500 +++ confer/xen/common/domain.c 2005-05-20 10:37:58.354758872 -0500 @@ -289,6 +289,7 @@ atomic_set(&ed->pausecnt, 0); ed->cpumap = CPUMAP_RUNANYWHERE; + set_bit(_VCPUF_canconfer, &ed->vcpu_flags); memcpy(&ed->arch, &idle0_exec_domain.arch, sizeof(ed->arch)); diff -urN b/xen/common/schedule.c confer/xen/common/schedule.c --- b/xen/common/schedule.c 2005-05-19 22:20:30.000000000 -0500 +++ confer/xen/common/schedule.c 2005-05-20 10:45:41.493351104 -0500 @@ -224,6 +224,11 @@ spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags); if ( likely(domain_runnable(ed)) ) { + /* mark current's confer state */ + if ( test_bit(_VCPUF_conferring, ¤t->vcpu_flags) ) { + clear_bit(_VCPUF_conferring, ¤t->vcpu_flags); + set_bit(_VCPUF_conferred, ¤t->vcpu_flags); + } SCHED_OP(wake, ed); #ifdef WAKE_HISTO ed->wokenup = NOW(); @@ -273,6 +278,54 @@ return 0; } +/* Confer control to another vcpu */ +long do_confer(unsigned int vcpu, unsigned int yield_count) +{ + struct domain *d = current->domain; + + /* Validate CONFER prereqs: + * - vcpu is within bounds + * - vcpu is a valid in this domain + * - current has not already conferred its slice to vcpu + * - vcpu is not already running + * - designated vcpu's yield_count matches value from call + * + * of all are ok, then set conferred value and enter scheduler + */ + + if (vcpu > MAX_VIRT_CPUS) + return 0; + + if (d->exec_domain[vcpu] == NULL) + return 0; + + if (!test_bit(_VCPUF_canconfer, ¤t->vcpu_flags)) + return 0; + + /* even counts indicate a running vcpu, odd is preempted/conferred */ + /* don't confer if holder is currently running */ + if ((d->exec_domain[vcpu]->vcpu_info->yield_count & 1) == 0) + return 0; + + if (d->exec_domain[vcpu]->vcpu_info->yield_count != yield_count) + return 0; + + /* + * set current's state to conferring, wake target + */ + clear_bit(_VCPUF_canconfer, ¤t->vcpu_flags); + set_bit(_VCPUF_conferring, ¤t->vcpu_flags); + domain_wake(d->exec_domain[vcpu]); + + /* request scheduling for woken domain */ + raise_softirq(SCHEDULE_SOFTIRQ); + + /* give up my timeslice */ + do_yield(); + + return 0; +} + /* * Demultiplex scheduler-related hypercalls. */ @@ -441,7 +494,15 @@ r_time = next_slice.time; next = next_slice.task; - + + /* + * always clear conferred state so this vcpu can confer during its slice + * since it can confer, clear all other confer state + */ + set_bit(_VCPUF_canconfer, &next->vcpu_flags); + clear_bit(_VCPUF_conferring, &next->vcpu_flags); + clear_bit(_VCPUF_conferred, &next->vcpu_flags); + schedule_data[cpu].curr = next; next->lastschd = now; @@ -455,6 +516,12 @@ spin_unlock_irq(&schedule_data[cpu].schedule_lock); + /* bump vcpu yield_count when controlling domain is not-idle */ + if ( !is_idle_task(prev->domain) ) + prev->vcpu_info->yield_count++; + if ( !is_idle_task(next->domain) ) + next->vcpu_info->yield_count++; + if ( unlikely(prev == next) ) { #ifdef ADV_SCHED_HISTO adv_sched_hist_to_stop(cpu); diff -urN b/xen/include/public/xen.h confer/xen/include/public/xen.h --- b/xen/include/public/xen.h 2005-05-19 22:20:11.000000000 -0500 +++ confer/xen/include/public/xen.h 2005-05-20 10:37:58.368756744 -0500 @@ -58,6 +58,7 @@ #define __HYPERVISOR_boot_vcpu 24 #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ #define __HYPERVISOR_mmuext_op 26 +#define __HYPERVISOR_confer 27 /* * MULTICALLS @@ -334,8 +335,11 @@ u8 evtchn_upcall_mask; /* 1 */ u8 pad0, pad1; u32 evtchn_pending_sel; /* 4 */ - arch_vcpu_info_t arch; /* 8 */ -} PACKED vcpu_info_t; /* 8 + arch */ + /* Even when vcpu is running, Odd when it is preempted/conferred */ + u32 yield_count; /* 8 */ + u32 pad2; /* 12 */ + arch_vcpu_info_t arch; /* 16 */ +} PACKED vcpu_info_t; /* 16 + arch */ /* * Xen/kernel shared data -- pointer provided in start_info. @@ -347,6 +351,9 @@ u32 n_vcpu; + /* set if domains' vcpus share physical cpus */ + int shproc; + /* * A domain can have up to 1024 "event channels" on which it can send * and receive asynchronous event notifications. There are three classes diff -urN b/xen/include/xen/sched.h confer/xen/include/xen/sched.h --- b/xen/include/xen/sched.h 2005-05-19 22:20:07.000000000 -0500 +++ confer/xen/include/xen/sched.h 2005-05-20 10:37:58.378755224 -0500 @@ -358,6 +358,15 @@ /* Initialization completed. */ #define _VCPUF_initialised 8 #define VCPUF_initialised (1UL<<_VCPUF_initialised) + /* Able to give time slice to another vcpu */ +#define _VCPUF_canconfer 9 +#define VCPUF_canconfer (1UL<<_VCPUF_canconfer) + /* Currently giving time slice to another vcpu */ +#define _VCPUF_conferring 10 +#define VCPUF_conferring (1UL<<_VCPUF_conferring) + /* Already given time slice to another vcpu */ +#define _VCPUF_conferred 11 +#define VCPUF_conferred (1UL<<_VCPUF_conferred) /* * Per-domain flags (domain_flags). _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |