|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [RFC PATCH 1/1] xen: Use a global mapping for runstate
On Thu, May 28, 2020 at 04:25:31PM +0100, Bertrand Marquis wrote:
> At the moment on Arm, a Linux guest running with KTPI enabled will
> cause the following error when a context switch happens in user mode:
> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>
> This patch is modifying runstate handling to map the area given by the
> guest inside Xen during the hypercall.
> This is removing the guest virtual to physical conversion during context
> switches which removes the bug and improve performance by preventing to
> walk page tables during context switches.
>
> --
> In the current status, this patch is only working on Arm and needs to
> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
>
> Signed-off-by: Bertrand Marquis <bertrand.marquis@xxxxxxx>
> ---
> xen/arch/arm/domain.c | 32 +++++++++-------
> xen/arch/x86/domain.c | 51 ++++++++++++++-----------
> xen/common/domain.c | 84 ++++++++++++++++++++++++++++++++++-------
> xen/include/xen/sched.h | 11 ++++--
> 4 files changed, 124 insertions(+), 54 deletions(-)
>
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 31169326b2..799b0e0103 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
> /* Update per-VCPU guest runstate shared memory area (if registered). */
> static void update_runstate_area(struct vcpu *v)
> {
> - void __user *guest_handle = NULL;
> - struct vcpu_runstate_info runstate;
> + struct vcpu_runstate_info *runstate;
>
> - if ( guest_handle_is_null(runstate_guest(v)) )
> + /* XXX why do we accept not to block here */
> + if ( !spin_trylock(&v->runstate_guest_lock) )
IMO the runstate is not a crucial piece of information, so it's better
to context switch fast.
> return;
>
> - memcpy(&runstate, &v->runstate, sizeof(runstate));
> + runstate = runstate_guest(v);
> +
> + if (runstate == NULL)
In general we don't explicitly check for NULL, and you could write
this as:
if ( runstate ) ...
Note the adding spaces between parentheses and the condition. I would
also likely check runstate_guest(v) directly and assign to runstate
afterwards if it's set.
> + {
> + spin_unlock(&v->runstate_guest_lock);
> + return;
> + }
>
> if ( VM_ASSIST(v->domain, runstate_update_flag) )
> {
> - guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> - guest_handle--;
> - runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> - __raw_copy_to_guest(guest_handle,
> - (void *)(&runstate.state_entry_time + 1) - 1, 1);
> + runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> smp_wmb();
> + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> }
>
> - __copy_to_guest(runstate_guest(v), &runstate, 1);
> + memcpy(runstate, &v->runstate, sizeof(v->runstate));
>
> - if ( guest_handle )
> + if ( VM_ASSIST(v->domain, runstate_update_flag) )
> {
> - runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> + runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> smp_wmb();
I think you need the barrier before clearing XEN_RUNSTATE_UPDATE from
the guest version of the runstate info, to make sure writes are not
re-ordered and hence that the XEN_RUNSTATE_UPDATE flag in the guest
version is not cleared before the full write has been committed?
> - __raw_copy_to_guest(guest_handle,
> - (void *)(&runstate.state_entry_time + 1) - 1, 1);
> + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> }
> +
> + spin_unlock(&v->runstate_guest_lock);
> }
>
> static void schedule_tail(struct vcpu *prev)
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 6327ba0790..10c6ceb561 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1642,57 +1642,62 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
> /* Update per-VCPU guest runstate shared memory area (if registered). */
> bool update_runstate_area(struct vcpu *v)
> {
> - bool rc;
> struct guest_memory_policy policy = { .nested_guest_mode = false };
> - void __user *guest_handle = NULL;
> - struct vcpu_runstate_info runstate;
> + struct vcpu_runstate_info *runstate;
>
> - if ( guest_handle_is_null(runstate_guest(v)) )
> + /* XXX: should we return false ? */
> + if ( !spin_trylock(&v->runstate_guest_lock) )
> return true;
>
> - update_guest_memory_policy(v, &policy);
> + runstate = runstate_guest(v);
>
> - memcpy(&runstate, &v->runstate, sizeof(runstate));
> + if (runstate == NULL)
> + {
> + spin_unlock(&v->runstate_guest_lock);
> + return true;
> + }
> +
> + update_guest_memory_policy(v, &policy);
>
> if ( VM_ASSIST(v->domain, runstate_update_flag) )
> {
> - guest_handle = has_32bit_shinfo(v->domain)
> - ? &v->runstate_guest.compat.p->state_entry_time + 1
> - : &v->runstate_guest.native.p->state_entry_time + 1;
> - guest_handle--;
> - runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> - __raw_copy_to_guest(guest_handle,
> - (void *)(&runstate.state_entry_time + 1) - 1, 1);
> + runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> smp_wmb();
> + if (has_32bit_shinfo(v->domain))
> + v->runstate_guest.compat->state_entry_time |=
> XEN_RUNSTATE_UPDATE;
> + else
> + v->runstate_guest.native->state_entry_time |=
> XEN_RUNSTATE_UPDATE;
I'm confused here, isn't runstate == v->runstate_guest.native at this
point?
I think you want to update v->runstate.state_entry_time here?
> }
>
> if ( has_32bit_shinfo(v->domain) )
> {
> struct compat_vcpu_runstate_info info;
> -
> XLAT_vcpu_runstate_info(&info, &runstate);
> - __copy_to_guest(v->runstate_guest.compat, &info, 1);
> - rc = true;
> + memcpy(v->runstate_guest.compat, &info, 1);
> }
> else
> - rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
> - sizeof(runstate);
> + memcpy(runstate, &v->runstate, sizeof(v->runstate));
>
> - if ( guest_handle )
> + if ( VM_ASSIST(v->domain, runstate_update_flag) )
> {
> - runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> + runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> smp_wmb();
> - __raw_copy_to_guest(guest_handle,
> - (void *)(&runstate.state_entry_time + 1) - 1, 1);
> + if (has_32bit_shinfo(v->domain))
> + v->runstate_guest.compat->state_entry_time |=
> XEN_RUNSTATE_UPDATE;
> + else
> + v->runstate_guest.native->state_entry_time |=
> XEN_RUNSTATE_UPDATE;
Same comment here related to the usage of runstate_guest instead of
runstate.
> }
>
> + spin_unlock(&v->runstate_guest_lock);
> +
> update_guest_memory_policy(v, &policy);
>
> - return rc;
> + return true;
> }
>
> static void _update_runstate_area(struct vcpu *v)
> {
> + /* XXX: this should be removed */
> if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
> !(v->arch.flags & TF_kernel_mode) )
> v->arch.pv.need_update_runstate_area = 1;
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7cc9526139..acc6f90ba3 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -161,6 +161,7 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int
> vcpu_id)
> v->dirty_cpu = VCPU_CPU_CLEAN;
>
> spin_lock_init(&v->virq_lock);
> + spin_lock_init(&v->runstate_guest_lock);
>
> tasklet_init(&v->continue_hypercall_tasklet, NULL, NULL);
>
> @@ -691,6 +692,66 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom,
> struct domain **d)
> return 0;
> }
>
> +static void unmap_runstate_area(struct vcpu *v, unsigned int lock)
lock wants to be a bool here.
> +{
> + mfn_t mfn;
> +
> + if ( ! runstate_guest(v) )
> + return;
I think you must check runstate_guest with the lock taken?
> +
> + if (lock)
> + spin_lock(&v->runstate_guest_lock);
> +
> + mfn = domain_page_map_to_mfn(runstate_guest(v));
> +
> + unmap_domain_page_global((void *)
> + ((unsigned long)v->runstate_guest &
> + PAGE_MASK));
> +
> + put_page_and_type(mfn_to_page(mfn));
> + runstate_guest(v) = NULL;
> +
> + if (lock)
> + spin_unlock(&v->runstate_guest_lock);
> +}
> +
> +static int map_runstate_area(struct vcpu *v,
> + struct vcpu_register_runstate_memory_area *area)
> +{
> + unsigned long offset = area->addr.p & ~PAGE_MASK;
> + void *mapping;
> + struct page_info *page;
> + size_t size = sizeof(struct vcpu_runstate_info);
> +
> + ASSERT(runstate_guest(v) == NULL);
> +
> + /* do not allow an area crossing 2 pages */
> + if ( offset > (PAGE_SIZE - size) )
> + return -EINVAL;
I'm afraid this is not suitable, Linux will BUG if
VCPUOP_register_runstate_memory_area returns an error, and current
Linux code doesn't check that the area doesn't cross a page
boundary. You will need to take a reference to the possible two pages
in that case.
> +
> +#ifdef CONFIG_ARM
> + page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
> +#else
> + /* XXX how to solve this one ? */
We have hvm_translate_get_page which seems similar, will need to look
into this.
> +#error get_page_from_gva is not available on other archs
> +#endif
> + if ( !page )
> + return -EINVAL;
> +
> + mapping = __map_domain_page_global(page);
> +
> + if ( mapping == NULL )
> + {
> + put_page_and_type(page);
> + return -ENOMEM;
> + }
> +
> + runstate_guest(v) = (struct vcpu_runstate_info *)
> + ((unsigned long)mapping + offset);
There's no need to cast to unsigned long, you can just do pointer
arithmetic on the void * directly. That should also get rid of the
cast to vcpu_runstate_info I think.
> +
> + return 0;
> +}
> +
> int domain_kill(struct domain *d)
> {
> int rc = 0;
> @@ -727,7 +788,10 @@ int domain_kill(struct domain *d)
> if ( cpupool_move_domain(d, cpupool0) )
> return -ERESTART;
> for_each_vcpu ( d, v )
> + {
> + unmap_runstate_area(v, 0);
Why is it not appropriate here to hold the lock? It might not be
technically needed, but still should work?
Thanks, Roger.
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |