|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] RE: [PATCH v4 1/3] VMX: use a single, global APIC access page
> From: Jan Beulich <jbeulich@xxxxxxxx>
> Sent: Friday, April 23, 2021 6:53 PM
>
> The address of this page is used by the CPU only to recognize when to
> access the virtual APIC page instead. No accesses would ever go to this
> page. It only needs to be present in the (CPU) page tables so that
> address translation will produce its address as result for respective
> accesses.
>
> By making this page global, we also eliminate the need to refcount it,
> or to assign it to any domain in the first place.
>
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx>
> ---
> v5: Init apic_access_mfn to INVALID_MFN. Move assignment out of if()
> condition. Introduce page_suppress_refcounting() and
> page_refcounting_suppressed().
> v4: Set PGC_extra on the page. Make shadow mode work.
> v3: Split p2m insertion change to a separate patch.
> v2: Avoid insertion when !has_vlapic(). Split off change to
> p2m_get_iommu_flags().
> ---
> I did further consider not allocating any real page at all, but just
> using the address of some unpopulated space (which would require
> announcing this page as reserved to Dom0, so it wouldn't put any PCI
> MMIO BARs there). But I thought this would be too controversial, because
> of the possible risks associated with this.
>
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -66,8 +66,7 @@ boolean_param("force-ept", opt_force_ept
> static void vmx_ctxt_switch_from(struct vcpu *v);
> static void vmx_ctxt_switch_to(struct vcpu *v);
>
> -static int vmx_alloc_vlapic_mapping(struct domain *d);
> -static void vmx_free_vlapic_mapping(struct domain *d);
> +static int alloc_vlapic_mapping(void);
> static void vmx_install_vlapic_mapping(struct vcpu *v);
> static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
> unsigned int flags);
> @@ -78,6 +77,8 @@ static int vmx_msr_read_intercept(unsign
> static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
> static void vmx_invlpg(struct vcpu *v, unsigned long linear);
>
> +static mfn_t __read_mostly apic_access_mfn = INVALID_MFN_INITIALIZER;
> +
> /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */
> #define PI_CSW_FROM (1u << 0)
> #define PI_CSW_TO (1u << 1)
> @@ -401,7 +402,6 @@ static int vmx_domain_initialise(struct
> .to = vmx_ctxt_switch_to,
> .tail = vmx_do_resume,
> };
> - int rc;
>
> d->arch.ctxt_switch = &csw;
>
> @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct
> */
> d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
>
> - if ( !has_vlapic(d) )
> - return 0;
> -
> - if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
> - return rc;
> -
> return 0;
> }
>
> -static void vmx_domain_relinquish_resources(struct domain *d)
> +static void domain_creation_finished(struct domain *d)
> {
> - if ( !has_vlapic(d) )
> + gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);
> + uint8_t ipat;
> +
> + if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, INVALID_MFN) )
> return;
>
> - vmx_free_vlapic_mapping(d);
> -}
> + ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat,
> + true) == MTRR_TYPE_WRBACK);
> + ASSERT(ipat);
>
> -static void domain_creation_finished(struct domain *d)
> -{
> - if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn,
> _mfn(0)) &&
> - set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE),
> - d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) )
> + if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) )
> domain_crash(d);
> }
>
> @@ -2415,7 +2409,6 @@ static struct hvm_function_table __initd
> .cpu_up_prepare = vmx_cpu_up_prepare,
> .cpu_dead = vmx_cpu_dead,
> .domain_initialise = vmx_domain_initialise,
> - .domain_relinquish_resources = vmx_domain_relinquish_resources,
> .domain_creation_finished = domain_creation_finished,
> .vcpu_initialise = vmx_vcpu_initialise,
> .vcpu_destroy = vmx_vcpu_destroy,
> @@ -2662,7 +2655,7 @@ const struct hvm_function_table * __init
> {
> set_in_cr4(X86_CR4_VMXE);
>
> - if ( vmx_vmcs_init() )
> + if ( vmx_vmcs_init() || alloc_vlapic_mapping() )
> {
> printk("VMX: failed to initialise.\n");
> return NULL;
> @@ -3224,7 +3217,7 @@ gp_fault:
> return X86EMUL_EXCEPTION;
> }
>
> -static int vmx_alloc_vlapic_mapping(struct domain *d)
> +static int __init alloc_vlapic_mapping(void)
> {
> struct page_info *pg;
> mfn_t mfn;
> @@ -3232,52 +3225,34 @@ static int vmx_alloc_vlapic_mapping(stru
> if ( !cpu_has_vmx_virtualize_apic_accesses )
> return 0;
>
> - pg = alloc_domheap_page(d, MEMF_no_refcount);
> + pg = alloc_domheap_page(NULL, 0);
> if ( !pg )
> return -ENOMEM;
>
> - if ( !get_page_and_type(pg, d, PGT_writable_page) )
> - {
> - /*
> - * The domain can't possibly know about this page yet, so failure
> - * here is a clear indication of something fishy going on.
> - */
> - domain_crash(d);
> - return -ENODATA;
> - }
> + /*
> + * Signal to shadow code that this page cannot be refcounted. This also
> + * makes epte_get_entry_emt() recognize this page as "special".
> + */
> + page_suppress_refcounting(pg);
>
> mfn = page_to_mfn(pg);
> clear_domain_page(mfn);
> - d->arch.hvm.vmx.apic_access_mfn = mfn;
> + apic_access_mfn = mfn;
>
> return 0;
> }
>
> -static void vmx_free_vlapic_mapping(struct domain *d)
> -{
> - mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn;
> -
> - d->arch.hvm.vmx.apic_access_mfn = _mfn(0);
> - if ( !mfn_eq(mfn, _mfn(0)) )
> - {
> - struct page_info *pg = mfn_to_page(mfn);
> -
> - put_page_alloc_ref(pg);
> - put_page_and_type(pg);
> - }
> -}
> -
> static void vmx_install_vlapic_mapping(struct vcpu *v)
> {
> paddr_t virt_page_ma, apic_page_ma;
>
> - if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) )
> + if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, INVALID_MFN) )
> return;
>
> ASSERT(cpu_has_vmx_virtualize_apic_accesses);
>
> virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
> - apic_page_ma = mfn_to_maddr(v->domain-
> >arch.hvm.vmx.apic_access_mfn);
> + apic_page_ma = mfn_to_maddr(apic_access_mfn);
>
> vmx_vmcs_enter(v);
> __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
> --- a/xen/arch/x86/mm/shadow/set.c
> +++ b/xen/arch/x86/mm/shadow/set.c
> @@ -94,6 +94,15 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
> ASSERT(!sh_l1e_is_magic(sl1e));
> ASSERT(shadow_mode_refcounts(d));
>
> + /*
> + * Check whether refcounting is suppressed on this page. For example,
> + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually
> + * get accessed, and hence there's no need to refcount it.
> + */
> + mfn = shadow_l1e_get_mfn(sl1e);
> + if ( mfn_valid(mfn) &&
> page_refcounting_suppressed(mfn_to_page(mfn)) )
> + return 0;
> +
> res = get_page_from_l1e(sl1e, d, d);
>
> /*
> --- a/xen/arch/x86/mm/shadow/types.h
> +++ b/xen/arch/x86/mm/shadow/types.h
> @@ -276,9 +276,16 @@ int shadow_set_l4e(struct domain *d, sha
> static void inline
> shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
> {
> + mfn_t mfn;
> +
> if ( !shadow_mode_refcounts(d) )
> return;
>
> + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
> + /* See the respective comment in shadow_get_page_from_l1e(). */
> + page_refcounting_suppressed(mfn_to_page(mfn)) )
> + return;
> +
> put_page_from_l1e(sl1e, d);
> }
>
> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -58,7 +58,6 @@ struct ept_data {
> #define _VMX_DOMAIN_PML_ENABLED 0
> #define VMX_DOMAIN_PML_ENABLED (1ul <<
> _VMX_DOMAIN_PML_ENABLED)
> struct vmx_domain {
> - mfn_t apic_access_mfn;
> /* VMX_DOMAIN_* */
> unsigned int status;
>
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -82,7 +82,7 @@
> #define PGC_state_offlined PG_mask(2, 9)
> #define PGC_state_free PG_mask(3, 9)
> #define page_state_is(pg, st) (((pg)->count_info&PGC_state) ==
> PGC_state_##st)
> -/* Page is not reference counted */
> +/* Page is not reference counted (see below for caveats) */
> #define _PGC_extra PG_shift(10)
> #define PGC_extra PG_mask(1, 10)
>
> @@ -374,6 +374,24 @@ void zap_ro_mpt(mfn_t mfn);
>
> bool is_iomem_page(mfn_t mfn);
>
> +/*
> + * Pages with no owner which may get passed to functions wanting to
> + * refcount them can be marked PGC_extra to bypass this refcounting
> (which
> + * would fail due to the lack of an owner).
> + *
> + * (For pages with owner PGC_extra has different meaning.)
> + */
> +static inline void page_suppress_refcounting(struct page_info *pg)
> +{
> + ASSERT(!page_get_owner(pg));
> + pg->count_info |= PGC_extra;
> +}
> +
> +static inline bool page_refcounting_suppressed(const struct page_info *pg)
> +{
> + return !page_get_owner(pg) && (pg->count_info & PGC_extra);
> +}
> +
> struct platform_bad_page {
> unsigned long mfn;
> unsigned int order;
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |