|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2] x86/hvm: Add Kconfig option to disable nested virtualization
On Fri, Feb 06, 2026 at 01:05:54PM -0800, Stefano Stabellini wrote:
> Introduce CONFIG_NESTED_VIRT (default y, requires EXPERT to disable)
> to allow nested virtualization support to be disabled at build time.
> This is useful for embedded or safety-focused deployments where
> nested virtualization is not needed, reducing code size and attack
> surface.
>
> When CONFIG_NESTED_VIRT=n, the following source files are excluded:
> - arch/x86/hvm/nestedhvm.c
> - arch/x86/hvm/svm/nestedsvm.c
> - arch/x86/hvm/vmx/vvmx.c
> - arch/x86/mm/nested.c
> - arch/x86/mm/hap/nested_hap.c
> - arch/x86/mm/hap/nested_ept.c
>
> Add inline stubs where needed in headers.
>
> No functional change when CONFIG_NESTED_VIRT=y.
>
> Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxx>
> ---
> Changes in v2:
> - add ASSERT_UNREACHABLE
> - change default to N and remove EXPERT
> - don't compile nested_hap if not VMX
> - add XEN_SYSCTL_PHYSCAP_nestedhvm
> - add IS_ENABLED check in hvm_nested_virt_supported
> ---
> xen/arch/x86/hvm/Kconfig | 10 ++++
> xen/arch/x86/hvm/Makefile | 2 +-
> xen/arch/x86/hvm/svm/Makefile | 2 +-
> xen/arch/x86/hvm/svm/nestedhvm.h | 60 +++++++++++++++++++++--
> xen/arch/x86/hvm/svm/svm.c | 6 +++
> xen/arch/x86/hvm/vmx/Makefile | 2 +-
> xen/arch/x86/hvm/vmx/vmx.c | 10 +++-
> xen/arch/x86/include/asm/hvm/hvm.h | 2 +-
> xen/arch/x86/include/asm/hvm/nestedhvm.h | 52 ++++++++++++++++----
> xen/arch/x86/include/asm/hvm/vmx/vvmx.h | 62 ++++++++++++++++++++++++
> xen/arch/x86/mm/Makefile | 2 +-
> xen/arch/x86/mm/hap/Makefile | 4 +-
> xen/arch/x86/mm/p2m.h | 6 +++
> xen/arch/x86/sysctl.c | 2 +
> xen/include/public/sysctl.h | 4 +-
> 15 files changed, 202 insertions(+), 24 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/Kconfig b/xen/arch/x86/hvm/Kconfig
> index f32bf5cbb7..133f19a063 100644
> --- a/xen/arch/x86/hvm/Kconfig
> +++ b/xen/arch/x86/hvm/Kconfig
> @@ -92,4 +92,14 @@ config MEM_SHARING
> bool "Xen memory sharing support (UNSUPPORTED)" if UNSUPPORTED
> depends on INTEL_VMX
>
> +config NESTED_VIRT
> + bool "Nested virtualization support"
> + depends on AMD_SVM || INTEL_VMX
> + default n
> + help
> + Enable nested virtualization, allowing guests to run their own
> + hypervisors. This requires hardware support.
> +
> + If unsure, say N.
> +
> endif
> diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile
> index f34fb03934..b8a0a68624 100644
> --- a/xen/arch/x86/hvm/Makefile
> +++ b/xen/arch/x86/hvm/Makefile
> @@ -18,7 +18,7 @@ obj-y += irq.o
> obj-y += mmio.o
> obj-$(CONFIG_VM_EVENT) += monitor.o
> obj-y += mtrr.o
> -obj-y += nestedhvm.o
> +obj-$(CONFIG_NESTED_VIRT) += nestedhvm.o
> obj-y += pmtimer.o
> obj-y += quirks.o
> obj-y += rtc.o
> diff --git a/xen/arch/x86/hvm/svm/Makefile b/xen/arch/x86/hvm/svm/Makefile
> index 8a072cafd5..92418e3444 100644
> --- a/xen/arch/x86/hvm/svm/Makefile
> +++ b/xen/arch/x86/hvm/svm/Makefile
> @@ -2,6 +2,6 @@ obj-y += asid.o
> obj-y += emulate.o
> obj-bin-y += entry.o
> obj-y += intr.o
> -obj-y += nestedsvm.o
> +obj-$(CONFIG_NESTED_VIRT) += nestedsvm.o
> obj-y += svm.o
> obj-y += vmcb.o
> diff --git a/xen/arch/x86/hvm/svm/nestedhvm.h
> b/xen/arch/x86/hvm/svm/nestedhvm.h
> index 9bfed5ffd7..ed1aa847e5 100644
> --- a/xen/arch/x86/hvm/svm/nestedhvm.h
> +++ b/xen/arch/x86/hvm/svm/nestedhvm.h
> @@ -26,6 +26,13 @@
> #define nsvm_efer_svm_enabled(v) \
> (!!((v)->arch.hvm.guest_efer & EFER_SVME))
I wonder whether you also want to force this one to return false if
NETSED_VIRT is disabled. That would likely lead to more DCE by the
compiler:
#define nsvm_efer_svm_enabled(v) \
(IS_ENABLED(CONFIG_NESTED_VIRT) &&
!!((v)->arch.hvm.guest_efer & EFER_SVME))
>
> +#define NSVM_INTR_NOTHANDLED 3
> +#define NSVM_INTR_NOTINTERCEPTED 2
> +#define NSVM_INTR_FORCEVMEXIT 1
> +#define NSVM_INTR_MASKED 0
> +
> +#ifdef CONFIG_NESTED_VIRT
> +
> int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr);
> void nestedsvm_vmexit_defer(struct vcpu *v,
> uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2);
> @@ -57,13 +64,56 @@ int cf_check nsvm_hap_walk_L1_p2m(
> struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int
> *page_order,
> uint8_t *p2m_acc, struct npfec npfec);
>
> -#define NSVM_INTR_NOTHANDLED 3
> -#define NSVM_INTR_NOTINTERCEPTED 2
> -#define NSVM_INTR_FORCEVMEXIT 1
> -#define NSVM_INTR_MASKED 0
> -
> int nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack);
>
> +#else /* !CONFIG_NESTED_VIRT */
> +
> +static inline int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr)
> +{
> + ASSERT_UNREACHABLE();
> + return -EOPNOTSUPP;
I think this is guest-reachable code even when nested virt is build
time disabled:
VMEXIT_VMRUN -> svm_vmexit_do_vmrun() -> nestedsvm_vmcb_map().
I think you rely on nsvm_efer_svm_enabled() always returning false, as
all calls to nestedsvm_vmcb_map() are gated on nsvm_efer_svm_enabled()
returning true.
> +}
> +static inline void nestedsvm_vmexit_defer(struct vcpu *v,
> + uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline enum nestedhvm_vmexits nestedsvm_vmexit_n2n1(struct vcpu *v,
> + struct cpu_user_regs *regs)
> +{
> + ASSERT_UNREACHABLE();
> + return NESTEDHVM_VMEXIT_ERROR;
> +}
> +static inline enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu
> *v,
> + struct cpu_user_regs *regs, uint64_t exitcode)
> +{
> + ASSERT_UNREACHABLE();
> + return NESTEDHVM_VMEXIT_ERROR;
For the above two you possibly want to return
NESTEDHVM_VMEXIT_FATALERROR so that the L1 guest is killed. This
should be unreachable code when nested virt is build disabled, and
hence it's safer to kill the guest, rather than attempting to report
an error to L1.
> +}
> +static inline void svm_nested_features_on_efer_update(struct vcpu *v)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline void svm_vmexit_do_clgi(struct cpu_user_regs *regs,
> + struct vcpu *v)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline void svm_vmexit_do_stgi(struct cpu_user_regs *regs,
> + struct vcpu *v)
> +{
> + ASSERT_UNREACHABLE();
> +}
Aren't those last two guest reachable if it tries to execute an STGI
or CLGI instruction? I see the svm exit handler calling straight into
svm_vmexit_do_{st,cl}gi() as a result of VMEXIT_{ST,CL}GI vmexit.
> +static inline bool nestedsvm_gif_isset(struct vcpu *v) { return true; }
I think this one above is really unreachable when `nestedsvm.c` is not
build, and hence you could even remove the handler here.
> +static inline int nestedsvm_vcpu_interrupt(struct vcpu *v,
> + const struct hvm_intack intack)
> +{
> + ASSERT_UNREACHABLE();
> + return NSVM_INTR_NOTINTERCEPTED;
> +}
> +
> +#endif /* CONFIG_NESTED_VIRT */
> +
> #endif /* __X86_HVM_SVM_NESTEDHVM_PRIV_H__ */
>
> /*
> diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
> index 18ba837738..0234b57afb 100644
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -46,6 +46,10 @@
>
> void noreturn svm_asm_do_resume(void);
>
> +#ifndef CONFIG_NESTED_VIRT
> +void asmlinkage nsvm_vcpu_switch(void) { }
> +#endif
Instead of adding the ifdefs here, you could maybe add them to
svm/entry.S and elide the call to nsvm_vcpu_switch() altogether?
> +
> u32 svm_feature_flags;
>
> /*
> @@ -2465,6 +2469,7 @@ static struct hvm_function_table __initdata_cf_clobber
> svm_function_table = {
> .set_rdtsc_exiting = svm_set_rdtsc_exiting,
> .get_insn_bytes = svm_get_insn_bytes,
>
> +#ifdef CONFIG_NESTED_VIRT
> .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
> .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
> .nhvm_vcpu_reset = nsvm_vcpu_reset,
> @@ -2474,6 +2479,7 @@ static struct hvm_function_table __initdata_cf_clobber
> svm_function_table = {
> .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
> .nhvm_intr_blocked = nsvm_intr_blocked,
> .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m,
> +#endif
>
> .get_reg = svm_get_reg,
> .set_reg = svm_set_reg,
> diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile
> index 04a29ce59d..902564b3e2 100644
> --- a/xen/arch/x86/hvm/vmx/Makefile
> +++ b/xen/arch/x86/hvm/vmx/Makefile
> @@ -3,4 +3,4 @@ obj-y += intr.o
> obj-y += realmode.o
> obj-y += vmcs.o
> obj-y += vmx.o
> -obj-y += vvmx.o
> +obj-$(CONFIG_NESTED_VIRT) += vvmx.o
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 82c55f49ae..252f27322b 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -55,6 +55,10 @@
> #include <public/hvm/save.h>
> #include <public/sched.h>
>
> +#ifndef CONFIG_NESTED_VIRT
> +void asmlinkage nvmx_switch_guest(void) { }
> +#endif
Similar to my remark above, you could likely move the ifdef to the
assembly call site.
> static bool __initdata opt_force_ept;
> boolean_param("force-ept", opt_force_ept);
>
> @@ -2033,7 +2037,7 @@ static void nvmx_enqueue_n2_exceptions(struct vcpu *v,
> nvmx->intr.intr_info, nvmx->intr.error_code);
> }
>
> -static int cf_check nvmx_vmexit_event(
> +static int cf_check __maybe_unused nvmx_vmexit_event(
Maybe you can move this one to the vvmx.c file? It seems like an odd
one to have defined in vmx.c.
> struct vcpu *v, const struct x86_event *event)
> {
> nvmx_enqueue_n2_exceptions(v, event->vector, event->error_code,
> @@ -2933,6 +2937,7 @@ static struct hvm_function_table __initdata_cf_clobber
> vmx_function_table = {
> .handle_cd = vmx_handle_cd,
> .set_info_guest = vmx_set_info_guest,
> .set_rdtsc_exiting = vmx_set_rdtsc_exiting,
> +#ifdef CONFIG_NESTED_VIRT
> .nhvm_vcpu_initialise = nvmx_vcpu_initialise,
> .nhvm_vcpu_destroy = nvmx_vcpu_destroy,
> .nhvm_vcpu_reset = nvmx_vcpu_reset,
> @@ -2942,8 +2947,9 @@ static struct hvm_function_table __initdata_cf_clobber
> vmx_function_table = {
> .nhvm_vcpu_vmexit_event = nvmx_vmexit_event,
> .nhvm_intr_blocked = nvmx_intr_blocked,
> .nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources,
> - .update_vlapic_mode = vmx_vlapic_msr_changed,
> .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
> +#endif
> + .update_vlapic_mode = vmx_vlapic_msr_changed,
> #ifdef CONFIG_VM_EVENT
> .enable_msr_interception = vmx_enable_msr_interception,
> #endif
> diff --git a/xen/arch/x86/include/asm/hvm/hvm.h
> b/xen/arch/x86/include/asm/hvm/hvm.h
> index 7d9774df59..536a38b450 100644
> --- a/xen/arch/x86/include/asm/hvm/hvm.h
> +++ b/xen/arch/x86/include/asm/hvm/hvm.h
> @@ -711,7 +711,7 @@ static inline bool hvm_altp2m_supported(void)
> /* Returns true if we have the minimum hardware requirements for nested virt
> */
> static inline bool hvm_nested_virt_supported(void)
> {
> - return hvm_funcs.caps.nested_virt;
> + return IS_ENABLED(CONFIG_NESTED_VIRT) && hvm_funcs.caps.nested_virt;
> }
>
> #ifdef CONFIG_ALTP2M
> diff --git a/xen/arch/x86/include/asm/hvm/nestedhvm.h
> b/xen/arch/x86/include/asm/hvm/nestedhvm.h
> index ea2c1bc328..e18d59e0eb 100644
> --- a/xen/arch/x86/include/asm/hvm/nestedhvm.h
> +++ b/xen/arch/x86/include/asm/hvm/nestedhvm.h
> @@ -25,9 +25,21 @@ enum nestedhvm_vmexits {
> /* Nested HVM on/off per domain */
> static inline bool nestedhvm_enabled(const struct domain *d)
> {
> - return IS_ENABLED(CONFIG_HVM) && (d->options &
> XEN_DOMCTL_CDF_nested_virt);
> + return IS_ENABLED(CONFIG_NESTED_VIRT) &&
> + (d->options & XEN_DOMCTL_CDF_nested_virt);
> }
>
> +/* Nested paging */
> +#define NESTEDHVM_PAGEFAULT_DONE 0
> +#define NESTEDHVM_PAGEFAULT_INJECT 1
> +#define NESTEDHVM_PAGEFAULT_L1_ERROR 2
> +#define NESTEDHVM_PAGEFAULT_L0_ERROR 3
> +#define NESTEDHVM_PAGEFAULT_MMIO 4
> +#define NESTEDHVM_PAGEFAULT_RETRY 5
> +#define NESTEDHVM_PAGEFAULT_DIRECT_MMIO 6
> +
> +#ifdef CONFIG_NESTED_VIRT
> +
> /* Nested VCPU */
> int nestedhvm_vcpu_initialise(struct vcpu *v);
> void nestedhvm_vcpu_destroy(struct vcpu *v);
> @@ -38,14 +50,6 @@ bool nestedhvm_vcpu_in_guestmode(struct vcpu *v);
> #define nestedhvm_vcpu_exit_guestmode(v) \
> vcpu_nestedhvm(v).nv_guestmode = 0
>
> -/* Nested paging */
> -#define NESTEDHVM_PAGEFAULT_DONE 0
> -#define NESTEDHVM_PAGEFAULT_INJECT 1
> -#define NESTEDHVM_PAGEFAULT_L1_ERROR 2
> -#define NESTEDHVM_PAGEFAULT_L0_ERROR 3
> -#define NESTEDHVM_PAGEFAULT_MMIO 4
> -#define NESTEDHVM_PAGEFAULT_RETRY 5
> -#define NESTEDHVM_PAGEFAULT_DIRECT_MMIO 6
> int nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t *L2_gpa,
> struct npfec npfec);
>
> @@ -59,6 +63,36 @@ unsigned long *nestedhvm_vcpu_iomap_get(bool ioport_80,
> bool ioport_ed);
>
> void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m);
>
> +#else /* !CONFIG_NESTED_VIRT */
> +
> +static inline int nestedhvm_vcpu_initialise(struct vcpu *v)
> +{
> + ASSERT_UNREACHABLE();
> + return -EOPNOTSUPP;
> +}
> +static inline void nestedhvm_vcpu_destroy(struct vcpu *v) { }
> +static inline void nestedhvm_vcpu_reset(struct vcpu *v)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline bool nestedhvm_vcpu_in_guestmode(struct vcpu *v) { return
> false; }
> +static inline int nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t
> *L2_gpa,
> + struct npfec npfec)
> +{
> + ASSERT_UNREACHABLE();
> + return NESTEDHVM_PAGEFAULT_L0_ERROR;
> +}
> +#define nestedhvm_vcpu_enter_guestmode(v) do { ASSERT_UNREACHABLE(); } while
> (0)
> +#define nestedhvm_vcpu_exit_guestmode(v) do { ASSERT_UNREACHABLE(); } while
> (0)
> +#define nestedhvm_paging_mode_hap(v) false
> +#define nestedhvm_vmswitch_in_progress(v) false
Why are those defines instead of static inlines?
> +static inline void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +
> +#endif /* CONFIG_NESTED_VIRT */
> +
> static inline bool nestedhvm_is_n2(struct vcpu *v)
> {
> if ( !nestedhvm_enabled(v->domain) ||
> diff --git a/xen/arch/x86/include/asm/hvm/vmx/vvmx.h
> b/xen/arch/x86/include/asm/hvm/vmx/vvmx.h
> index da10d3fa96..ad56cdf01e 100644
> --- a/xen/arch/x86/include/asm/hvm/vmx/vvmx.h
> +++ b/xen/arch/x86/include/asm/hvm/vmx/vvmx.h
> @@ -73,6 +73,8 @@ union vmx_inst_info {
> u32 word;
> };
>
> +#ifdef CONFIG_NESTED_VIRT
> +
> int cf_check nvmx_vcpu_initialise(struct vcpu *v);
> void cf_check nvmx_vcpu_destroy(struct vcpu *v);
> int cf_check nvmx_vcpu_reset(struct vcpu *v);
> @@ -199,5 +201,65 @@ int nept_translate_l2ga(struct vcpu *v, paddr_t l2ga,
> uint64_t *exit_qual, uint32_t *exit_reason);
> int nvmx_cpu_up_prepare(unsigned int cpu);
> void nvmx_cpu_dead(unsigned int cpu);
> +
> +#else /* !CONFIG_NESTED_VIRT */
> +
> +static inline void nvmx_update_exec_control(struct vcpu *v, u32 value)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline void nvmx_update_secondary_exec_control(struct vcpu *v,
> + unsigned long value)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline void nvmx_update_exception_bitmap(struct vcpu *v,
> + unsigned long value)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline u64 nvmx_get_tsc_offset(struct vcpu *v)
> +{
> + ASSERT_UNREACHABLE();
> + return 0;
> +}
> +static inline void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline bool nvmx_intercepts_exception(struct vcpu *v, unsigned int
> vector,
> + int error_code)
> +{
> + ASSERT_UNREACHABLE();
> + return false;
> +}
> +static inline int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
> + unsigned int exit_reason)
> +{
> + ASSERT_UNREACHABLE();
> + return 0;
> +}
> +static inline void nvmx_idtv_handling(void)
> +{
> + ASSERT_UNREACHABLE();
> +}
> +static inline int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
> +{
> + ASSERT_UNREACHABLE();
> + return 0;
> +}
I think this function is reachable even when nested virt is not
enabled:
vmx_msr_read_intercept() -> case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_VMFUNC ->
nvmx_msr_read_intercept()
I'm also confused about why the function returns 0 instead of an error
when !nestedhvm_enabled(). We should probably make it return -ENODEV
when nested virt is not available or enabled.
> +static inline int nvmx_handle_vmx_insn(struct cpu_user_regs *regs,
> + unsigned int exit_reason)
> +{
> + ASSERT_UNREACHABLE();
> + return 0;
> +}
Same here, I think this is likely reachable from vmx_vmexit_handler(),
and shouldn't assert?
It should also do something like:
hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
return X86EMUL_EXCEPTION;
So it mimics what the function itself does when !nestedhvm_enabled().
> +static inline int nvmx_cpu_up_prepare(unsigned int cpu) { return 0; }
> +static inline void nvmx_cpu_dead(unsigned int cpu) { }
> +
> +#define get_vvmcs(vcpu, encoding) 0
This likely wants to be a static inline and have an
ASSERT_UNREACHABLE()? From a quick look it seems like call sites do
check that the vpcu is in guest mode before attempting to fetch a
field from the nested VMCS.
Thanks, Roger.
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |