[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2] x86/hvm: Advertise and support extended destination IDs for MSI/IO-APIC



x2APIC guests with more than 128 vCPUs have APIC IDs above 255, but MSI
addresses and IO-APIC RTEs only provide an 8-bit destination field.
Without extended destination ID support, Linux limits the maximum usable
APIC ID to 255, refusing to bring up vCPUs beyond that limit. So,
advertise XEN_HVM_CPUID_EXT_DEST_ID in the HVM hypervisor CPUID leaf,
signalling that guests may use MSI address bits 11:5 and IO-APIC RTE
bits 55:49 as additional high destination ID bits. This expands the
destination ID from 8 to 15 bits.

Signed-off-by: Julian Vetter <julian.vetter@xxxxxxxxxx>
---
Changes in v2:
- Replaced macro by bit field entries for vioapic
- Added migration check function to vioapic state
- Check if XEN_HVM_CPUID_EXT_DEST_ID is announced by destination XEN,
  if any RTE has ext_dest_id bits set
- Replaced bit-mask + shift value by MASK_EXTR()
- Rephrased comments based on reviewer suggestions
---
 xen/arch/x86/cpuid.c                   |  9 +++++
 xen/arch/x86/hvm/irq.c                 | 11 +++++-
 xen/arch/x86/hvm/vioapic.c             | 50 ++++++++++++++++++++++++--
 xen/arch/x86/hvm/vmsi.c                |  8 +++--
 xen/arch/x86/include/asm/hvm/hvm.h     |  4 +--
 xen/arch/x86/include/asm/msi.h         |  2 ++
 xen/drivers/passthrough/x86/hvm.c      |  8 ++---
 xen/include/public/arch-x86/hvm/save.h |  4 ++-
 xen/include/public/domctl.h            | 16 +++++----
 9 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index d85be20d86..fb17c71d74 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -148,6 +148,15 @@ static void cpuid_hypervisor_leaves(const struct vcpu *v, 
uint32_t leaf,
         res->a |= XEN_HVM_CPUID_DOMID_PRESENT;
         res->c = d->domain_id;
 
+        /*
+         * Advertise extended destination ID support. This allows guests to use
+         * bits 11:5 of the MSI address and bits 55:49 of the IO-APIC RTE for
+         * additional destination ID bits, expanding the addressable APIC ID
+         * range from 8 to 15 bits. This is required for x2APIC guests with
+         * APIC IDs > 255.
+         */
+        res->a |= XEN_HVM_CPUID_EXT_DEST_ID;
+
         /*
          * Per-vCPU event channel upcalls are implemented and work
          * correctly with PIRQs routed over event channels.
diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index 5f64361113..b036be2d01 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -374,7 +374,16 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 
isa_irq)
 int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data)
 {
     uint32_t tmp = (uint32_t) addr;
-    uint8_t  dest = (tmp & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+    /*
+     * Standard MSI destination address bits 19:12 (8 bits).
+     * Extended MSI destination address bits 11:5 (7 more bits).
+     *
+     * As XEN_HVM_CPUID_EXT_DEST_ID is advertised, the guest may use bits 11:5
+     * for high destination ID bits, expanding to 15 bits total. Guests unaware
+     * of this feature set these bits to 0, so this is backwards-compatible.
+     */
+    uint32_t dest = (MASK_EXTR(tmp, MSI_ADDR_EXT_DEST_ID_MASK) << 8) |
+                     MASK_EXTR(tmp, MSI_ADDR_DEST_ID_MASK);
     uint8_t  dest_mode = !!(tmp & MSI_ADDR_DESTMODE_MASK);
     uint8_t  delivery_mode = (data & MSI_DATA_DELIVERY_MODE_MASK)
         >> MSI_DATA_DELIVERY_MODE_SHIFT;
diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index 7c725f9e47..249291a25d 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -31,7 +31,9 @@
 #include <xen/errno.h>
 #include <xen/sched.h>
 #include <xen/nospec.h>
+#include <public/arch-x86/cpuid.h>
 #include <public/hvm/ioreq.h>
+#include <asm/cpuid.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/vlapic.h>
 #include <asm/hvm/support.h>
@@ -411,7 +413,8 @@ static void ioapic_inj_irq(
 
 static void vioapic_deliver(struct hvm_vioapic *vioapic, unsigned int pin)
 {
-    uint16_t dest = vioapic->redirtbl[pin].fields.dest_id;
+    uint32_t dest = ((uint32_t)vioapic->redirtbl[pin].fields.ext_dest_id << 8) 
|
+                    vioapic->redirtbl[pin].fields.dest_id;
     uint8_t dest_mode = vioapic->redirtbl[pin].fields.dest_mode;
     uint8_t delivery_mode = vioapic->redirtbl[pin].fields.delivery_mode;
     uint8_t vector = vioapic->redirtbl[pin].fields.vector;
@@ -594,6 +597,49 @@ int vioapic_get_trigger_mode(const struct domain *d, 
unsigned int gsi)
     return vioapic->redirtbl[pin].fields.trig_mode;
 }
 
+static int cf_check ioapic_check(const struct domain *d, hvm_domain_context_t 
*h)
+{
+    const HVM_SAVE_TYPE(IOAPIC) *s = hvm_get_entry(IOAPIC, h);
+    unsigned int i;
+
+    if ( !s )
+        return -ENODATA;
+
+    for ( i = 0; i < ARRAY_SIZE(s->redirtbl); i++ )
+    {
+        struct cpuid_leaf res;
+
+        /* ext_dest_id bits not set in RTE so continue */
+        if ( !s->redirtbl[i].fields.ext_dest_id )
+            continue;
+
+        /*
+         * An RTE in the saved state has ext_dest_id bits set, verify the
+         * destination XEN advertises XEN_HVM_CPUID_EXT_DEST_ID to the guest.
+         * If not interrupt routing to APIC IDs > 255 would be broken after
+         * restore -> -EINVAL!
+         */
+        guest_cpuid(d->vcpu[0], 0x40000004, 0, &res);
+        if ( !(res.a & XEN_HVM_CPUID_EXT_DEST_ID) )
+        {
+            printk(XENLOG_G_ERR "HVM restore: dom%d IO-APIC RTE %u has "
+                                "extended destination ID bits set but "
+                                "XEN_HVM_CPUID_EXT_DEST_ID is not 
advertised\n",
+                                d->domain_id, i);
+            return -EINVAL;
+        }
+
+        /*
+         * Found an RTE with ext_dest bits set, but the destination XEN also
+         * correctly announces XEN_HVM_CPUID_EXT_DEST_ID
+         * -> All good, no need to check remaining RTEs.
+         */
+        break;
+    }
+
+    return 0;
+}
+
 static int cf_check ioapic_save(struct vcpu *v, hvm_domain_context_t *h)
 {
     const struct domain *d = v->domain;
@@ -630,7 +676,7 @@ static int cf_check ioapic_load(struct domain *d, 
hvm_domain_context_t *h)
     return 0;
 }
 
-HVM_REGISTER_SAVE_RESTORE(IOAPIC, ioapic_save, NULL, ioapic_load, 1,
+HVM_REGISTER_SAVE_RESTORE(IOAPIC, ioapic_save, ioapic_check, ioapic_load, 1,
                           HVMSR_PER_DOM);
 
 void vioapic_reset(struct domain *d)
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 27b1f089e2..36ea898ac7 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -66,7 +66,7 @@ static void vmsi_inj_irq(
 
 int vmsi_deliver(
     struct domain *d, int vector,
-    uint8_t dest, uint8_t dest_mode,
+    uint32_t dest, uint8_t dest_mode,
     uint8_t delivery_mode, uint8_t trig_mode)
 {
     struct vlapic *target;
@@ -109,7 +109,7 @@ void vmsi_deliver_pirq(struct domain *d, const struct 
hvm_pirq_dpci *pirq_dpci)
 {
     uint32_t flags = pirq_dpci->gmsi.gflags;
     int vector = pirq_dpci->gmsi.gvec;
-    uint8_t dest = (uint8_t)flags;
+    uint32_t dest = XEN_DOMCTL_VMSI_X86_FULL_DEST(flags);
     bool dest_mode = flags & XEN_DOMCTL_VMSI_X86_DM_MASK;
     uint8_t delivery_mode = MASK_EXTR(flags, XEN_DOMCTL_VMSI_X86_DELIV_MASK);
     bool trig_mode = flags & XEN_DOMCTL_VMSI_X86_TRIG_MASK;
@@ -125,7 +125,7 @@ void vmsi_deliver_pirq(struct domain *d, const struct 
hvm_pirq_dpci *pirq_dpci)
 }
 
 /* Return value, -1 : multi-dests, non-negative value: dest_vcpu_id */
-int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode)
+int hvm_girq_dest_2_vcpu_id(struct domain *d, uint32_t dest, uint8_t dest_mode)
 {
     int dest_vcpu_id = -1, w = 0;
     struct vcpu *v;
@@ -802,6 +802,8 @@ static unsigned int msi_gflags(uint16_t data, uint64_t 
addr, bool masked)
      */
     return MASK_INSR(MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK),
                      XEN_DOMCTL_VMSI_X86_DEST_ID_MASK) |
+           MASK_INSR(MASK_EXTR(addr, MSI_ADDR_EXT_DEST_ID_MASK),
+                     XEN_DOMCTL_VMSI_X86_EXT_DEST_ID_MASK) |
            MASK_INSR(MASK_EXTR(addr, MSI_ADDR_REDIRECTION_MASK),
                      XEN_DOMCTL_VMSI_X86_RH_MASK) |
            MASK_INSR(MASK_EXTR(addr, MSI_ADDR_DESTMODE_MASK),
diff --git a/xen/arch/x86/include/asm/hvm/hvm.h 
b/xen/arch/x86/include/asm/hvm/hvm.h
index 7d9774df59..11256d5e67 100644
--- a/xen/arch/x86/include/asm/hvm/hvm.h
+++ b/xen/arch/x86/include/asm/hvm/hvm.h
@@ -295,11 +295,11 @@ uint64_t hvm_get_guest_time_fixed(const struct vcpu *v, 
uint64_t at_tsc);
 
 int vmsi_deliver(
     struct domain *d, int vector,
-    uint8_t dest, uint8_t dest_mode,
+    uint32_t dest, uint8_t dest_mode,
     uint8_t delivery_mode, uint8_t trig_mode);
 struct hvm_pirq_dpci;
 void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci 
*pirq_dpci);
-int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode);
+int hvm_girq_dest_2_vcpu_id(struct domain *d, uint32_t dest, uint8_t 
dest_mode);
 
 enum hvm_intblk
 hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack);
diff --git a/xen/arch/x86/include/asm/msi.h b/xen/arch/x86/include/asm/msi.h
index 00059d4a3a..711b026616 100644
--- a/xen/arch/x86/include/asm/msi.h
+++ b/xen/arch/x86/include/asm/msi.h
@@ -54,6 +54,8 @@
 #define         MSI_ADDR_DEST_ID_MASK          0x00ff000
 #define  MSI_ADDR_DEST_ID(dest)                (((dest) << 
MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK)
 
+#define MSI_ADDR_EXT_DEST_ID_MASK      0x0000fe0
+
 /* MAX fixed pages reserved for mapping MSIX tables. */
 #define FIX_MSIX_MAX_PAGES              512
 
diff --git a/xen/drivers/passthrough/x86/hvm.c 
b/xen/drivers/passthrough/x86/hvm.c
index b73bb55055..9c3c8d28d6 100644
--- a/xen/drivers/passthrough/x86/hvm.c
+++ b/xen/drivers/passthrough/x86/hvm.c
@@ -281,7 +281,7 @@ int pt_irq_create_bind(
     {
     case PT_IRQ_TYPE_MSI:
     {
-        uint8_t dest, delivery_mode;
+        uint32_t dest, delivery_mode;
         bool dest_mode;
         int dest_vcpu_id;
         const struct vcpu *vcpu;
@@ -357,8 +357,7 @@ int pt_irq_create_bind(
             }
         }
         /* Calculate dest_vcpu_id for MSI-type pirq migration. */
-        dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
-                         XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
+        dest = XEN_DOMCTL_VMSI_X86_FULL_DEST(pirq_dpci->gmsi.gflags);
         dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
         delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
                                   XEN_DOMCTL_VMSI_X86_DELIV_MASK);
@@ -807,8 +806,7 @@ static int cf_check _hvm_dpci_msi_eoi(
     if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) &&
          (pirq_dpci->gmsi.gvec == vector) )
     {
-        unsigned int dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
-                                      XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
+        unsigned int dest = 
XEN_DOMCTL_VMSI_X86_FULL_DEST(pirq_dpci->gmsi.gflags);
         bool dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
 
         if ( vlapic_match_dest(vcpu_vlapic(current), NULL, 0, dest,
diff --git a/xen/include/public/arch-x86/hvm/save.h 
b/xen/include/public/arch-x86/hvm/save.h
index 9c4bfc7ebd..d903ee53da 100644
--- a/xen/include/public/arch-x86/hvm/save.h
+++ b/xen/include/public/arch-x86/hvm/save.h
@@ -359,7 +359,9 @@ union vioapic_redir_entry
         uint8_t trig_mode:1;
         uint8_t mask:1;
         uint8_t reserve:7;
-        uint8_t reserved[4];
+        uint8_t reserved[3];
+        uint8_t ext_dest_rsvd:1;
+        uint8_t ext_dest_id:7;
         uint8_t dest_id;
     } fields;
 };
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 8f6708c0a7..007bcf0f09 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -615,12 +615,13 @@ struct xen_domctl_bind_pt_irq {
         struct {
             uint8_t gvec;
             uint32_t gflags;
-#define XEN_DOMCTL_VMSI_X86_DEST_ID_MASK 0x0000ff
-#define XEN_DOMCTL_VMSI_X86_RH_MASK      0x000100
-#define XEN_DOMCTL_VMSI_X86_DM_MASK      0x000200
-#define XEN_DOMCTL_VMSI_X86_DELIV_MASK   0x007000
-#define XEN_DOMCTL_VMSI_X86_TRIG_MASK    0x008000
-#define XEN_DOMCTL_VMSI_X86_UNMASKED     0x010000
+#define XEN_DOMCTL_VMSI_X86_DEST_ID_MASK        0x0000ff
+#define XEN_DOMCTL_VMSI_X86_RH_MASK             0x000100
+#define XEN_DOMCTL_VMSI_X86_DM_MASK             0x000200
+#define XEN_DOMCTL_VMSI_X86_DELIV_MASK          0x007000
+#define XEN_DOMCTL_VMSI_X86_TRIG_MASK           0x008000
+#define XEN_DOMCTL_VMSI_X86_UNMASKED            0x010000
+#define XEN_DOMCTL_VMSI_X86_EXT_DEST_ID_MASK    0xfe0000
 
             uint64_aligned_t gtable;
         } msi;
@@ -630,6 +631,9 @@ struct xen_domctl_bind_pt_irq {
     } u;
 };
 
+#define XEN_DOMCTL_VMSI_X86_FULL_DEST(gflags) \
+        (MASK_EXTR((gflags), XEN_DOMCTL_VMSI_X86_DEST_ID_MASK) | \
+        (MASK_EXTR((gflags), XEN_DOMCTL_VMSI_X86_EXT_DEST_ID_MASK) << 8))
 
 /* Bind machine I/O address range -> HVM address range. */
 /* XEN_DOMCTL_memory_mapping */
-- 
2.51.0



--
Julian Vetter | Vates Hypervisor & Kernel Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.