AMD IOMMU: allocate IRTE entries instead of using a static mapping For multi-vector MSI, where we surely don't want to allocate contiguous vectors and be able to set affinities of the individual vectors separately, we need to drop the use of the tuple of vector and delivery mode to determine the IRTE to use, and instead allocate IRTEs (which imo should have been done from the beginning). Signed-off-by: Jan Beulich --- One thing I surely need confirmation on is whether this BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table != get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); in update_intremap_entry_from_msi_msg() is valid. If it isn't, it's not clear to me how to properly set up things for affected devices, as we would need an identical index allocated for two different remap table instances (which can hardly be expected to work out well). --- a/xen/drivers/passthrough/amd/iommu_acpi.c +++ b/xen/drivers/passthrough/amd/iommu_acpi.c @@ -72,12 +72,15 @@ static void __init add_ivrs_mapping_entr /* allocate per-device interrupt remapping table */ if ( amd_iommu_perdev_intremap ) ivrs_mappings[alias_id].intremap_table = - amd_iommu_alloc_intremap_table(); + amd_iommu_alloc_intremap_table( + &ivrs_mappings[alias_id].intremap_inuse); else { if ( shared_intremap_table == NULL ) - shared_intremap_table = amd_iommu_alloc_intremap_table(); + shared_intremap_table = amd_iommu_alloc_intremap_table( + &shared_intremap_inuse); ivrs_mappings[alias_id].intremap_table = shared_intremap_table; + ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse; } } /* assgin iommu hardware */ @@ -671,7 +674,7 @@ static u16 __init parse_ivhd_device_spec if ( IO_APIC_ID(apic) != special->handle ) continue; - if ( ioapic_sbdf[special->handle].pin_setup ) + if ( ioapic_sbdf[special->handle].pin_2_idx ) { if ( ioapic_sbdf[special->handle].bdf == bdf && ioapic_sbdf[special->handle].seg == seg ) @@ -691,14 +694,16 @@ static u16 __init parse_ivhd_device_spec ioapic_sbdf[special->handle].bdf = bdf; ioapic_sbdf[special->handle].seg = seg; - ioapic_sbdf[special->handle].pin_setup = xzalloc_array( - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); + ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( + u16, nr_ioapic_entries[apic]); if ( nr_ioapic_entries[apic] && - !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) + !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) { printk(XENLOG_ERR "IVHD Error: Out of memory\n"); return 0; } + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, + nr_ioapic_entries[apic]); } break; } @@ -926,7 +931,7 @@ static int __init parse_ivrs_table(struc for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic ) { if ( !nr_ioapic_entries[apic] || - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) continue; printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n", @@ -935,13 +940,15 @@ static int __init parse_ivrs_table(struc error = -ENXIO; else { - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup = xzalloc_array( - unsigned long, BITS_TO_LONGS(nr_ioapic_entries[apic])); - if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_setup ) + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx = xmalloc_array( + u16, nr_ioapic_entries[apic]); + if ( !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) { printk(XENLOG_ERR "IVHD Error: Out of memory\n"); error = -ENOMEM; } + memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, + nr_ioapic_entries[apic]); } } --- a/xen/drivers/passthrough/amd/iommu_intr.c +++ b/xen/drivers/passthrough/amd/iommu_intr.c @@ -31,6 +31,7 @@ struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS]; struct hpet_sbdf hpet_sbdf; void *shared_intremap_table; +unsigned long *shared_intremap_inuse; static DEFINE_SPINLOCK(shared_intremap_lock); static spinlock_t* get_intremap_lock(int seg, int req_id) @@ -46,30 +47,31 @@ static int get_intremap_requestor_id(int return get_ivrs_mappings(seg)[bdf].dte_requestor_id; } -static int get_intremap_offset(u8 vector, u8 dm) +static unsigned int alloc_intremap_entry(int seg, int bdf) { - int offset = 0; - offset = (dm << INT_REMAP_INDEX_DM_SHIFT) & INT_REMAP_INDEX_DM_MASK; - offset |= (vector << INT_REMAP_INDEX_VECTOR_SHIFT ) & - INT_REMAP_INDEX_VECTOR_MASK; - return offset; + unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse; + unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES); + + if ( slot < INTREMAP_ENTRIES ) + __set_bit(slot, inuse); + return slot; } -static u8 *get_intremap_entry(int seg, int bdf, int offset) +static u32 *get_intremap_entry(int seg, int bdf, int offset) { - u8 *table; + u32 *table = get_ivrs_mappings(seg)[bdf].intremap_table; - table = (u8*)get_ivrs_mappings(seg)[bdf].intremap_table; ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) ); - return (u8*) (table + offset); + return table + offset; } static void free_intremap_entry(int seg, int bdf, int offset) { - u32* entry; - entry = (u32*)get_intremap_entry(seg, bdf, offset); + u32 *entry = get_intremap_entry(seg, bdf, offset); + memset(entry, 0, sizeof(u32)); + __clear_bit(offset, get_ivrs_mappings(seg)[bdf].intremap_inuse); } static void update_intremap_entry(u32* entry, u8 vector, u8 int_type, @@ -98,18 +100,30 @@ static void update_intremap_entry(u32* e INT_REMAP_ENTRY_VECTOR_SHIFT, entry); } -static void update_intremap_entry_from_ioapic( +static inline int get_rte_index(const struct IO_APIC_route_entry *rte) +{ + return rte->vector | (rte->delivery_mode << 8); +} + +static inline void set_rte_index(struct IO_APIC_route_entry *rte, int offset) +{ + rte->vector = (u8)offset; + rte->delivery_mode = offset >> 8; +} + +static int update_intremap_entry_from_ioapic( int bdf, struct amd_iommu *iommu, - const struct IO_APIC_route_entry *rte, - const struct IO_APIC_route_entry *old_rte) + struct IO_APIC_route_entry *rte, + bool_t lo_update, + u16 *index) { unsigned long flags; u32* entry; u8 delivery_mode, dest, vector, dest_mode; int req_id; spinlock_t *lock; - int offset; + unsigned int offset; req_id = get_intremap_requestor_id(iommu->seg, bdf); lock = get_intremap_lock(iommu->seg, req_id); @@ -121,16 +135,35 @@ static void update_intremap_entry_from_i spin_lock_irqsave(lock, flags); - offset = get_intremap_offset(vector, delivery_mode); - if ( old_rte ) + offset = *index; + if ( offset >= INTREMAP_ENTRIES ) { - int old_offset = get_intremap_offset(old_rte->vector, - old_rte->delivery_mode); + offset = alloc_intremap_entry(iommu->seg, req_id); + if ( offset >= INTREMAP_ENTRIES ) + { + spin_unlock_irqrestore(lock, flags); + rte->mask = 1; + return -ENOSPC; + } + *index = offset; + lo_update = 1; + } - if ( offset != old_offset ) - free_intremap_entry(iommu->seg, bdf, old_offset); + entry = get_intremap_entry(iommu->seg, req_id, offset); + if ( !lo_update ) + { + /* + * Low half of incoming RTE is already in remapped format, + * so need to recover vector and delivery mode from IRTE. + */ + ASSERT(get_rte_index(rte) == offset); + vector = get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_VECTOR_MASK, + INT_REMAP_ENTRY_VECTOR_SHIFT); + delivery_mode = get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_INTTYPE_MASK, + INT_REMAP_ENTRY_INTTYPE_SHIFT); } - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); @@ -141,6 +174,10 @@ static void update_intremap_entry_from_i amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } + + set_rte_index(rte, offset); + + return 0; } int __init amd_iommu_setup_ioapic_remapping(void) @@ -153,7 +190,7 @@ int __init amd_iommu_setup_ioapic_remapp u16 seg, bdf, req_id; struct amd_iommu *iommu; spinlock_t *lock; - int offset; + unsigned int offset; /* Read ioapic entries and update interrupt remapping table accordingly */ for ( apic = 0; apic < nr_ioapics; apic++ ) @@ -184,19 +221,23 @@ int __init amd_iommu_setup_ioapic_remapp dest = rte.dest.logical.logical_dest; spin_lock_irqsave(lock, flags); - offset = get_intremap_offset(vector, delivery_mode); - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); + offset = alloc_intremap_entry(seg, req_id); + BUG_ON(offset >= INTREMAP_ENTRIES); + entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); + set_rte_index(&rte, offset); + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] = offset; + __ioapic_write_entry(apic, pin, 1, rte); + if ( iommu->enabled ) { spin_lock_irqsave(&iommu->lock, flags); amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } - set_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup); } } return 0; @@ -209,7 +250,7 @@ void amd_iommu_ioapic_update_ire( struct IO_APIC_route_entry new_rte = { 0 }; unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; unsigned int pin = (reg - 0x10) / 2; - int saved_mask, seg, bdf; + int saved_mask, seg, bdf, rc; struct amd_iommu *iommu; if ( !iommu_intremap ) @@ -247,7 +288,7 @@ void amd_iommu_ioapic_update_ire( } if ( new_rte.mask && - !test_bit(pin, ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ) + ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] >= INTREMAP_ENTRIES ) { ASSERT(saved_mask); __io_apic_write(apic, reg, value); @@ -262,14 +303,19 @@ void amd_iommu_ioapic_update_ire( } /* Update interrupt remapping entry */ - update_intremap_entry_from_ioapic( - bdf, iommu, &new_rte, - test_and_set_bit(pin, - ioapic_sbdf[IO_APIC_ID(apic)].pin_setup) ? &old_rte - : NULL); + rc = update_intremap_entry_from_ioapic( + bdf, iommu, &new_rte, reg == rte_lo, + &ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin]); - /* Forward write access to IO-APIC RTE */ - __io_apic_write(apic, reg, value); + __io_apic_write(apic, reg, ((u32 *)&new_rte)[reg != rte_lo]); + + if ( rc ) + { + /* Keep the entry masked. */ + printk(XENLOG_ERR "Remapping IO-APIC %#x pin %u failed (%d)\n", + IO_APIC_ID(apic), pin, rc); + return; + } /* For lower bits access, return directly to avoid double writes */ if ( reg == rte_lo ) @@ -283,16 +329,41 @@ void amd_iommu_ioapic_update_ire( } } -static void update_intremap_entry_from_msi_msg( +unsigned int amd_iommu_read_ioapic_from_ire( + unsigned int apic, unsigned int reg) +{ + unsigned int val = __io_apic_read(apic, reg); + + if ( !(reg & 1) ) + { + unsigned int offset = val & (INTREMAP_ENTRIES - 1); + u16 bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf; + u16 seg = ioapic_sbdf[IO_APIC_ID(apic)].seg; + u16 req_id = get_intremap_requestor_id(seg, bdf); + const u32 *entry = get_intremap_entry(seg, req_id, offset); + + val &= ~(INTREMAP_ENTRIES - 1); + val |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_INTTYPE_MASK, + INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; + val |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_VECTOR_MASK, + INT_REMAP_ENTRY_VECTOR_SHIFT); + } + + return val; +} + +static int update_intremap_entry_from_msi_msg( struct amd_iommu *iommu, u16 bdf, - int *remap_index, const struct msi_msg *msg) + int *remap_index, const struct msi_msg *msg, u32 *data) { unsigned long flags; u32* entry; u16 req_id, alias_id; u8 delivery_mode, dest, vector, dest_mode; spinlock_t *lock; - int offset; + unsigned int offset; req_id = get_dma_requestor_id(iommu->seg, bdf); alias_id = get_intremap_requestor_id(iommu->seg, bdf); @@ -303,15 +374,6 @@ static void update_intremap_entry_from_m spin_lock_irqsave(lock, flags); free_intremap_entry(iommu->seg, req_id, *remap_index); spin_unlock_irqrestore(lock, flags); - - if ( ( req_id != alias_id ) && - get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) - { - lock = get_intremap_lock(iommu->seg, alias_id); - spin_lock_irqsave(lock, flags); - free_intremap_entry(iommu->seg, alias_id, *remap_index); - spin_unlock_irqrestore(lock, flags); - } goto done; } @@ -322,16 +384,24 @@ static void update_intremap_entry_from_m delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK; dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff; - offset = get_intremap_offset(vector, delivery_mode); - if ( *remap_index < 0) + offset = *remap_index; + if ( offset >= INTREMAP_ENTRIES ) + { + offset = alloc_intremap_entry(iommu->seg, bdf); + if ( offset >= INTREMAP_ENTRIES ) + { + spin_unlock_irqrestore(lock, flags); + return -ENOSPC; + } *remap_index = offset; - else - BUG_ON(*remap_index != offset); + } - entry = (u32*)get_intremap_entry(iommu->seg, req_id, offset); + entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); + *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset; + /* * In some special cases, a pci-e device(e.g SATA controller in IDE mode) * will use alias id to index interrupt remapping table. @@ -343,10 +413,8 @@ static void update_intremap_entry_from_m if ( ( req_id != alias_id ) && get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) { - spin_lock_irqsave(lock, flags); - entry = (u32*)get_intremap_entry(iommu->seg, alias_id, offset); - update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); - spin_unlock_irqrestore(lock, flags); + BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table != + get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); } done: @@ -358,19 +426,22 @@ done: amd_iommu_flush_intremap(iommu, alias_id); spin_unlock_irqrestore(&iommu->lock, flags); } + + return 0; } static struct amd_iommu *_find_iommu_for_device(int seg, int bdf) { - struct amd_iommu *iommu = find_iommu_for_device(seg, bdf); - - if ( iommu ) - return iommu; + struct amd_iommu *iommu; list_for_each_entry ( iommu, &amd_iommu_head, list ) if ( iommu->seg == seg && iommu->bdf == bdf ) return NULL; + iommu = find_iommu_for_device(seg, bdf); + if ( iommu ) + return iommu; + AMD_IOMMU_DEBUG("No IOMMU for MSI dev = %04x:%02x:%02x.%u\n", seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); return ERR_PTR(-EINVAL); @@ -380,8 +451,9 @@ int amd_iommu_msi_msg_update_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { struct pci_dev *pdev = msi_desc->dev; - int bdf, seg; + int bdf, seg, rc; struct amd_iommu *iommu; + u32 data; bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; seg = pdev ? pdev->seg : hpet_sbdf.seg; @@ -390,11 +462,12 @@ int amd_iommu_msi_msg_update_ire( if ( IS_ERR_OR_NULL(iommu) ) return PTR_ERR(iommu); - if ( msi_desc->remap_index >= 0 ) + if ( msi_desc->remap_index >= 0 && !msg ) { do { update_intremap_entry_from_msi_msg(iommu, bdf, - &msi_desc->remap_index, NULL); + &msi_desc->remap_index, + NULL, NULL); if ( !pdev || !pdev->phantom_stride ) break; bdf += pdev->phantom_stride; @@ -409,19 +482,39 @@ int amd_iommu_msi_msg_update_ire( return 0; do { - update_intremap_entry_from_msi_msg(iommu, bdf, &msi_desc->remap_index, - msg); - if ( !pdev || !pdev->phantom_stride ) + rc = update_intremap_entry_from_msi_msg(iommu, bdf, + &msi_desc->remap_index, + msg, &data); + if ( rc || !pdev || !pdev->phantom_stride ) break; bdf += pdev->phantom_stride; } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); - return 0; + msg->data = data; + return rc; } void amd_iommu_read_msi_from_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { + unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1); + const struct pci_dev *pdev = msi_desc->dev; + u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; + u16 seg = pdev ? pdev->seg : hpet_sbdf.seg; + const u32 *entry; + + if ( IS_ERR_OR_NULL(_find_iommu_for_device(seg, bdf)) ) + return; + + entry = get_intremap_entry(seg, get_dma_requestor_id(seg, bdf), offset); + + msg->data &= ~(INTREMAP_ENTRIES - 1); + msg->data |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_INTTYPE_MASK, + INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; + msg->data |= get_field_from_reg_u32(*entry, + INT_REMAP_ENTRY_VECTOR_MASK, + INT_REMAP_ENTRY_VECTOR_SHIFT); } int __init amd_iommu_free_intremap_table( @@ -438,12 +531,14 @@ int __init amd_iommu_free_intremap_table return 0; } -void* __init amd_iommu_alloc_intremap_table(void) +void* __init amd_iommu_alloc_intremap_table(unsigned long **inuse_map) { void *tb; tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER); BUG_ON(tb == NULL); memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER)); + *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES)); + BUG_ON(*inuse_map == NULL); return tb; } --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -622,7 +622,7 @@ const struct iommu_ops amd_iommu_ops = { .get_device_group_id = amd_iommu_group_id, .update_ire_from_apic = amd_iommu_ioapic_update_ire, .update_ire_from_msi = amd_iommu_msi_msg_update_ire, - .read_apic_from_ire = __io_apic_read, + .read_apic_from_ire = amd_iommu_read_ioapic_from_ire, .read_msi_from_ire = amd_iommu_read_msi_from_ire, .setup_hpet_msi = amd_setup_hpet_msi, .suspend = amd_iommu_suspend, --- a/xen/include/asm-x86/amd-iommu.h +++ b/xen/include/asm-x86/amd-iommu.h @@ -119,6 +119,7 @@ struct ivrs_mappings { /* per device interrupt remapping table */ void *intremap_table; + unsigned long *intremap_inuse; spinlock_t intremap_lock; /* ivhd device data settings */ --- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h @@ -458,10 +458,6 @@ #define MAX_AMD_IOMMUS 32 /* interrupt remapping table */ -#define INT_REMAP_INDEX_DM_MASK 0x1C00 -#define INT_REMAP_INDEX_DM_SHIFT 10 -#define INT_REMAP_INDEX_VECTOR_MASK 0x3FC -#define INT_REMAP_INDEX_VECTOR_SHIFT 2 #define INT_REMAP_ENTRY_REMAPEN_MASK 0x00000001 #define INT_REMAP_ENTRY_REMAPEN_SHIFT 0 #define INT_REMAP_ENTRY_SUPIOPF_MASK 0x00000002 --- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h @@ -89,10 +89,12 @@ struct amd_iommu *find_iommu_for_device( /* interrupt remapping */ int amd_iommu_setup_ioapic_remapping(void); -void *amd_iommu_alloc_intremap_table(void); +void *amd_iommu_alloc_intremap_table(unsigned long **); int amd_iommu_free_intremap_table(u16 seg, struct ivrs_mappings *); void amd_iommu_ioapic_update_ire( unsigned int apic, unsigned int reg, unsigned int value); +unsigned int amd_iommu_read_ioapic_from_ire( + unsigned int apic, unsigned int reg); int amd_iommu_msi_msg_update_ire( struct msi_desc *msi_desc, struct msi_msg *msg); void amd_iommu_read_msi_from_ire( @@ -101,15 +103,17 @@ int amd_setup_hpet_msi(struct msi_desc * extern struct ioapic_sbdf { u16 bdf, seg; - unsigned long *pin_setup; + u16 *pin_2_idx; } ioapic_sbdf[MAX_IO_APICS]; -extern void *shared_intremap_table; extern struct hpet_sbdf { u16 bdf, seg, id; struct amd_iommu *iommu; } hpet_sbdf; +extern void *shared_intremap_table; +extern unsigned long *shared_intremap_inuse; + /* power management support */ void amd_iommu_resume(void); void amd_iommu_suspend(void);