[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH, v2] IOMMU: don't immediately disable bus mastering on faults



At 16:05 +0000 on 07 Nov (1352304352), Jan Beulich wrote:
> Instead, give the owning domain at least a small opportunity of fixing
> things up, and allow for rare faults to not bring down the device at
> all. The amount of faults tolerated within a given time period (all
> numbers are made up with no specific rationale) is higher for Dom0 than
> for DomU-s.
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

Looks good to me, though I don't think dom0 should have any special
treatment here.  I'd be inclined to set the threshold to 10 for
everyone. 

Tim.

> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -566,7 +566,7 @@ static hw_irq_controller iommu_maskable_
>  
>  static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
>  {
> -    u16 domain_id, device_id, bdf, cword, flags;
> +    u16 domain_id, device_id, bdf, flags;
>      u32 code;
>      u64 *addr;
>      int count = 0;
> @@ -620,18 +620,10 @@ static void parse_event_log_entry(struct
>                 "fault address = %#"PRIx64", flags = %#x\n",
>                 event_str[code-1], domain_id, device_id, *addr, flags);
>  
> -        /* Tell the device to stop DMAing; we can't rely on the guest to
> -         * control it for us. */
>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
> -            {
> -                cword = pci_conf_read16(iommu->seg, PCI_BUS(bdf),
> -                                        PCI_SLOT(bdf), PCI_FUNC(bdf),
> -                                        PCI_COMMAND);
> -                pci_conf_write16(iommu->seg, PCI_BUS(bdf), PCI_SLOT(bdf),
> -                                 PCI_FUNC(bdf), PCI_COMMAND, 
> -                                 cword & ~PCI_COMMAND_MASTER);
> -            }
> +                pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
> +                                         PCI_DEVFN2(bdf));
>      }
>      else
>      {
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -215,6 +215,7 @@ static int device_assigned(u16 seg, u8 b
>  static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
> +    struct pci_dev *pdev;
>      int rc = 0;
>  
>      if ( !iommu_enabled || !hd->platform_ops )
> @@ -228,6 +229,13 @@ static int assign_device(struct domain *
>          return -EXDEV;
>  
>      spin_lock(&pcidevs_lock);
> +    pdev = pci_get_pdev(seg, bus, devfn);
> +    if ( pdev )
> +    {
> +        pdev->fault.threshold = PT_FAULT_THRESHOLD_UNPRIV;
> +        pdev->fault.count     = 0;
> +    }
> +
>      if ( (rc = hd->platform_ops->assign_device(d, seg, bus, devfn)) )
>          goto done;
>  
> @@ -239,6 +247,8 @@ static int assign_device(struct domain *
>          goto done;
>      }
>  done:
> +    if ( pdev && pdev->domain != d )
> +        pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV;
>      spin_unlock(&pcidevs_lock);
>      return rc;
>  }
> @@ -379,6 +389,9 @@ int deassign_device(struct domain *d, u1
>          return ret;
>      }
>  
> +    pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV;
> +    pdev->fault.count     = 0;
> +
>      if ( !has_arch_pdevs(d) && need_iommu(d) )
>      {
>          d->need_iommu = 0;
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -137,6 +137,7 @@ static struct pci_dev *alloc_pdev(struct
>      *((u8*) &pdev->bus) = bus;
>      *((u8*) &pdev->devfn) = devfn;
>      pdev->domain = NULL;
> +    pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV;
>      INIT_LIST_HEAD(&pdev->msi_list);
>      list_add(&pdev->alldevs_list, &pseg->alldevs_list);
>      spin_lock_init(&pdev->msix_table_lock);
> @@ -672,6 +673,36 @@ int __init pci_device_detect(u16 seg, u8
>      return 1;
>  }
>  
> +void pci_check_disable_device(u16 seg, u8 bus, u8 devfn)
> +{
> +    struct pci_dev *pdev;
> +    s_time_t now = NOW();
> +    u16 cword;
> +
> +    spin_lock(&pcidevs_lock);
> +    pdev = pci_get_pdev(seg, bus, devfn);
> +    if ( pdev )
> +    {
> +        if ( now < pdev->fault.time ||
> +             now - pdev->fault.time > MILLISECS(10) )
> +            pdev->fault.count >>= 1;
> +        pdev->fault.time = now;
> +        if ( ++pdev->fault.count < pdev->fault.threshold )
> +            pdev = NULL;
> +    }
> +    spin_unlock(&pcidevs_lock);
> +
> +    if ( !pdev )
> +        return;
> +
> +    /* Tell the device to stop DMAing; we can't rely on the guest to
> +     * control it for us. */
> +    cword = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> +                            PCI_COMMAND);
> +    pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> +                     PCI_COMMAND, cword & ~PCI_COMMAND_MASTER);
> +}
> +
>  /*
>   * scan pci devices to add all existed PCI devices to alldevs_list,
>   * and setup pci hierarchy in array bus2bridge.
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -917,7 +917,7 @@ static void __do_iommu_page_fault(struct
>      while (1)
>      {
>          u8 fault_reason;
> -        u16 source_id, cword;
> +        u16 source_id;
>          u32 data;
>          u64 guest_addr;
>          int type;
> @@ -950,14 +950,8 @@ static void __do_iommu_page_fault(struct
>          iommu_page_fault_do_one(iommu, type, fault_reason,
>                                  source_id, guest_addr);
>  
> -        /* Tell the device to stop DMAing; we can't rely on the guest to
> -         * control it for us. */
> -        cword = pci_conf_read16(iommu->intel->drhd->segment,
> -                                PCI_BUS(source_id), PCI_SLOT(source_id),
> -                                PCI_FUNC(source_id), PCI_COMMAND);
> -        pci_conf_write16(iommu->intel->drhd->segment, PCI_BUS(source_id),
> -                         PCI_SLOT(source_id), PCI_FUNC(source_id),
> -                         PCI_COMMAND, cword & ~PCI_COMMAND_MASTER);
> +        pci_check_disable_device(iommu->intel->drhd->segment,
> +                                 PCI_BUS(source_id), PCI_DEVFN2(source_id));
>  
>          fault_index++;
>          if ( fault_index > cap_num_fault_regs(iommu->cap) )
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -64,6 +64,13 @@ struct pci_dev {
>      const u8 devfn;
>      struct pci_dev_info info;
>      struct arch_pci_dev arch;
> +    struct {
> +        s_time_t time;
> +        unsigned int threshold;
> +#define PT_FAULT_THRESHOLD_PRIV  10
> +#define PT_FAULT_THRESHOLD_UNPRIV 3
> +        unsigned int count;
> +    } fault;
>      u64 vf_rlen[6];
>  };
>  
> @@ -107,6 +114,7 @@ int pci_hide_device(int bus, int devfn);
>  struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
>  struct pci_dev *pci_get_pdev_by_domain(
>      struct domain *, int seg, int bus, int devfn);
> +void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);
>  
>  uint8_t pci_conf_read8(
>      unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func,
> 
> 

> IOMMU: don't immediately disable bus mastering on faults
> 
> Instead, give the owning domain at least a small opportunity of fixing
> things up, and allow for rare faults to not bring down the device at
> all. The amount of faults tolerated within a given time period (all
> numbers are made up with no specific rationale) is higher for Dom0 than
> for DomU-s.
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -566,7 +566,7 @@ static hw_irq_controller iommu_maskable_
>  
>  static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
>  {
> -    u16 domain_id, device_id, bdf, cword, flags;
> +    u16 domain_id, device_id, bdf, flags;
>      u32 code;
>      u64 *addr;
>      int count = 0;
> @@ -620,18 +620,10 @@ static void parse_event_log_entry(struct
>                 "fault address = %#"PRIx64", flags = %#x\n",
>                 event_str[code-1], domain_id, device_id, *addr, flags);
>  
> -        /* Tell the device to stop DMAing; we can't rely on the guest to
> -         * control it for us. */
>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
> -            {
> -                cword = pci_conf_read16(iommu->seg, PCI_BUS(bdf),
> -                                        PCI_SLOT(bdf), PCI_FUNC(bdf),
> -                                        PCI_COMMAND);
> -                pci_conf_write16(iommu->seg, PCI_BUS(bdf), PCI_SLOT(bdf),
> -                                 PCI_FUNC(bdf), PCI_COMMAND, 
> -                                 cword & ~PCI_COMMAND_MASTER);
> -            }
> +                pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
> +                                         PCI_DEVFN2(bdf));
>      }
>      else
>      {
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -215,6 +215,7 @@ static int device_assigned(u16 seg, u8 b
>  static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
> +    struct pci_dev *pdev;
>      int rc = 0;
>  
>      if ( !iommu_enabled || !hd->platform_ops )
> @@ -228,6 +229,13 @@ static int assign_device(struct domain *
>          return -EXDEV;
>  
>      spin_lock(&pcidevs_lock);
> +    pdev = pci_get_pdev(seg, bus, devfn);
> +    if ( pdev )
> +    {
> +        pdev->fault.threshold = PT_FAULT_THRESHOLD_UNPRIV;
> +        pdev->fault.count     = 0;
> +    }
> +
>      if ( (rc = hd->platform_ops->assign_device(d, seg, bus, devfn)) )
>          goto done;
>  
> @@ -239,6 +247,8 @@ static int assign_device(struct domain *
>          goto done;
>      }
>  done:
> +    if ( pdev && pdev->domain != d )
> +        pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV;
>      spin_unlock(&pcidevs_lock);
>      return rc;
>  }
> @@ -379,6 +389,9 @@ int deassign_device(struct domain *d, u1
>          return ret;
>      }
>  
> +    pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV;
> +    pdev->fault.count     = 0;
> +
>      if ( !has_arch_pdevs(d) && need_iommu(d) )
>      {
>          d->need_iommu = 0;
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -137,6 +137,7 @@ static struct pci_dev *alloc_pdev(struct
>      *((u8*) &pdev->bus) = bus;
>      *((u8*) &pdev->devfn) = devfn;
>      pdev->domain = NULL;
> +    pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV;
>      INIT_LIST_HEAD(&pdev->msi_list);
>      list_add(&pdev->alldevs_list, &pseg->alldevs_list);
>      spin_lock_init(&pdev->msix_table_lock);
> @@ -672,6 +673,36 @@ int __init pci_device_detect(u16 seg, u8
>      return 1;
>  }
>  
> +void pci_check_disable_device(u16 seg, u8 bus, u8 devfn)
> +{
> +    struct pci_dev *pdev;
> +    s_time_t now = NOW();
> +    u16 cword;
> +
> +    spin_lock(&pcidevs_lock);
> +    pdev = pci_get_pdev(seg, bus, devfn);
> +    if ( pdev )
> +    {
> +        if ( now < pdev->fault.time ||
> +             now - pdev->fault.time > MILLISECS(10) )
> +            pdev->fault.count >>= 1;
> +        pdev->fault.time = now;
> +        if ( ++pdev->fault.count < pdev->fault.threshold )
> +            pdev = NULL;
> +    }
> +    spin_unlock(&pcidevs_lock);
> +
> +    if ( !pdev )
> +        return;
> +
> +    /* Tell the device to stop DMAing; we can't rely on the guest to
> +     * control it for us. */
> +    cword = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> +                            PCI_COMMAND);
> +    pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> +                     PCI_COMMAND, cword & ~PCI_COMMAND_MASTER);
> +}
> +
>  /*
>   * scan pci devices to add all existed PCI devices to alldevs_list,
>   * and setup pci hierarchy in array bus2bridge.
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -917,7 +917,7 @@ static void __do_iommu_page_fault(struct
>      while (1)
>      {
>          u8 fault_reason;
> -        u16 source_id, cword;
> +        u16 source_id;
>          u32 data;
>          u64 guest_addr;
>          int type;
> @@ -950,14 +950,8 @@ static void __do_iommu_page_fault(struct
>          iommu_page_fault_do_one(iommu, type, fault_reason,
>                                  source_id, guest_addr);
>  
> -        /* Tell the device to stop DMAing; we can't rely on the guest to
> -         * control it for us. */
> -        cword = pci_conf_read16(iommu->intel->drhd->segment,
> -                                PCI_BUS(source_id), PCI_SLOT(source_id),
> -                                PCI_FUNC(source_id), PCI_COMMAND);
> -        pci_conf_write16(iommu->intel->drhd->segment, PCI_BUS(source_id),
> -                         PCI_SLOT(source_id), PCI_FUNC(source_id),
> -                         PCI_COMMAND, cword & ~PCI_COMMAND_MASTER);
> +        pci_check_disable_device(iommu->intel->drhd->segment,
> +                                 PCI_BUS(source_id), PCI_DEVFN2(source_id));
>  
>          fault_index++;
>          if ( fault_index > cap_num_fault_regs(iommu->cap) )
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -64,6 +64,13 @@ struct pci_dev {
>      const u8 devfn;
>      struct pci_dev_info info;
>      struct arch_pci_dev arch;
> +    struct {
> +        s_time_t time;
> +        unsigned int threshold;
> +#define PT_FAULT_THRESHOLD_PRIV  10
> +#define PT_FAULT_THRESHOLD_UNPRIV 3
> +        unsigned int count;
> +    } fault;
>      u64 vf_rlen[6];
>  };
>  
> @@ -107,6 +114,7 @@ int pci_hide_device(int bus, int devfn);
>  struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
>  struct pci_dev *pci_get_pdev_by_domain(
>      struct domain *, int seg, int bus, int devfn);
> +void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);
>  
>  uint8_t pci_conf_read8(
>      unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func,


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.