[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5 4/7] xen/x86: use invpcid for flushing the TLB



On 06/04/18 15:40, Andrew Cooper wrote:
> On 06/04/18 08:52, Juergen Gross wrote:
>> If possible use the INVPCID instruction for flushing the TLB instead of
>> toggling cr4.pge for that purpose.
>>
>> While at it remove the dependency on cr4.pge being required for mtrr
>> loading, as this will be required later anyway.
>>
>> Add a command line option "invpcid" for controlling the use of
>> INVPCID (default to true).
>>
>> Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
>> ---
>> V5:
>> - use pre_flush() as an initializer in do_tlb_flush() (Jan Beulich)
>> - introduce boolean use_invpcid instead of clearing X86_FEATURE_INVPCID
>>   (Jan Beulich)
>>
>> V4:
>> - option "invpcid" instead of "noinvpcid" (Jan Beulich)
>>
>> V3:
>> - new patch
>> ---
>>  docs/misc/xen-command-line.markdown | 10 ++++++++++
>>  xen/arch/x86/cpu/mtrr/generic.c     | 37 
>> ++++++++++++++++++++++++++-----------
>>  xen/arch/x86/flushtlb.c             | 29 +++++++++++++++++++----------
>>  xen/arch/x86/setup.c                |  8 ++++++++
>>  xen/include/asm-x86/invpcid.h       |  2 ++
>>  5 files changed, 65 insertions(+), 21 deletions(-)
>>
>> diff --git a/docs/misc/xen-command-line.markdown 
>> b/docs/misc/xen-command-line.markdown
>> index 79be9a6ba5..5f6ae654ad 100644
>> --- a/docs/misc/xen-command-line.markdown
>> +++ b/docs/misc/xen-command-line.markdown
>> @@ -1380,6 +1380,16 @@ Because responsibility for APIC setup is shared 
>> between Xen and the
>>  domain 0 kernel this option is automatically propagated to the domain
>>  0 command line.
>>  
>> +### invpcid (x86)
>> +> `= <boolean>`
>> +
>> +> Default: `true`
>> +
>> +Control using the INVPCID instruction for flushing TLB entries.
>> +This should only be used in case of known issues on the current platform
>> +with that instruction. Disabling INVPCID will normally result in a slightly
>> +degraded performance.
> 
> How about:
> 
> By default, Xen will use the INVPCID instruction for TLB management if
> it is available.  This option can be used to cause Xen to fall back to
> older mechanisms, which are generally slower.
> 
> ?

Works for me.

> We are not aware of any systems with INVPCID problems, but "for testing
> purposes" is also a valid reason to use this option.

Okay, I'll change it.

> 
> 
>> +
>>  ### noirqbalance
>>  > `= <boolean>`
>>  
>> diff --git a/xen/arch/x86/cpu/mtrr/generic.c 
>> b/xen/arch/x86/cpu/mtrr/generic.c
>> index e9c0e5e059..705855e753 100644
>> --- a/xen/arch/x86/cpu/mtrr/generic.c
>> +++ b/xen/arch/x86/cpu/mtrr/generic.c
>> @@ -5,6 +5,7 @@
>>  #include <xen/mm.h>
>>  #include <xen/stdbool.h>
>>  #include <asm/flushtlb.h>
>> +#include <asm/invpcid.h>
>>  #include <asm/io.h>
>>  #include <asm/mtrr.h>
>>  #include <asm/msr.h>
>> @@ -400,8 +401,10 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
>>   * has been called.
>>   */
>>  
>> -static void prepare_set(void)
>> +static bool prepare_set(void)
>>  {
>> +    unsigned long cr4;
>> +
>>      /*  Note that this is not ideal, since the cache is only 
>> flushed/disabled
>>         for this CPU while the MTRRs are changed, but changing this requires
>>         more invasive changes to the way the kernel boots  */
>> @@ -412,18 +415,24 @@ static void prepare_set(void)
>>      write_cr0(read_cr0() | X86_CR0_CD);
>>      wbinvd();
>>  
>> -    /*  TLB flushing here relies on Xen always using CR4.PGE. */
>> -    BUILD_BUG_ON(!(XEN_MINIMAL_CR4 & X86_CR4_PGE));
>> -    write_cr4(read_cr4() & ~X86_CR4_PGE);
>> +    cr4 = read_cr4();
>> +    if (cr4 & X86_CR4_PGE)
>> +            write_cr4(cr4 & ~X86_CR4_PGE);
>> +    else if (use_invpcid)
>> +            invpcid_flush_all();
>> +    else
>> +            asm volatile( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" );
> 
> We could really do with gaining a static inline wrapper for write_cr3(),
> because at the very least, the memory clobber going missing would be a
> BadThing(tm).  Sadly that name is already taken in Xen, and behaves
> differently to (all?) other OS's example.
> 
> There are currently only 3 callers of write_cr3().  How about a prereq
> patch renaming write_cr3() to switch_cr3() (as switch_cr3() logically
> implies the TLB clock activities), and a new thin write_cr3() wrapper?

Sure.

> 
>>  
>>      /*  Save MTRR state */
>>      rdmsrl(MSR_MTRRdefType, deftype);
>>  
>>      /*  Disable MTRRs, and set the default type to uncached  */
>>      mtrr_wrmsr(MSR_MTRRdefType, deftype & ~0xcff);
>> +
>> +    return cr4 & X86_CR4_PGE;
>>  }
>>  
>> -static void post_set(void)
>> +static void post_set(bool pge)
>>  {
>>      /* Intel (P6) standard MTRRs */
>>      mtrr_wrmsr(MSR_MTRRdefType, deftype);
>> @@ -432,7 +441,12 @@ static void post_set(void)
>>      write_cr0(read_cr0() & ~X86_CR0_CD);
>>  
>>      /*  Reenable CR4.PGE (also flushes the TLB) */
>> -    write_cr4(read_cr4() | X86_CR4_PGE);
>> +    if (pge)
>> +            write_cr4(read_cr4() | X86_CR4_PGE);
>> +    else if (use_invpcid)
>> +            invpcid_flush_all();
>> +    else
>> +            asm volatile( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" );
>>  
>>      spin_unlock(&set_atomicity_lock);
>>  }
>> @@ -441,14 +455,15 @@ static void generic_set_all(void)
>>  {
>>      unsigned long mask, count;
>>      unsigned long flags;
>> +    bool pge;
>>  
>>      local_irq_save(flags);
>> -    prepare_set();
>> +    pge = prepare_set();
>>  
>>      /* Actually set the state */
>>      mask = set_mtrr_state();
>>  
>> -    post_set();
>> +    post_set(pge);
>>      local_irq_restore(flags);
>>  
>>      /*  Use the atomic bitops to update the global mask  */
>> @@ -457,7 +472,6 @@ static void generic_set_all(void)
>>                      set_bit(count, &smp_changes_mask);
>>              mask >>= 1;
>>      }
>> -    
>>  }
>>  
>>  static void generic_set_mtrr(unsigned int reg, unsigned long base,
>> @@ -474,11 +488,12 @@ static void generic_set_mtrr(unsigned int reg, 
>> unsigned long base,
>>  {
>>      unsigned long flags;
>>      struct mtrr_var_range *vr;
>> +    bool pge;
>>  
>>      vr = &mtrr_state.var_ranges[reg];
>>  
>>      local_irq_save(flags);
>> -    prepare_set();
>> +    pge = prepare_set();
>>  
>>      if (size == 0) {
>>              /* The invalid bit is kept in the mask, so we simply clear the
>> @@ -499,7 +514,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned 
>> long base,
>>              mtrr_wrmsr(MSR_IA32_MTRR_PHYSMASK(reg), vr->mask);
>>      }
>>  
>> -    post_set();
>> +    post_set(pge);
>>      local_irq_restore(flags);
>>  }
>>  
>> diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
>> index 38cedf3b22..f793b70696 100644
>> --- a/xen/arch/x86/flushtlb.c
>> +++ b/xen/arch/x86/flushtlb.c
>> @@ -10,6 +10,7 @@
>>  #include <xen/sched.h>
>>  #include <xen/softirq.h>
>>  #include <asm/flushtlb.h>
>> +#include <asm/invpcid.h>
>>  #include <asm/page.h>
>>  
>>  /* Debug builds: Wrap frequently to stress-test the wrap logic. */
>> @@ -71,6 +72,23 @@ static void post_flush(u32 t)
>>      this_cpu(tlbflush_time) = t;
>>  }
>>  
>> +static void do_tlb_flush(void)
>> +{
>> +    u32 t = pre_flush();
>> +
>> +    if ( use_invpcid )
>> +        invpcid_flush_all();
>> +    else
>> +    {
>> +        unsigned long cr4 = read_cr4();
>> +
>> +        write_cr4(cr4 ^ X86_CR4_PGE);
>> +        write_cr4(cr4);
>> +    }
>> +
>> +    post_flush(t);
>> +}
>> +
>>  void write_cr3(unsigned long cr3)
>>  {
>>      unsigned long flags, cr4;
>> @@ -118,16 +136,7 @@ unsigned int flush_area_local(const void *va, unsigned 
>> int flags)
>>                             : : "m" (*(const char *)(va)) : "memory" );
>>          }
>>          else
>> -        {
>> -            u32 t = pre_flush();
>> -            unsigned long cr4 = read_cr4();
>> -
>> -            write_cr4(cr4 & ~X86_CR4_PGE);
>> -            barrier();
>> -            write_cr4(cr4);
>> -
>> -            post_flush(t);
>> -        }
>> +            do_tlb_flush();
>>      }
>>  
>>      if ( flags & FLUSH_CACHE )
>> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
>> index e83d3b4f07..0a27402e4d 100644
>> --- a/xen/arch/x86/setup.c
>> +++ b/xen/arch/x86/setup.c
>> @@ -63,6 +63,11 @@ boolean_param("nosmp", opt_nosmp);
>>  static unsigned int __initdata max_cpus;
>>  integer_param("maxcpus", max_cpus);
>>  
>> +/* opt_invpcid: If false, don't use INVPCID instruction even if available. 
>> */
>> +static bool __initdata opt_invpcid = true;
>> +boolean_param("invpcid", opt_invpcid);
>> +bool use_invpcid;
> 
> __read_mostly.

Yes.


Juergen

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.