[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [patch 35/37] x86/smpboot: Support parallel startup of secondary CPUs



On Fri, Apr 14, 2023 at 7:45 PM Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
>
> From: David Woodhouse <dwmw@xxxxxxxxxxxx>
>
> Rework the real-mode startup code to allow for APs to be brought up in
> parallel. This is in two parts:
>
>  1. Introduce a bit-spinlock to prevent them from all using the real
>     mode stack at the same time.
>
>  2. Avoid needing to use the global smpboot_control variable to pass
>     each AP its CPU number.
>
> To achieve the latter, export the cpuid_to_apicid[] array so that each
> AP can find its own CPU number by searching therein based on its APIC ID.
>
> Introduce flags in the top bits of smpboot_control which indicate methods
> by which an AP should find its CPU number. For a serialized bringup, the
> CPU number is explicitly passed in the low bits of smpboot_control as
> before. For parallel mode there are flags directing the AP to find its APIC
> ID in CPUID leaf 0x0b or 1x1f (for X2APIC mode) or CPUID leaf 0x01 where 8
> bits are sufficient, then perform the cpuid_to_apicid[] lookup with that.
>
> Aside from the fact that APs will now look up their CPU number via the
> newly-exported cpuid_to_apicid[] table, there is no behavioural change
> intended, since the parallel bootup has not yet been enabled.
>
> [ tglx: Initial proof of concept patch with bitlock and APIC ID lookup ]
> [ dwmw2: Rework and testing, commit message, CPUID 0x1 and CPU0 support ]
> [ seanc: Fix stray override of initial_gs in common_cpu_up() ]
> [ Oleksandr Natalenko: reported suspend/resume issue fixed in
>   x86_acpi_suspend_lowlevel ]
>
> Co-developed-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Co-developed-by: Brian Gerst <brgerst@xxxxxxxxx>
> Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx>
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> Signed-off-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
>  arch/x86/include/asm/apic.h          |    2
>  arch/x86/include/asm/realmode.h      |    3 +
>  arch/x86/include/asm/smp.h           |    8 +++
>  arch/x86/kernel/acpi/sleep.c         |    9 +++
>  arch/x86/kernel/apic/apic.c          |    2
>  arch/x86/kernel/head_64.S            |   79 
> ++++++++++++++++++++++++++++++++++-
>  arch/x86/kernel/smpboot.c            |    5 --
>  arch/x86/realmode/init.c             |    3 +
>  arch/x86/realmode/rm/trampoline_64.S |   27 +++++++++--
>  9 files changed, 125 insertions(+), 13 deletions(-)
>
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -55,6 +55,8 @@ extern int local_apic_timer_c2_ok;
>  extern int disable_apic;
>  extern unsigned int lapic_timer_period;
>
> +extern int cpuid_to_apicid[];
> +
>  extern enum apic_intr_mode_id apic_intr_mode;
>  enum apic_intr_mode_id {
>         APIC_PIC,
> --- a/arch/x86/include/asm/realmode.h
> +++ b/arch/x86/include/asm/realmode.h
> @@ -52,6 +52,7 @@ struct trampoline_header {
>         u64 efer;
>         u32 cr4;
>         u32 flags;
> +       u32 lock;
>  #endif
>  };
>
> @@ -64,6 +65,8 @@ extern unsigned long initial_stack;
>  extern unsigned long initial_vc_handler;
>  #endif
>
> +extern u32 *trampoline_lock;
> +
>  extern unsigned char real_mode_blob[];
>  extern unsigned char real_mode_relocs[];
>
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -198,4 +198,12 @@ extern unsigned int smpboot_control;
>
>  #endif /* !__ASSEMBLY__ */
>
> +/* Control bits for startup_64 */
> +#define STARTUP_APICID_CPUID_1F 0x80000000
> +#define STARTUP_APICID_CPUID_0B 0x40000000
> +#define STARTUP_APICID_CPUID_01 0x20000000
> +
> +/* Top 8 bits are reserved for control */
> +#define STARTUP_PARALLEL_MASK  0xFF000000
> +
>  #endif /* _ASM_X86_SMP_H */
> --- a/arch/x86/kernel/acpi/sleep.c
> +++ b/arch/x86/kernel/acpi/sleep.c
> @@ -16,6 +16,7 @@
>  #include <asm/cacheflush.h>
>  #include <asm/realmode.h>
>  #include <asm/hypervisor.h>
> +#include <asm/smp.h>
>
>  #include <linux/ftrace.h>
>  #include "../../realmode/rm/wakeup.h"
> @@ -127,7 +128,13 @@ int x86_acpi_suspend_lowlevel(void)
>          * value is in the actual %rsp register.
>          */
>         current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
> -       smpboot_control = smp_processor_id();
> +       /*
> +        * Ensure the CPU knows which one it is when it comes back, if
> +        * it isn't in parallel mode and expected to work that out for
> +        * itself.
> +        */
> +       if (!(smpboot_control & STARTUP_PARALLEL_MASK))
> +               smpboot_control = smp_processor_id();
>  #endif
>         initial_code = (unsigned long)wakeup_long64;
>         saved_magic = 0x123456789abcdef0L;
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -2377,7 +2377,7 @@ static int nr_logical_cpuids = 1;
>  /*
>   * Used to store mapping between logical CPU IDs and APIC IDs.
>   */
> -static int cpuid_to_apicid[] = {
> +int cpuid_to_apicid[] = {
>         [0 ... NR_CPUS - 1] = -1,
>  };
>
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -25,6 +25,7 @@
>  #include <asm/export.h>
>  #include <asm/nospec-branch.h>
>  #include <asm/fixmap.h>
> +#include <asm/smp.h>
>
>  /*
>   * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
> @@ -234,8 +235,70 @@ SYM_INNER_LABEL(secondary_startup_64_no_
>         ANNOTATE_NOENDBR // above
>
>  #ifdef CONFIG_SMP
> +       /*
> +        * For parallel boot, the APIC ID is retrieved from CPUID, and then
> +        * used to look up the CPU number.  For booting a single CPU, the
> +        * CPU number is encoded in smpboot_control.
> +        *
> +        * Bit 31       STARTUP_APICID_CPUID_1F flag (use CPUID 0x1f)
> +        * Bit 30       STARTUP_APICID_CPUID_0B flag (use CPUID 0x0b)
> +        * Bit 29       STARTUP_APICID_CPUID_01 flag (use CPUID 0x01)
> +        * Bit 0-23     CPU# if STARTUP_APICID_CPUID_xx flags are not set
> +        */
>         movl    smpboot_control(%rip), %ecx
> +       testl   $STARTUP_APICID_CPUID_1F, %ecx
> +       jnz     .Luse_cpuid_1f
> +       testl   $STARTUP_APICID_CPUID_0B, %ecx
> +       jnz     .Luse_cpuid_0b
> +       testl   $STARTUP_APICID_CPUID_01, %ecx
> +       jnz     .Luse_cpuid_01
> +       andl    $(~STARTUP_PARALLEL_MASK), %ecx
> +       jmp     .Lsetup_cpu
> +
> +.Luse_cpuid_01:
> +       mov     $0x01, %eax
> +       cpuid
> +       mov     %ebx, %edx
> +       shr     $24, %edx
> +       jmp     .Lsetup_AP
> +
> +.Luse_cpuid_0b:
> +       mov     $0x0B, %eax
> +       xorl    %ecx, %ecx
> +       cpuid
> +       jmp     .Lsetup_AP
> +
> +.Luse_cpuid_1f:
> +       mov     $0x1f, %eax
> +       xorl    %ecx, %ecx
> +       cpuid
>
> +.Lsetup_AP:
> +       /* EDX contains the APIC ID of the current CPU */
> +       xorq    %rcx, %rcx
> +       leaq    cpuid_to_apicid(%rip), %rbx
> +
> +.Lfind_cpunr:
> +       cmpl    (%rbx,%rcx,4), %edx
> +       jz      .Lsetup_cpu
> +       inc     %ecx
> +#ifdef CONFIG_FORCE_NR_CPUS
> +       cmpl    $NR_CPUS, %ecx
> +#else
> +       cmpl    nr_cpu_ids(%rip), %ecx
> +#endif
> +       jb      .Lfind_cpunr
> +
> +       /*  APIC ID not found in the table. Drop the trampoline lock and 
> bail. */
> +       movq    trampoline_lock(%rip), %rax
> +       lock
> +       btrl    $0, (%rax)
> +
> +1:     cli
> +       hlt
> +       jmp     1b
> +
> +.Lsetup_cpu:
>         /* Get the per cpu offset for the given CPU# which is in ECX */
>         movq    __per_cpu_offset(,%rcx,8), %rdx
>  #else
> @@ -248,10 +311,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_
>          *
>          * RDX contains the per-cpu offset
>          */
> -       movq    pcpu_hot + X86_current_task(%rdx), %rax
> -       movq    TASK_threadsp(%rax), %rsp
> +       movq    pcpu_hot + X86_top_of_stack(%rdx), %rsp

Switching to using pcpu_hot.top_of_stack is ok, but it's not
completely equivalent.  top_of_stack points to the end of the pt_regs
structure, while the kernel stack starts below pt_regs even for kernel
threads.  So you need to subtract PTREGS_SIZE from the stack pointer
after this.

This change should also be a separate patch.

--
Brian Gerst



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.