x86/x2apic: properly implement cluster mode So far, cluster mode was just an alternative implementation of physical mode: Allowing only single CPU interrupt targets, and sending IPIs to each target CPU separately. Take advantage of what cluster mode really can do in that regard. Signed-off-by: Jan Beulich --- a/xen/arch/x86/genapic/x2apic.c +++ b/xen/arch/x86/genapic/x2apic.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,14 @@ static bool_t __initdata x2apic_phys; /* boolean_param("x2apic_phys", x2apic_phys); static DEFINE_PER_CPU_READ_MOSTLY(u32, cpu_2_logical_apicid); +static DEFINE_PER_CPU_READ_MOSTLY(cpumask_t *, cluster_cpus); +static cpumask_t *cluster_cpus_spare; +static DEFINE_PER_CPU(cpumask_var_t, scratch_mask); + +static inline u32 x2apic_cluster(unsigned int cpu) +{ + return per_cpu(cpu_2_logical_apicid, cpu) >> 16; +} static void init_apic_ldr_x2apic_phys(void) { @@ -40,20 +49,53 @@ static void init_apic_ldr_x2apic_phys(vo static void init_apic_ldr_x2apic_cluster(void) { - this_cpu(cpu_2_logical_apicid) = apic_read(APIC_LDR); + unsigned int cpu, this_cpu = smp_processor_id(); + + per_cpu(cpu_2_logical_apicid, this_cpu) = apic_read(APIC_LDR); + + if ( per_cpu(cluster_cpus, this_cpu) ) + { + ASSERT(cpumask_test_cpu(this_cpu, per_cpu(cluster_cpus, this_cpu))); + return; + } + + per_cpu(cluster_cpus, this_cpu) = cluster_cpus_spare; + for_each_online_cpu ( cpu ) + { + if (this_cpu == cpu || x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + per_cpu(cluster_cpus, this_cpu) = per_cpu(cluster_cpus, cpu); + break; + } + if ( per_cpu(cluster_cpus, this_cpu) == cluster_cpus_spare ) + cluster_cpus_spare = NULL; + + cpumask_set_cpu(this_cpu, per_cpu(cluster_cpus, this_cpu)); } static void __init clustered_apic_check_x2apic(void) { } +static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu) +{ + return per_cpu(cluster_cpus, cpu); +} + static unsigned int cpu_mask_to_apicid_x2apic_cluster(const cpumask_t *cpumask) { - return per_cpu(cpu_2_logical_apicid, cpumask_first(cpumask)); + unsigned int cpu = cpumask_first(cpumask); + unsigned int dest = per_cpu(cpu_2_logical_apicid, cpu); + const cpumask_t *cluster_cpus = per_cpu(cluster_cpus, cpu); + + for_each_cpu ( cpu, cluster_cpus ) + if ( cpumask_test_cpu(cpu, cpumask) ) + dest |= per_cpu(cpu_2_logical_apicid, cpu); + + return dest; } -static void __send_IPI_mask_x2apic( - const cpumask_t *cpumask, int vector, unsigned int dest_mode) +static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector) { unsigned int cpu; unsigned long flags; @@ -77,23 +119,48 @@ static void __send_IPI_mask_x2apic( { if ( !cpu_online(cpu) || (cpu == smp_processor_id()) ) continue; - msr_content = (dest_mode == APIC_DEST_PHYSICAL) - ? cpu_physical_id(cpu) : per_cpu(cpu_2_logical_apicid, cpu); - msr_content = (msr_content << 32) | APIC_DM_FIXED | dest_mode | vector; + msr_content = cpu_physical_id(cpu); + msr_content = (msr_content << 32) | APIC_DM_FIXED | + APIC_DEST_PHYSICAL | vector; apic_wrmsr(APIC_ICR, msr_content); } local_irq_restore(flags); } -static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector) -{ - __send_IPI_mask_x2apic(cpumask, vector, APIC_DEST_PHYSICAL); -} - static void send_IPI_mask_x2apic_cluster(const cpumask_t *cpumask, int vector) { - __send_IPI_mask_x2apic(cpumask, vector, APIC_DEST_LOGICAL); + unsigned int cpu = smp_processor_id(); + cpumask_t *ipimask = per_cpu(scratch_mask, cpu); + const cpumask_t *cluster_cpus; + unsigned long flags; + + mb(); /* See above for an explanation. */ + + local_irq_save(flags); + + cpumask_andnot(ipimask, &cpu_online_map, cpumask_of(cpu)); + + for ( cpumask_and(ipimask, cpumask, ipimask); !cpumask_empty(ipimask); + cpumask_andnot(ipimask, ipimask, cluster_cpus) ) + { + uint64_t msr_content = 0; + + cluster_cpus = per_cpu(cluster_cpus, cpumask_first(ipimask)); + for_each_cpu ( cpu, cluster_cpus ) + { + if ( !cpumask_test_cpu(cpu, ipimask) ) + continue; + msr_content |= per_cpu(cpu_2_logical_apicid, cpu); + } + + BUG_ON(!msr_content); + msr_content = (msr_content << 32) | APIC_DM_FIXED | + APIC_DEST_LOGICAL | vector; + apic_wrmsr(APIC_ICR, msr_content); + } + + local_irq_restore(flags); } static const struct genapic apic_x2apic_phys = { @@ -116,15 +183,60 @@ static const struct genapic apic_x2apic_ .init_apic_ldr = init_apic_ldr_x2apic_cluster, .clustered_apic_check = clustered_apic_check_x2apic, .target_cpus = target_cpus_all, - .vector_allocation_cpumask = vector_allocation_cpumask_phys, + .vector_allocation_cpumask = vector_allocation_cpumask_x2apic_cluster, .cpu_mask_to_apicid = cpu_mask_to_apicid_x2apic_cluster, .send_IPI_mask = send_IPI_mask_x2apic_cluster, .send_IPI_self = send_IPI_self_x2apic }; +static int update_clusterinfo( + struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + per_cpu(cpu_2_logical_apicid, cpu) = BAD_APICID; + if ( !cluster_cpus_spare ) + cluster_cpus_spare = xzalloc(cpumask_t); + if ( !cluster_cpus_spare || + !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) + err = -ENOMEM; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + if ( per_cpu(cluster_cpus, cpu) ) + { + cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu)); + if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) ) + xfree(per_cpu(cluster_cpus, cpu)); + } + free_cpumask_var(per_cpu(scratch_mask, cpu)); + break; + } + + return !err ? NOTIFY_DONE : notifier_from_errno(err); +} + +static struct notifier_block x2apic_cpu_nfb = { + .notifier_call = update_clusterinfo +}; + const struct genapic *__init apic_x2apic_probe(void) { - return x2apic_phys ? &apic_x2apic_phys : &apic_x2apic_cluster; + if ( x2apic_phys ) + return &apic_x2apic_phys; + + if ( !this_cpu(cluster_cpus) ) + { + update_clusterinfo(NULL, CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + init_apic_ldr_x2apic_cluster(); + register_cpu_notifier(&x2apic_cpu_nfb); + } + + return &apic_x2apic_cluster; } void __init check_x2apic_preenabled(void)