[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86: make GDT per-CPU



The major issue with supporting a significantly larger number of physical
CPUs appears to be the use of per-CPU GDT entries - at present, x86-64
could support only up to 126 CPUs (with code changes to also use the
top-most GDT page, that would be 254). Instead of trying to go with
incremental steps here, by converting the GDT itself to be per-CPU,
limitations in that respect go away entirely.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

Index: 2008-09-19/xen/arch/x86/boot/wakeup.S
===================================================================
--- 2008-09-19.orig/xen/arch/x86/boot/wakeup.S  2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/boot/wakeup.S       2008-09-19 13:56:36.000000000 
+0200
@@ -168,7 +168,7 @@ wakeup_32:
         .word   0,0,0
 lgdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
         
 wakeup_64:
         lgdt    lgdt_descr(%rip)
Index: 2008-09-19/xen/arch/x86/boot/x86_32.S
===================================================================
--- 2008-09-19.orig/xen/arch/x86/boot/x86_32.S  2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/boot/x86_32.S       2008-09-19 13:56:36.000000000 
+0200
@@ -78,7 +78,7 @@ idt_descr:
         .word   0
 gdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .long   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .long   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
 
         .align 32
@@ -94,7 +94,7 @@ ENTRY(idle_pg_table)
 #define GUEST_DESC(d)                                                   \
         .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff,                \
               ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d)
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00cf9a000000ffff     /* 0xe008 ring 0 4.00GB code at 0x0 */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 4.00GB data at 0x0 */
@@ -102,4 +102,6 @@ ENTRY(gdt_table)
         GUEST_DESC(0x00c0b200)       /* 0xe021 ring 1 3.xxGB data at 0x0 */
         GUEST_DESC(0x00c0fa00)       /* 0xe02b ring 3 3.xxGB code at 0x0 */
         GUEST_DESC(0x00c0f200)       /* 0xe033 ring 3 3.xxGB data at 0x0 */
+        .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu) */
         .align PAGE_SIZE,0
Index: 2008-09-19/xen/arch/x86/boot/x86_64.S
===================================================================
--- 2008-09-19.orig/xen/arch/x86/boot/x86_64.S  2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/boot/x86_64.S       2008-09-19 13:56:36.000000000 
+0200
@@ -85,7 +85,7 @@ multiboot_ptr:
         .word   0
 gdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
         .word   0,0,0
 idt_descr:
@@ -96,7 +96,7 @@ ENTRY(stack_start)
         .quad   cpu0_stack
 
         .align PAGE_SIZE, 0
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
@@ -105,11 +105,13 @@ ENTRY(gdt_table)
         .quad 0x00cff2000000ffff     /* 0xe02b ring 3 data                */
         .quad 0x00affa000000ffff     /* 0xe033 ring 3 code, 64-bit mode   */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
+        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
 
         .align PAGE_SIZE, 0
 /* NB. Even rings != 0 get access to the full 4Gb, as only the            */
 /*     (compatibility) machine->physical mapping table lives there.       */
-ENTRY(compat_gdt_table)
+ENTRY(boot_cpu_compat_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
@@ -118,4 +120,6 @@ ENTRY(compat_gdt_table)
         .quad 0x00cffa000000ffff     /* 0xe02b ring 3 code, compatibility */
         .quad 0x00cff2000000ffff     /* 0xe033 ring 3 data                */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
+        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
         .align PAGE_SIZE, 0
Index: 2008-09-19/xen/arch/x86/cpu/common.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/cpu/common.c   2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/cpu/common.c        2008-09-19 13:56:36.000000000 
+0200
@@ -575,6 +575,9 @@ void __cpuinit cpu_init(void)
        if (cpu_has_pat)
                wrmsrl(MSR_IA32_CR_PAT, host_pat);
 
+       /* Install correct page table. */
+       write_ptbase(current);
+
        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(current);
        asm volatile ( "lgdt %0" : "=m" (gdt_load) );
@@ -605,9 +608,6 @@ void __cpuinit cpu_init(void)
 #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
        CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
 #undef CD
-
-       /* Install correct page table. */
-       write_ptbase(current);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
Index: 2008-09-19/xen/arch/x86/domain.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/domain.c       2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/domain.c    2008-09-19 13:57:28.000000000 +0200
@@ -211,7 +211,6 @@ static inline int may_switch_mode(struct
 
 int switch_native(struct domain *d)
 {
-    l1_pgentry_t gdt_l1e;
     unsigned int vcpuid;
 
     if ( d == NULL )
@@ -223,12 +222,8 @@ int switch_native(struct domain *d)
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
 
-    /* switch gdt */
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
     for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
     {
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
         if (d->vcpu[vcpuid])
             release_compat_l4(d->vcpu[vcpuid]);
     }
@@ -238,7 +233,6 @@ int switch_native(struct domain *d)
 
 int switch_compat(struct domain *d)
 {
-    l1_pgentry_t gdt_l1e;
     unsigned int vcpuid;
 
     if ( d == NULL )
@@ -250,15 +244,11 @@ int switch_compat(struct domain *d)
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
 
-    /* switch gdt */
-    gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
     for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
     {
         if ( (d->vcpu[vcpuid] != NULL) &&
              (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
             goto undo_and_fail;
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
     }
 
     domain_set_alloc_bitsize(d);
@@ -267,13 +257,10 @@ int switch_compat(struct domain *d)
 
  undo_and_fail:
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
     while ( vcpuid-- != 0 )
     {
         if ( d->vcpu[vcpuid] != NULL )
             release_compat_l4(d->vcpu[vcpuid]);
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
     }
     return -ENOMEM;
 }
@@ -322,7 +309,12 @@ int vcpu_initialise(struct vcpu *v)
         if ( is_idle_domain(d) )
         {
             v->arch.schedule_tail = continue_idle_domain;
-            v->arch.cr3           = __pa(idle_pg_table);
+            if ( v->vcpu_id )
+                v->arch.cr3 = d->vcpu[0]->arch.cr3;
+            else if ( !*idle_vcpu )
+                v->arch.cr3 = __pa(idle_pg_table);
+            else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
+                return -ENOMEM;
         }
 
         v->arch.guest_context.ctrlreg[4] =
@@ -349,8 +341,7 @@ int arch_domain_create(struct domain *d,
 #ifdef __x86_64__
     struct page_info *pg;
 #endif
-    l1_pgentry_t gdt_l1e;
-    int i, vcpuid, pdpt_order, paging_initialised = 0;
+    int i, pdpt_order, paging_initialised = 0;
     int rc = -ENOMEM;
 
     d->arch.hvm_domain.hap_enabled =
@@ -369,18 +360,6 @@ int arch_domain_create(struct domain *d,
         goto fail;
     memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
 
-    /*
-     * Map Xen segments into every VCPU's GDT, irrespective of whether every
-     * VCPU will actually be used. This avoids an NMI race during context
-     * switch: if we take an interrupt after switching CR3 but before switching
-     * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
-     * try to load CS from an invalid table.
-     */
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
-    for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-
 #if defined(__i386__)
 
     mapcache_domain_init(d);
@@ -1193,9 +1172,12 @@ static void paravirt_ctxt_switch_to(stru
 static void __context_switch(void)
 {
     struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
-    unsigned int          cpu = smp_processor_id();
+    unsigned int          i, cpu = smp_processor_id();
     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
     struct vcpu          *n = current;
+    struct desc_struct   *gdt;
+    struct page_info     *page;
+    struct desc_ptr       gdt_desc;
 
     ASSERT(p != n);
     ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
@@ -1221,14 +1203,30 @@ static void __context_switch(void)
         cpu_set(cpu, n->domain->domain_dirty_cpumask);
     cpu_set(cpu, n->vcpu_dirty_cpumask);
 
+    gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
+                                  per_cpu(compat_gdt_table, cpu);
+    page = virt_to_page(gdt);
+    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+    {
+        l1e_write(n->domain->arch.mm_perdomain_pt +
+                  (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+                  FIRST_RESERVED_GDT_PAGE + i,
+                  l1e_from_page(page + i, __PAGE_HYPERVISOR));
+    }
+
+    if ( p->vcpu_id != n->vcpu_id )
+    {
+        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
+        gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
+        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+    }
+
     write_ptbase(n);
 
     if ( p->vcpu_id != n->vcpu_id )
     {
-        char gdt_load[10];
-        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
-        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(n);
-        asm volatile ( "lgdt %0" : "=m" (gdt_load) );
+        gdt_desc.base = GDT_VIRT_START(n);
+        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
     }
 
     if ( p->domain != n->domain )
@@ -1279,8 +1277,6 @@ void context_switch(struct vcpu *prev, s
             uint64_t efer = read_efer();
             if ( !(efer & EFER_SCE) )
                 write_efer(efer | EFER_SCE);
-            flush_tlb_one_local(GDT_VIRT_START(next) +
-                                FIRST_RESERVED_GDT_BYTE);
         }
 #endif
 
Index: 2008-09-19/xen/arch/x86/domain_build.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/domain_build.c 2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/domain_build.c      2008-09-19 13:56:36.000000000 
+0200
@@ -314,24 +314,11 @@ int __init construct_dom0(
 #if defined(__x86_64__)
     if ( compat32 )
     {
-        l1_pgentry_t gdt_l1e;
-
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
         v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
 
         if ( nr_pages != (unsigned int)nr_pages )
             nr_pages = UINT_MAX;
-
-        /*
-         * Map compatibility Xen segments into every VCPU's GDT. See
-         * arch_domain_create() for further comments.
-         */
-        gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
-                                PAGE_HYPERVISOR);
-        for ( i = 0; i < MAX_VIRT_CPUS; i++ )
-            d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
-                                     FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-        flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
     }
 #endif
 
Index: 2008-09-19/xen/arch/x86/hvm/vmx/vmcs.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/hvm/vmx/vmcs.c      2008-09-19 13:56:36.000000000 
+0200
@@ -446,7 +446,7 @@ static void vmx_set_host_env(struct vcpu
 
     __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
 
-    __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
+    __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
     __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
 
     __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
Index: 2008-09-19/xen/arch/x86/setup.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/setup.c        2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/setup.c     2008-09-19 13:56:36.000000000 +0200
@@ -115,6 +115,12 @@ extern void early_cpu_init(void);
 extern void vesa_init(void);
 extern void vesa_mtrr_init(void);
 
+DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
+#ifdef CONFIG_COMPAT
+DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
+    = boot_cpu_compat_gdt_table;
+#endif
+
 struct tss_struct init_tss[NR_CPUS];
 
 char __attribute__ ((__section__(".bss.stack_aligned"))) 
cpu0_stack[STACK_SIZE];
@@ -224,6 +230,7 @@ static void __init percpu_init_areas(voi
 static void __init init_idle_domain(void)
 {
     struct domain *idle_domain;
+    unsigned int i;
 
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
@@ -236,6 +243,12 @@ static void __init init_idle_domain(void
     idle_vcpu[0] = this_cpu(curr_vcpu) = current;
 
     setup_idle_pagetable();
+
+    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+        idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] =
+            l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
+                          __PAGE_HYPERVISOR);
+
 }
 
 static void __init srat_detect_node(int cpu)
@@ -443,7 +456,6 @@ void __init __start_xen(unsigned long mb
     parse_video_info();
 
     set_current((struct vcpu *)0xfffff000); /* debug sanity */
-    idle_vcpu[0] = current;
     set_processor_id(0); /* needed early, for smp_processor_id() */
     if ( cpu_has_efer )
         rdmsrl(MSR_EFER, this_cpu(efer));
Index: 2008-09-19/xen/arch/x86/smpboot.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/smpboot.c      2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/smpboot.c   2008-09-19 13:57:46.000000000 +0200
@@ -836,10 +836,15 @@ static int __devinit do_boot_cpu(int api
  */
 {
        unsigned long boot_error;
+       unsigned int i;
        int timeout;
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
        struct vcpu *v;
+       struct desc_struct *gdt;
+#ifdef __x86_64__
+        struct page_info *page;
+#endif
 
        /*
         * Save current MTRR state in case it was changed since early boot
@@ -865,6 +870,37 @@ static int __devinit do_boot_cpu(int api
        /* Debug build: detect stack overflow by setting up a guard page. */
        memguard_guard_stack(stack_start.esp);
 
+       gdt = per_cpu(gdt_table, cpu);
+       if (gdt == boot_cpu_gdt_table) {
+               i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+#ifdef __x86_64__
+#ifdef CONFIG_COMPAT
+               page = alloc_domheap_pages(NULL, i,
+                                          MEMF_node(cpu_to_node(cpu)));
+               per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
+               memcpy(gdt, boot_cpu_compat_gdt_table,
+                      NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+               gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+#endif
+               page = alloc_domheap_pages(NULL, i,
+                                          MEMF_node(cpu_to_node(cpu)));
+               per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
+#else
+               per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
+#endif
+               memcpy(gdt, boot_cpu_gdt_table,
+                      NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+               BUILD_BUG_ON(NR_CPUS > 0x10000);
+               gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+       }
+
+       for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+               v->domain->arch.mm_perdomain_pt
+                       [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+                        FIRST_RESERVED_GDT_PAGE + i]
+                       = l1e_from_page(virt_to_page(gdt) + i,
+                                       __PAGE_HYPERVISOR);
+
        /*
         * This grunge runs the startup process for
         * the targeted processor.
Index: 2008-09-19/xen/arch/x86/traps.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/traps.c        2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/traps.c     2008-09-19 13:58:14.000000000 +0200
@@ -2965,13 +2965,13 @@ void set_intr_gate(unsigned int n, void 
 void set_tss_desc(unsigned int n, void *addr)
 {
     _set_tssldt_desc(
-        gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+        per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)addr,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         9);
 #ifdef CONFIG_COMPAT
     _set_tssldt_desc(
-        compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+        per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)addr,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         11);
Index: 2008-09-19/xen/arch/x86/x86_32/mm.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/x86_32/mm.c    2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/x86_32/mm.c 2008-09-19 13:56:36.000000000 +0200
@@ -132,6 +132,30 @@ void __init setup_idle_pagetable(void)
                                 __PAGE_HYPERVISOR));
 }
 
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+    unsigned int i;
+    struct domain *d = v->domain;
+    l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
+    l2_pgentry_t *l2_table = alloc_xenheap_page();
+
+    if ( !l2_table )
+        return 0;
+
+    memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
+    l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] =
+        l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
+
+    copy_page(l2_table, idle_pg_table_l2 +
+              l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
+    for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
+        l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+            l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+                          __PAGE_HYPERVISOR);
+
+    return __pa(l3_table);
+}
+
 void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
 {
     int i;
@@ -186,7 +210,7 @@ void __init subarch_init_memory(void)
     {
         /* Guest kernel runs in ring 0, not ring 1. */
         struct desc_struct *d;
-        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+        d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - 
FIRST_RESERVED_GDT_ENTRY];
         d[0].b &= ~_SEGMENT_DPL;
         d[1].b &= ~_SEGMENT_DPL;
     }
Index: 2008-09-19/xen/arch/x86/x86_32/supervisor_mode_kernel.S
===================================================================
--- 2008-09-19.orig/xen/arch/x86/x86_32/supervisor_mode_kernel.S        
2008-09-19 13:56:32.000000000 +0200
+++ 2008-09-19/xen/arch/x86/x86_32/supervisor_mode_kernel.S     2008-09-19 
13:56:36.000000000 +0200
@@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack)
         # %gs:%esi now points to the guest stack before the
         # interrupt/exception occured.
 
-        /*
-         * Reverse the __TSS macro, giving us the CPU number.
-         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
-         */
-        str   %ecx
-        shrl  $3,%ecx                                   # Calculate GDT index 
for TSS.
-        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
-        shll  $6,%ecx                                   # Each TSS entry is 
0x80 bytes
-        addl  $init_tss,%ecx                            # but we have 2*cpu 
from above.
+        movl  $PER_CPU_GDT_ENTRY*8,%ecx
+        lsll  %ecx,%ecx
+        shll  $7,%ecx                                   # Each TSS entry is 
0x80 bytes
+        addl  $init_tss,%ecx
 
         # Load Xen stack from TSS.
         movw  TSS_ss0(%ecx),%ax
Index: 2008-09-19/xen/arch/x86/x86_32/traps.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/x86_32/traps.c 2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/x86_32/traps.c      2008-09-19 13:56:36.000000000 
+0200
@@ -194,13 +194,15 @@ static unsigned char doublefault_stack[D
 
 asmlinkage void do_double_fault(void)
 {
-    struct tss_struct *tss = &doublefault_tss;
-    unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
+    struct tss_struct *tss;
+    unsigned int cpu;
 
     watchdog_disable();
 
     console_force_unlock();
 
+    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
     /* Find information saved during fault and dump it to the console. */
     tss = &init_tss[cpu];
     printk("*** DOUBLE FAULT ***\n");
@@ -325,7 +327,7 @@ void __devinit subarch_percpu_traps_init
     tss->eflags = 2;
     tss->bitmap = IOBMP_INVALID_OFFSET;
     _set_tssldt_desc(
-        gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+        boot_cpu_gdt_table + __DOUBLEFAULT_TSS_ENTRY - 
FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)tss, 235, 9);
 
     set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
Index: 2008-09-19/xen/arch/x86/x86_64/mm.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/x86_64/mm.c    2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/x86_64/mm.c 2008-09-19 13:56:36.000000000 +0200
@@ -21,6 +21,7 @@
 #include <xen/lib.h>
 #include <xen/init.h>
 #include <xen/mm.h>
+#include <xen/numa.h>
 #include <xen/sched.h>
 #include <xen/guest_access.h>
 #include <asm/current.h>
@@ -206,6 +207,24 @@ void __init setup_idle_pagetable(void)
                   __PAGE_HYPERVISOR));
 }
 
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct page_info *page = alloc_domheap_page(NULL,
+                                                MEMF_node(vcpu_to_node(v)));
+    l4_pgentry_t *l4_table = page_to_virt(page);
+
+    if ( !page )
+        return 0;
+
+    copy_page(l4_table, idle_pg_table);
+    l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
+        l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
+                      __PAGE_HYPERVISOR);
+
+    return __pa(l4_table);
+}
+
 void __init zap_low_mappings(void)
 {
     BUG_ON(num_online_cpus() != 1);
Index: 2008-09-19/xen/arch/x86/x86_64/traps.c
===================================================================
--- 2008-09-19.orig/xen/arch/x86/x86_64/traps.c 2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/arch/x86/x86_64/traps.c      2008-09-19 13:56:36.000000000 
+0200
@@ -213,15 +213,14 @@ void show_page_walk(unsigned long addr)
 asmlinkage void double_fault(void);
 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
 {
-    unsigned int cpu, tr;
-
-    asm volatile ( "str %0" : "=r" (tr) );
-    cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
+    unsigned int cpu;
 
     watchdog_disable();
 
     console_force_unlock();
 
+    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
     /* Find information saved during fault and dump it to the console. */
     printk("*** DOUBLE FAULT ***\n");
     print_xen_info();
Index: 2008-09-19/xen/include/asm-x86/desc.h
===================================================================
--- 2008-09-19.orig/xen/include/asm-x86/desc.h  2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/include/asm-x86/desc.h       2008-09-19 13:56:36.000000000 
+0200
@@ -34,11 +34,9 @@
 #define FLAT_COMPAT_USER_CS   FLAT_COMPAT_RING3_CS
 #define FLAT_COMPAT_USER_SS   FLAT_COMPAT_RING3_SS
 
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2)
-
-#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 2)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2)
 
 #elif defined(__i386__)
 
@@ -51,17 +49,15 @@
 
 #define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
 
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1)
-
-#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 1)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1)
 
 #endif
 
 #ifndef __ASSEMBLY__
 
-#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (__TSS(n)<<3) )
+#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (TSS_ENTRY<<3) )
 
 #if defined(__x86_64__)
 #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
@@ -205,11 +201,19 @@ do {                                    
 
 #endif
 
-extern struct desc_struct gdt_table[];
+struct desc_ptr {
+       unsigned short limit;
+       unsigned long base;
+} __attribute__((__packed__)) ;
+
+extern struct desc_struct boot_cpu_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, gdt_table);
 #ifdef CONFIG_COMPAT
-extern struct desc_struct compat_gdt_table[];
+extern struct desc_struct boot_cpu_compat_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
 #else
-# define compat_gdt_table gdt_table
+# define boot_cpu_compat_gdt_table boot_cpu_gdt_table
+# define per_cpu__compat_gdt_table per_cpu__gdt_table
 #endif
 
 extern void set_intr_gate(unsigned int irq, void * addr);
Index: 2008-09-19/xen/include/asm-x86/ldt.h
===================================================================
--- 2008-09-19.orig/xen/include/asm-x86/ldt.h   2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/include/asm-x86/ldt.h        2008-09-19 13:56:36.000000000 
+0200
@@ -6,7 +6,6 @@
 
 static inline void load_LDT(struct vcpu *v)
 {
-    unsigned int cpu;
     struct desc_struct *desc;
     unsigned long ents;
 
@@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu 
     }
     else
     {
-        cpu = smp_processor_id();
-        desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table)
-               + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY;
+        desc = (!is_pv_32on64_vcpu(v)
+                ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
+               + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
         _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2);
-        __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) );
+        __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) );
     }
 }
 
Index: 2008-09-19/xen/include/asm-x86/page.h
===================================================================
--- 2008-09-19.orig/xen/include/asm-x86/page.h  2008-09-19 13:56:32.000000000 
+0200
+++ 2008-09-19/xen/include/asm-x86/page.h       2008-09-19 13:56:36.000000000 
+0200
@@ -278,6 +278,7 @@ extern unsigned int   m2p_compat_vstart;
 #endif
 void paging_init(void);
 void setup_idle_pagetable(void);
+unsigned long clone_idle_pagetable(struct vcpu *);
 #endif /* !defined(__ASSEMBLY__) */
 
 #define _PAGE_PRESENT  0x001U



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.