[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [Patch 3/6] Xen/MCE: vMCE emulation



Xen/MCE: vMCE emulation

This patch provides virtual MCE support to guest. It emulates a simple
and clean MCE MSRs interface to guest by faking caps to guest if needed
and masking caps if unnecessary:
1. Providing a well-defined MCG_CAP to guest, filter out un-necessary caps and 
provide only guest needed caps;
2. Disabling MCG_CTL to avoid model specific;
3. Sticking all 1's to MCi_CTL to guest to avoid model specific;
4. Enabling CMCI cap but never really inject to guest to prevent polling 
periodically;
5. Masking MSCOD field of MCi_STATUS to avoid model specific;
6. Keeping natural semantics by per-vcpu instead of per-domain variables;
7. Using bank1 and reserving bank0 to work around 'bank0 quirk' of some very 
old processors;
8. Cleaning some vMCE# injection logic which shared by Intel and AMD but 
useless under new vMCE implement;
9. Keeping compatilbe w/ old xen version which has been backported to SLES11 
SP2, so that old vMCE would not blocked when migrate to new vMCE;

Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>

diff -r 585fe3e5a608 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Thu Jul 19 21:42:55 2012 +0800
@@ -55,8 +55,8 @@
 u64 mce_cap_init(void);
 extern unsigned int firstbank;
 
-int intel_mce_rdmsr(const struct vcpu *, uint32_t msr, uint64_t *val);
-int intel_mce_wrmsr(struct vcpu *, uint32_t msr, uint64_t val);
+void intel_vmce_mci_ctl2_rdmsr(const struct vcpu *, uint32_t msr, uint64_t 
*val);
+void intel_vmce_mci_ctl2_wrmsr(struct vcpu *, uint32_t msr, uint64_t val);
 
 struct mcinfo_extended *intel_get_extended_msrs(
     struct mcinfo_global *mig, struct mc_info *mi);
@@ -168,13 +168,12 @@
 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
         uint64_t gstatus);
 int inject_vmce(struct domain *d);
-int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct 
mcinfo_global *global);
 
 static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr)
 {
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
          msr >= MSR_IA32_MC0_CTL2 &&
-         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
+         msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) )
           return 1;
     return 0;
 }
@@ -182,7 +181,7 @@
 static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr)
 {
     if ( (msr >= MSR_IA32_MC0_CTL &&
-          msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) ||
+          msr < MSR_IA32_MCx_CTL(v->arch.vmce.mcg_cap & MCG_CAP_COUNT)) ||
          mce_vendor_bank_msr(v, msr) )
         return 1;
     return 0;
diff -r 585fe3e5a608 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Jul 19 21:42:55 2012 +0800
@@ -1396,33 +1396,26 @@
 }
 
 /* intel specific MCA MSR */
-int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+void intel_vmce_mci_ctl2_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
 {
-    int ret = 0;
+    int bank = msr - MSR_IA32_MC0_CTL2;
 
-    if ( msr >= MSR_IA32_MC0_CTL2 &&
-         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
+    if ( (bank >= 0) && (bank < GUEST_BANK_NUM) )
     {
-        mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
-                 "Guest should not write this MSR!\n");
-         ret = 1;
+        v->arch.vmce.bank[bank].mci_ctl2 = val;
+        mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %"PRIx64"\n",
+                   bank, val);
     }
-
-    return ret;
 }
 
-int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+void intel_vmce_mci_ctl2_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t 
*val)
 {
-    int ret = 0;
+    int bank = msr - MSR_IA32_MC0_CTL2;
 
-    if ( msr >= MSR_IA32_MC0_CTL2 &&
-         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
+    if ( (bank >= 0) && (bank < GUEST_BANK_NUM) )
     {
-        mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
-                 "Guest should not read this MSR!\n");
-        ret = 1;
+        *val = v->arch.vmce.bank[bank].mci_ctl2;
+        mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL2 0x%"PRIx64"\n",
+                   bank, *val);
     }
-
-    return ret;
 }
-
diff -r 585fe3e5a608 xen/arch/x86/cpu/mcheck/vmce.c
--- a/xen/arch/x86/cpu/mcheck/vmce.c    Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/vmce.c    Thu Jul 19 21:42:55 2012 +0800
@@ -1,5 +1,22 @@
 /*
- * vmce.c - virtual MCE support
+ * vmce.c - provide software emulated vMCE support to guest
+ *
+ * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
+ * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <xen/init.h>
@@ -19,44 +36,24 @@
 #include "mce.h"
 #include "x86_mca.h"
 
-/*
- * Emulalte 2 banks for guest
- * Bank0: reserved for 'bank0 quirk' occur at some very old processors:
- *   1). Intel cpu whose family-model value < 06-1A;
- *   2). AMD K7
- * Bank1: used to transfer error info to guest
- */
-#define GUEST_BANK_NUM 2
-#define GUEST_MCG_CAP (MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM)
-
-#define dom_vmce(x)   ((x)->arch.vmca_msrs)
-
-int vmce_init_msr(struct domain *d)
-{
-    dom_vmce(d) = xmalloc(struct domain_mca_msrs);
-    if ( !dom_vmce(d) )
-        return -ENOMEM;
-
-    dom_vmce(d)->mcg_status = 0x0;
-    dom_vmce(d)->nr_injection = 0;
-
-    INIT_LIST_HEAD(&dom_vmce(d)->impact_header);
-    spin_lock_init(&dom_vmce(d)->lock);
-
-    return 0;
-}
-
-void vmce_destroy_msr(struct domain *d)
-{
-    if ( !dom_vmce(d) )
-        return;
-    xfree(dom_vmce(d));
-    dom_vmce(d) = NULL;
-}
-
 void vmce_init_vcpu(struct vcpu *v)
 {
-    v->arch.mcg_cap = GUEST_MCG_CAP;
+    int i;
+
+    /* global MCA MSRs init */
+    v->arch.vmce.mcg_cap = GUEST_MCG_CAP;
+    v->arch.vmce.mcg_status = 0;
+
+    /* per-bank MCA MSRs init */
+    for ( i = 0; i < GUEST_BANK_NUM; i++ )
+    {
+        v->arch.vmce.bank[i].mci_status = 0;
+        v->arch.vmce.bank[i].mci_addr = 0;
+        v->arch.vmce.bank[i].mci_misc = 0;
+        v->arch.vmce.bank[i].mci_ctl2 = 0;
+    }
+
+    spin_lock_init(&v->arch.vmce.lock);
 }
 
 int vmce_restore_vcpu(struct vcpu *v, uint64_t caps)
@@ -70,16 +67,18 @@
         return -EPERM;
     }
 
-    v->arch.mcg_cap = caps;
+    v->arch.vmce.mcg_cap = caps;
     return 0;
 }
 
-static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+/*
+ * For historic version reason, bank number may greater than GUEST_BANK_NUM,
+ * when migratie from old vMCE version to new vMCE.
+ */
+static int bank_mce_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
 {
     int ret = 1;
     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
-    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
-    struct bank_entry *entry;
 
     *val = 0;
 
@@ -92,53 +91,40 @@
                    bank, *val);
         break;
     case MSR_IA32_MC0_STATUS:
-        /* Only error bank is read. Non-error banks simply return. */
-        if ( !list_empty(&vmce->impact_header) )
+        if ( bank < GUEST_BANK_NUM )
         {
-            entry = list_entry(vmce->impact_header.next,
-                               struct bank_entry, list);
-            if ( entry->bank == bank )
-            {
-                *val = entry->mci_status;
+            *val = v->arch.vmce.bank[bank].mci_status;
+            if ( *val )
                 mce_printk(MCE_VERBOSE,
-                           "MCE: rd MC%u_STATUS in vMCE# context "
-                           "value 0x%"PRIx64"\n", bank, *val);
-            }
+                           "MCE: rdmsr MC%u_STATUS in vMCE# context "
+                           "0x%"PRIx64"\n", bank, *val);
         }
         break;
     case MSR_IA32_MC0_ADDR:
-        if ( !list_empty(&vmce->impact_header) )
+        if ( bank < GUEST_BANK_NUM )
         {
-            entry = list_entry(vmce->impact_header.next,
-                               struct bank_entry, list);
-            if ( entry->bank == bank )
-            {
-                *val = entry->mci_addr;
+            *val = v->arch.vmce.bank[bank].mci_addr;
+            if ( *val )
                 mce_printk(MCE_VERBOSE,
                            "MCE: rdmsr MC%u_ADDR in vMCE# context "
                            "0x%"PRIx64"\n", bank, *val);
-            }
         }
         break;
     case MSR_IA32_MC0_MISC:
-        if ( !list_empty(&vmce->impact_header) )
+        if ( bank < GUEST_BANK_NUM )
         {
-            entry = list_entry(vmce->impact_header.next,
-                               struct bank_entry, list);
-            if ( entry->bank == bank )
-            {
-                *val = entry->mci_misc;
+            *val = v->arch.vmce.bank[bank].mci_misc;
+            if ( *val )
                 mce_printk(MCE_VERBOSE,
-                           "MCE: rd MC%u_MISC in vMCE# context "
+                           "MCE: rdmsr MC%u_MISC in vMCE# context "
                            "0x%"PRIx64"\n", bank, *val);
-            }
         }
         break;
     default:
         switch ( boot_cpu_data.x86_vendor )
         {
         case X86_VENDOR_INTEL:
-            ret = intel_mce_rdmsr(v, msr, val);
+            intel_vmce_mci_ctl2_rdmsr(v, msr, val);
             break;
         default:
             ret = 0;
@@ -157,29 +143,27 @@
  */
 int vmce_rdmsr(uint32_t msr, uint64_t *val)
 {
-    const struct vcpu *cur = current;
-    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
+    struct vcpu *cur = current;
     int ret = 1;
 
     *val = 0;
 
-    spin_lock(&vmce->lock);
+    spin_lock(&cur->arch.vmce.lock);
 
     switch ( msr )
     {
     case MSR_IA32_MCG_STATUS:
-        *val = vmce->mcg_status;
+        *val = cur->arch.vmce.mcg_status;
         if (*val)
             mce_printk(MCE_VERBOSE,
                        "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
         break;
     case MSR_IA32_MCG_CAP:
-        *val = cur->arch.mcg_cap;
-        mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
-                   *val);
+        *val = cur->arch.vmce.mcg_cap;
+        mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", *val);
         break;
     case MSR_IA32_MCG_CTL:
-        if ( cur->arch.mcg_cap & MCG_CTL_P )
+        if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
         {
             *val = ~0UL;
             mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val);
@@ -195,24 +179,19 @@
         break;
     }
 
-    spin_unlock(&vmce->lock);
+    spin_unlock(&cur->arch.vmce.lock);
+
     return ret;
 }
 
+/*
+ * For historic version reason, bank number may greater than GUEST_BANK_NUM,
+ * when migratie from old vMCE version to new vMCE.
+ */
 static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val)
 {
     int ret = 1;
     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
-    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
-    struct bank_entry *entry = NULL;
-
-    /* Give the first entry of the list, it corresponds to current
-     * vMCE# injection. When vMCE# is finished processing by the
-     * the guest, this node will be deleted.
-     * Only error bank is written. Non-error banks simply return.
-     */
-    if ( !list_empty(&vmce->impact_header) )
-        entry = list_entry(vmce->impact_header.next, struct bank_entry, list);
 
     switch ( msr & (MSR_IA32_MC0_CTL | 3) )
     {
@@ -223,56 +202,52 @@
          */
         break;
     case MSR_IA32_MC0_STATUS:
-        if ( entry && (entry->bank == bank) )
+        if ( val )
         {
-            entry->mci_status = val;
-            mce_printk(MCE_VERBOSE,
-                       "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
+            mce_printk(MCE_QUIET,
+                       "MCE: wr MC%u_STATUS w/ non-zero cause #GP\n", bank);
+            ret = -1;
+        }
+        if ( bank < GUEST_BANK_NUM )
+        {
+            v->arch.vmce.bank[bank].mci_status = val;
+            mce_printk(MCE_VERBOSE, "MCE: wr MC%u_STATUS %"PRIx64"\n",
                        bank, val);
         }
-        else
-            mce_printk(MCE_VERBOSE,
-                       "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
         break;
     case MSR_IA32_MC0_ADDR:
-        if ( !~val )
+        if ( val )
         {
             mce_printk(MCE_QUIET,
-                       "MCE: wr MC%u_ADDR with all 1s will cause #GP\n", bank);
+                       "MCE: wr MC%u_ADDR w/ non-zero cause #GP\n", bank);
             ret = -1;
         }
-        else if ( entry && (entry->bank == bank) )
+        else if ( bank < GUEST_BANK_NUM )
         {
-            entry->mci_addr = val;
-            mce_printk(MCE_VERBOSE,
-                       "MCE: wr MC%u_ADDR %"PRIx64" in vMCE#\n", bank, val);
-        }
-        else
+            v->arch.vmce.bank[bank].mci_addr = val;
             mce_printk(MCE_VERBOSE,
                        "MCE: wr MC%u_ADDR %"PRIx64"\n", bank, val);
+        }
         break;
     case MSR_IA32_MC0_MISC:
-        if ( !~val )
+        if ( val )
         {
             mce_printk(MCE_QUIET,
-                       "MCE: wr MC%u_MISC with all 1s will cause #GP\n", bank);
+                       "MCE: wr MC%u_MISC w/ non-zero cause #GP\n", bank);
             ret = -1;
         }
-        else if ( entry && (entry->bank == bank) )
+        else if ( bank < GUEST_BANK_NUM )
         {
-            entry->mci_misc = val;
-            mce_printk(MCE_VERBOSE,
-                       "MCE: wr MC%u_MISC %"PRIx64" in vMCE#\n", bank, val);
-        }
-        else
+            v->arch.vmce.bank[bank].mci_misc = val;
             mce_printk(MCE_VERBOSE,
                        "MCE: wr MC%u_MISC %"PRIx64"\n", bank, val);
+        }
         break;
     default:
         switch ( boot_cpu_data.x86_vendor )
         {
         case X86_VENDOR_INTEL:
-            ret = intel_mce_wrmsr(v, msr, val);
+            intel_vmce_mci_ctl2_wrmsr(v, msr, val);
             break;
         default:
             ret = 0;
@@ -292,57 +267,38 @@
 int vmce_wrmsr(u32 msr, u64 val)
 {
     struct vcpu *cur = current;
-    struct bank_entry *entry = NULL;
-    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
     int ret = 1;
 
-    spin_lock(&vmce->lock);
+    spin_lock(&cur->arch.vmce.lock);
 
     switch ( msr )
     {
     case MSR_IA32_MCG_CTL:
-        if ( !(cur->arch.mcg_cap & MCG_CTL_P) )
+        /* If MCG_CTL exist then stick to all 1's. If not exist then GP# */
+        if ( !(cur->arch.vmce.mcg_cap & MCG_CTL_P) )
         {
             mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n");
             ret = -1;
         }
         break;
     case MSR_IA32_MCG_STATUS:
-        vmce->mcg_status = val;
+        cur->arch.vmce.mcg_status = val;
         mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
-        /* For HVM guest, this is the point for deleting vMCE injection node */
-        if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) )
-        {
-            vmce->nr_injection--; /* Should be 0 */
-            if ( !list_empty(&vmce->impact_header) )
-            {
-                entry = list_entry(vmce->impact_header.next,
-                                   struct bank_entry, list);
-                if ( entry->mci_status & MCi_STATUS_VAL )
-                    mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
-                               "been cleared before write MCG_STATUS MSR\n");
-
-                mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
-                           "Node, nr_injection %u\n",
-                           vmce->nr_injection);
-                list_del(&entry->list);
-                xfree(entry);
-            }
-            else
-                mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
-                           " last injection Node, something Wrong!\n");
-        }
         break;
     case MSR_IA32_MCG_CAP:
-        mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
-        ret = -1;
+        /*
+         * According to Intel SDM, IA32_MCG_CAP is a read-only register,
+         * the effect of writing to the IA32_MCG_CAP is undefined. Here we
+         * treat writing as 'write not change'. Guest would not surprise.
+         */
+        mce_printk(MCE_QUIET, "MCE: MCG_CAP is read only and write not 
change\n");
         break;
     default:
         ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
         break;
     }
 
-    spin_unlock(&vmce->lock);
+    spin_unlock(&cur->arch.vmce.lock);
     return ret;
 }
 
@@ -353,7 +309,7 @@
 
     for_each_vcpu( d, v ) {
         struct hvm_vmce_vcpu ctxt = {
-            .caps = v->arch.mcg_cap
+            .caps = v->arch.vmce.mcg_cap
         };
 
         err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
@@ -433,93 +389,38 @@
     return 0;
 }
 
-/* This node list records errors impacting a domain. when one
- * MCE# happens, one error bank impacts a domain. This error node
- * will be inserted to the tail of the per_dom data for vMCE# MSR
- * virtualization. When one vMCE# injection is finished processing
- * processed by guest, the corresponding node will be deleted.
- * This node list is for GUEST vMCE# MSRS virtualization.
- */
-static struct bank_entry* alloc_bank_entry(void)
+int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
+                   uint64_t gstatus)
 {
-    struct bank_entry *entry;
+    struct vcpu *v = d->vcpu[0];
 
-    entry = xzalloc(struct bank_entry);
-    if ( entry == NULL )
-    {
-        printk(KERN_ERR "MCE: malloc bank_entry failed\n");
-        return NULL;
-    }
-
-    INIT_LIST_HEAD(&entry->list);
-    return entry;
-}
-
-/* Fill error bank info for #vMCE injection and GUEST vMCE#
- * MSR virtualization data
- * 1) Log down how many nr_injections of the impacted.
- * 2) Copy MCE# error bank to impacted DOM node list,
- *    for vMCE# MSRs virtualization
- */
-int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
-                   uint64_t gstatus) {
-    struct bank_entry *entry;
-
-    /* This error bank impacts one domain, we need to fill domain related
-     * data for vMCE MSRs virtualization and vMCE# injection */
     if ( mc_bank->mc_domid != (uint16_t)~0 )
     {
-        /* For HVM guest, Only when first vMCE is consumed by HVM guest
-         * successfully, will we generete another node and inject another vMCE.
-         */
-        if ( d->is_hvm && (dom_vmce(d)->nr_injection > 0) )
+        if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
         {
-            mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
+            mce_printk(MCE_QUIET, "MCE: guest has not handled previous"
                        " vMCE yet!\n");
             return -1;
         }
 
-        entry = alloc_bank_entry();
-        if ( entry == NULL )
-            return -1;
+        spin_lock(&v->arch.vmce.lock);
 
-        entry->mci_status = mc_bank->mc_status;
-        entry->mci_addr = mc_bank->mc_addr;
-        entry->mci_misc = mc_bank->mc_misc;
-        entry->bank = mc_bank->mc_bank;
+        v->arch.vmce.mcg_status = gstatus;
+        /*
+         * 1. Skip BANK0 to avoid 'bank0 quirk' of old processors
+         * 2. Filter MCi_STATUS MSCOD model specific error code to guest
+         */
+        v->arch.vmce.bank[BANK1].mci_status = mc_bank->mc_status &
+                                              MCi_STATUS_MSCOD_MASK;
+        v->arch.vmce.bank[BANK1].mci_addr = mc_bank->mc_addr;
+        v->arch.vmce.bank[BANK1].mci_misc = mc_bank->mc_misc;
 
-        spin_lock(&dom_vmce(d)->lock);
-        /* New error Node, insert to the tail of the per_dom data */
-        list_add_tail(&entry->list, &dom_vmce(d)->impact_header);
-        /* Fill MSR global status */
-        dom_vmce(d)->mcg_status = gstatus;
-        /* New node impact the domain, need another vMCE# injection*/
-        dom_vmce(d)->nr_injection++;
-        spin_unlock(&dom_vmce(d)->lock);
-
-        mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
-                   "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
-                   mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
-                   mc_bank->mc_domid);
+        spin_unlock(&v->arch.vmce.lock);
     }
 
     return 0;
 }
 
-#if 0 /* currently unused */
-int vmce_domain_inject(
-    struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global)
-{
-    int ret;
-
-    ret = fill_vmsr_data(bank, d, global->mc_gstatus);
-    if ( ret < 0 )
-        return ret;
-
-    return inject_vmce(d);
-}
-#endif
-
 static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d)
 {
     struct vcpu *v;
diff -r 585fe3e5a608 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/arch/x86/domain.c     Thu Jul 19 21:42:55 2012 +0800
@@ -597,9 +597,6 @@
 
         if ( (rc = iommu_domain_init(d)) != 0 )
             goto fail;
-
-        /* For Guest vMCE MSRs virtualization */
-        vmce_init_msr(d);
     }
 
     if ( is_hvm_domain(d) )
@@ -627,7 +624,6 @@
 
  fail:
     d->is_dying = DOMDYING_dead;
-    vmce_destroy_msr(d);
     cleanup_domain_irq_mapping(d);
     free_xenheap_page(d->shared_info);
     if ( paging_initialised )
@@ -656,7 +652,6 @@
     else
         xfree(d->arch.pv_domain.e820);
 
-    vmce_destroy_msr(d);
     free_domain_pirqs(d);
     if ( !is_idle_domain(d) )
         iommu_domain_destroy(d);
diff -r 585fe3e5a608 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/arch/x86/domctl.c     Thu Jul 19 21:42:55 2012 +0800
@@ -1023,7 +1023,7 @@
                 evc->syscall32_callback_eip    = 0;
                 evc->syscall32_disables_events = 0;
             }
-            evc->mcg_cap = v->arch.mcg_cap;
+            evc->mcg_cap = v->arch.vmce.mcg_cap;
         }
         else
         {
diff -r 585fe3e5a608 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/arch/x86/traps.c      Thu Jul 19 21:42:55 2012 +0800
@@ -3227,50 +3227,6 @@
                 break;
     ASSERT(trap <= VCPU_TRAP_LAST);
 
-    /* inject vMCE to PV_Guest including DOM0. */
-    if ( trap == VCPU_TRAP_MCE )
-    {
-        gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n");
-        if ( curr->vcpu_id == 0 )
-        {
-            struct domain *d = curr->domain;
-
-            if ( !d->arch.vmca_msrs->nr_injection )
-            {
-                printk(XENLOG_WARNING "MCE: ret from vMCE#, "
-                       "no injection node\n");
-                goto end;
-            }
-
-            d->arch.vmca_msrs->nr_injection--;
-            if ( !list_empty(&d->arch.vmca_msrs->impact_header) )
-            {
-                struct bank_entry *entry;
-
-                entry = list_entry(d->arch.vmca_msrs->impact_header.next,
-                                   struct bank_entry, list);
-                gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n");
-                list_del(&entry->list);
-            }
-            else
-                printk(XENLOG_ERR "MCE: didn't found last injection node\n");
-
-            /* further injection */
-            if ( d->arch.vmca_msrs->nr_injection > 0 &&
-                 guest_has_trap_callback(d, 0, TRAP_machine_check) &&
-                 !test_and_set_bool(curr->mce_pending) )
-            {
-                int cpu = smp_processor_id();
-
-                cpumask_copy(curr->cpu_affinity_tmp, curr->cpu_affinity);
-                printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n",
-                       cpu, curr->processor);
-                vcpu_set_affinity(curr, cpumask_of(cpu));
-            }
-        }
-    }
-
-end:
     /* Restore previous asynchronous exception mask. */
     curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
 }
diff -r 585fe3e5a608 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/include/asm-x86/domain.h      Thu Jul 19 21:42:55 2012 +0800
@@ -309,9 +309,6 @@
 
     struct PITState vpit;
 
-    /* For Guest vMCA handling */
-    struct domain_mca_msrs *vmca_msrs;
-
     /* TSC management (emulation, pv, scaling, stats) */
     int tsc_mode;            /* see include/asm-x86/time.h */
     bool_t vtsc;             /* tsc is emulated (may change after migrate) */
@@ -490,7 +487,7 @@
      * and thus should be saved/restored. */
     bool_t nonlazy_xstate_used;
 
-    uint64_t mcg_cap;
+    struct vmce vmce;
     
     struct paging_vcpu paging;
 
diff -r 585fe3e5a608 xen/include/asm-x86/mce.h
--- a/xen/include/asm-x86/mce.h Thu Jul 19 21:38:15 2012 +0800
+++ b/xen/include/asm-x86/mce.h Thu Jul 19 21:42:55 2012 +0800
@@ -3,28 +3,46 @@
 #ifndef _XEN_X86_MCE_H
 #define _XEN_X86_MCE_H
 
-/* This entry is for recording bank nodes for the impacted domain,
- * put into impact_header list. */
-struct bank_entry {
-    struct list_head list;
-    uint16_t bank;
+/*
+ * Emulalte 2 banks for guest
+ * Bank0: reserved for 'bank0 quirk' occur at some very old processors:
+ *   1). Intel cpu whose family-model value < 06-1A;
+ *   2). AMD K7
+ * Bank1: used to transfer error info to guest
+ */
+#define BANK0 0
+#define BANK1 1
+#define GUEST_BANK_NUM 2
+
+/*
+ * MCG_SER_P:  software error recovery supported
+ * MCG_TES_P:  to avoid MCi_status bit56:53 model specific
+ * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
+ *             for sake of performance since guest not polling periodically
+ */
+#define GUEST_MCG_CAP (MCG_SER_P | MCG_TES_P | MCG_CMCI_P | GUEST_BANK_NUM)
+
+/* Filter MSCOD model specific error code to guest */
+#define MCi_STATUS_MSCOD_MASK (~(0x0ffffUL << 16))
+
+/* No mci_ctl since it stick all 1's */
+struct vmce_bank {
     uint64_t mci_status;
     uint64_t mci_addr;
     uint64_t mci_misc;
+    uint64_t mci_ctl2;
 };
 
-struct domain_mca_msrs
-{
-    /* Guest should not change below values after DOM boot up */
+/* No mcg_ctl since it not expose to guest */
+struct vmce {
+    uint64_t mcg_cap;
     uint64_t mcg_status;
-    uint16_t nr_injection;
-    struct list_head impact_header;
+    struct vmce_bank bank[GUEST_BANK_NUM];
+
     spinlock_t lock;
 };
 
 /* Guest vMCE MSRs virtualization */
-extern int vmce_init_msr(struct domain *d);
-extern void vmce_destroy_msr(struct domain *d);
 extern void vmce_init_vcpu(struct vcpu *);
 extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps);
 extern int vmce_wrmsr(uint32_t msr, uint64_t val);

Attachment: 3_vmce_emulation.patch
Description: 3_vmce_emulation.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.