[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH for-4.5] x86/stack: Avoid peeking into unmapped guard pages when dumping Xens stack



Currently, Xens stack tracing and dumping of its own stacks will always
attempt to continue to the top of the primary stack.  While this is fine for
99% of cases, it is incorrect when the stack pointer starts on an IST stack.

In particular, the stack analysis functions will wander up from the IST
stacks, through the syscall trampolines and then onto the primary stack.  If
MEMORY_GUARD is enabled, this will cause a pagefault when attempting to read
from the guard page.  Being an unhandled hypervisor fault, the pagefault
handler will then attempt to dump the stacks, and fall over the same problem.

This change introduces more finegrained knowledge of the cpu stack layouts,
and introduces different boundaries for whether the stack pointer is on an IST
stack or the primary stack.  Stack analysis starting from an IST stack will
now never exceed the stack they start on, and specifically not spill over into
an adjacent IST stack, or the syscall trampoline area.

A sample now looks like:
(XEN) '1' pressed -> testing mce stack printing
(XEN) ----[ Xen-4.5.0-rc  x86_64  debug=n  Not tainted ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82d08019196b>] check_mce_test+0xb/0x20
(XEN) RFLAGS: 0000000000010002   CONTEXT: hypervisor
(XEN) rax: 0000000000000000   rbx: ffff82d0802caf58   rcx: 0000000000000000
(XEN) rdx: ffff82d080235d20   rsi: 000000000000000a   rdi: ffff82d0802caf58
(XEN) rbp: 0000000000000031   rsp: ffff82d0802caf40   r8:  ffff83007faf4000
(XEN) r9:  0000000000004000   r10: 0000000000000001   r11: 0000000000000006
(XEN) r12: ffff82d0802cfe58   r13: ffff82d0802cfe58   r14: dead0000c0de0000
(XEN) r15: ffff82d080191910   cr0: 000000008005003b   cr4: 00000000000026f0
(XEN) cr3: 0000000062565000   cr2: 00007fb98a5e8fe8
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008
(XEN) Xen stack trace from rsp=ffff82d0802caf40:
(XEN)    ffff82d0801ad119 ffff82d080278da0 ffff82d080227907 ffff82d080191910
(XEN)    dead0000c0de0000 ffff82d0802cfe58 ffff82d0802cfe58 0000000000000031
(XEN)    ffff82d080278da0 0000000000000006 0000000000000001 0000000000004000
(XEN)    ffff83007faf4000 ffff82d0803101a0 0000000000000000 ffff82d0802c8000
(XEN)    000000000000000a ffff82d080277620 0000001200000000 ffff82d08018ae6a
(XEN)    000000000000e008 0000000000000292 ffff82d0802cfd18 000000000000e010
(XEN) Xen call trace:
(XEN)    [<ffff82d08019196b>] check_mce_test+0xb/0x20
(XEN)    [<ffff82d0801ad119>] do_machine_check+0x9/0x20
(XEN)    [<ffff82d080227907>] handle_ist_exception+0x8d/0xf6
(XEN)

In this test case, %r15 is deliberately set up with a function pointer in an
attempt to fool the naive stack trace algorithm.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CC: Keir Fraser <keir@xxxxxxx>
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Tim Deegan <tim@xxxxxxx>
CC: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

---

This was discovered as a one-off by XenServer automated testing where a server
had been woken straight from mwait with an MCE, then suffered an NMI watchdog
timeout while waiting for the mce_logout_lock.  Given no printk() from any
other cpus, and a reset while transitioning into the crash kernel, there is
sadly no more analysis which can be performed at this stage.

The stack trace produces for this issue ended up picking out everything
looking like a function return pointer across 6 pages of stack, leading to a
very confusing call trace.

Konrad: I am requesting a release ack for this change.  It aids the clarity of
certain crash information, and prevents cascade pagefaults in certain
circumstances, which would prevent execution of the crash kernel or a system
reboot.
---
 xen/arch/x86/traps.c          |   83 ++++++++++++++++++++++++++++++++++++++---
 xen/include/asm-x86/current.h |   33 +++++++++++++---
 2 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 10fc2ca..e008295 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -193,6 +193,76 @@ static void show_guest_stack(struct vcpu *v, const struct 
cpu_user_regs *regs)
     printk("\n");
 }
 
+/*
+ * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
+ *
+ * Stack pages 0, 1 and 2:
+ *   These are all 1-page IST stacks.  Each of these stacks have an exception
+ *   frame and saved register state at the top.  The interesting bound for a
+ *   trace is the word adjacent to this, while the bound for a dump is the
+ *   very top, including the exception frame.
+ *
+ * Stack pages 3, 4 and 5:
+ *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
+ *   explicitly not present, so attempting to dump or trace it is
+ *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
+ *   to use the entire primary stack and wander into page 5.  In this case,
+ *   consider these pages an extension of the primary stack to aid debugging
+ *   hopefully rare situations where the primary stack has effective been
+ *   overflown.
+ *
+ * Stack pages 6 and 7:
+ *   These form the primary stack, and have a cpu_info at the top.  For a
+ *   trace, the interesting bound is adjacent to the cpu_info, while for a
+ *   dump, the entire cpu_info is interesting.
+ *
+ * For the cases where the stack should not be inspected, pretend that the
+ * passed stack pointer is already out of reasonable bounds.
+ */
+unsigned long get_stack_trace_bottom(unsigned long sp)
+{
+    switch ( get_stack_page(sp) )
+    {
+    case 0 ... 2:
+        return ROUNDUP(sp, PAGE_SIZE) -
+            offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
+
+#ifndef MEMORY_GUARD
+    case 3 ... 5:
+#endif
+    case 6 ... 7:
+        return ROUNDUP(sp, STACK_SIZE) -
+            sizeof(struct cpu_info) - sizeof(unsigned long);
+
+#ifdef MEMORY_GUARD
+    case 3 ... 5:
+#endif
+    default:
+        return sp - sizeof(unsigned long);
+    }
+}
+
+unsigned long get_stack_dump_bottom(unsigned long sp)
+{
+    switch ( get_stack_page(sp) )
+    {
+    case 0 ... 2:
+        return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
+
+#ifndef MEMORY_GUARD
+    case 3 ... 5:
+#endif
+    case 6 ... 7:
+        return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
+
+#ifdef MEMORY_GUARD
+    case 3 ... 5:
+#endif
+    default:
+        return sp - sizeof(unsigned long);
+    }
+}
+
 #if !defined(CONFIG_FRAME_POINTER)
 
 /*
@@ -203,7 +273,7 @@ static void show_guest_stack(struct vcpu *v, const struct 
cpu_user_regs *regs)
 static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
 {
     unsigned long *stack = (unsigned long *)sp, addr;
-    unsigned long *bottom = (unsigned long *)get_printable_stack_bottom(sp);
+    unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
 
     while ( stack <= bottom )
     {
@@ -221,7 +291,7 @@ static void _show_trace(unsigned long sp, unsigned long bp)
     unsigned long *frame, next, addr;
 
     /* Bounds for range of valid frame pointer. */
-    unsigned long low = sp, high = get_printable_stack_bottom(sp);
+    unsigned long low = sp, high = get_stack_trace_bottom(sp);
 
     /* The initial frame pointer. */
     next = bp;
@@ -292,7 +362,7 @@ static void show_trace(const struct cpu_user_regs *regs)
 
 void show_stack(const struct cpu_user_regs *regs)
 {
-    unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
+    unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
     int i;
 
     if ( guest_mode(regs) )
@@ -300,10 +370,11 @@ void show_stack(const struct cpu_user_regs *regs)
 
     printk("Xen stack trace from "__OP"sp=%p:\n  ", stack);
 
-    for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
+    stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
+
+    for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
+              (stack <= stack_bottom); i++ )
     {
-        if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
-            break;
         if ( (i != 0) && ((i % stack_words_per_line) == 0) )
             printk("\n  ");
         addr = *stack++;
diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
index b95fd79..f011d2d 100644
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -12,6 +12,28 @@
 #include <public/xen.h>
 #include <asm/page.h>
 
+/*
+ * Xen's cpu stacks are 8 pages (8-page aligned), arranged as:
+ *
+ * 7 - Primary stack (with a struct cpu_info at the top)
+ * 6 - Primary stack
+ * 5 - Optionally not preset (MEMORY_GUARD)
+ * 4 - unused
+ * 3 - Syscall trampolines
+ * 2 - MCE IST stack
+ * 1 - NMI IST stack
+ * 0 - Double Fault IST stack
+ */
+
+/*
+ * Identify which stack page the stack pointer is on.  Returns an index
+ * as per the comment above.
+ */
+static inline unsigned int get_stack_page(unsigned long sp)
+{
+    return (sp & (STACK_SIZE-1)) >> PAGE_SHIFT;
+}
+
 struct vcpu;
 
 struct cpu_info {
@@ -51,13 +73,12 @@ static inline struct cpu_info *get_cpu_info(void)
     ((unsigned long)&get_cpu_info()->guest_cpu_user_regs.es)
 
 /*
- * Get the bottom-of-stack, as useful for printing stack traces.  This is the
- * highest word on the stack which might be part of a stack trace, and is the
- * adjacent word to a struct cpu_info on the stack.
+ * Get the reasonable stack bounds for stack traces and stack dumps.  Stack
+ * dumps have a slightly larger range to include exception frames in the
+ * printed information.  The returned word is inside the interesting range.
  */
-#define get_printable_stack_bottom(sp)          \
-    ((sp & (~(STACK_SIZE-1))) +                 \
-     (STACK_SIZE - sizeof(struct cpu_info) - sizeof(unsigned long)))
+unsigned long get_stack_trace_bottom(unsigned long sp);
+unsigned long get_stack_dump_bottom (unsigned long sp);
 
 #define reset_stack_and_jump(__fn)                                      \
     ({                                                                  \
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.