[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC PATCH v3.1 1/2] xsplice: rfc.v3.1



*TODO*:
 - XSM add
 - Contributs/authors on xsplice.markdown
 - Figure out the preemption method (rfc.v4 target). Talk in Seattle?
 - Further work - write out an Wiki detailing what implementation
   pieces to be done for individual contributions. Do it before
   Seattle?

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
---
 docs/misc/xsplice.h           |   80 +++
 docs/misc/xsplice.markdown    | 1230 +++++++++++++++++++++++++++++++++++++++++
 docs/misc/xsplice_test.c      |   78 +++
 tools/libxc/include/xenctrl.h |   16 +
 tools/libxc/xc_misc.c         |  183 ++++++
 tools/misc/Makefile           |    4 +
 tools/misc/xen-xsplice.c      |  360 ++++++++++++
 xen/common/Makefile           |    1 +
 xen/common/keyhandler.c       |    8 +-
 xen/common/sysctl.c           |    5 +
 xen/common/xsplice.c          |  405 ++++++++++++++
 xen/include/public/sysctl.h   |   66 +++
 xen/include/xen/xsplice.h     |    9 +
 13 files changed, 2444 insertions(+), 1 deletion(-)
 create mode 100644 docs/misc/xsplice.h
 create mode 100644 docs/misc/xsplice.markdown
 create mode 100644 docs/misc/xsplice_test.c
 create mode 100644 tools/misc/xen-xsplice.c
 create mode 100644 xen/common/xsplice.c
 create mode 100644 xen/include/xen/xsplice.h

diff --git a/docs/misc/xsplice.h b/docs/misc/xsplice.h
new file mode 100644
index 0000000..00061fc
--- /dev/null
+++ b/docs/misc/xsplice.h
@@ -0,0 +1,80 @@
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#define XSPLICE_HOWTO_INLINE        0x1 /* It is an inline replacement. */  
+#define XSPLICE_HOWTO_RELOC_PATCH   0x2 /* Add an trampoline. */  
+
+#define XSPLICE_HOWTO_FLAG_PC_REL    0x1 /* Is PC relative. */  
+#define XSPLICE_HOWOT_FLAG_SIGN      0x2 /* Should the new value be treated as 
signed value. */  
+
+struct xsplice_reloc_howto {  
+    uint32_t    howto;/* XSPLICE_HOWTO_* */  
+    uint32_t    flag; /* XSPLICE_HOWTO_FLAG_* */  
+    uint32_t    size; /* Size, in bytes, of the item to be relocated. */  
+    uint32_t    r_shift; /* The value the final relocation is shifted right 
by; used to drop unwanted data from the relocation. */  
+    uint64_t    mask; /* Bitmask for which parts of the instruction or data 
are replaced with the relocated value. */  
+    uint8_t     pad[8]; /* Must be zero. */  
+};  
+
+struct xsplice_symbol {  
+    const char *name; /* The ELF name of the symbol. */  
+    const char *label; /* A unique xSplice name for the symbol. */  
+    uint8_t pad[16]; /* Must be zero. */  
+};  
+#define XSPLICE_PATCH_INLINE_TEXT   0x1
+#define XSPLICE_PATCH_INLINE_DATA   0x2
+#define XSPLICE_PATCH_RELOC_TEXT    0x3
+
+struct xsplice_patch {  
+    uint32_t type; /* XSPLICE_PATCH_* .*/  
+    uint32_t size; /* Size of patch. */  
+    uint64_t addr; /* The address of the inline new code (or data). */  
+    void *content; /* The bytes to be installed. */  
+    uint8_t pad[40]; /* Must be zero. */  
+};
+
+#define XSPLICE_SECTION_TEXT   0x00000001 /* Section is in .text */  
+#define XSPLICE_SECTION_RODATA 0x00000002 /* Section is in .rodata */  
+#define XSPLICE_SECTION_DATA   0x00000004 /* Section is in .data */  
+#define XSPLICE_SECTION_STRING 0x00000008 /* Section is in .str */  
+
+#define XSPLICE_SECTION_TEXT_INLINE 0x00000200 /* Change is to be inline. */   
+#define XSPLICE_SECTION_MATCH_EXACT 0x00000400 /* Must match exactly. */  
+#define XSPLICE_SECTION_NO_STACKCHECK 0x00000800 /* Do not check the stack. */ 
 
+
+struct xsplice_section {  
+    struct xsplice_symbol *symbol; /* The symbol associated with this change. 
*/  
+    uint64_t address; /* The address of the section (if known). */  
+    uint32_t size; /* The size of the section. */  
+    uint32_t flags; /* Various XSPLICE_SECTION_* flags. */
+    uint8_t pad[12]; /* To be zero. */  
+};
+
+struct xsplice_reloc {  
+    uint64_t addr; /* The address of the relocation (if known). */  
+    struct xsplice_symbol *symbol; /* Symbol for this relocation. */  
+    int64_t isns_target; /* rest of the ELF addend.  This is equal to the 
offset against the symbol that the relocation refers to. */  
+    struct xsplice_reloc_howto  *howto; /* Pointer to the above structure. */  
+    int64_t isns_added; /* ELF addend resulting from quirks of instruction one 
of whose operands is the relocation. For example, this is -4 on x86 pc-relative 
jumps. */  
+    uint8_t pad[24];  /* Must be zero. */  
+};  
+
+struct xsplice_code {  
+    struct xsplice_reloc *relocs; /* How to patch it. */  
+    uint32_t n_relocs;
+    struct xsplice_section *sections; /* Safety data. */  
+    uint32_t n_sections;
+    struct xsplice_patch *patches; /* Patch code and data */  
+    uint32_t n_patches;
+    uint8_t pad[28]; /* Must be zero. */
+};
+struct xsplice {
+    uint32_t version;
+    const char *name; /* A sensible name for the patch. Up to 40 characters. 
*/  
+    const char *id; /* ID of the hypervisor this binary was built against. */  
+    uint32_t id_size;
+    struct xsplice_code *new; /* Pointer to the new code to be patched. */  
+    struct xsplice_code *old; /* Pointer to the old code to be checked 
against. */  
+    uint8_t pad[24];  /* Must be zero. */  
+};
diff --git a/docs/misc/xsplice.markdown b/docs/misc/xsplice.markdown
new file mode 100644
index 0000000..02fd4d1
--- /dev/null
+++ b/docs/misc/xsplice.markdown
@@ -0,0 +1,1230 @@
+# xSplice Design v1 (EXTERNAL RFC v3)
+
+## Rationale
+
+A mechanism is required to binarily patch the running hypervisor with new
+opcodes that have come about due to primarily security updates.
+
+This document describes the design of the API that would allow us to
+upload to the hypervisor binary patches.
+
+The document is split in four sections:
+ - Detailed descriptions of the problem statement.
+ - Design of the data structures.
+ - Design of the hypercalls.
+ - Implementation notes that should be taken into consideration.
+
+
+## Glossary
+
+ * splice - patch in the binary code with new opcodes
+ * trampoline - a jump to a new instruction.
+ * payload - telemetries of the old code along with binary blob of the new
+   function (if needed).
+ * reloc - telemetries contained in the payload to construct proper trampoline.
+
+## Multiple ways to patch
+
+The mechanism needs to be flexible to patch the hypervisor in multiple ways
+and be as simple as possible. The compiled code is contiguous in memory with
+no gaps - so we have no luxury of 'moving' existing code and must either
+insert a trampoline to the new code to be executed - or only modify in-place
+the code if there is sufficient space. The placement of new code has to be done
+by hypervisor and the virtual address for the new code is allocated 
dynamically.
+
+This implies that the hypervisor must compute the new offsets when splicing
+in the new trampoline code. Where the trampoline is added (inside
+the function we are patching or just the callers?) is also important.
+
+To lessen the amount of code in hypervisor, the consumer of the API
+is responsible for identifying which mechanism to employ and how many locations
+to patch. Combinations of modifying in-place code, adding trampoline, etc
+has to be supported. The API should allow read/write any memory within
+the hypervisor virtual address space.
+
+We must also have a mechanism to query what has been applied and a mechanism
+to revert it if needed.
+
+We must also have a mechanism to: (optional) provide an copy of the old code - 
so
+that the hypervisor can verify it against the code in memory; the new code;
+the symbol name of the function to be patched; or offset from the symbol;
+or virtual address.
+
+The complications that this design will encounter are explained later
+in this document.
+
+## Workflow
+
+
+The expected workflows of higher-level tools that manage multiple patches
+on production machines would be:
+
+ * The first obvious task is loading all available / suggested
+   hotpatches around system start.
+ * Whenever new hotpatches are installed, they should be loaded too.
+ * One wants to query which modules have been loaded at runtime.
+ * If unloading is deemed safe (see unloading below), one may want to
+   support a workflow where a specific hotpatch is marked as bad and
+   unloaded.
+ * If we do no restrict module activation order and want to report tboot
+   state on sequences, we might have a complexity explosion problem, in
+   what system hashes should be considered acceptable.
+
+## Patching code
+
+The first mechanism to patch that comes in mind is in-place replacement.
+That is replace the affected code with new code. Unfortunately the x86
+ISA is variable size which places limits on how much space we have available
+to replace the instructions. That is not a problem if the change is smaller
+than the original opcode and we can fill it with nops. Problems will
+appear if the replacement code is longer.
+
+The second mechanism is by replacing the call or jump to the
+old function with the address of the new function.
+
+A third mechanism is to add a jump to the new function at the
+start of the old function.
+
+### Example of trampoline and in-place splicing
+
+As example we will assume the hypervisor does not have XSA-132 (see
+*domctl/sysctl: don't leak hypervisor stack to toolstacks*
+4ff3449f0e9d175ceb9551d3f2aecb59273f639d) and we would like to binary patch
+the hypervisor with it. The original code looks as so:
+
+<pre>
+   48 89 e0                  mov    %rsp,%rax  
+   48 25 00 80 ff ff         and    $0xffffffffffff8000,%rax  
+</pre>
+
+while the new patched hypervisor would be:
+
+<pre>
+   48 c7 45 b8 00 00 00 00   movq   $0x0,-0x48(%rbp)  
+   48 c7 45 c0 00 00 00 00   movq   $0x0,-0x40(%rbp)  
+   48 c7 45 c8 00 00 00 00   movq   $0x0,-0x38(%rbp)  
+   48 89 e0                  mov    %rsp,%rax  
+   48 25 00 80 ff ff         and    $0xffffffffffff8000,%rax  
+</pre>
+
+This is inside the arch_do_domctl. This new change adds 21 extra
+bytes of code which alters all the offsets inside the function. To alter
+these offsets and add the extra 21 bytes of code we might not have enough
+space in .text to squeeze this in.
+
+As such we could simplify this problem by only patching the site
+which calls arch_do_domctl:
+
+<pre>
+<do_domctl>:  
+ e8 4b b1 05 00          callq  ffff82d08015fbb9 <arch_do_domctl>  
+</pre>
+
+with a new address for where the new `arch_do_domctl` would be (this
+area would be allocated dynamically).
+
+Astute readers will wonder what we need to do if we were to patch `do_domctl`
+- which is not called directly by hypervisor but on behalf of the guests via
+the `compat_hypercall_table` and `hypercall_table`.
+Patching the offset in `hypercall_table` for `do_domctl:
+(ffff82d080103079 <do_domctl>:)
+<pre>
+
+ ffff82d08024d490:   79 30  
+ ffff82d08024d492:   10 80 d0 82 ff ff   
+
+</pre>
+with the new address where the new `do_domctl` is possible. The other
+place where it is used is in `hvm_hypercall64_table` which would need
+to be patched in a similar way. This would require an in-place splicing
+of the new virtual address of `arch_do_domctl`.
+
+In summary this example patched the callee of the affected function by
+ * allocating memory for the new code to live in,
+ * changing the virtual address in all the functions which called the old
+   code (computing the new offset, patching the callq with a new callq).
+ * changing the function pointer tables with the new virtual address of
+   the function (splicing in the new virtual address). Since this table
+   resides in the .rodata section we would need to temporarily change the
+   page table permissions during this part.
+
+
+However it has severe drawbacks - the safety checks which have to make sure
+the function is not on the stack - must also check every caller. For some
+patches this could mean - if there were an sufficient large amount of
+callers - that we would never be able to apply the update.
+
+### Example of different trampoline patching.
+
+An alternative mechanism exists where we can insert a trampoline in the
+existing function to be patched to jump directly to the new code. This
+lessens the locations to be patched to one but it puts pressure on the
+CPU branching logic (I-cache, but it is just one unconditional jump).
+
+For this example we will assume that the hypervisor has not been compiled
+with fe2e079f642effb3d24a6e1a7096ef26e691d93e (XSA-125: *pre-fill structures
+for certain HYPERVISOR_xen_version sub-ops*) which mem-sets an structure
+in `xen_version` hypercall. This function is not called **anywhere** in
+the hypervisor (it is called by the guest) but referenced in the
+`compat_hypercall_table` and `hypercall_table` (and indirectly called
+from that). Patching the offset in `hypercall_table` for the old
+`do_xen_version` (ffff82d080112f9e <do_xen_version>)
+
+</pre>
+ ffff82d08024b270 <hypercall_table>  
+ ...  
+ ffff82d08024b2f8:   9e 2f 11 80 d0 82 ff ff  
+
+</pre>
+with the new address where the new `do_xen_version` is possible. The other
+place where it is used is in `hvm_hypercall64_table` which would need
+to be patched in a similar way. This would require an in-place splicing
+of the new virtual address of `do_xen_version`.
+
+An alternative solution would be to patch insert a trampoline in the
+old `do_xen_version' function to directly jump to the new `do_xen_version`.
+
+<pre>
+ ffff82d080112f9e <do_xen_version>:  
+ ffff82d080112f9e:       48 c7 c0 da ff ff ff    mov    
$0xffffffffffffffda,%rax  
+ ffff82d080112fa5:       83 ff 09                cmp    $0x9,%edi  
+ ffff82d080112fa8:       0f 87 24 05 00 00       ja     ffff82d0801134d2 
<do_xen_version+0x534>  
+</pre>
+
+with:
+
+<pre>
+ ffff82d080112f9e <do_xen_version>:  
+ ffff82d080112f9e:       e9 XX YY ZZ QQ          jmpq   [new do_xen_version]  
+</pre>
+
+which would lessen the amount of patching to just one location.
+
+In summary this example patched the affected function to jump to the
+new replacement function which required:
+ * allocating memory for the new code to live in,
+ * inserting trampoline with new offset in the old function to point to the
+   new function.
+ * Optionally we can insert in the old function a trampoline jump to an 
function
+   providing an BUG_ON to catch errant code.
+
+The disadvantage of this are that the unconditional jump will consume a small
+I-cache penalty. However the simplicity of the patching and higher chance
+of passing safety checks make this a worthwhile option.
+
+### Security
+
+With this method we can re-write the hypervisor - and as such we **MUST** be
+diligent in only allowing certain guests to perform this operation.
+
+Furthermore with SecureBoot or tboot, we **MUST** also verify the signature
+of the payload to be certain it came from a trusted source and integrity
+was intact.
+
+As such the hypercall **MUST** support an XSM policy to limit what the guest
+is allowed to invoke. If the system is booted with signature checking the
+signature checking will be enforced.
+
+## Design of payload format
+
+The payload **MUST** contain enough data to allow us to apply the update
+and also safely reverse it. As such we **MUST** know:
+
+ * (optional) What the old code is expected to be. We **MUST** be able verify 
it
+   against the runtime code if old code is included in the payload.
+ * Verify the build-id of hypervisor against the payload build-id.
+ * The locations in memory to be patched. This can be determined dynamically
+   via symbols or via virtual addresses.
+ * The new code (or data) that will be patched in.
+ * Signature to verify the payload.
+
+This binary format can be constructed using an custom binary format but
+there are severe disadvantages of it:
+
+ * The format might need to be changed and we need an mechanism to accommodate
+   that.
+ * It has to be platform agnostic.
+ * Easily constructed using existing tools.
+
+As such having the payload in an ELF file is the sensible way. We would be
+carrying the various sets of structures (and data) in the ELF sections under
+different names and with definitions. The prefix for the ELF section name
+would always be: *.xsplice* to match up to the names of the structures.
+
+Note that every structure has padding. This is added so that the hypervisor
+can re-use those fields as it sees fit.
+
+Earlier design attempted to ineptly explain the relations of the ELF sections
+to each other without using proper ELF mechanism (sh_info, sh_link, data
+structures using Elf_* types, etc). This design will explain in detail
+the structures and how they are used together and not dig in the ELF
+format - except mention that the section names should match the
+structure names.
+
+### ASCII art of structures.
+
+The diagram below is omitting some entries to easy the relationship 
explanation.
+
+<pre>
+                                                                          
/---------------------\  
+                                                                       +->| 
xsplice_reloc_howto |  
+                                                                      /   
\---------------------/  
+                                                /---------------\ 1:1/  
+                                             +->| xsplice_reloc |   /  
+                                            /   | - howto       +--/  1:1 
/----------------\  
+                                           /    | - symbol      +-------->| 
xsplice_symbol |  
+                                     1:N  /     \---------------/       / 
\----------------/  
+/----------\        /--------------\     /                             /  
+| xsplice  |  1:1   | xsplice_code |    /                          1:1/  
+| - new    +------->|  - relocs    +---/  1:N   /-----------------\  /  
+| - old    +------->|  - sections  +----------->| xsplice_section | /  
+\----------/        |  - patches   +--\         | - symbol        +/ 1:1   
/----------------\  
+                    \--------------/   \        | - addr          +------->| 
.text or .data |  
+                                        \       \----------------/         
\----------------/  
+                                         \  
+                                      1:N \  
+                                           \    /----------------\  
+                                            +-->| xsplice_patch  |  1:1  
/----------------\  
+                                                | - content      +------>| 
binary code or |  
+                                                \----------------/       | 
data           |  
+                                                                         
\----------------/  
+
+</pre>
+
+### xsplice structures
+
+From the top (or left in the above diagram) the structures are:
+
+ *  `xsplice`. The top most structure - contains the the name of the update,
+    the id to match against the hypervisor, the pointer to the metadata for
+    the new code and optionally the metadata for the old code.
+
+ * `xsplice_code`. The structure that ties all of this together and defines
+   the payload. Contains arrays of `xsplice_reloc`, `xsplice_section`, and
+   `xsplice_patch`.
+
+ * `xsplice_reloc` contains telemetry used for patching - which describes the
+   targets to be patched and how to do it.
+
+ * `xsplice_section` - the safety data for the code. Contains pointer to the
+   symbol (`xsplice_symbols`) and pointer to the code (`.text`) or data 
(`.data`),
+   which are to be used during safety and dependency checking.
+
+ * `xsplice_patch`: the description of the new function to be patched in
+   along with the binary code or data.
+
+ * ` xsplice_reloc_howto`: the howto properly construct trampolines for an 
patch.
+   We may have multiple locations for which we need to insert a trampoline for 
a
+   payload and each location might require a different way of handling it.
+
+ * `xsplice_symbols `.  The symbol that will be patched.
+
+In short the *.xsplice* sections (with `xsplice` being the top) represent
+various structures to define the new code and safety checks for the old
+code (optional). The ELF provides the mechanism to glue it all together when
+loaded in memory.
+
+
+Note that a lot of these ideas are borrowed from kSplice which is
+available at: https://github.com/jirislaby/ksplice
+
+### struct xsplice
+
+The top most structure is quite simple. It defines the name, the id
+of the hypervisor, pointer to the new code & data and an pointer to
+the old code & data (optional).
+
+The `new` code uses all of the `xsplice_*` structures while the
+`old`  does not use the `xsplice_reloc` structures.
+
+The sections defining the structures will explicitly state
+when they are not used.
+
+<pre>
+struct xsplice {
+    uint32_t version; /* Version of payload. */
+    const char *name; /* A sensible name for the patch. Up to 40 characters. 
*/  
+    const char *id; /* ID of the hypervisor this binary was built against. */  
+    uint32_t id_size; /* Size of the ID. */  
+    struct xsplice_code *new; /* Pointer to the new code & data to be patched. 
*/  
+    struct xsplice_code *old; /* Pointer to the old code & data to be checked 
against. */  
+    uint8_t pad[24];  /* Must be zero. */  
+};
+</pre>
+
+The size of this structure should be 64 bytes.
+
+### xsplice_code
+
+The structure embedded within this section ties the other
+structures together. It has the pointers with an start and end
+address for each set of structures. This means that an update
+can be split in multiple changes - for example to accomodate
+an update that contains both code and data and will need patching
+in both .text and .data sections.
+
+<pre>
+struct xsplice_code {  
+    struct xsplice_reloc *relocs; /* How to patch it. */  
+    uint32_t n_relocs;
+    struct xsplice_section *sections; /* Safety data. */  
+    uint32_t n_sections;
+    struct xsplice_patch *patches; /* Patch code and data */  
+    uint32_t n_patches;  
+    uint8_t pad[28]; /* Must be zero. */
+};
+</pre>
+
+The size of this structure is 64 bytes.
+
+There can be at most two of those structures in the payload.
+One for the `new` and another for the `old` (optional).
+
+If it is for the old code the relocs, and relocs_end values will be ignored.
+
+
+### xsplice_reloc
+
+The `xsplice_code` defines an array of these structures. As such
+an singular structure defines an singular point where to patch the
+hypervisor.
+
+The structure contains the address of the hypervisor (if known),
+the symbol associated with this address, how the patching is to
+be done, and platform specific details.
+
+The `isns_added` is an value to be used to compute the new offset
+due to the quirks of the operands of the instruction. For example
+to patch in an jump operation to the new code - the offset is relative
+to the program counter of the next instruction - hence the offset
+value has to be subtracted by four bytes - hence this would contain -4 .
+
+The `isns_target` is the offset against the symbol.
+
+The relation of this structure with `xsplice_patch` is 1:1, even
+for inline patches. See the section detailing the structure
+`xsplice_reloc_howto`.
+
+The relation of this structure with `xsplice_section` is 1:1.
+
+This structure is as follow:
+
+<pre>
+struct xsplice_reloc {  
+    uint64_t addr; /* The address of the relocation (if known). */  
+    struct xsplice_symbol *symbol; /* Symbol for this relocation. */  
+    int64_t isns_target; /* rest of the ELF addend.  This is equal to the 
offset against the symbol that the relocation refers to. */  
+    struct xsplice_reloc_howto  *howto; /* Pointer to the above structure. */  
+    int64_t isns_added; /* ELF addend resulting from quirks of instruction one 
of whose operands is the relocation. For example, this is -4 on x86 pc-relative 
jumps. */  
+    uint8_t pad[24];  /* Must be zero. */  
+};  
+
+</pre>
+
+The size of this structure is 64 bytes.
+
+### xsplice_section
+
+The structure defined in this section is used during pre-patching and
+during patching. Pre-patching it is used to verify that it is safe
+to update with the new changes - and contains safety data on the old code
+and what kind of matching we are to expect.
+
+That is whether the address (either provided or resolved when payload is
+loaded by referencing the symbols) is:
+
+ * in memory,
+ * correct size,
+ * in it's proper ELF section,
+ * has been already patched (or not),
+ * is expected not to be on any CPU stack - (or if it is OK for it be on the 
CPU stack).
+
+with what we expect it to be.
+
+Some of the checks can be relaxed, as such the `flag` values
+can be or-ed together.
+
+Depending on the time when patching is done, stack checking might not
+be required.
+<pre>
+
+#define XSPLICE_SECTION_TEXT   0x00000001 /* Section is in .text */  
+#define XSPLICE_SECTION_RODATA 0x00000002 /* Section is in .rodata */  
+#define XSPLICE_SECTION_DATA   0x00000004 /* Section is in .data */  
+#define XSPLICE_SECTION_STRING 0x00000008 /* Section is in .str */  
+
+#define XSPLICE_SECTION_TEXT_INLINE 0x00000200 /* Change is to be inline. */   
+#define XSPLICE_SECTION_MATCH_EXACT 0x00000400 /* Must match exactly. */  
+#define XSPLICE_SECTION_NO_STACKCHECK 0x00000800 /* Do not check the stack. */ 
 
+
+
+struct xsplice_section {  
+    struct xsplice_symbol *symbol; /* The symbol associated with this change. 
*/  
+    uint64_t address; /* The address of the section (if known). */  
+    uint32_t size; /* The size of the section. */  
+    uint32_t flags; /* Various XSPLICE_SECTION_* flags. */
+    uint8_t pad[12]; /* To be zero. */  
+};
+
+</pre>
+
+The size of this structure is 32 bytes.
+
+### xsplice_patch
+
+This structure has the binary code (or data) to be patched. Depending on the
+type it can either an inline patch (data or text) or require an relocation
+change (which requires a trampoline). Naturally it also points to a blob
+of the binary data to patch in, and the size of the patch.
+
+The `addr` is used when the patch is for inline change. It can be
+an virtual address or an offset from the symbol start.
+
+If it is an relocation (requiring a trampoline), the `addr` should be zero.
+
+There must be an corresponding ` struct xsplice_reloc` and
+`struct xsplice_section` describing this patch.
+
+<pre>
+#define XSPLICE_PATCH_INLINE_TEXT   0x1
+#define XSPLICE_PATCH_INLINE_DATA   0x2
+#define XSPLICE_PATCH_RELOC_TEXT    0x3
+
+struct xsplice_patch {  
+    uint32_t type; /* XSPLICE_PATCH_* .*/  
+    uint32_t size; /* Size of patch. */  
+    uint64_t addr; /* The address (or offset from symbol) of the inline new 
code (or data). */  
+    void *content; /* The bytes to be installed. */  
+    uint8_t pad[40]; /* Must be zero. */  
+};
+
+</pre>
+
+The size of this structure is 64 bytes.
+
+### xsplice_symbols
+
+The structure contains an pointer to the name of the ELF symbol
+to be patched and as well an unique name for the symbol.
+
+The `label` is used for diagnostic purposes - such as including the
+name and the offset.
+
+The structure is as follow:
+
+<pre>
+struct xsplice_symbol {  
+    const char *name; /* The ELF name of the symbol. */  
+    const char *label; /* A unique xSplice name for the symbol. */  
+    uint8_t pad[16]; /* Must be zero. */  
+};  
+</pre>
+
+The size of this structure is 32 bytes.
+
+
+### xsplice_reloc_howto
+
+The howto defines in the detail the change. It contains the type,
+whether the relocation is relative, the size of the relocation,
+bitmask for which parts of the instruction or data are to be replaced,
+amount the final relocation is shifted by (to drop unwanted data), and
+whether the replacement should be interpreted as signed value.
+
+The structure is as follow:
+
+<pre>
+#define XSPLICE_HOWTO_INLINE        0x1 /* It is an inline replacement. */  
+#define XSPLICE_HOWTO_RELOC_PATCH   0x2 /* Add a trampoline. */  
+
+#define XSPLICE_HOWTO_FLAG_PC_REL    0x1 /* Is PC relative. */  
+#define XSPLICE_HOWOT_FLAG_SIGN      0x2 /* Should the new value be treated as 
signed value. */  
+
+struct xsplice_reloc_howto {  
+    uint32_t    howto; /* XSPLICE_HOWTO_* */  
+    uint32_t    flag; /* XSPLICE_HOWTO_FLAG_* */  
+    uint32_t    size; /* Size, in bytes, of the item to be relocated. */  
+    uint32_t    r_shift; /* The value the final relocation is shifted right 
by; used to drop unwanted data from the relocation. */  
+    uint64_t    mask; /* Bitmask for which parts of the instruction or data 
are replaced with the relocated value. */  
+    uint8_t     pad[8]; /* Must be zero. */  
+};  
+
+</pre>
+
+The size of this structure is 32 bytes.
+
+### Example
+
+There is a wealth of information that the payload must have to define a simple
+patch.  For this example we will assume that the hypervisor has not been 
compiled
+with fe2e079f642effb3d24a6e1a7096ef26e691d93e (XSA-125: *pre-fill structures
+for certain HYPERVISOR_xen_version sub-ops*) which mem-sets an structure
+in `xen_version` hypercall. This function is not called **anywhere** in
+the hypervisor (it is called by the guest) but referenced in the
+`compat_hypercall_table` and `hypercall_table` (and indirectly called
+from that). There are two ways to patch this:
+inline patch `hvm_hypercall64_table` and `hvm_hypercall` with a new
+address for the new `do_xen_version` , or insert
+trampoline in `do_xen_version` code. The example will focus on the later.
+
+The `do_xen_version` code is located at virtual address ffff82d080112f9e.
+
+<pre>
+struct xsplice_code xsplice_xsa125;  
+struct xsplice_reloc relocs[1];  
+struct xsplice_section sections[1];  
+struct xsplice_patch patches[1];  
+struct xsplice_symbol do_xen_version_symbol;  
+struct xsplice_reloc_howto do_xen_version_howto;  
+char do_xen_version_new_code[1728];  
+
+#ifndef HYPERVISOR_ID  
+#define HYPERVISOR_ID "92dd05a61556c554155b1508c9cf67d993336d28"
+#endif  
+
+struct xsplice xsa125 = {  
+    .name = "xsa125",  
+    .id = HYPERVISOR_ID,  
+    .old = NULL,  
+    .new = &xsplice_xsa125,  
+};  
+
+struct xsplice_code xsplice_xsa125 = {  
+    .relocs = &relocs[0],  
+    .n_relocs = 1,  
+    .sections = &sections[0],  
+    .n_sections = 1,  
+    .patches = &patches[0],  
+    .n_patches = 1,   
+};
+
+struct xsplice_reloc relocs[1] = {  
+    {  
+        .addr = 0xffff82d080112f9e,  
+        .symbol = &do_xen_version_symbol,  
+        .isns_target = 0,  
+        .howto = &do_xen_version_howto,  
+        .isns_added = -4,  
+    },  
+};  
+
+struct xsplice_symbol do_xen_version_symbol = {  
+    .name = "do_xen_version",  
+    .label = "do_xen_version+<0x0>",  
+};  
+
+struct xsplice_reloc_howto do_xen_version_howto = {  
+    .type = XSPLICE_HOWTO_RELOC_PATCH,  
+    .flag = XSPLICE_HOWTO_FLAG_PC_REL,  
+    .r_shift = 0,  
+    .mask = (-1ULL),  
+};  
+
+
+struct xsplice_section sections[1] = {  
+    {  
+        .symbol = &do_xen_version_symbol,  
+        .address = 0xffff82d080112f9e,  
+        .size = 1728,  
+        .flags = XSPLICE_SECTION_TEXT,  
+    },  
+};  
+
+struct xsplice_patch patches[1] = {  
+    {  
+        .type = XSPLICE_PATCH_RELOC_TEXT,  
+        .size = 1728,  
+        .addr = 0,  
+        .content = &do_xen_version_new_code,  
+    },  
+};  
+
+char do_xen_version_new_code[1728] = { 0x83, 0xff, 0x09, /* And more code. 
*/};  
+</pre>
+
+
+## Signature checking requirements.
+
+The signature checking requires that the layout of the data in memory
+**MUST** be same for signature to be verified. This means that the payload
+data layout in ELF format **MUST** match what the hypervisor would be
+expecting such that it can properly do signature verification.
+
+The signature is based on the all of the payloads continuously laid out
+in memory. The signature is to be appended at the end of the ELF payload
+prefixed with the string '~Module signature appended~\n', followed by
+an signature header then followed by the signature, key identifier, and signers
+name.
+
+Specifically the signature header would be:
+
+<pre>
+#define PKEY_ALGO_DSA       0  
+#define PKEY_ALGO_RSA       1  
+
+#define PKEY_ID_PGP         0 /* OpenPGP generated key ID */  
+#define PKEY_ID_X509        1 /* X.509 arbitrary subjectKeyIdentifier */  
+
+#define HASH_ALGO_MD4          0  
+#define HASH_ALGO_MD5          1  
+#define HASH_ALGO_SHA1         2  
+#define HASH_ALGO_RIPE_MD_160  3  
+#define HASH_ALGO_SHA256       4  
+#define HASH_ALGO_SHA384       5  
+#define HASH_ALGO_SHA512       6  
+#define HASH_ALGO_SHA224       7  
+#define HASH_ALGO_RIPE_MD_128  8  
+#define HASH_ALGO_RIPE_MD_256  9  
+#define HASH_ALGO_RIPE_MD_320 10  
+#define HASH_ALGO_WP_256      11  
+#define HASH_ALGO_WP_384      12  
+#define HASH_ALGO_WP_512      13  
+#define HASH_ALGO_TGR_128     14  
+#define HASH_ALGO_TGR_160     15  
+#define HASH_ALGO_TGR_192     16  
+
+
+struct elf_payload_signature {  
+       u8      algo;           /* Public-key crypto algorithm PKEY_ALGO_*. */  
+       u8      hash;           /* Digest algorithm: HASH_ALGO_*. */  
+       u8      id_type;        /* Key identifier type PKEY_ID*. */  
+       u8      signer_len;     /* Length of signer's name */  
+       u8      key_id_len;     /* Length of key identifier */  
+       u8      __pad[3];  
+       __be32  sig_len;        /* Length of signature data */  
+};
+
+</pre>
+(Note that this has been borrowed from Linux module signature code.).
+
+
+## Hypercalls
+
+We will employ the sub operations of the system management hypercall (sysctl).
+There are to be four sub-operations:
+
+ * upload the payloads.
+ * listing of payloads summary uploaded and their state.
+ * getting an particular payload summary and its state.
+ * command to apply, delete, or revert the payload.
+ * querying of the hypervisor ID (TODO).
+
+Most of the actions are asynchronous therefore the caller is responsible
+to verify that it has been applied properly by retrieving the summary of it
+and verifying that there are no error codes associated with the payload.
+
+We **MUST** make some of them asynchronous due to the nature of patching
+it requires every physical CPU to be lock-step with each other.
+The patching mechanism while an implementation detail, is not an short
+operation and as such the design **MUST** assume it will be an long-running
+operation.
+
+The sub-operations will spell out how preemption is to be handled (if at all).
+
+Furthermore it is possible to have multiple different payloads for the same
+function. As such an unique id per payload has to be visible to allow proper 
manipulation.
+
+The hypercall is part of the `xen_sysctl`. The top level structure contains
+one uint32_t to determine the sub-operations:
+
+<pre>
+struct xen_sysctl_xsplice_op {  
+    uint32_t cmd;  
+       union {  
+          ... see below ...  
+        } u;  
+};  
+
+</pre>
+while the rest of hypercall specific structures are part of the this structure.
+
+### XEN_SYSCTL_XSPLICE_UPLOAD (0)
+
+Upload a payload to the hypervisor. The payload is verified and if there
+are any issues the proper return code will be returned. The payload is
+not applied at this time - that is controlled by *XEN_SYSCTL_XSPLICE_ACTION*.
+
+The caller provides:
+
+ * `id` unique id.
+ * `payload` the virtual address of where the ELF payload is.
+
+The `id` could be an UUID in mind that stays fixed forever for a given
+hotpatch. It can be embedded into the Elf payload at creation time
+and extracted by tools.
+
+The return value is zero if the payload was succesfully uploaded and the
+signature was verified. Otherwise an EXX return value is provided.
+Duplicate `id` are not supported.
+
+The `payload` is the ELF payload as mentioned in the `Payload format` section.
+
+This operation can be preempted by the hypercall returning EAGAIN.
+This is due to the nature of signature verification - which may require
+SecureBoot firmware calls which are unbounded.
+
+The structure is as follow:
+
+<pre>
+struct xen_sysctl_xsplice_upload {  
+    char id[40];  /* IN, name of the patch. */  
+    uint64_t size; /* IN, size of the ELF file. */
+    XEN_GUEST_HANDLE_64(uint8) payload; /* ELF file. */  
+}; 
+</pre>
+
+### XEN_SYSCTL_XSPLICE_GET (1)
+
+Retrieve an summary of an specific payload. This caller provides:
+
+ * `id` the unique id.
+ * `status` *MUST* be set to zero.
+
+The `summary` structure contains an summary of payload which includes:
+
+ * `id` the unique id.
+ * `status` - whether it has been:
+ 1. *XSPLICE_STATUS_LOADED* (0x1) has been loaded.
+ 2. *XSPLICE_STATUS_PROGRESS* (0x2) acting on the 
**XEN_SYSCTL_XSPLICE_ACTION** command.
+ 3. *XSPLICE_STATUS_CHECKED*  (0x4) the ELF payload safety checks passed.
+ 4. *XSPLICE_STATUS_APPLIED* (0x8) loaded, checked, and applied.
+ 5. *XSPLICE_STATUS_REVERTED* (0x10) loaded, checked, applied and then also 
reverted.
+ 6. Negative values is an error. The error would be of EXX format.
+
+The return value is zero on success and EXX on failure. This operation
+is synchronous and does not require preemption.
+
+The structure is as follow:
+
+<pre>
+#define XSPLICE_STATUS_LOADED    0x1  
+#define XSPLICE_STATUS_PROGRESS  0x2  
+#define XSPLICE_STATUS_CHECKED   0x4  
+#define XSPLICE_STATUS_APPLIED   0x8  
+#define XSPLICE_STATUS_REVERTED  0x10  
+
+struct xen_sysctl_xsplice_summary {  
+    char id[40];  /* IN/OUT, name of the patch. */  
+    int32_t status;   /* OUT */  
+}; 
+</pre>
+
+### XEN_SYSCTL_XSPLICE_LIST (2)
+
+Retrieve an array of abbreviated summary of payloads that are loaded in the
+hypervisor.
+
+The caller provides:
+
+ * `version`. Initially it *MUST* be zero.
+ * `idx` index iterator. On first call *MUST* be zero, subsequent calls varies.
+ * `count` the max number of entries to populate.
+ * `summary` virtual address of where to write payload summaries.
+
+The hypercall returns zero on success and updates the `idx` (index) iterator
+with the number of payloads returned, `count` with the number of remaining
+payloads, and `summary` with an number of payload summaries. The `version`
+is updated on every hypercall - if it varies from one hypercall to another
+the data is stale and further calls could fail.
+
+
+If the hypercall returns E2BIG the `count` is too big and should be
+lowered.
+
+Note that due to the asynchronous nature of hypercalls the domain might have
+added or removed the number of payloads making this information stale. It is
+the responsibility of the toolstack to use the `version` field to check
+between each invocation. if the version differs it should discard the stale
+data and start from scratch.
+
+This operation is synchronous and does not require preemption.
+
+The `summary` structure contains an summary of payload which includes:
+
+ * `version` version of the data.
+ * `id` unique id per payload.
+ * `status` - whether it has been:
+ 1. *XSPLICE_STATUS_LOADED* (0x1) has been loaded.
+ 2. *XSPLICE_STATUS_PROGRESS* (0x2) acting on the 
**XEN_SYSCTL_XSPLICE_ACTION** command.
+ 3. *XSPLICE_STATUS_CHECKED*  (0x4) the ELF payload safety checks passed.
+ 4. *XSPLICE_STATUS_APPLIED* (0x8) loaded, checked, and applied.
+ 5. *XSPLICE_STATUS_REVERTED* (0x10) loaded, checked, applied and then also 
reverted.
+ 6. Any negative values means there has been error. The value is in EXX format.
+
+The structure is as follow:
+
+<pre>
+struct xen_sysctl_xsplice_list {  
+    uint32_t version; /* OUT */
+    uint32_t idx;  /* IN/OUT */  
+    uint32_t count;  /* IN/OUT */
+    XEN_GUEST_HANDLE_64(xen_sysctl_xsplice_summary) summary;  /* OUT */  
+};  
+
+struct xen_sysctl_xsplice_summary {  
+    char id[40];  /* OUT, name of the patch. */  
+    int32_t status;  /* OUT */  
+};  
+
+</pre>
+### XEN_SYSCTL_XSPLICE_ACTION (3)
+
+Perform an operation on the payload structure referenced by the `id` field.
+The operation request is asynchronous and the status should be retrieved
+by using either **XEN_SYSCTL_XSPLICE_GET** or **XEN_SYSCTL_XSPLICE_LIST** 
hypercall.
+
+There are two ways about doing preemption. Either via returning back EBUSY
+or the mechanism outlined here.
+
+Doing it in userland would remove any tracking of states in
+the hypervisor - except the simple commands apply, unload, and revert.
+
+However we would not be able to patch all the code that is invoked while
+this hypercall is in progress. That is - the do_domctl, the spinlocks,
+anything put on the stack, etc.
+
+The disadvantage of the mechanism outlined here is that the hypervisor
+code has to keep the state atomic and have an upper bound of time
+on actions. If within the time the operation does not succeed the
+operation would go in error state.
+
+ * `id` the unique id.
+ * `time` the upper bound of time the cmd should take. Zero means infinite.
+ * `cmd` the command requested:
+  1. *XSPLICE_ACTION_CHECK* (1) check that the payload will apply properly.
+  2. *XSPLICE_ACTION_UNLOAD* (2) unload the payload.
+   Any further hypercalls against the `id` will result in failure unless
+   **XEN_SYSCTL_XSPLICE_UPLOAD** hypercall is perfomed with same `id`.
+  3. *XSPLICE_ACTION_REVERT* (3) revert the payload. If the operation takes
+  more time than the upper bound of time the `status` will EBUSY.
+  4. *XSPLICE_ACTION_APPLY* (4) apply the payload. If the operation takes
+  more time than the upper bound of time the `status` will be EBUSY.
+  5. *XSPLICE_ACTION_LOADED* is an initial state and cannot be requested.  
+
+The return value will be zero unless the provided fields are incorrect.
+
+The structure is as follow:
+
+<pre>
+#define XSPLICE_ACTION_CHECK  1  
+#define XSPLICE_ACTION_UNLOAD 2  
+#define XSPLICE_ACTION_REVERT 3  
+#define XSPLICE_ACTION_APPLY  4  
+
+struct xen_sysctl_xsplice_action {  
+    char id[40];  /* IN, name of the patch. */  
+    uint64_t time; /* IN, upper bound of time (ms) for the operation to take. 
*/  
+    uint32_t cmd; /* IN */  
+};  
+
+</pre>
+
+## State diagrams of XSPLICE_ACTION values.
+
+There is a strict ordering state of what the commands can be.
+The XSPLICE_ACTION prefix has been dropped to easy reading:
+
+<pre>
+                        /->\  
+                        \  /  
+             /-------< CHECK  
+             |          |  
+             |          +  
+             |       UNLOAD<--\  
+             |                 \  
+             |                   \  
+      /-> APPLY -----------> REVERT --\  
+      |                               |  
+      \-------------------------------/  
+
+</pre>
+Or an state transition table of valid states:
+<pre>
++-------+-------+--------+--------+---------+-------+------------------+  
+| CHECK | APPLY | REVERT | UNLOAD | Current | Next  | Result           |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|   x   |       |        |        | LOADED  | CHECK | Check payload.   |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|       |       |        |   x    | LOADED  | UNLOAD| unload payload.  |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|   x   |       |        |        | CHECK   | CHECK | Check payload.   |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|       |   x   |        |        | CHECK   | APPLY | Apply payload.   |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|       |       |        |   x    | CHECK   | UNLOAD| Unload payload.  |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|       |       |   x    |        | APPLY   | REVERT| Revert payload.  |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|       |   x   |        |        | REVERT  | APPLY | Apply payload.   |  
++-------+-------+--------+--------+---------+-------+------------------+  
+|       |       |        |   x    | REVERT  | UNLOAD| Unload payload.  |  
++-------+-------+--------+--------+---------+-------+------------------+  
+</pre>
+All the other state transitions are invalid.
+
+## Sequence of events.
+
+The normal sequence of events is to:
+
+ 1. *XEN_SYSCTL_XSPLICE_UPLOAD* to upload the payload. If there are errors 
*STOP* here.
+ 2. *XEN_SYSCTL_XSPLICE_GET* to check the `->status`. If in 
*XSPLICE_STATUS_PROGRESS* spin. If in *XSPLICE_STATUS_LOADED* go to next step.
+ 3. *XEN_SYSCTL_XSPLICE_ACTION* with *XSPLICE_ACTION_CHECK* command to verify 
that the payload can be succesfully applied.
+ 4. *XEN_SYSCTL_XSPLICE_GET* to check the `->status`. If in 
*XSPLICE_STATUS_PROGRESS* spin. If in *XSPLICE_STATUS_CHECKED* go to next step.
+ 5. *XEN_SYSCTL_XSPLICE_ACTION* with *XSPLICE_ACTION_APPLY* to apply the patch.
+ 6. *XEN_SYSCTL_XSPLICE_GET* to check the `->status`. If in 
*XSPLICE_STATUS_PROGRESS* spin. If in *XSPLICE_STATUS_APPLIED* exit with 
success.
+
+ 
+## Addendum
+
+Implementation quirks should not be discussed in a design document.
+
+However these observations can provide aid when developing against this
+document.
+
+
+### Alternative assembler
+
+Alternative assembler is a mechanism to use different instructions depending
+on what the CPU supports. This is done by providing multiple streams of code
+that can be patched in - or if the CPU does not support it - padded with
+`nop` operations. The alternative assembler macros cause the compiler to
+expand the code to place a most generic code in place - emit a special
+ELF .section header to tag this location. During run-time the hypervisor
+can leave the areas alone or patch them with an better suited opcodes.
+
+However these sections are part of .init. and as such can't reasonably be
+subject to patching.
+
+### .rodata sections
+
+The patching might require strings to be updated as well. As such we must be
+also able to patch the strings as needed. This sounds simple - but the compiler
+has a habit of coalescing strings that are the same - which means if we 
in-place
+alter the strings - other users will be inadvertently affected as well.
+
+This is also where pointers to functions live - and we may need to patch this
+as well. And switch-style jump tables.
+
+To guard against that we must be prepared to do patching similar to
+trampoline patching or in-line depending on the flavour. If we can
+do in-line patching we would need to:
+
+ * alter `.rodata` to be writeable.
+ * inline patch.
+ * alter `.rodata` to be read-only.
+
+If are doing trampoline patching we would need to:
+
+ * allocate a new memory location for the string.
+ * all locations which use this string will have to be updated to use the
+   offset to the string.
+ * mark the region RO when we are done.
+
+### .bss and .data sections.
+
+Patching writable data is not suitable as it is unclear what should be done
+depending on the current state of data. As such it should not be attempted.
+
+
+### Patching code which is in the stack.
+
+We should not patch the code which is on the stack. That can lead
+to corruption.
+
+### Inline patching
+
+The hypervisor should verify that the in-place patching would fit within
+the code or data.
+
+### Trampoline (e9 opcode)
+
+The e9 opcode used for jmpq uses a 32-bit signed displacement. That means
+we are limited to up to 2GB of virtual address to place the new code
+from the old code. That should not be a problem since Xen hypervisor has
+a very small footprint.
+
+However if we need - we can always add two trampolines. One at the 2GB
+limit that calls the next trampoline.
+
+Please note there is a small limitation for trampolines in
+function entries: The target function (+ trailing padding) must be able
+to accomodate the trampoline. On x86 with +-2 GB relative jumps,
+this means 5 bytes are  required.
+
+Depending on compiler settings, there are several functions in Xen that
+are smaller (without inter-function padding).
+
+<pre> 
+readelf -sW xen-syms | grep " FUNC " | \
+    awk '{ if ($3 < 5) print $3, $4, $5, $8 }'
+
+...
+3 FUNC LOCAL wbinvd_ipi
+3 FUNC LOCAL shadow_l1_index
+...
+</pre>
+A compile-time check for, e.g., a minimum alignment of functions or a
+runtime check that verifies symbol size (+ padding to next symbols) for
+that in the hypervisor is advised.
+
+### When to patch
+
+During the discussion on the design two candidates bubbled where
+the call stack for each CPU would be deterministic. This would
+minimize the chance of the patch not being applied due to safety
+checks failing.
+
+#### Rendezvous code instead of stop_machine for patching
+
+The hypervisor's time rendezvous code runs synchronously across all CPUs
+every second. Using the stop_machine to patch can stall the time rendezvous
+code and result in NMI. As such having the patching be done at the tail
+of rendezvous code should avoid this problem.
+
+However the entrance point for that code is
+do_softirq->timer_softirq_action->time_calibration
+which ends up calling on_selected_cpus on remote CPUs.
+
+The remote CPUs receive CALL_FUNCTION_VECTOR IPI and execute the
+desired function.
+
+
+#### Before entering the guest code.
+
+Before we call VMXResume we check whether any soft IRQs need to be executed.
+This is a good spot because all Xen stacks are effectively empty at
+that point.
+
+To randezvous all the CPUs an barrier with an maximum timeout (which
+could be adjusted), combined with forcing all other CPUs through the
+hypervisor with IPIs, can be utilized to have all the CPUs be lockstep.
+
+The approach is similar in concept to stop_machine and the time rendezvous
+but is time-bound. However the local CPU stack is much shorter and
+a lot more deterministic.
+
+### Compiling the hypervisor code
+
+Hotpatch generation often requires support for compiling the target
+with -ffunction-sections / -fdata-sections.  Changes would have to
+be done to the linker scripts to support this.
+
+
+### Generation of xSplice ELF payloads
+
+The design of that is not discussed in this design.
+
+The author of this design envisions objdump and objcopy along
+with special GCC parameters (see above) to create .o.xsplice files
+which can be used to splice an ELF with the new payload.
+
+The ksplice code can provide inspiration.
+
+### Exception tables and symbol tables growth
+
+We may need support for adapting or augmenting exception tables if
+patching such code.  Hotpatches may need to bring their own small
+exception tables (similar to how Linux modules support this).
+
+If supporting hotpatches that introduce additional exception-locations
+is not important, one could also change the exception table in-place
+and reorder it afterwards.
+
+
+### xSplice interdependencies
+
+xSplice patches interdependencies are tricky.
+
+There are the ways this can be addressed:
+ * A single large patch that subsumes and replaces all previous ones.
+   Over the life-time of patching the hypervisor this large patch
+   grows to accumulate all the code changes.
+ * Hotpatch stack - where an mechanism exists that loads the hotpatches
+   in the same order they were built in. We would need an build-id
+   of the hypevisor to make sure the hot-patches are build against the
+   correct build.
+ * Payload containing the old code to check against that. That allows
+   the hotpatches to be loaded indepedently (if they don't overlap) - or
+   if the old code also containst previously patched code - even if they
+   overlap.
+
+The disadvantage of the first large patch is that it can grow over
+time and not provide an bisection mechanism to identify faulty patches.
+
+The hot-patch stack puts stricts requirements on the order of the patches
+being loaded and requires an hypervisor build-id to match against.
+
+The old code allows much more flexibility and an additional guard,
+but is more complex to implement.
+
+### Hypervisor ID (buid-id)
+
+The build-id can help with:
+
+  * Prevent loading of wrong hotpatches (intended for other builds)
+
+  * Allow to identify suitable hotpatches on disk and help with runtime
+    tooling (if laid out using build ID)
+
+The build-id (aka hypervisor id) can be easily obtained by utilizing
+the ld --build-id operatin which (copied from ld):
+
+<pre>
+--build-id  
+    --build-id=style  
+        Request creation of ".note.gnu.build-id" ELF note section.  The 
contents of the note are unique bits identifying this  
+        linked file.  style can be "uuid" to use 128 random bits, "sha1" to 
use a 160-bit SHA1 hash on the normative parts of the  
+        output contents, "md5" to use a 128-bit MD5 hash on the normative 
parts of the output contents, or "0xhexstring" to use a  
+        chosen bit string specified as an even number of hexadecimal digits 
("-" and ":" characters between digit pairs are  
+        ignored).  If style is omitted, "sha1" is used.  
+
+        The "md5" and "sha1" styles produces an identifier that is always the 
same in an identical output file, but will be  
+        unique among all nonidentical output files.  It is not intended to be 
compared as a checksum for the file's contents.  A  
+        linked file may be changed later by other tools, but the build ID bit 
string identifying the original linked file does  
+        not change.  
+
+        Passing "none" for style disables the setting from any "--build-id" 
options earlier on the command line.  
+
+</pre>
+
+### Symbol names
+
+
+Xen as it is now, has a couple of non-unique symbol names which will
+make runtime symbol identification hard.  Sometimes, static symbols
+simply have the same name in C files, sometimes such symbols get
+included via header files, and some C files are also compiled
+multiple times and linked under different names (guest_walk.c).
+
+As such we need to modify the linker to make sure that the symbol
+table qualifies also symbols by their source file name.
+
+For the awkward situations in which C-files are compiled multiple
+times patches we would need to some modification in the Xen code.
+
+
+The convention for file-type symbols (that would allow to map many
+symbols to their compilation unit) says that only the basename (i.e.,
+without directories) is embedded.  This creates another layer of
+confusion for duplicate file names in the build tree.
+
+That would have to be resolved.
+
+<pre>
+> find . -name \*.c -print0 | xargs -0 -n1 basename | sort | uniq -c | sort -n 
| tail -n10
+      3 shutdown.c
+      3 sysctl.c
+      3 time.c
+      3 xenoprof.c
+      4 gdbstub.c
+      4 irq.c
+      5 domain.c
+      5 mm.c
+      5 pci.c
+      5 traps.c
+</pre>
+
+### Security
+
+Only the privileged domain should be allowed to do this operation.
+
diff --git a/docs/misc/xsplice_test.c b/docs/misc/xsplice_test.c
new file mode 100644
index 0000000..6e0cf93
--- /dev/null
+++ b/docs/misc/xsplice_test.c
@@ -0,0 +1,78 @@
+#include "xsplice.h"
+#include <stdio.h>
+
+struct xsplice_code xsplice_xsa125;  
+struct xsplice_reloc relocs[1];  
+struct xsplice_section sections[1];  
+struct xsplice_patch patches[1];  
+struct xsplice_symbol do_xen_version_symbol;  
+struct xsplice_reloc_howto do_xen_version_howto;  
+char do_xen_version_new_code[1728];  
+
+#ifndef HYPERVISOR_ID  
+#define HYPERVISOR_ID "Xen 4.6-unstable-g9348394"  
+#endif  
+
+struct xsplice xsa125 = {  
+    .name = "xsa125",  
+    .id = HYPERVISOR_ID,  
+    .old = NULL,  
+    .new = &xsplice_xsa125,  
+};  
+
+struct xsplice_code xsplice_xsa125 = {  
+    .relocs = &relocs[0],  
+    .n_relocs = 1,
+    .sections = &sections[0],  
+    .n_sections = 1,
+    .patches = &patches[0],  
+    .n_patches = 1,
+};
+
+struct xsplice_reloc relocs[1] = {  
+    {  
+        .addr = 0xffff82d080112f9e,  
+        .symbol = &do_xen_version_symbol,  
+        .isns_target = 0,  
+        .howto = &do_xen_version_howto,  
+        .isns_added = -4,  
+    },  
+};  
+
+struct xsplice_symbol do_xen_version_symbol = {  
+    .name = "do_xen_version",  
+    .label = "do_xen_version+<0x0>",  
+};  
+
+struct xsplice_reloc_howto do_xen_version_howto = {  
+    .howto = XSPLICE_HOWTO_RELOC_PATCH,  
+    .flag = XSPLICE_HOWTO_FLAG_PC_REL,  
+    .r_shift = 0,  
+    .mask = (-1ULL),  
+};  
+
+
+struct xsplice_section sections[1] = {  
+    {  
+        .symbol = &do_xen_version_symbol,  
+        .address = 0xffff82d080112f9e,  
+        .size = 1728,  
+        .flags = XSPLICE_SECTION_TEXT,  
+    },  
+};  
+
+struct xsplice_patch patches[1] = {  
+    {  
+        .type = XSPLICE_PATCH_RELOC_TEXT,  
+        .size = 1728,  
+        .addr = 0,  
+        .content = &do_xen_version_new_code,  
+    },  
+};  
+
+char do_xen_version_new_code[1728] = { 0x83, 0xff, 0x09, };  
+
+void main()
+{
+   printf("%s\n", xsa125.name); 
+}
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index ac7e5fd..9ef39fd 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2828,6 +2828,22 @@ int xc_psr_cat_get_l3_info(xc_interface *xch, uint32_t 
socket,
                            uint32_t *cos_max, uint32_t *cbm_len);
 #endif
 
+int xc_xsplice_upload(xc_interface *xch,
+                      char *id, char *payload, uint32_t size);
+
+int xc_xsplice_get(xc_interface *xch,
+                   char *id,
+                   xen_sysctl_xsplice_summary_t *summary);
+
+int xc_xsplice_list(xc_interface *xch, unsigned int max, unsigned int start,
+                    xen_sysctl_xsplice_summary_t *info, unsigned int *done,
+                    unsigned int *left);
+
+int xc_xsplice_apply(xc_interface *xch, char *id);
+int xc_xsplice_revert(xc_interface *xch, char *id);
+int xc_xsplice_unload(xc_interface *xch, char *id);
+int xc_xsplice_check(xc_interface *xch, char *id);
+
 #endif /* XENCTRL_H */
 
 /*
diff --git a/tools/libxc/xc_misc.c b/tools/libxc/xc_misc.c
index b827bbb..bb91930 100644
--- a/tools/libxc/xc_misc.c
+++ b/tools/libxc/xc_misc.c
@@ -719,6 +719,189 @@ int xc_hvm_inject_trap(
     return rc;
 }
 
+int xc_xsplice_upload(xc_interface *xch,
+                      char *id,
+                      char *payload,
+                      uint32_t size)
+{
+    int rc;
+    DECLARE_SYSCTL;
+    DECLARE_HYPERCALL_BOUNCE(payload, size, XC_HYPERCALL_BUFFER_BOUNCE_IN);
+
+    if ( !id || !payload )
+        return -1;
+
+    if ( xc_hypercall_bounce_pre(xch, payload) )
+        return -1;
+
+    sysctl.cmd = XEN_SYSCTL_xsplice_op;
+    sysctl.u.xsplice.cmd = XEN_SYSCTL_XSPLICE_UPLOAD;
+    sysctl.u.xsplice.u.upload.size = size;
+    memcpy(sysctl.u.xsplice.u.upload.id, id, XEN_XSPLICE_ID_SIZE);
+    set_xen_guest_handle(sysctl.u.xsplice.u.upload.payload, payload);
+
+    rc = do_sysctl(xch, &sysctl);
+
+    xc_hypercall_bounce_post(xch, payload);
+
+    return rc;
+}
+
+int xc_xsplice_get(xc_interface *xch,
+                   char *id,
+                   xen_sysctl_xsplice_summary_t *summary)
+{
+    int rc;
+    DECLARE_SYSCTL;
+
+    if ( !id )
+        return -1;
+
+    sysctl.cmd = XEN_SYSCTL_xsplice_op;
+    sysctl.u.xsplice.cmd = XEN_SYSCTL_XSPLICE_GET;
+    sysctl.u.xsplice.u.get.status = 0;
+    memcpy(sysctl.u.xsplice.u.get.id, id, XEN_XSPLICE_ID_SIZE);
+
+    rc = do_sysctl(xch, &sysctl);
+
+    memcpy(summary, &sysctl.u.xsplice.u.get, sizeof(*summary));
+
+    return rc;
+}
+
+int xc_xsplice_list(xc_interface *xch, unsigned int max, unsigned int start,
+                    xen_sysctl_xsplice_summary_t *info, unsigned int *done,
+                    unsigned int *left)
+{
+    int rc;
+    DECLARE_SYSCTL;
+    DECLARE_HYPERCALL_BOUNCE(info, 0 /* adjust later. */, 
XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+    uint32_t max_batch_sz, nr;
+    uint32_t version = 0, retries = 0;
+    uint32_t adjust = 0;
+
+    if ( !max )
+        return -1;
+
+    sysctl.cmd = XEN_SYSCTL_xsplice_op;
+    sysctl.u.xsplice.cmd = XEN_SYSCTL_XSPLICE_LIST;
+    sysctl.u.xsplice.u.list.version = 0;
+    sysctl.u.xsplice.u.list.idx = start;
+
+    max_batch_sz = max;
+
+    *done = 0;
+    *left = 0;
+    do {
+        if ( adjust )
+            adjust = 0; /* Used when adjusting the 'max_batch_sz' or 
'retries'. */
+
+        nr = min(max - *done, max_batch_sz);
+
+        sysctl.u.xsplice.u.list.nr = nr;
+        HYPERCALL_BOUNCE_SET_SIZE(info, nr * sizeof(*info));
+
+        /* Move the pointer to proper offset into 'info'. */
+        (HYPERCALL_BUFFER(info))->ubuf = info + *done;
+        if ( xc_hypercall_bounce_pre(xch, info) )
+            return -1;
+
+        set_xen_guest_handle(sysctl.u.xsplice.u.list.summary, info);
+
+        rc = do_sysctl(xch, &sysctl);
+        if ( rc < 0 && errno == E2BIG )
+        {
+            if ( max_batch_sz <= 1 )
+                break;
+            max_batch_sz >>= 1;
+            adjust = 1; /* For the loop conditional to let us loop again. */
+            xc_hypercall_bounce_post(xch, info); /* No memory leaks! */
+            continue;
+        }
+        if ( rc < 0 ) /* For all other errors we bail out. */
+            break;
+
+        if ( !version )
+            version = sysctl.u.xsplice.u.list.version;
+
+        if ( sysctl.u.xsplice.u.list.version != version )
+        {
+            /* TODO: retries should be configurable? */
+            if ( retries++ > 3 )
+            {
+                rc = -1;
+                errno = EBUSY;
+                break;
+            }
+            *done = 0; /* Retry from scratch. */
+            version = sysctl.u.xsplice.u.list.version;
+            adjust = 1; /* And make sure we continue in the loop. */
+            xc_hypercall_bounce_post(xch, info); /* No memory leaks! */
+            continue;
+        }
+
+        /* We should never hit this, but just in case. */
+        if ( rc > nr )
+        {
+            errno = EINVAL; /* Overflow! */
+            rc = -1;
+            break;
+        }
+        *left = sysctl.u.xsplice.u.list.nr; /* Total remaining count. */
+        /* Copy only up 'rc' of data' - we could add 'min(rc,nr) if desired. */
+        HYPERCALL_BOUNCE_SET_SIZE(info, (rc * sizeof(*info)));
+        /* Bounce the data and free the bounce buffer. */
+        xc_hypercall_bounce_post(xch, info);
+        /* And update how many elements of info we have copied into. */
+        *done += rc;
+        /* Update idx. */
+        sysctl.u.xsplice.u.list.idx = rc;
+    } while ( adjust || (*done < max && *left != 0) );
+
+    return rc > 0 ? 0 : rc;
+}
+
+static int _xc_xsplice_action(xc_interface *xch,
+                              char *id,
+                              unsigned int action)
+{
+    int rc;
+    DECLARE_SYSCTL;
+
+    if ( !id )
+        return -1;
+
+    sysctl.cmd = XEN_SYSCTL_xsplice_op;
+    sysctl.u.xsplice.cmd = XEN_SYSCTL_XSPLICE_ACTION;
+    sysctl.u.xsplice.u.action.cmd = action;
+    sysctl.u.xsplice.u.action.time = 0; /* TODO */
+    memcpy(sysctl.u.xsplice.u.action.id, id, XEN_XSPLICE_ID_SIZE);
+
+    rc = do_sysctl(xch, &sysctl);
+
+    return rc;
+}
+
+int xc_xsplice_apply(xc_interface *xch, char *id)
+{
+    return _xc_xsplice_action(xch, id, XSPLICE_ACTION_APPLY);
+}
+
+int xc_xsplice_revert(xc_interface *xch, char *id)
+{
+    return _xc_xsplice_action(xch, id, XSPLICE_ACTION_REVERT);
+}
+
+int xc_xsplice_unload(xc_interface *xch, char *id)
+{
+    return _xc_xsplice_action(xch, id, XSPLICE_ACTION_UNLOAD);
+}
+
+int xc_xsplice_check(xc_interface *xch, char *id)
+{
+    return _xc_xsplice_action(xch, id, XSPLICE_ACTION_CHECK);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index c4490f3..c46873e 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -30,6 +30,7 @@ INSTALL_SBIN                   += xenlockprof
 INSTALL_SBIN                   += xenperf
 INSTALL_SBIN                   += xenpm
 INSTALL_SBIN                   += xenwatchdogd
+INSTALL_SBIN                   += xen-xsplice
 INSTALL_SBIN += $(INSTALL_SBIN-y)
 
 # Everything to be installed in a private bin/
@@ -98,6 +99,9 @@ xen-mfndump: xen-mfndump.o
 xenwatchdogd: xenwatchdogd.o
        $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+xen-xsplice: xen-xsplice.o
+       $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
+
 xen-lowmemd: xen-lowmemd.o
        $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenstore) 
$(APPEND_LDFLAGS)
 
diff --git a/tools/misc/xen-xsplice.c b/tools/misc/xen-xsplice.c
new file mode 100644
index 0000000..7cf9879
--- /dev/null
+++ b/tools/misc/xen-xsplice.c
@@ -0,0 +1,360 @@
+#include <xenctrl.h>
+#include <xenstore.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+static xc_interface *xch;
+
+void show_help(void)
+{
+    fprintf(stderr,
+            "xen-xsplice: Xsplice test tool\n"
+            "Usage: xen-xsplice <command> [args]\n"
+            " <id> An unique name of payload. Up to 40 characters.\n"
+            "Commands:\n"
+            "  help                 display this help\n"
+            "  upload <id> <file>   upload file <cpuid> with <id> name\n"
+            "  list                 list payloads uploaded.\n"
+            "  apply <id>           apply <id> patch.\n"
+            "  revert <id>          revert id <id> patch.\n"
+            "  unload <id>          unload id <id> patch.\n"
+            "  check <id>           check id <id> patch.\n"
+           );
+}
+
+/* wrapper function */
+static int help_func(int argc, char *argv[])
+{
+    show_help();
+    return 0;
+}
+
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
+static const char *status2str(long status)
+{
+#define STATUS(x) [XSPLICE_STATUS_##x] = #x
+    static const char *const names[] = {
+            STATUS(LOADED),
+            STATUS(PROGRESS),
+            STATUS(CHECKED),
+            STATUS(APPLIED),
+            STATUS(REVERTED),
+    };
+
+    if (status >= ARRAY_SIZE(names))
+        return "unknown";
+
+    if (status < 0)
+        return "-EXX";
+
+    if (!names[status])
+        return "unknown";
+
+    return names[status];
+}
+
+#define MAX 11
+static int list_func(int argc, char *argv[])
+{
+    unsigned int idx, done, left, rc, i;
+    xen_sysctl_xsplice_summary_t *info;
+
+    if ( argc )
+    {
+        show_help();
+        return -1;
+    }
+    idx = left = 0;
+    info = malloc(sizeof(*info) * MAX);
+    if ( !info )
+    {
+        fprintf(stderr, "Could not allocate buffer!\n");
+        return ENOMEM;
+    }
+    fprintf(stdout," ID                                     | status\n"
+                   "----------------------------------------+------------\n");
+    do {
+        done = 0;
+        memset(info, 'A', sizeof(*info) * MAX); /* Optional. */
+        rc = xc_xsplice_list(xch, MAX, idx, info, &done, &left);
+        if ( rc )
+        {
+            fprintf(stderr, "Failed to list %d/%d: %d(%s)!\n", idx, left, 
errno, strerror(errno));
+            return errno;
+        }
+        for ( i = 0; i < done; i++ )
+        {
+            fprintf(stdout, "%-40s| ", info[i].id);
+            if ( info[i].status < 0 )
+                fprintf(stdout, "%s\n", strerror(info[i].status));
+            else
+                fprintf(stdout, "%s\n", status2str(info[i].status));
+        }
+        idx += done;
+    } while ( left );
+
+    return 0;
+}
+
+static int get_id(int argc, char *argv[], char *id)
+{
+    ssize_t len = strlen(argv[0]);
+    if ( len > XEN_XSPLICE_ID_SIZE )
+    {
+        fprintf(stderr, "ID MUST be %d characters!\n", XEN_XSPLICE_ID_SIZE);
+        errno = EINVAL;
+        return errno;
+    }
+    /* Don't want any funny strings from the stack. */
+    memset(id, 0, XEN_XSPLICE_ID_SIZE);
+    strncpy(id, argv[0], len);
+    return 0;
+}
+
+static int upload_func(int argc, char *argv[])
+{
+    char *filename;
+    char id[XEN_XSPLICE_ID_SIZE];
+    int fd = 0, rc;
+    struct stat buf;
+    unsigned char *fbuf;
+    ssize_t len;
+    DECLARE_HYPERCALL_BUFFER(char, payload);
+
+    if ( argc != 2 )
+    {
+        show_help();
+        return -1;
+    }
+
+    if ( get_id(argc, argv, id) )
+        return EINVAL;
+
+    filename = argv[1];
+    fd = open(filename, O_RDONLY);
+    if ( fd < 0 )
+    {
+        fprintf(stderr, "Could not open %s, error: %d(%s)\n",
+                filename, errno, strerror(errno));
+        return errno;
+    }
+    if ( stat(filename, &buf) != 0 )
+    {
+        fprintf(stderr, "Could not get right size %s, error: %d(%s)\n",
+                filename, errno, strerror(errno));
+        close(fd);
+        return errno;
+    }
+
+    len = buf.st_size;
+    fbuf = mmap(0, len, PROT_READ, MAP_PRIVATE, fd, 0);
+    if ( fbuf == MAP_FAILED )
+    {
+        fprintf(stderr,"Could not map: %s, error: %d(%s)\n",
+                filename, errno, strerror(errno));
+        close (fd);
+        return errno;
+    }
+    printf("Uploading %s (%ld bytes)\n", filename, len);
+    payload = xc_hypercall_buffer_alloc(xch, payload, len);
+    memcpy(payload, fbuf, len);
+
+    rc = xc_xsplice_upload(xch, id, payload, len);
+    if ( rc )
+    {
+        fprintf(stderr, "Upload failed: %s, error: %d(%s)!\n",
+                filename, errno, strerror(errno));
+        goto out;
+    }
+    xc_hypercall_buffer_free(xch, payload);
+
+out:
+    if ( munmap( fbuf, len) )
+    {
+        fprintf(stderr, "Could not unmap!? error: %d(%s)!\n",
+                errno, strerror(errno));
+        rc = errno;
+    }
+    close(fd);
+
+    return rc;
+}
+
+struct {
+    int allow; /* State it must be in to call function. */
+    int expected; /* The state to be in after the function. */
+    const char *name;
+    int (*function)(xc_interface *xch, char *id);
+    unsigned int executed; /* Has the function been called?. */
+} action_options[] = {
+    {   .allow = XSPLICE_STATUS_CHECKED | XSPLICE_STATUS_REVERTED,
+        .expected = XSPLICE_STATUS_APPLIED,
+        .name = "apply",
+        .function = xc_xsplice_apply,
+    },
+    {   .allow = XSPLICE_STATUS_APPLIED,
+        .expected = XSPLICE_STATUS_REVERTED,
+        .name = "revert",
+        .function = xc_xsplice_revert,
+    },
+    {   .allow = XSPLICE_STATUS_CHECKED | XSPLICE_STATUS_REVERTED | 
XSPLICE_STATUS_LOADED,
+        .expected = ENOENT,
+        .name = "unload",
+        .function = xc_xsplice_unload,
+    },
+    {   .allow = XSPLICE_STATUS_CHECKED | XSPLICE_STATUS_LOADED,
+        .expected = XSPLICE_STATUS_CHECKED,
+        .name = "check",
+        .function = xc_xsplice_check
+    },
+};
+
+int action_func(int argc, char *argv[], unsigned int idx)
+{
+    char id[40];
+    int rc;
+    xen_sysctl_xsplice_summary_t summary;
+    unsigned int retry = 0;
+
+    if ( argc != 1 )
+    {
+        show_help();
+        return -1;
+    }
+
+    if ( get_id(argc, argv, id) )
+        return EINVAL;
+
+    do {
+        rc = xc_xsplice_get(xch, id, &summary);
+        /* N.B. Successfull unload will return ENOENT. */
+        if ( rc )
+        {
+            rc = errno; /* rc is just -1 and we want proper EXX. */
+            break;
+        }
+
+        if ( summary.status < 0 )
+        { /* We report it outside the loop. */
+            rc = summary.status;
+            break;
+        }
+        if ( summary.status == XSPLICE_STATUS_PROGRESS )
+        {
+            if ( !action_options[idx].executed )
+            {
+                printf("%s is in progress and we didn't initiate it!\n", id);
+                errno = EBUSY;
+                rc = -1;
+                break;
+            }
+            if ( retry++ < 30 )
+            {
+                printf(".");
+                sleep(1);
+                continue;
+            }
+            printf("%s: Waited more than 30 seconds! Bailing out.\n", id);
+            errno = EBUSY;
+            rc = -1;
+            break;
+        }
+        /* We use rc outside loop to deal with EXX type expected values. */
+        rc = summary.status;
+        if ( action_options[idx].expected == rc ) /* Yeey! */
+            break;
+
+        if ( action_options[idx].allow & rc )
+        {
+            if ( action_options[idx].executed )
+            {
+                printf(" (0x%x vs 0x%x) state not reached!?\n",
+                       action_options[idx].expected, rc);
+                errno = EINVAL;
+                break;
+            }
+            printf("%s: State is 0x%x, ok are 0x%x. Commencing %s:",
+                   id, rc, action_options[idx].allow,
+                   action_options[idx].name);
+
+            rc = action_options[idx].function(xch, id);
+            if ( rc ) /* We report it outside the loop. */
+                break;
+
+            action_options[idx].executed = 1;
+            rc = 1; /* Loop again so we can display the dots. */
+        } else {
+            printf("%s: in wrong state (0x%x), expected 0x%x\n",
+                   id, rc, action_options[idx].expected);
+            errno = EINVAL;
+            rc = -1;
+            break;
+        }
+    } while ( rc > 0 );
+
+    if ( action_options[idx].expected == rc )
+    {
+            printf("completed!\n");
+            rc = 0;
+    } else
+        printf("%s failed with %d(%s)\n", id, errno, strerror(errno));
+
+    return rc;
+}
+struct {
+    const char *name;
+    int (*function)(int argc, char *argv[]);
+} main_options[] = {
+    { "help", help_func },
+    { "list", list_func },
+    { "upload", upload_func },
+};
+
+int main(int argc, char *argv[])
+{
+    int i, j, ret;
+
+    if ( argc  <= 1 )
+    {
+        show_help();
+        return 0;
+    }
+    for ( i = 0; i < ARRAY_SIZE(main_options); i++ )
+        if (!strncmp(main_options[i].name, argv[1], strlen(argv[1])))
+            break;
+
+    if ( i == ARRAY_SIZE(main_options) )
+    {
+        for ( j = 0; j < ARRAY_SIZE(action_options); j++ )
+            if (!strncmp(action_options[j].name, argv[1], strlen(argv[1])))
+                break;
+
+        if ( j == ARRAY_SIZE(action_options) )
+        {
+            fprintf(stderr, "Unrecognised command '%s' -- try "
+                   "'xen-xsplice help'\n", argv[1]);
+            return 1;
+        }
+    }
+
+    xch = xc_interface_open(0,0,0);
+    if ( !xch )
+    {
+        fprintf(stderr, "failed to get the handler\n");
+        return 0;
+    }
+
+    if ( i == ARRAY_SIZE(main_options) )
+        ret = action_func(argc -2, argv + 2, j);
+    else
+        ret = main_options[i].function(argc -2, argv + 2);
+
+    xc_interface_close(xch);
+
+    return !!ret;
+}
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 3fdf931..7769e5c 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -55,6 +55,7 @@ obj-y += vmap.o
 obj-y += vsprintf.o
 obj-y += wait.o
 obj-y += xmalloc_tlsf.o
+obj-y += xsplice.o
 
 obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo 
unlz4 earlycpio,$(n).init.o)
 
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 5d21e48..1d4574a 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -18,6 +18,7 @@
 #include <xen/mm.h>
 #include <xen/watchdog.h>
 #include <xen/init.h>
+#include <xen/xsplice.h>
 #include <asm/debugger.h>
 #include <asm/div64.h>
 
@@ -455,6 +456,11 @@ static struct keyhandler spinlock_reset_keyhandler = {
     .desc = "reset lock profile info"
 };
 #endif
+static struct keyhandler xsplice_printall_keyhandler = {
+    .diagnostic = 1,
+    .u.fn = xsplice_printall,
+    .desc = "print splicing information"
+};
 
 static void run_all_nonirq_keyhandlers(unsigned long unused)
 {
@@ -567,7 +573,7 @@ void __init initialize_keytable(void)
     register_keyhandler('l', &spinlock_printall_keyhandler);
     register_keyhandler('L', &spinlock_reset_keyhandler);
 #endif
-
+    register_keyhandler('x', &xsplice_printall_keyhandler);
 }
 
 /*
diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c
index f1c0c76..641bb25 100644
--- a/xen/common/sysctl.c
+++ b/xen/common/sysctl.c
@@ -27,6 +27,7 @@
 #include <xsm/xsm.h>
 #include <xen/pmstat.h>
 #include <xen/gcov.h>
+#include <xen/xsplice.h>
 
 long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
 {
@@ -399,6 +400,10 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) 
u_sysctl)
         ret = sysctl_coverage_op(&op->u.coverage_op);
         break;
 #endif
+    case XEN_SYSCTL_xsplice_op:
+        ret = xsplice_control(&op->u.xsplice);
+        copyback = 1;
+        break;
 
 #ifdef HAS_PCI
     case XEN_SYSCTL_pcitopoinfo:
diff --git a/xen/common/xsplice.c b/xen/common/xsplice.c
new file mode 100644
index 0000000..e816394
--- /dev/null
+++ b/xen/common/xsplice.c
@@ -0,0 +1,405 @@
+/*
+ * xSplice - Copyright Oracle Corp. Inc 2015.
+ *
+ * Author: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
+ */
+
+/* TODO: Sort includes .*/
+#include <xen/smp.h>
+#include <xen/keyhandler.h>
+#include <xen/spinlock.h>
+#include <xen/mm.h>
+#include <xen/list.h>
+#include <xen/guest_access.h>
+#include <xen/stdbool.h>
+#include <xen/sched.h>
+#include <xen/lib.h>
+#include <xen/xsplice.h>
+#include <public/sysctl.h>
+
+#include <asm/event.h>
+
+static DEFINE_SPINLOCK(payload_list_lock);
+static LIST_HEAD(payload_list);
+static unsigned int payload_cnt;
+static unsigned int payload_version = 1;
+static int _debug = 1;
+#define where() { if (_debug) printk("%s:%d\n", __func__, __LINE__); }
+
+struct payload {
+    char  id[XEN_XSPLICE_ID_SIZE]; /* Unique id. */
+    int32_t    status; /* XSPLICE_STATUS_* or Exx type value. */
+    int32_t    old_status; /* XSPLICE_STATUS_* or Exx type value. */
+
+    uint32_t    cmd; /* Action request. XSPLICE_ACTION_* */
+    struct spinlock cmd_lock; /* Lock against the action. */
+
+    uint8_t     *raw;   /* Pointer to Elf file. */
+    ssize_t     size;   /* Size of 'raw'. */
+
+    struct tasklet tasklet;
+    struct list_head   list;   /* Linked to 'payload_list'. */
+};
+
+static const char *status2str(int64_t status)
+{
+#define STATUS(x) [XSPLICE_STATUS_##x] = #x
+    static const char *const names[] = {
+            STATUS(LOADED),
+            STATUS(PROGRESS),
+            STATUS(CHECKED),
+            STATUS(APPLIED),
+            STATUS(REVERTED),
+    };
+
+    if (status >= ARRAY_SIZE(names))
+        return "unknown";
+
+    if (status < 0)
+        return "-EXX";
+
+    if (!names[status])
+        return "unknown";
+
+    return names[status];
+}
+
+void xsplice_printall(unsigned char key)
+{
+    struct payload *data;
+
+    spin_lock(&payload_list_lock);
+
+    list_for_each_entry ( data, &payload_list, list )
+    {
+        printk(" id=%s status=%s(%d,old=%d): \n", data->id,
+               status2str(data->status), data->status, data->old_status);
+    }
+    spin_unlock(&payload_list_lock);
+}
+
+struct payload *find_payload(const char *id, bool_t need_lock)
+{
+    struct payload *data, *found = NULL;
+
+    if ( need_lock )
+        spin_lock(&payload_list_lock);
+
+    list_for_each_entry ( data, &payload_list, list )
+    {
+        if ( !strncmp(data->id, id, XEN_XSPLICE_ID_SIZE) )
+        {
+            found = data;
+            break;
+        }
+    }
+
+    if ( need_lock )
+        spin_unlock(&payload_list_lock);
+
+    return found;
+}
+
+static int verify_payload(xen_sysctl_xsplice_upload_t *upload)
+{
+    if ( upload->id[0] == '\0' )
+    {
+        where();
+        return -EINVAL;
+    }
+    if ( upload->size == 0 )
+    {
+        where();
+        return -EINVAL;
+    }
+    if ( !guest_handle_okay(upload->payload, upload->size) )
+    {
+        where();
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
+/*
+ * We MUST be holding the spinlock.
+ */
+static void __free_payload(struct payload *data)
+{
+
+    free_xenheap_pages(data->raw, get_order_from_bytes(data->size));
+    list_del(&data->list);
+    payload_cnt --;
+    payload_version ++;
+    tasklet_kill(&data->tasklet);
+    xfree(data);
+}
+#include <xen/delay.h>
+static void xsplice_tasklet(unsigned long _data)
+{
+    struct payload *data = (struct payload *)_data;
+
+    /* TODO: Remove it. */
+    mdelay(1000);
+
+    spin_lock(&data->cmd_lock);
+    switch ( data->cmd ) {
+    case XSPLICE_ACTION_CHECK:
+            /* Do the operation here. */
+            data->status = XSPLICE_STATUS_CHECKED;
+            break;
+    case XSPLICE_ACTION_APPLY:
+            /* TODO: Well, do the work :-) */
+            data->status = XSPLICE_STATUS_APPLIED;
+            break;
+    case XSPLICE_ACTION_REVERT:
+            /* TODO: Well, do the work :-) */
+            data->status = XSPLICE_STATUS_REVERTED;
+            break;
+    default:
+            data->status = -EINVAL;
+    }
+    spin_unlock(&data->cmd_lock);
+}
+
+static int xsplice_upload(xen_sysctl_xsplice_upload_t *upload)
+{
+    struct payload *data;
+    uint8_t *raw = NULL;
+    int rc;
+
+    rc = verify_payload(upload);
+    if ( rc )
+        return rc;
+
+    /*
+     * Compute the size of the structures which need to be verified.
+     */
+
+    data = find_payload(upload->id, true);
+    if ( data )
+    {
+        where();
+        return -EEXIST;
+    }
+    rc = -ENOMEM;
+    data = xzalloc(struct payload);
+    if ( !data )
+    {
+        where();
+        return rc;
+    }
+
+    raw = alloc_xenheap_pages(get_order_from_bytes(upload->size), 0);
+    if ( !raw )
+    {
+        printk("%s: alloc for %ld bytes, %d order\n", __func__, upload->size, 
get_order_from_bytes(upload->size));
+        xfree(data);
+        return rc;
+    }
+    if ( copy_from_guest(raw, upload->payload, upload->size) )
+    {
+        rc = -EFAULT;
+        goto err_out;
+    }
+
+    printk("%s: size %ld %p [%02x %02x ..] \n", __func__, upload->size,
+           raw, (unsigned int)raw[0], (unsigned int)raw[1]);
+
+    /* TODO: Verify signature . */
+    memcpy(data->id, upload->id, XEN_XSPLICE_ID_SIZE);
+    data->status = XSPLICE_STATUS_LOADED;
+    INIT_LIST_HEAD(&data->list);
+    data->raw = raw;
+    data->size = upload->size;
+    spin_lock_init(&data->cmd_lock);
+    data->cmd = 0;
+    tasklet_init(&data->tasklet, xsplice_tasklet, (unsigned long)data);
+
+    spin_lock(&payload_list_lock);
+    list_add_tail(&data->list, &payload_list);
+    payload_cnt ++;
+    payload_version ++;
+    spin_unlock(&payload_list_lock);
+
+    return 0;
+
+ err_out:
+    if ( raw )
+        free_xenheap_pages(raw, get_order_from_bytes(upload->size));
+    if ( data )
+        xfree(data);
+    return rc;
+}
+
+static int xsplice_get(xen_sysctl_xsplice_summary_t *summary)
+{
+    struct payload *data;
+
+    if ( summary->status )
+        return -EINVAL;
+
+    data = find_payload(summary->id, true);
+    if ( !data )
+        return -ENOENT;
+
+    summary->status = data->status;
+
+    return 0;
+}
+
+static int xsplice_list(xen_sysctl_xsplice_list_t *list)
+{
+    xen_sysctl_xsplice_summary_t summary;
+    struct payload *data;
+    unsigned int idx = 0, i = 0;
+    int rc = 0;
+    unsigned int ver = payload_version;
+
+    // TODO: Increase to 64. Leave 4 for debug.
+    if ( list->nr > 4 )
+        return -E2BIG;
+
+    if ( guest_handle_is_null(list->summary) )
+        return -EINVAL;
+
+    spin_lock(&payload_list_lock);
+    if ( list->idx > payload_cnt )
+    {
+        spin_unlock(&payload_list_lock);
+        where();
+        return -EINVAL;
+    }
+
+    list_for_each_entry( data, &payload_list, list )
+    {
+        if ( list->idx > i++ )
+            continue;
+
+        /* Copy all of the bytes avoid leaking stack data. */
+        memcpy(summary.id, data->id, XEN_XSPLICE_ID_SIZE);
+        summary.status = data->status;
+
+        /* N.B. 'idx' != 'i'. */
+        if ( copy_to_guest_offset(list->summary, idx++, &summary, 1) )
+        {
+            rc = -EFAULT;
+            break;
+        }
+        if ( hypercall_preempt_check() || (idx + 1 > list->nr) )
+        {
+            break;
+        }
+    }
+    list->nr = payload_cnt - i; /* Remaining amount. */
+    spin_unlock(&payload_list_lock);
+    list->version = ver;
+
+    /* And how many we have processed. */
+    return rc ? rc : idx;
+}
+
+static int xsplice_action(xen_sysctl_xsplice_action_t *action)
+{
+    struct payload *data;
+    int rc = -EINVAL;
+
+    if ( action->id[0] == '\0' )
+        return rc;
+
+    spin_lock(&payload_list_lock);
+    data = find_payload(action->id, false /* we are holding the lock. */);
+    if ( !data )
+    {
+        rc = -ENOENT;
+        goto out;
+    }
+    if ( action->cmd != XSPLICE_ACTION_UNLOAD )
+        spin_lock(&data->cmd_lock);
+
+    switch ( action->cmd )
+    {
+    case XSPLICE_ACTION_CHECK:
+        if ( ( data->status == XSPLICE_STATUS_LOADED ) )
+        {
+            data->old_status = data->status;
+            data->status = XSPLICE_STATUS_PROGRESS;
+            data->cmd = action->cmd;
+            tasklet_schedule(&data->tasklet);
+            rc = 0;
+        } else if ( data->status == XSPLICE_STATUS_CHECKED )
+        {
+            rc = 0;
+        }
+        break;
+    case XSPLICE_ACTION_UNLOAD:
+        if ( ( data->status == XSPLICE_STATUS_REVERTED ) ||
+             ( data->status == XSPLICE_STATUS_LOADED ) ||
+             ( data->status == XSPLICE_STATUS_CHECKED ) )
+        {
+            __free_payload(data);
+            /* No touching 'data' from here on! */
+            rc = 0;
+        }
+        break;
+    case XSPLICE_ACTION_REVERT:
+        if ( data->status == XSPLICE_STATUS_APPLIED )
+        {
+            data->old_status = data->status;
+            data->status = XSPLICE_STATUS_PROGRESS;
+            data->cmd = action->cmd;
+            rc = 0;
+            /* TODO: Tasklet is not good for this. We need a different 
vehicle. */
+            tasklet_schedule(&data->tasklet);
+        }
+        break;
+    case XSPLICE_ACTION_APPLY:
+        if ( ( data->status == XSPLICE_STATUS_CHECKED ) ||
+             ( data->status == XSPLICE_STATUS_REVERTED ))
+        {
+            data->old_status = data->status;
+            data->status = XSPLICE_STATUS_PROGRESS;
+            data->cmd = action->cmd;
+            rc = 0;
+            /* TODO: Tasklet is not good for this. We need a different 
vehicle. */
+            tasklet_schedule(&data->tasklet);
+        }
+        break;
+    default:
+        rc = -ENOSYS;
+        break;
+    }
+
+    if ( action->cmd != XSPLICE_ACTION_UNLOAD )
+        spin_unlock(&data->cmd_lock);
+ out:
+    spin_unlock(&payload_list_lock);
+
+    return rc;
+}
+
+int xsplice_control(xen_sysctl_xsplice_op_t *xsplice)
+{
+    int rc;
+
+    switch ( xsplice->cmd )
+    {
+    case XEN_SYSCTL_XSPLICE_UPLOAD:
+        rc = xsplice_upload(&xsplice->u.upload);
+        break;
+    case XEN_SYSCTL_XSPLICE_GET:
+        rc = xsplice_get(&xsplice->u.get);
+        break;
+    case XEN_SYSCTL_XSPLICE_LIST:
+        rc = xsplice_list(&xsplice->u.list);
+        break;
+    case XEN_SYSCTL_XSPLICE_ACTION:
+        rc = xsplice_action(&xsplice->u.action);
+        break;
+    default:
+        rc = -ENOSYS;
+        break;
+   }
+
+    return rc;
+}
diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
index 58c9be2..48dd511 100644
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -710,6 +710,70 @@ struct xen_sysctl_psr_cat_op {
 typedef struct xen_sysctl_psr_cat_op xen_sysctl_psr_cat_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cat_op_t);
 
+/*
+ * XEN_SYSCTL_XSPLICE_op
+ *
+ */
+#define XEN_SYSCTL_XSPLICE_UPLOAD 0
+#define XEN_XSPLICE_ID_SIZE 40
+
+struct xen_sysctl_xsplice_upload {
+    char        id[XEN_XSPLICE_ID_SIZE];    /* IN, name of the patch. */
+    uint64_t    size;                       /* IN, size of the ELF file. */
+    XEN_GUEST_HANDLE_64(uint8) payload;     /* IN, the ELF file. */
+};
+typedef struct xen_sysctl_xsplice_upload xen_sysctl_xsplice_upload_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_xsplice_upload_t);
+
+#define XEN_SYSCTL_XSPLICE_GET 1
+struct xen_sysctl_xsplice_summary {
+    char        id[XEN_XSPLICE_ID_SIZE];    /* IN, name of the patch. */
+#define XSPLICE_STATUS_LOADED       0x01
+#define XSPLICE_STATUS_PROGRESS     0x02
+#define XSPLICE_STATUS_CHECKED      0x04
+#define XSPLICE_STATUS_APPLIED      0x08
+#define XSPLICE_STATUS_REVERTED     0x10
+ /* Any negative value is an error. The error would be in -EXX format. */
+       int32_t status;   /* OUT, On IN has to be zero. */
+};
+typedef struct xen_sysctl_xsplice_summary xen_sysctl_xsplice_summary_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_xsplice_summary_t);
+
+#define XEN_SYSCTL_XSPLICE_LIST 2
+struct xen_sysctl_xsplice_list {
+    uint32_t version;           /* OUT. */
+    uint32_t idx;               /* IN/OUT */
+    uint32_t nr;                /* IN/OUT */
+    XEN_GUEST_HANDLE_64(xen_sysctl_xsplice_summary_t) summary;  /* OUT */
+};
+typedef struct xen_sysctl_xsplice_list xen_sysctl_xsplice_list_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_xsplice_list_t);
+
+#define XEN_SYSCTL_XSPLICE_ACTION 3
+struct xen_sysctl_xsplice_action {
+    char        id[XEN_XSPLICE_ID_SIZE];    /* IN, name of the patch. */
+#define XSPLICE_ACTION_CHECK        1
+#define XSPLICE_ACTION_UNLOAD       2
+#define XSPLICE_ACTION_REVERT       3
+#define XSPLICE_ACTION_APPLY        4
+    uint32_t    cmd; /* IN */
+    uint64_aligned_t time;  /* IN */
+};
+typedef struct xen_sysctl_xsplice_action xen_sysctl_xsplice_action_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_xsplice_action_t);
+
+struct xen_sysctl_xsplice_op {
+    uint32_t cmd; /* IN */
+    union {
+        xen_sysctl_xsplice_upload_t upload;
+        xen_sysctl_xsplice_list_t list;
+        xen_sysctl_xsplice_summary_t get;
+        xen_sysctl_xsplice_action_t action;
+    } u;
+};
+typedef struct xen_sysctl_xsplice_op xen_sysctl_xsplice_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_xsplice_op_t);
+
 struct xen_sysctl {
     uint32_t cmd;
 #define XEN_SYSCTL_readconsole                    1
@@ -734,6 +798,7 @@ struct xen_sysctl {
 #define XEN_SYSCTL_psr_cmt_op                    21
 #define XEN_SYSCTL_pcitopoinfo                   22
 #define XEN_SYSCTL_psr_cat_op                    23
+#define XEN_SYSCTL_xsplice_op                    24
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
     union {
         struct xen_sysctl_readconsole       readconsole;
@@ -758,6 +823,7 @@ struct xen_sysctl {
         struct xen_sysctl_coverage_op       coverage_op;
         struct xen_sysctl_psr_cmt_op        psr_cmt_op;
         struct xen_sysctl_psr_cat_op        psr_cat_op;
+        struct xen_sysctl_xsplice_op        xsplice;
         uint8_t                             pad[128];
     } u;
 };
diff --git a/xen/include/xen/xsplice.h b/xen/include/xen/xsplice.h
new file mode 100644
index 0000000..41e28da
--- /dev/null
+++ b/xen/include/xen/xsplice.h
@@ -0,0 +1,9 @@
+#ifndef __XEN_XSPLICE_H__
+#define __XEN_XSPLICE_H__
+
+struct xen_sysctl_xsplice_op;
+int xsplice_control(struct xen_sysctl_xsplice_op *);
+
+extern void xsplice_printall(unsigned char key);
+
+#endif /* __XEN_XSPLICE_H__ */
-- 
1.8.4.2


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.