[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg



# HG changeset patch
# User Alex Williamson <alex.williamson@xxxxxx>
# Date 1204301998 25200
# Node ID 71a8366fb212b9199090bf9e87e389bdd65e5cbd
# Parent  0b20ac6ec64aa50558bea7145552c341277f9f19
# Parent  9049b0b62e0891e9bfb188bef40f68c04b5ea653
merge with xen-unstable.hg
---
 extras/mini-os/fbfront.c                                   |   12 
 extras/mini-os/include/fbfront.h                           |    3 
 extras/mini-os/kernel.c                                    |    6 
 extras/mini-os/xenbus/xenbus.c                             |    4 
 stubdom/README                                             |   91 +
 stubdom/stubdom-dm                                         |   27 
 tools/blktap/drivers/block-qcow2.c                         |  161 --
 tools/ioemu/block-qcow.c                                   |    2 
 tools/ioemu/block-qcow2.c                                  |    2 
 tools/ioemu/block-raw.c                                    |    2 
 tools/ioemu/block-vmdk.c                                   |    2 
 tools/ioemu/block.c                                        |   17 
 tools/ioemu/block_int.h                                    |    4 
 tools/ioemu/hw/e1000.c                                     |    2 
 tools/ioemu/hw/xenfb.c                                     |  216 +++
 tools/ioemu/keymaps.c                                      |    4 
 tools/ioemu/monitor.c                                      |    2 
 tools/ioemu/vl.c                                           |    8 
 tools/ioemu/vl.h                                           |    7 
 tools/ioemu/xenstore.c                                     |   31 
 tools/python/xen/xend/XendAPI.py                           |    3 
 tools/python/xen/xend/XendCheckpoint.py                    |   33 
 tools/python/xen/xend/XendDomain.py                        |   10 
 tools/python/xen/xend/XendDomainInfo.py                    |   36 
 tools/python/xen/xend/image.py                             |   29 
 tools/python/xen/xm/main.py                                |    9 
 tools/python/xen/xm/migrate.py                             |   10 
 tools/xenstat/libxenstat/src/xenstat_solaris.c             |   44 
 tools/xentrace/xentrace.c                                  |    8 
 unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c |    3 
 xen/arch/ia64/xen/machine_kexec.c                          |   49 
 xen/arch/powerpc/machine_kexec.c                           |    6 
 xen/arch/x86/machine_kexec.c                               |   10 
 xen/arch/x86/mm/shadow/multi.c                             |  202 ++-
 xen/arch/x86/mm/shadow/types.h                             |    1 
 xen/arch/x86/x86_32/Makefile                               |    1 
 xen/arch/x86/x86_32/machine_kexec.c                        |   33 
 xen/arch/x86/x86_64/Makefile                               |    1 
 xen/arch/x86/x86_64/machine_kexec.c                        |   32 
 xen/common/compat/kexec.c                                  |    5 
 xen/common/kexec.c                                         |   97 -
 xen/drivers/acpi/tables.c                                  |    1 
 xen/drivers/passthrough/amd/Makefile                       |    1 
 xen/drivers/passthrough/amd/iommu_acpi.c                   |  874 +++++++++++++
 xen/drivers/passthrough/amd/iommu_detect.c                 |   36 
 xen/drivers/passthrough/amd/iommu_init.c                   |   41 
 xen/drivers/passthrough/amd/iommu_map.c                    |   42 
 xen/drivers/passthrough/amd/pci_amd_iommu.c                |  142 +-
 xen/include/asm-x86/amd-iommu.h                            |   36 
 xen/include/asm-x86/domain.h                               |    5 
 xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h               |  176 ++
 xen/include/asm-x86/hvm/svm/amd-iommu-defs.h               |    6 
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h              |   24 
 xen/include/asm-x86/perfc_defn.h                           |    5 
 xen/include/public/io/kbdif.h                              |    2 
 xen/include/public/kexec.h                                 |   15 
 xen/include/xen/acpi.h                                     |    1 
 xen/include/xen/kexec.h                                    |    1 
 58 files changed, 2173 insertions(+), 460 deletions(-)

diff -r 0b20ac6ec64a -r 71a8366fb212 extras/mini-os/fbfront.c
--- a/extras/mini-os/fbfront.c  Fri Feb 29 09:18:01 2008 -0700
+++ b/extras/mini-os/fbfront.c  Fri Feb 29 09:19:58 2008 -0700
@@ -31,13 +31,6 @@ struct kbdfront_dev {
     char *nodename;
     char *backend;
 
-    char *data;
-    int width;
-    int height;
-    int depth;
-    int line_length;
-    int mem_length;
-
 #ifdef HAVE_LIBC
     int fd;
 #endif
@@ -316,7 +309,10 @@ struct fbfront_dev *init_fbfront(char *n
     for (i = 0; mapped < mem_length && i < max_pd; i++) {
         unsigned long *pd = (unsigned long *) alloc_page();
         for (j = 0; mapped < mem_length && j < PAGE_SIZE / sizeof(unsigned 
long); j++) {
-            pd[j] = virt_to_mfn((unsigned long) data + mapped);
+            /* Trigger CoW */
+            * ((char *)data + mapped) = 0;
+            barrier();
+            pd[j] = virtual_to_mfn((unsigned long) data + mapped);
             mapped += PAGE_SIZE;
         }
         for ( ; j < PAGE_SIZE / sizeof(unsigned long); j++)
diff -r 0b20ac6ec64a -r 71a8366fb212 extras/mini-os/include/fbfront.h
--- a/extras/mini-os/include/fbfront.h  Fri Feb 29 09:18:01 2008 -0700
+++ b/extras/mini-os/include/fbfront.h  Fri Feb 29 09:19:58 2008 -0700
@@ -14,6 +14,9 @@
 #ifndef KEY_Q
 #define KEY_Q 16
 #endif
+#ifndef KEY_MAX
+#define KEY_MAX 0x1ff
+#endif
 
 
 struct kbdfront_dev;
diff -r 0b20ac6ec64a -r 71a8366fb212 extras/mini-os/kernel.c
--- a/extras/mini-os/kernel.c   Fri Feb 29 09:18:01 2008 -0700
+++ b/extras/mini-os/kernel.c   Fri Feb 29 09:19:58 2008 -0700
@@ -360,13 +360,13 @@ static void kbdfront_thread(void *p)
                 refresh_cursor(x, y);
                 break;
             case XENKBD_TYPE_POS:
-                printk("pos x:%d y:%d z:%d\n",
+                printk("pos x:%d y:%d dz:%d\n",
                         event.pos.abs_x,
                         event.pos.abs_y,
-                        event.pos.abs_z);
+                        event.pos.rel_z);
                 x = event.pos.abs_x;
                 y = event.pos.abs_y;
-                z = event.pos.abs_z;
+                z = event.pos.rel_z;
                 clip_cursor(&x, &y);
                 refresh_cursor(x, y);
                 break;
diff -r 0b20ac6ec64a -r 71a8366fb212 extras/mini-os/xenbus/xenbus.c
--- a/extras/mini-os/xenbus/xenbus.c    Fri Feb 29 09:18:01 2008 -0700
+++ b/extras/mini-os/xenbus/xenbus.c    Fri Feb 29 09:19:58 2008 -0700
@@ -637,9 +637,7 @@ char* xenbus_printf(xenbus_transaction_t
     va_start(args, fmt);
     vsprintf(val, fmt, args);
     va_end(args);
-    xenbus_write(xbt,fullpath,val);
-
-    return NULL;
+    return xenbus_write(xbt,fullpath,val);
 }
 
 static void do_ls_test(const char *pre)
diff -r 0b20ac6ec64a -r 71a8366fb212 stubdom/README
--- a/stubdom/README    Fri Feb 29 09:18:01 2008 -0700
+++ b/stubdom/README    Fri Feb 29 09:19:58 2008 -0700
@@ -6,6 +6,73 @@ Then make install to install the result.
 
 Also, run make and make install in $XEN_ROOT/tools/fs-back
 
+General Configuration
+=====================
+
+In your HVM config "hvmconfig",
+
+- use /usr/lib/xen/bin/stubdom-dm as dm script
+
+device_model = '/usr/lib/xen/bin/stubdom-dm'
+
+- comment the disk statement:
+
+#disk = [  'file:/tmp/install.iso,hdc:cdrom,r', 'phy:/dev/sda6,hda,w', 
'file:/tmp/test,hdb,r' ]
+
+
+Create /etc/xen/stubdom-hvmconfig (where "hvmconfig" is the name of your HVM
+guest) with
+
+kernel = "/usr/lib/xen/boot/stubdom.gz"
+vif = [ '', 'ip=10.0.1.1,mac=aa:00:00:12:23:34']
+disk = [  'file:/tmp/install.iso,hdc:cdrom,r', 'phy:/dev/sda6,hda,w', 
'file:/tmp/test,hdb,r' ]
+
+where
+- the first vif ('') is reserved for VNC (see below)
+- 'ip=10.0.1.1,mac= etc...' is the same net configuration as in the hvmconfig
+script,
+- and disk = is the same block configuration as in the hvmconfig script.
+
+Display Configuration
+=====================
+
+There are three posibilities
+
+* Using SDL
+
+In hvmconfig, disable vnc:
+
+vnc = 0
+
+In stubdom-hvmconfig, set a vfb:
+
+vfb = [ 'type=sdl' ]
+
+* Using a VNC server in the stub domain
+
+In hvmconfig, set vnclisten to "172.30.206.1" for instance.  Do not use a host
+name as Mini-OS does not have a name resolver.  Do not use 127.0.0.1 since then
+you will not be able to connect to it.
+
+vnc = 1
+vnclisten = "172.30.206.1"
+
+In stubdom-hvmconfig, fill the reserved vif with the same IP, for instance:
+
+vif = [ 'ip=172.30.206.1', 'ip=10.0.1.1,mac=aa:00:00:12:23:34']
+
+* Using a VNC server in dom0
+
+In hvmconfig, disable vnc:
+
+vnc = 0
+
+In stubdom-hvmconfig, set a vfb:
+
+vfb = [ 'type=vnc' ]
+
+and any other parameter as wished.
+
 To run
 ======
 
@@ -13,32 +80,4 @@ ln -s /usr/share/qemu/keymaps /exports/u
 ln -s /usr/share/qemu/keymaps /exports/usr/share/qemu
 /usr/sbin/fs-backend &
 
-
-In your HVM config "hvmconfig",
-
-- use VNC, set vnclisten to "172.30.206.1" for instance.  Do not use a host 
name
-as Mini-OS does not have a name resolver.  Do not use 127.0.0.1 since then you
-will not be able to connect to it.
-
-vnc = 1
-vnclisten = "172.30.206.1"
-
-- use /usr/lib/xen/bin/stubdom-dm as dm script
-
-device_model = '/usr/lib/xen/bin/stubdom-dm'
-
-- comment the disk statement:
-#disk = [  'file:/tmp/install.iso,hdc:cdrom,r', 'phy:/dev/sda6,hda,w', 
'file:/tmp/test,hdb,r' ]
-
-Create /etc/xen/stubdom-hvmconfig (where "hvmconfig" is your HVM guest domain
-name) with
-
-kernel = "/usr/lib/xen/boot/stubdom.gz"
-vif = [ 'ip=172.30.206.1', 'ip=10.0.1.1,mac=aa:00:00:12:23:34']
-disk = [  'file:/tmp/install.iso,hdc:cdrom,r', 'phy:/dev/sda6,hda,w', 
'file:/tmp/test,hdb,r' ]
-
-where
-- 172.30.206.1 is the IP for vnc,
-- 'ip=10.0.1.1,mac= etc...' is the same net configuration as in the hvmconfig
-script,
-- and disk = is the same block configuration as in the hvmconfig script.
+xm create hvmconfig
diff -r 0b20ac6ec64a -r 71a8366fb212 stubdom/stubdom-dm
--- a/stubdom/stubdom-dm        Fri Feb 29 09:18:01 2008 -0700
+++ b/stubdom/stubdom-dm        Fri Feb 29 09:19:58 2008 -0700
@@ -62,32 +62,23 @@ done
 
 creation="xm create -c stubdom-$domname target=$domid memory=32"
 
-(while true ; do sleep 60 ; done) | $creation > 
/var/log/xen/qemu-dm-$domid.log &
+(while true ; do sleep 60 ; done) | $creation &
 #xterm -geometry +0+0 -e /bin/sh -c "$creation ; echo ; echo press ENTER to 
shut down ; read" &
 consolepid=$!
 
-
-# Wait for vnc server to appear
-while ! vnc_port=`xenstore-read /local/domain/$domid/console/vnc-port`
-do
-        # Check that the stubdom job is still alive
-        kill -0 $consolepid || term
-       sleep 1
-done
-
-################
-# DEBUG: tcpdump
-#while ! stubdomid=`xm domid stubdom-$domname`
-#do
-#        sleep 1
-#done
-#xterm -geometry 160x25+0+$height -e /bin/sh -c "tcpdump -n -i 
vif$stubdomid.0" &
-#xterm -geometry 160x25+0+$((2 * $height)) -e /bin/sh -c "tcpdump -n -i 
vif$stubdomid.1" &
 
 ###########
 # vncviewer
 if [ "$vncviewer" = 1 ]
 then
+    # Wait for vnc server to appear
+    while ! vnc_port=`xenstore-read /local/domain/$domid/console/vnc-port`
+    do
+        # Check that the stubdom job is still alive
+        kill -0 $consolepid || term
+       sleep 1
+    done
+
     vncviewer $ip:$vnc_port &
     vncpid=$!
 fi
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/blktap/drivers/block-qcow2.c
--- a/tools/blktap/drivers/block-qcow2.c        Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/blktap/drivers/block-qcow2.c        Fri Feb 29 09:19:58 2008 -0700
@@ -1241,167 +1241,6 @@ static void create_refcount_update(QCowC
                refcount++;
                *p = cpu_to_be16(refcount);
        }
-}
-
-static int qcow2_create(const char *filename, int64_t total_size,
-               const char *backing_file, int flags)
-{
-       int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
-       QCowHeader header;
-       uint64_t tmp, offset;
-       QCowCreateState s1, *s = &s1;
-
-       memset(s, 0, sizeof(*s));
-
-       fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
-       if (fd < 0)
-               return -1;
-       memset(&header, 0, sizeof(header));
-       header.magic = cpu_to_be32(QCOW_MAGIC);
-       header.version = cpu_to_be32(QCOW_VERSION);
-       header.size = cpu_to_be64(total_size * 512);
-       header_size = sizeof(header);
-       backing_filename_len = 0;
-       if (backing_file) {
-               header.backing_file_offset = cpu_to_be64(header_size);
-               backing_filename_len = strlen(backing_file);
-               header.backing_file_size = cpu_to_be32(backing_filename_len);
-               header_size += backing_filename_len;
-       }
-       s->cluster_bits = 12;  /* 4 KB clusters */
-       s->cluster_size = 1 << s->cluster_bits;
-       header.cluster_bits = cpu_to_be32(s->cluster_bits);
-       header_size = (header_size + 7) & ~7;
-       if (flags & BLOCK_FLAG_ENCRYPT) {
-               header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
-       } else {
-               header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
-       }
-       l2_bits = s->cluster_bits - 3;
-       shift = s->cluster_bits + l2_bits;
-       l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
-       offset = align_offset(header_size, s->cluster_size);
-       s->l1_table_offset = offset;
-       header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
-       header.l1_size = cpu_to_be32(l1_size);
-       offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
-
-       s->refcount_table = qemu_mallocz(s->cluster_size);
-       if (!s->refcount_table)
-               goto fail;
-       s->refcount_block = qemu_mallocz(s->cluster_size);
-       if (!s->refcount_block)
-               goto fail;
-
-       s->refcount_table_offset = offset;
-       header.refcount_table_offset = cpu_to_be64(offset);
-       header.refcount_table_clusters = cpu_to_be32(1);
-       offset += s->cluster_size;
-
-       s->refcount_table[0] = cpu_to_be64(offset);
-       s->refcount_block_offset = offset;
-       offset += s->cluster_size;
-
-       /* update refcounts */
-       create_refcount_update(s, 0, header_size);
-       create_refcount_update(s, s->l1_table_offset, l1_size * 
sizeof(uint64_t));
-       create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
-       create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
-
-       /* write all the data */
-       write(fd, &header, sizeof(header));
-       if (backing_file) {
-               write(fd, backing_file, backing_filename_len);
-       }
-       lseek(fd, s->l1_table_offset, SEEK_SET);
-       tmp = 0;
-       for(i = 0;i < l1_size; i++) {
-               write(fd, &tmp, sizeof(tmp));
-       }
-       lseek(fd, s->refcount_table_offset, SEEK_SET);
-       write(fd, s->refcount_table, s->cluster_size);
-
-       lseek(fd, s->refcount_block_offset, SEEK_SET);
-       write(fd, s->refcount_block, s->cluster_size);
-
-       qemu_free(s->refcount_table);
-       qemu_free(s->refcount_block);
-       close(fd);
-       return 0;
-fail:
-       qemu_free(s->refcount_table);
-       qemu_free(s->refcount_block);
-       close(fd);
-       return -ENOMEM;
-}
-
-/* XXX: put compressed sectors first, then all the cluster aligned
-   tables to avoid losing bytes in alignment */
-static int qcow_write_compressed(struct disk_driver *bs, int64_t sector_num,
-               const uint8_t *buf, int nb_sectors)
-{
-       BDRVQcowState *s = bs->private;
-       z_stream strm;
-       int ret, out_len;
-       uint8_t *out_buf;
-       uint64_t cluster_offset;
-
-       if (nb_sectors == 0) {
-               /* align end of file to a sector boundary to ease reading with
-                  sector based I/Os */
-               cluster_offset = 512 * s->total_sectors;
-               cluster_offset = (cluster_offset + 511) & ~511;
-               ftruncate(s->fd, cluster_offset);
-               return 0;
-       }
-
-       if (nb_sectors != s->cluster_sectors)
-               return -EINVAL;
-
-       out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
-       if (!out_buf)
-               return -ENOMEM;
-
-       /* best compression, small window, no zlib header */
-       memset(&strm, 0, sizeof(strm));
-       ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
-                       Z_DEFLATED, -12,
-                       9, Z_DEFAULT_STRATEGY);
-       if (ret != 0) {
-               qemu_free(out_buf);
-               return -1;
-       }
-
-       strm.avail_in = s->cluster_size;
-       strm.next_in = (uint8_t *)buf;
-       strm.avail_out = s->cluster_size;
-       strm.next_out = out_buf;
-
-       ret = deflate(&strm, Z_FINISH);
-       if (ret != Z_STREAM_END && ret != Z_OK) {
-               qemu_free(out_buf);
-               deflateEnd(&strm);
-               return -1;
-       }
-       out_len = strm.next_out - out_buf;
-
-       deflateEnd(&strm);
-
-       if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
-               /* could not compress: write normal cluster */
-               qcow_write(bs, sector_num, buf, s->cluster_sectors);
-       } else {
-               cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
-                                                                               
        out_len, 0, 0);
-               cluster_offset &= s->cluster_offset_mask;
-               if (bdrv_pwrite(s->fd, cluster_offset, out_buf, out_len) != 
out_len) {
-                       qemu_free(out_buf);
-                       return -1;
-               }
-       }
-
-       qemu_free(out_buf);
-       return 0;
 }
 
 static int qcow_submit(struct disk_driver *bs)
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/block-qcow.c
--- a/tools/ioemu/block-qcow.c  Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/block-qcow.c  Fri Feb 29 09:19:58 2008 -0700
@@ -95,7 +95,7 @@ static int qcow_open(BlockDriverState *b
     int len, i, shift, ret;
     QCowHeader header;
 
-    ret = bdrv_file_open(&s->hd, filename, flags);
+    ret = bdrv_file_open(&s->hd, filename, flags | BDRV_O_EXTENDABLE);
     if (ret < 0)
         return ret;
     if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/block-qcow2.c
--- a/tools/ioemu/block-qcow2.c Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/block-qcow2.c Fri Feb 29 09:19:58 2008 -0700
@@ -191,7 +191,7 @@ static int qcow_open(BlockDriverState *b
     int len, i, shift, ret;
     QCowHeader header;
 
-    ret = bdrv_file_open(&s->hd, filename, flags);
+    ret = bdrv_file_open(&s->hd, filename, flags | BDRV_O_EXTENDABLE);
     if (ret < 0)
         return ret;
     if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/block-raw.c
--- a/tools/ioemu/block-raw.c   Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/block-raw.c   Fri Feb 29 09:19:58 2008 -0700
@@ -1489,5 +1489,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_getlength = raw_getlength,
+
+    .bdrv_flags = BLOCK_DRIVER_FLAG_EXTENDABLE
 };
 #endif /* _WIN32 */
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/block-vmdk.c
--- a/tools/ioemu/block-vmdk.c  Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/block-vmdk.c  Fri Feb 29 09:19:58 2008 -0700
@@ -352,7 +352,7 @@ static int vmdk_open(BlockDriverState *b
     uint32_t magic;
     int l1_size, i, ret;
 
-    ret = bdrv_file_open(&s->hd, filename, flags);
+    ret = bdrv_file_open(&s->hd, filename, flags | BDRV_O_EXTENDABLE);
     if (ret < 0)
         return ret;
     if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic))
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/block.c
--- a/tools/ioemu/block.c       Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/block.c       Fri Feb 29 09:19:58 2008 -0700
@@ -123,20 +123,23 @@ static int bdrv_rw_badreq_sectors(BlockD
 static int bdrv_rw_badreq_sectors(BlockDriverState *bs,
                                int64_t sector_num, int nb_sectors)
 {
-    return
+    return (
        nb_sectors < 0 ||
        nb_sectors > bs->total_sectors ||
-       sector_num > bs->total_sectors - nb_sectors;
+       sector_num > bs->total_sectors - nb_sectors
+       ) && !bs->extendable;
 }
 
 static int bdrv_rw_badreq_bytes(BlockDriverState *bs,
                                  int64_t offset, int count)
 {
     int64_t size = bs->total_sectors << SECTOR_BITS;
-    return
+    return (
        count < 0 ||
        count > size ||
-       offset > size - count;
+       offset > size - count
+       ) && !bs->extendable;
+    
 }
 
 void bdrv_register(BlockDriver *bdrv)
@@ -347,6 +350,12 @@ int bdrv_open2(BlockDriverState *bs, con
     bs->is_temporary = 0;
     bs->encrypted = 0;
 
+    if (flags & BDRV_O_EXTENDABLE) {
+       if (!(drv->bdrv_flags & BLOCK_DRIVER_FLAG_EXTENDABLE))
+           return -ENOSYS;
+       bs->extendable = 1;
+    }
+
     if (flags & BDRV_O_SNAPSHOT) {
         BlockDriverState *bs1;
         int64_t total_size;
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/block_int.h
--- a/tools/ioemu/block_int.h   Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/block_int.h   Fri Feb 29 09:19:58 2008 -0700
@@ -23,6 +23,8 @@
  */
 #ifndef BLOCK_INT_H
 #define BLOCK_INT_H
+
+#define BLOCK_DRIVER_FLAG_EXTENDABLE  0x0001u
 
 struct BlockDriver {
     const char *format_name;
@@ -76,6 +78,7 @@ struct BlockDriver {
     int (*bdrv_eject)(BlockDriverState *bs, int eject_flag);
     int (*bdrv_set_locked)(BlockDriverState *bs, int locked);
     
+    unsigned bdrv_flags;
     BlockDriverAIOCB *free_aiocb;
     struct BlockDriver *next;
 };
@@ -87,6 +90,7 @@ struct BlockDriverState {
     int removable; /* if true, the media can be removed */
     int locked;    /* if true, the media cannot temporarily be ejected */
     int encrypted; /* if true, the media is encrypted */
+    int extendable;/* if true, we may write out of original range */
     /* event callback when inserting/removing */
     void (*change_cb)(void *opaque);
     void *change_opaque;
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/hw/e1000.c
--- a/tools/ioemu/hw/e1000.c    Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/hw/e1000.c    Fri Feb 29 09:19:58 2008 -0700
@@ -48,7 +48,7 @@ static int debugflags = DBGBIT(TXERR) | 
 #endif
 
 #define IOPORT_SIZE       0x40
-#define PNPMMIO_SIZE      0x60000
+#define PNPMMIO_SIZE      0x20000
 
 /*
  * HW models:
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/hw/xenfb.c
--- a/tools/ioemu/hw/xenfb.c    Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/hw/xenfb.c    Fri Feb 29 09:19:58 2008 -0700
@@ -18,6 +18,12 @@
 #include <xs.h>
 
 #include "xenfb.h"
+
+#ifdef CONFIG_STUBDOM
+#include <semaphore.h>
+#include <sched.h>
+#include <fbfront.h>
+#endif
 
 #ifndef BTN_LEFT
 #define BTN_LEFT 0x110 /* from <linux/input.h> */
@@ -592,7 +598,8 @@ static int xenfb_send_key(struct xenfb *
 }
 
 /* Send a relative mouse movement event */
-static int xenfb_send_motion(struct xenfb *xenfb, int rel_x, int rel_y, int 
rel_z)
+static int xenfb_send_motion(struct xenfb *xenfb,
+                            int rel_x, int rel_y, int rel_z)
 {
        union xenkbd_in_event event;
 
@@ -606,7 +613,8 @@ static int xenfb_send_motion(struct xenf
 }
 
 /* Send an absolute mouse movement event */
-static int xenfb_send_position(struct xenfb *xenfb, int abs_x, int abs_y, int 
abs_z)
+static int xenfb_send_position(struct xenfb *xenfb,
+                              int abs_x, int abs_y, int rel_z)
 {
        union xenkbd_in_event event;
 
@@ -614,7 +622,7 @@ static int xenfb_send_position(struct xe
        event.type = XENKBD_TYPE_POS;
        event.pos.abs_x = abs_x;
        event.pos.abs_y = abs_y;
-       event.pos.abs_z = abs_z;
+       event.pos.rel_z = rel_z;
 
        return xenfb_kbd_event(xenfb, &event);
 }
@@ -1124,12 +1132,10 @@ static void xenfb_guest_copy(struct xenf
     dpy_update(xenfb->ds, x, y, w, h);
 }
 
-/* QEMU display state changed, so refresh the framebuffer copy */
-/* XXX - can we optimize this, or the next func at all ? */ 
+/* Periodic update of display, no need for any in our case */
 static void xenfb_update(void *opaque)
 {
     struct xenfb *xenfb = opaque;
-    xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height);
 }
 
 /* QEMU display state changed, so refresh the framebuffer copy */
@@ -1169,6 +1175,204 @@ static int xenfb_register_console(struct
         return 0;
 }
 
+#ifdef CONFIG_STUBDOM
+static struct semaphore kbd_sem = __SEMAPHORE_INITIALIZER(kbd_sem, 0);
+static struct kbdfront_dev *kbd_dev;
+static char *kbd_path, *fb_path;
+
+static unsigned char linux2scancode[KEY_MAX + 1];
+
+#define WIDTH 1024
+#define HEIGHT 768
+#define DEPTH 32
+#define LINESIZE (1280 * (DEPTH / 8))
+#define MEMSIZE (LINESIZE * HEIGHT)
+
+int xenfb_connect_vkbd(const char *path)
+{
+    kbd_path = strdup(path);
+    return 0;
+}
+
+int xenfb_connect_vfb(const char *path)
+{
+    fb_path = strdup(path);
+    return 0;
+}
+
+static void xenfb_pv_update(DisplayState *s, int x, int y, int w, int h)
+{
+    struct fbfront_dev *fb_dev = s->opaque;
+    fbfront_update(fb_dev, x, y, w, h);
+}
+
+static void xenfb_pv_resize(DisplayState *s, int w, int h)
+{
+    struct fbfront_dev *fb_dev = s->opaque;
+    fprintf(stderr,"resize to %dx%d required\n", w, h);
+    s->width = w;
+    s->height = h;
+    /* TODO: send resize event if supported */
+    memset(s->data, 0, MEMSIZE);
+    fbfront_update(fb_dev, 0, 0, WIDTH, HEIGHT);
+}
+
+static void xenfb_pv_colourdepth(DisplayState *s, int depth)
+{
+    /* TODO: send redepth event if supported */
+    fprintf(stderr,"redepth to %d required\n", depth);
+}
+
+static void xenfb_kbd_handler(void *opaque)
+{
+#define KBD_NUM_BATCH 64
+    union xenkbd_in_event buf[KBD_NUM_BATCH];
+    int n, i;
+    DisplayState *s = opaque;
+    static int buttons;
+    static int x, y;
+
+    n = kbdfront_receive(kbd_dev, buf, KBD_NUM_BATCH);
+    for (i = 0; i < n; i++) {
+        switch (buf[i].type) {
+
+            case XENKBD_TYPE_MOTION:
+                fprintf(stderr, "FB backend sent us relative mouse motion 
event!\n");
+                break;
+
+            case XENKBD_TYPE_POS:
+            {
+                int new_x = buf[i].pos.abs_x;
+                int new_y = buf[i].pos.abs_y;
+                if (new_x >= s->width)
+                    new_x = s->width - 1;
+                if (new_y >= s->height)
+                    new_y = s->height - 1;
+                if (kbd_mouse_is_absolute()) {
+                    kbd_mouse_event(
+                            new_x * 0x7FFF / (s->width - 1),
+                            new_y * 0x7FFF / (s->height - 1),
+                            buf[i].pos.rel_z,
+                            buttons);
+                } else {
+                    kbd_mouse_event(
+                            new_x - x,
+                            new_y - y,
+                            buf[i].pos.rel_z,
+                            buttons);
+                }
+                x = new_x;
+                y = new_y;
+                break;
+            }
+
+            case XENKBD_TYPE_KEY:
+            {
+                int keycode = buf[i].key.keycode;
+                int button = 0;
+
+                if (keycode == BTN_LEFT)
+                    button = MOUSE_EVENT_LBUTTON;
+                else if (keycode == BTN_RIGHT)
+                    button = MOUSE_EVENT_RBUTTON;
+                else if (keycode == BTN_MIDDLE)
+                    button = MOUSE_EVENT_MBUTTON;
+
+                if (button) {
+                    if (buf[i].key.pressed)
+                        buttons |=  button;
+                    else
+                        buttons &= ~button;
+                    if (kbd_mouse_is_absolute())
+                        kbd_mouse_event(
+                                x * 0x7FFF / s->width,
+                                y * 0x7FFF / s->height,
+                                0,
+                                buttons);
+                    else
+                        kbd_mouse_event(0, 0, 0, buttons);
+                } else {
+                    int scancode = linux2scancode[keycode];
+                    if (!scancode) {
+                        fprintf(stderr, "Can't convert keycode %x to 
scancode\n", keycode);
+                        break;
+                    }
+                    if (scancode & 0x80) {
+                        kbd_put_keycode(0xe0);
+                        scancode &= 0x7f;
+                    }
+                    if (!buf[i].key.pressed)
+                        scancode |= 0x80;
+                    kbd_put_keycode(scancode);
+                }
+                break;
+            }
+        }
+    }
+}
+
+static void xenfb_pv_refresh(DisplayState *ds)
+{
+    vga_hw_update();
+}
+
+static void kbdfront_thread(void *p)
+{
+    int scancode, keycode;
+    kbd_dev = init_kbdfront(p, 1);
+    if (!kbd_dev) {
+        fprintf(stderr,"can't open keyboard\n");
+        exit(1);
+    }
+    up(&kbd_sem);
+    for (scancode = 0; scancode < 128; scancode++) {
+        keycode = atkbd_set2_keycode[atkbd_unxlate_table[scancode]];
+        linux2scancode[keycode] = scancode;
+        keycode = atkbd_set2_keycode[atkbd_unxlate_table[scancode] | 0x80];
+        linux2scancode[keycode] = scancode | 0x80;
+    }
+}
+
+int xenfb_pv_display_init(DisplayState *ds)
+{
+    void *data;
+    struct fbfront_dev *fb_dev;
+    int kbd_fd;
+
+    if (!fb_path || !kbd_path)
+        return -1;
+
+    create_thread("kbdfront", kbdfront_thread, (void*) kbd_path);
+
+    data = qemu_memalign(PAGE_SIZE, VGA_RAM_SIZE);
+    fb_dev = init_fbfront(fb_path, data, WIDTH, HEIGHT, DEPTH, LINESIZE, 
MEMSIZE);
+    if (!fb_dev) {
+        fprintf(stderr,"can't open frame buffer\n");
+        exit(1);
+    }
+    free(fb_path);
+
+    down(&kbd_sem);
+    free(kbd_path);
+
+    kbd_fd = kbdfront_open(kbd_dev);
+    qemu_set_fd_handler(kbd_fd, xenfb_kbd_handler, NULL, ds);
+
+    ds->data = data;
+    ds->linesize = LINESIZE;
+    ds->depth = DEPTH;
+    ds->bgr = 0;
+    ds->width = WIDTH;
+    ds->height = HEIGHT;
+    ds->dpy_update = xenfb_pv_update;
+    ds->dpy_resize = xenfb_pv_resize;
+    ds->dpy_colourdepth = NULL; //xenfb_pv_colourdepth;
+    ds->dpy_refresh = xenfb_pv_refresh;
+    ds->opaque = fb_dev;
+    return 0;
+}
+#endif
+
 /*
  * Local variables:
  *  c-indent-level: 8
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/keymaps.c
--- a/tools/ioemu/keymaps.c     Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/keymaps.c     Fri Feb 29 09:19:58 2008 -0700
@@ -126,11 +126,11 @@ static kbd_layout_t *parse_keyboard_layo
                    if (rest && strstr(rest, "numlock")) {
                        add_to_key_range(&k->keypad_range, keycode);
                        add_to_key_range(&k->numlock_range, keysym);
-                       fprintf(stderr, "keypad keysym %04x keycode %d\n", 
keysym, keycode);
+                       //fprintf(stderr, "keypad keysym %04x keycode %d\n", 
keysym, keycode);
                    }
                    if (rest && strstr(rest, "shift")) {
                        add_to_key_range(&k->shift_range, keysym);
-                       fprintf(stderr, "shift keysym %04x keycode %d\n", 
keysym, keycode);
+                       //fprintf(stderr, "shift keysym %04x keycode %d\n", 
keysym, keycode);
                    }
 
                    /* if(keycode&0x80)
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/monitor.c
--- a/tools/ioemu/monitor.c     Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/monitor.c     Fri Feb 29 09:19:58 2008 -0700
@@ -2520,7 +2520,7 @@ static void monitor_handle_command1(void
 
 static void monitor_start_input(void)
 {
-    readline_start("(HVMXen) ", 0, monitor_handle_command1, NULL);
+    readline_start("(qemu) ", 0, monitor_handle_command1, NULL);
 }
 
 static void term_event(void *opaque, int event)
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/vl.c  Fri Feb 29 09:19:58 2008 -0700
@@ -7611,9 +7611,7 @@ int main(int argc, char **argv)
         }
     }
 
-    /* Now send logs to our named config */
-    sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm-%d.log", domid);
-    cpu_set_log_filename(qemu_dm_logfilename);
+    cpu_set_log(0);
 
 #ifndef NO_DAEMONIZE
     if (daemonize && !nographic && vnc_display == NULL && vncunused == 0) {
@@ -7831,6 +7829,10 @@ int main(int argc, char **argv)
     init_ioports();
 
     /* terminal init */
+#ifdef CONFIG_STUBDOM
+    if (xenfb_pv_display_init(ds) == 0) {
+    } else
+#endif
     if (nographic) {
         dumb_display_init(ds);
     } else if (vnc_display != NULL || vncunused != 0) {
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/vl.h
--- a/tools/ioemu/vl.h  Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/vl.h  Fri Feb 29 09:19:58 2008 -0700
@@ -614,6 +614,8 @@ typedef struct QEMUSnapshotInfo {
                                      use a disk image format on top of
                                      it (default for
                                      bdrv_file_open()) */
+#define BDRV_O_EXTENDABLE  0x0080 /* allow writes out of original size range;
+                                    only effective for some drivers */
 
 void bdrv_init(void);
 BlockDriver *bdrv_find_format(const char *format_name);
@@ -1525,6 +1527,11 @@ int xenstore_vm_write(int domid, char *k
 int xenstore_vm_write(int domid, char *key, char *val);
 char *xenstore_vm_read(int domid, char *key, unsigned int *len);
 
+/* xenfb.c */
+int xenfb_pv_display_init(DisplayState *ds);
+int xenfb_connect_vkbd(const char *path);
+int xenfb_connect_vfb(const char *path);
+
 /* helper2.c */
 extern long time_offset;
 void timeoffset_get(void);
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c    Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/ioemu/xenstore.c    Fri Feb 29 09:19:58 2008 -0700
@@ -238,6 +238,37 @@ void xenstore_parse_domain_config(int do
         }
     }
 
+#ifdef CONFIG_STUBDOM
+    if (pasprintf(&buf, "%s/device/vkbd", path) == -1)
+        goto out;
+
+    free(e);
+    e = xs_directory(xsh, XBT_NULL, buf, &num);
+
+    if (e) {
+        for (i = 0; i < num; i++) {
+            if (pasprintf(&buf, "%s/device/vkbd/%s", path, e[i]) == -1)
+                continue;
+            xenfb_connect_vkbd(buf);
+        }
+    }
+
+    if (pasprintf(&buf, "%s/device/vfb", path) == -1)
+        goto out;
+
+    free(e);
+    e = xs_directory(xsh, XBT_NULL, buf, &num);
+
+    if (e) {
+        for (i = 0; i < num; i++) {
+            if (pasprintf(&buf, "%s/device/vfb/%s", path, e[i]) == -1)
+                continue;
+            xenfb_connect_vfb(buf);
+        }
+    }
+#endif
+
+
     /* Set a watch for log-dirty requests from the migration tools */
     if (pasprintf(&buf, "/local/domain/0/device-model/%u/logdirty/next-active",
                   domid) != -1) {
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xend/XendAPI.py
--- a/tools/python/xen/xend/XendAPI.py  Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xend/XendAPI.py  Fri Feb 29 09:19:58 2008 -0700
@@ -1761,9 +1761,10 @@ class XendAPI(object):
 
         resource = other_config.get("resource", 0)
         port = other_config.get("port", 0)
+        node = other_config.get("node", 0)
         
         xendom.domain_migrate(xeninfo.getDomid(), destination_url,
-                              bool(live), resource, port)
+                              bool(live), resource, port, node)
         return xen_api_success_void()
 
     def VM_save(self, _, vm_ref, dest, checkpoint):
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xend/XendCheckpoint.py   Fri Feb 29 09:19:58 2008 -0700
@@ -22,6 +22,7 @@ from xen.xend.XendLogging import log
 from xen.xend.XendLogging import log
 from xen.xend.XendConfig import XendConfig
 from xen.xend.XendConstants import *
+from xen.xend import XendNode
 
 SIGNATURE = "LinuxGuestRecord"
 QEMU_SIGNATURE = "QemuDeviceModelRecord"
@@ -56,10 +57,23 @@ def read_exact(fd, size, errmsg):
     return buf
 
 
-def save(fd, dominfo, network, live, dst, checkpoint=False):
+def insert_after(list, pred, value):
+    for i,k in enumerate(list):
+        if type(k) == type([]):
+           if k[0] == pred:
+              list.insert (i+1, value)
+    return
+
+
+def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1):
     write_exact(fd, SIGNATURE, "could not write guest state file: signature")
 
-    config = sxp.to_string(dominfo.sxpr())
+    sxprep = dominfo.sxpr()
+
+    if node > -1:
+        insert_after(sxprep,'vcpus',['node', str(node)])
+
+    config = sxp.to_string(sxprep)
 
     domain_name = dominfo.getName()
     # Rename the domain temporarily, so that we don't get a name clash if this
@@ -191,6 +205,21 @@ def restore(xd, fd, dominfo = None, paus
         dominfo.resume()
     else:
         dominfo = xd.restore_(vmconfig)
+
+    # repin domain vcpus if a target node number was specified 
+    # this is done prior to memory allocation to aide in memory
+    # distribution for NUMA systems.
+    nodenr = -1
+    for i,l in enumerate(vmconfig):
+        if type(l) == type([]):
+            if l[0] == 'node':
+                nodenr = int(l[1])
+
+    if nodenr >= 0:
+        node_to_cpu = XendNode.instance().xc.physinfo()['node_to_cpu']
+        if nodenr < len(node_to_cpu):
+            for v in range(0, dominfo.info['VCPUs_max']):
+                 xc.vcpu_setaffinity(dominfo.domid, v, node_to_cpu[nodenr])
 
     store_port   = dominfo.getStorePort()
     console_port = dominfo.getConsolePort()
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xend/XendDomain.py       Fri Feb 29 09:19:58 2008 -0700
@@ -865,7 +865,7 @@ class XendDomain:
                 raise XendInvalidDomain(domname)
 
             if dominfo.getDomid() == DOM0_ID:
-                raise XendError("Cannot save privileged domain %s" % domname)
+                raise XendError("Cannot suspend privileged domain %s" % 
domname)
 
             if dominfo._stateGet() != DOM_STATE_RUNNING:
                 raise VMBadState("Domain is not running",
@@ -910,7 +910,7 @@ class XendDomain:
                     raise XendInvalidDomain(domname)
 
                 if dominfo.getDomid() == DOM0_ID:
-                    raise XendError("Cannot save privileged domain %s" % 
domname)
+                    raise XendError("Cannot resume privileged domain %s" % 
domname)
 
                 if dominfo._stateGet() != XEN_API_VM_POWER_STATE_SUSPENDED:
                     raise XendError("Cannot resume domain that is not 
suspended.")
@@ -1258,7 +1258,7 @@ class XendDomain:
 
         return val       
 
-    def domain_migrate(self, domid, dst, live=False, resource=0, port=0):
+    def domain_migrate(self, domid, dst, live=False, resource=0, port=0, 
node=-1):
         """Start domain migration.
         
         @param domid: Domain ID or Name
@@ -1271,6 +1271,8 @@ class XendDomain:
         @type live: bool
         @keyword resource: not used??
         @rtype: None
+        @keyword node: use node number for target
+        @rtype: int 
         @raise XendError: Failed to migrate
         @raise XendInvalidDomain: Domain is not valid        
         """
@@ -1299,7 +1301,7 @@ class XendDomain:
 
         sock.send("receive\n")
         sock.recv(80)
-        XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst)
+        XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node)
         sock.close()
 
     def domain_save(self, domid, dst, checkpoint=False):
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xend/XendDomainInfo.py   Fri Feb 29 09:19:58 2008 -0700
@@ -1406,9 +1406,6 @@ class XendDomainInfo:
     def setWeight(self, cpu_weight):
         self.info['vcpus_params']['weight'] = cpu_weight
 
-    def setResume(self, state):
-        self._resume = state
-
     def getRestartCount(self):
         return self._readVm('xend/restart_count')
 
@@ -1963,6 +1960,39 @@ class XendDomainInfo:
             if self.info['cpus'] is not None and len(self.info['cpus']) > 0:
                 for v in range(0, self.info['VCPUs_max']):
                     xc.vcpu_setaffinity(self.domid, v, self.info['cpus'])
+            else:
+                info = xc.physinfo()
+                if info['nr_nodes'] > 1:
+                    node_memory_list = info['node_to_memory']
+                    needmem = 
self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
+                    candidate_node_list = []
+                    for i in range(0, info['nr_nodes']):
+                        if node_memory_list[i] >= needmem:
+                            candidate_node_list.append(i)
+                    if candidate_node_list is None or len(candidate_node_list) 
== 1:
+                        index = node_memory_list.index( max(node_memory_list) )
+                        cpumask = info['node_to_cpu'][index]
+                    else:
+                        nodeload = [0]
+                        nodeload = nodeload * info['nr_nodes']
+                        from xen.xend import XendDomain
+                        doms = XendDomain.instance().list('all')
+                        for dom in doms:
+                            cpuinfo = dom.getVCPUInfo()
+                            for vcpu in sxp.children(cpuinfo, 'vcpu'):
+                                def vinfo(n, t):
+                                    return t(sxp.child_value(vcpu, n))
+                                cpumap = vinfo('cpumap', list)
+                                for i in candidate_node_list:
+                                    node_cpumask = info['node_to_cpu'][i]
+                                    for j in node_cpumask:
+                                        if j in cpumap:
+                                            nodeload[i] += 1
+                                            break
+                        index = nodeload.index( min(nodeload) )
+                        cpumask = info['node_to_cpu'][index]
+                    for v in range(0, self.info['VCPUs_max']):
+                        xc.vcpu_setaffinity(self.domid, v, cpumask)
 
             # Use architecture- and image-specific calculations to determine
             # the various headrooms necessary, given the raw configured
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xend/image.py    Fri Feb 29 09:19:58 2008 -0700
@@ -296,7 +296,34 @@ class ImageHandler:
                         { 'dom': self.vm.getDomid(), 'read': True, 'write': 
True })
         log.info("spawning device models: %s %s", self.device_model, args)
         # keep track of pid and spawned options to kill it later
-        self.pid = os.spawnve(os.P_NOWAIT, self.device_model, args, env)
+
+        logfile = "/var/log/xen/qemu-dm-%s.log" %  
str(self.vm.info['name_label'])
+        if os.path.exists(logfile):
+            if os.path.exists(logfile + ".1"):
+                os.unlink(logfile + ".1")
+            os.rename(logfile, logfile + ".1")
+
+        null = os.open("/dev/null", os.O_RDONLY)
+        logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
+        
+        pid = os.fork()
+        if pid == 0: #child
+            try:
+                os.dup2(null, 0)
+                os.dup2(logfd, 1)
+                os.dup2(logfd, 2)
+                os.close(null)
+                os.close(logfd)
+                try:
+                    os.execve(self.device_model, args, env)
+                except:
+                    os._exit(127)
+            except:
+                os._exit(127)
+        else:
+            self.pid = pid
+            os.close(null)
+            os.close(logfd)
         self.vm.storeDom("image/device-model-pid", self.pid)
         log.info("device model pid: %d", self.pid)
 
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xm/main.py       Fri Feb 29 09:19:58 2008 -0700
@@ -699,9 +699,6 @@ def xm_save(args):
         err(opterr)
         sys.exit(1)
 
-    dom = params[0]
-    savefile = params[1]
-
     checkpoint = False
     for (k, v) in options:
         if k in ['-c', '--checkpoint']:
@@ -710,9 +707,9 @@ def xm_save(args):
     if len(params) != 2:
         err("Wrong number of parameters")
         usage('save')
-        sys.exit(1)
-
-    savefile = os.path.abspath(savefile)
+
+    dom = params[0]
+    savefile = os.path.abspath(params[1])
 
     if not os.access(os.path.dirname(savefile), os.W_OK):
         err("xm save: Unable to create file %s" % savefile)
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/python/xen/xm/migrate.py
--- a/tools/python/xen/xm/migrate.py    Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/python/xen/xm/migrate.py    Fri Feb 29 09:19:58 2008 -0700
@@ -43,6 +43,10 @@ gopts.opt('port', short='p', val='portnu
           fn=set_int, default=0,
           use="Use specified port for migration.")
 
+gopts.opt('node', short='n', val='nodenum',
+          fn=set_int, default=-1,
+          use="Use specified NUMA node on target.")
+
 gopts.opt('resource', short='r', val='MBIT',
           fn=set_int, default=0,
           use="Set level of resource usage for migration.")
@@ -65,11 +69,13 @@ def main(argv):
         vm_ref = get_single_vm(dom)
         other_config = {
             "port":     opts.vals.port,
-            "resource": opts.vals.resource
+            "resource": opts.vals.resource,
+            "node":     opts.vals.node
             }
         server.xenapi.VM.migrate(vm_ref, dst, bool(opts.vals.live),
                                  other_config)
     else:
         server.xend.domain.migrate(dom, dst, opts.vals.live,
                                    opts.vals.resource,
-                                   opts.vals.port)
+                                   opts.vals.port,
+                                   opts.vals.node)
diff -r 0b20ac6ec64a -r 71a8366fb212 
tools/xenstat/libxenstat/src/xenstat_solaris.c
--- a/tools/xenstat/libxenstat/src/xenstat_solaris.c    Fri Feb 29 09:18:01 
2008 -0700
+++ b/tools/xenstat/libxenstat/src/xenstat_solaris.c    Fri Feb 29 09:19:58 
2008 -0700
@@ -113,49 +113,23 @@ static void xenstat_uninit_devs(xenstat_
        priv->kc = NULL;
 }
 
-static int parse_nic(const char *nic, char *module, int *instance)
-{
-       const char *c;
-
-       for (c = &nic[strlen(nic) - 1]; c != nic && isdigit(*c); c--)
-               ;
-
-       if (c == nic)
-               return 0;
-
-       c++;
-
-       if (sscanf(c, "%d", instance) != 1)
-               return 0;
-
-       strncpy(module, nic, c - nic);
-       module[c - nic] = '\0';
-       return 1;
-}
-
 static int update_dev_stats(priv_data_t *priv, stdevice_t *dev)
 {
-       char mod[256];
-       const char *name;
-       int inst;
        kstat_t *ksp;
 
+       if (kstat_chain_update(priv->kc) == -1)
+               return 0;
+
        if (dev->type == DEVICE_NIC) {
-               if (!parse_nic(dev->name, mod, &inst))
-                       return 0;
-               name = "mac";
+               ksp = kstat_lookup(priv->kc, "link", 0, (char *)dev->name);
        } else {
-               strcpy(mod, "xdb");
-               inst = dev->instance;
-               name = "req_statistics";
-       }
-
-       if (kstat_chain_update(priv->kc) == -1)
-               return 0;
-
-       ksp = kstat_lookup(priv->kc, mod, inst, (char *)name);
+               ksp = kstat_lookup(priv->kc, "xdb", dev->instance,
+                   (char *)"req_statistics");
+       }
+
        if (ksp == NULL)
                return 0;
+
        if (kstat_read(priv->kc, ksp, NULL) == -1)
                return 0;
 
diff -r 0b20ac6ec64a -r 71a8366fb212 tools/xentrace/xentrace.c
--- a/tools/xentrace/xentrace.c Fri Feb 29 09:18:01 2008 -0700
+++ b/tools/xentrace/xentrace.c Fri Feb 29 09:19:58 2008 -0700
@@ -15,7 +15,6 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <sys/vfs.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
@@ -25,6 +24,7 @@
 #include <getopt.h>
 #include <assert.h>
 #include <sys/poll.h>
+#include <sys/statvfs.h>
 
 #include <xen/xen.h>
 #include <xen/trace.h>
@@ -87,7 +87,7 @@ void write_buffer(unsigned int cpu, unsi
 void write_buffer(unsigned int cpu, unsigned char *start, int size,
                int total_size, int outfd)
 {
-    struct statfs stat;
+    struct statvfs stat;
     size_t written = 0;
     
     if ( opts.disk_rsvd != 0 )
@@ -95,13 +95,13 @@ void write_buffer(unsigned int cpu, unsi
         unsigned long long freespace;
 
         /* Check that filesystem has enough space. */
-        if ( fstatfs (outfd, &stat) )
+        if ( fstatvfs (outfd, &stat) )
         {
                 fprintf(stderr, "Statfs failed!\n");
                 goto fail;
         }
 
-        freespace = stat.f_bsize * (unsigned long long)stat.f_bfree;
+        freespace = stat.f_frsize * (unsigned long long)stat.f_bfree;
 
         if ( total_size )
             freespace -= total_size;
diff -r 0b20ac6ec64a -r 71a8366fb212 
unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c
--- a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c        Fri Feb 
29 09:18:01 2008 -0700
+++ b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c        Fri Feb 
29 09:19:58 2008 -0700
@@ -71,7 +71,7 @@ static int bp_suspend(void)
        return suspend_cancelled;
 }
 
-int __xen_suspend(int fast_suspend)
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(void))
 {
        int err, suspend_cancelled, nr_cpus;
        struct ap_suspend_info info;
@@ -101,6 +101,7 @@ int __xen_suspend(int fast_suspend)
 
        local_irq_disable();
        suspend_cancelled = bp_suspend();
+       resume_notifier();
        local_irq_enable();
 
        smp_mb();
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/ia64/xen/machine_kexec.c
--- a/xen/arch/ia64/xen/machine_kexec.c Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/ia64/xen/machine_kexec.c Fri Feb 29 09:19:58 2008 -0700
@@ -24,6 +24,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <asm/dom_fw_dom0.h>
+#include <asm-generic/sections.h>
 
 #define kexec_flush_icache_page(page)                                  \
 do {                                                                   \
@@ -144,6 +145,54 @@ void machine_reboot_kexec(xen_kexec_imag
        machine_kexec(image);
 }
 
+static int machine_kexec_get_xen(xen_kexec_range_t *range)
+{
+       range->start = range->start = ia64_tpa(_text);
+       range->size = (unsigned long)_end - (unsigned long)_text;
+       return 0;
+}
+
+#define ELF_PAGE_SHIFT 16
+#define ELF_PAGE_SIZE  (__IA64_UL_CONST(1) << ELF_PAGE_SHIFT)
+#define ELF_PAGE_MASK  (~(ELF_PAGE_SIZE - 1))
+
+static int machine_kexec_get_xenheap(xen_kexec_range_t *range)
+{
+       range->start = (ia64_tpa(_end) + (ELF_PAGE_SIZE - 1)) & ELF_PAGE_MASK;
+       range->size = (unsigned long)xenheap_phys_end -
+                     (unsigned long)range->start;
+       return 0;
+}
+
+static int machine_kexec_get_boot_param(xen_kexec_range_t *range)
+{
+       range->start = __pa(ia64_boot_param);
+       range->size = sizeof(*ia64_boot_param);
+       return 0;
+}
+
+static int machine_kexec_get_efi_memmap(xen_kexec_range_t *range)
+{
+       range->start = ia64_boot_param->efi_memmap;
+       range->size = ia64_boot_param->efi_memmap_size;
+       return 0;
+}
+
+int machine_kexec_get(xen_kexec_range_t *range)
+{
+       switch (range->range) {
+       case KEXEC_RANGE_MA_XEN:
+               return machine_kexec_get_xen(range);
+       case KEXEC_RANGE_MA_XENHEAP:
+               return machine_kexec_get_xenheap(range);
+       case KEXEC_RANGE_MA_BOOT_PARAM:
+               return machine_kexec_get_boot_param(range);
+       case KEXEC_RANGE_MA_EFI_MEMMAP:
+               return machine_kexec_get_efi_memmap(range);
+       }
+       return -EINVAL;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/powerpc/machine_kexec.c
--- a/xen/arch/powerpc/machine_kexec.c  Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/powerpc/machine_kexec.c  Fri Feb 29 09:19:58 2008 -0700
@@ -24,6 +24,12 @@ void machine_kexec(xen_kexec_image_t *im
     printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
 }
 
+int machine_kexec_get(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/machine_kexec.c
--- a/xen/arch/x86/machine_kexec.c      Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/x86/machine_kexec.c      Fri Feb 29 09:19:58 2008 -0700
@@ -23,6 +23,9 @@ typedef void (*relocate_new_kernel_t)(
                 unsigned long indirection_page,
                 unsigned long *page_list,
                 unsigned long start_address);
+
+extern int machine_kexec_get_xen(xen_kexec_range_t *range);
+
 
 int machine_kexec_load(int type, int slot, xen_kexec_image_t *image)
 {
@@ -135,6 +138,13 @@ void machine_kexec(xen_kexec_image_t *im
     }
 }
 
+int machine_kexec_get(xen_kexec_range_t *range)
+{
+       if (range->range != KEXEC_RANGE_MA_XEN)
+               return -EINVAL;
+       return machine_kexec_get_xen(range);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/x86/mm/shadow/multi.c    Fri Feb 29 09:19:58 2008 -0700
@@ -55,12 +55,6 @@
  * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
  * to share l2h pages again. 
  *
- * GUEST_WALK_TABLES TLB FLUSH COALESCE
- * guest_walk_tables can do up to three remote TLB flushes as it walks to
- * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
- * and if we do flush, re-do the walk.  If anything has changed, then 
- * pause all the other vcpus and do the walk *again*.
- *
  * PSE disabled / PSE36
  * We don't support any modes other than PSE enabled, PSE36 disabled.
  * Neither of those would be hard to change, but we'd need to be able to 
@@ -246,10 +240,95 @@ static uint32_t set_ad_bits(void *guest_
     return 0;
 }
 
+/* This validation is called with lock held, and after write permission
+ * removal. Then check is atomic and no more inconsistent content can
+ * be observed before lock is released
+ *
+ * Return 1 to indicate success and 0 for inconsistency
+ */
+static inline uint32_t
+shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
+{
+    struct domain *d = v->domain;
+    guest_l1e_t *l1p;
+    guest_l2e_t *l2p;
+#if GUEST_PAGING_LEVELS >= 4
+    guest_l3e_t *l3p;
+    guest_l4e_t *l4p;
+#endif
+    int mismatch = 0;
+
+    ASSERT(shadow_locked_by_me(d));
+
+    if ( gw->version ==
+         atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
+        return 1;
+
+    /* We may consider caching guest page mapping from last
+     * guest table walk. However considering this check happens
+     * relatively less-frequent, and a bit burden here to
+     * remap guest page is better than caching mapping in each
+     * guest table walk.
+     *
+     * Also when inconsistency occurs, simply return to trigger
+     * another fault instead of re-validate new path to make
+     * logic simple.
+     */
+    perfc_incr(shadow_check_gwalk);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
+    mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
+    l3p = sh_map_domain_page(gw->l3mfn);
+    mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
+    sh_unmap_domain_page(l3p);
+#else
+    mismatch |= (gw->l3e.l3 !=
+                 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
+#endif
+    l2p = sh_map_domain_page(gw->l2mfn);
+    mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
+    sh_unmap_domain_page(l2p);
+#else
+    l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
+    mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
+#endif
+    if ( !(guest_supports_superpages(v) &&
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
+    {
+        l1p = sh_map_domain_page(gw->l1mfn);
+        mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
+        sh_unmap_domain_page(l1p);
+    }
+
+    return !mismatch;
+}
+
+/* Remove write access permissions from a gwalk_t in a batch, and
+ * return OR-ed result for TLB flush hint
+ */
+static inline uint32_t
+gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
+{
+    int rc = 0;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
+#endif
+    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
+#endif
+    if ( !(guest_supports_superpages(v) &&
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
+        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+
+    return rc;
+}
+
 /* Walk the guest pagetables, after the manner of a hardware walker. 
  *
  * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
- *         pointer to a pagefault code, and a flag "shadow_op".
+ *         pointer to a pagefault code
  * 
  * We walk the vcpu's guest pagetables, filling the walk_t with what we
  * see and adding any Accessed and Dirty bits that are needed in the
@@ -257,10 +336,9 @@ static uint32_t set_ad_bits(void *guest_
  * we go.  For the purposes of reading pagetables we treat all non-RAM
  * memory as contining zeroes.
  * 
- * If "shadow_op" is non-zero, we are serving a genuine guest memory access, 
- * and must (a) be under the shadow lock, and (b) remove write access
- * from any guest PT pages we see, as we will be shadowing them soon
- * and will rely on the contents' not having changed.
+ * The walk is done in a lock-free style, with some sanity check postponed
+ * after grabbing shadow lock later. Those delayed checks will make sure
+ * no inconsistent mapping being translated into shadow page table.
  * 
  * Returns 0 for success, or the set of permission bits that we failed on 
  * if the walk did not complete.
@@ -268,8 +346,7 @@ static uint32_t set_ad_bits(void *guest_
  * checked the old return code anyway.
  */
 static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
-                  uint32_t pfec, int shadow_op)
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -282,11 +359,12 @@ guest_walk_tables(struct vcpu *v, unsign
     uint32_t gflags, mflags, rc = 0;
     int pse;
 
-    ASSERT(!shadow_op || shadow_locked_by_me(d));
-    
     perfc_incr(shadow_guest_walk);
     memset(gw, 0, sizeof(*gw));
     gw->va = va;
+
+    gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
+    rmb();
 
     /* Mandatory bits that must be set in every entry.  We invert NX, to
      * calculate as if there were an "X" bit that allowed access. 
@@ -312,9 +390,7 @@ guest_walk_tables(struct vcpu *v, unsign
         goto out;
     }
     ASSERT(mfn_valid(gw->l3mfn));
-    /* This mfn is a pagetable: make sure the guest can't write to it. */
-    if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
-        flush_tlb_mask(d->domain_dirty_cpumask); 
+
     /* Get the l3e and check its flags*/
     l3p = sh_map_domain_page(gw->l3mfn);
     gw->l3e = l3p[guest_l3_table_offset(va)];
@@ -343,9 +419,7 @@ guest_walk_tables(struct vcpu *v, unsign
         goto out;
     }
     ASSERT(mfn_valid(gw->l2mfn));
-    /* This mfn is a pagetable: make sure the guest can't write to it. */
-    if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
-        flush_tlb_mask(d->domain_dirty_cpumask); 
+
     /* Get the l2e */
     l2p = sh_map_domain_page(gw->l2mfn);
     gw->l2e = l2p[guest_l2_table_offset(va)];
@@ -403,10 +477,6 @@ guest_walk_tables(struct vcpu *v, unsign
             goto out;
         }
         ASSERT(mfn_valid(gw->l1mfn));
-        /* This mfn is a pagetable: make sure the guest can't write to it. */
-        if ( shadow_op 
-             && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
-            flush_tlb_mask(d->domain_dirty_cpumask); 
         l1p = sh_map_domain_page(gw->l1mfn);
         gw->l1e = l1p[guest_l1_table_offset(va)];
         gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
@@ -548,8 +618,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    shadow_lock(v->domain);
-    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0 
+    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
          && mfn_valid(gw.l1mfn) )
     {
         if ( gl1mfn )
@@ -558,8 +627,6 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
             (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
     }
 
-    shadow_unlock(v->domain);
-
     return pl1e;
 }
 
@@ -573,10 +640,8 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    shadow_lock(v->domain);
-    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
+    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
     *(guest_l1e_t *)eff_l1e = gw.l1e;
-    shadow_unlock(v->domain);
 }
 #endif /* CONFIG==SHADOW==GUEST */
 
@@ -2842,14 +2907,12 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
-    shadow_lock(d);
-    
-    shadow_audit_tables(v);
-    
-    if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 )
+    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
     {
         perfc_incr(shadow_fault_bail_real_fault);
-        goto not_a_shadow_fault;
+        SHADOW_PRINTK("not a shadow fault\n");
+        reset_early_unshadow(v);
+        return 0;
     }
 
     /* It's possible that the guest has put pagetables in memory that it has 
@@ -2859,11 +2922,8 @@ static int sh_page_fault(struct vcpu *v,
     if ( unlikely(d->is_shutting_down) )
     {
         SHADOW_PRINTK("guest is shutting down\n");
-        shadow_unlock(d);
         return 0;
     }
-
-    sh_audit_gw(v, &gw);
 
     /* What kind of access are we dealing with? */
     ft = ((regs->error_code & PFEC_write_access)
@@ -2879,7 +2939,8 @@ static int sh_page_fault(struct vcpu *v,
         perfc_incr(shadow_fault_bail_bad_gfn);
         SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", 
                       gfn_x(gfn), mfn_x(gmfn));
-        goto not_a_shadow_fault;
+        reset_early_unshadow(v);
+        return 0;
     }
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
@@ -2887,6 +2948,28 @@ static int sh_page_fault(struct vcpu *v,
     vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), 
                 regs->error_code | PFEC_page_present);
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+
+    shadow_lock(d);
+
+    if ( gw_remove_write_accesses(v, va, &gw) )
+    {
+        /* Write permission removal is also a hint that other gwalks
+         * overlapping with this one may be inconsistent
+         */
+        perfc_incr(shadow_rm_write_flush_tlb);
+        atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
+        flush_tlb_mask(d->domain_dirty_cpumask);
+    }
+
+    if ( !shadow_check_gwalk(v, va, &gw) )
+    {
+        perfc_incr(shadow_inconsistent_gwalk);
+        shadow_unlock(d);
+        return EXCRET_fault_fixed;
+    }
+
+    shadow_audit_tables(v);
+    sh_audit_gw(v, &gw);
 
     /* Make sure there is enough free shadow memory to build a chain of
      * shadow tables. (We never allocate a top-level shadow on this path,
@@ -3223,7 +3306,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( guest_walk_tables(v, va, &gw, pfec[0], 0) != 0 )
+    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
     {
         if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
             pfec[0] &= ~PFEC_page_present;
@@ -4276,6 +4359,8 @@ static void emulate_unmap_dest(struct vc
     }
     else 
         sh_unmap_domain_page(addr);
+
+    atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
 }
 
 int
@@ -4430,29 +4515,13 @@ static char * sh_audit_flags(struct vcpu
     return NULL;
 }
 
-static inline mfn_t
-audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
-/* Convert this gfn to an mfn in the manner appropriate for the
- * guest pagetable it's used in (gmfn) */ 
-{
-    p2m_type_t p2mt;
-    if ( !shadow_mode_translate(v->domain) )
-        return _mfn(gfn_x(gfn));
-    
-    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
-         != PGT_writable_page ) 
-        return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
-    else 
-        return gfn_to_mfn(v->domain, gfn, &p2mt);
-} 
-
-
 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
 {
     guest_l1e_t *gl1e, *gp;
     shadow_l1e_t *sl1e;
     mfn_t mfn, gmfn, gl1mfn;
     gfn_t gfn;
+    p2m_type_t p2mt;
     char *s;
     int done = 0;
     
@@ -4491,7 +4560,7 @@ int sh_audit_l1_table(struct vcpu *v, mf
             {
                 gfn = guest_l1e_get_gfn(*gl1e);
                 mfn = shadow_l1e_get_mfn(*sl1e);
-                gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
+                gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
                 if ( mfn_x(gmfn) != mfn_x(mfn) )
                     AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
                                " --> %" PRI_mfn " != mfn %" PRI_mfn,
@@ -4532,6 +4601,7 @@ int sh_audit_l2_table(struct vcpu *v, mf
     shadow_l2e_t *sl2e;
     mfn_t mfn, gmfn, gl2mfn;
     gfn_t gfn;
+    p2m_type_t p2mt;
     char *s;
     int done = 0;
 
@@ -4550,7 +4620,7 @@ int sh_audit_l2_table(struct vcpu *v, mf
             mfn = shadow_l2e_get_mfn(*sl2e);
             gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)  
                 ? get_fl1_shadow_status(v, gfn)
-                : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), 
+                : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt), 
                                     SH_type_l1_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
@@ -4558,7 +4628,7 @@ int sh_audit_l2_table(struct vcpu *v, mf
                            " --> %" PRI_mfn " != mfn %" PRI_mfn,
                            gfn_x(gfn), 
                            (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
-                           : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
+                           : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
                            mfn_x(gmfn), mfn_x(mfn));
         }
     });
@@ -4573,6 +4643,7 @@ int sh_audit_l3_table(struct vcpu *v, mf
     shadow_l3e_t *sl3e;
     mfn_t mfn, gmfn, gl3mfn;
     gfn_t gfn;
+    p2m_type_t p2mt;
     char *s;
     int done = 0;
 
@@ -4589,7 +4660,7 @@ int sh_audit_l3_table(struct vcpu *v, mf
         {
             gfn = guest_l3e_get_gfn(*gl3e);
             mfn = shadow_l3e_get_mfn(*sl3e);
-            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), 
+            gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt), 
                                      ((GUEST_PAGING_LEVELS == 3 ||
                                        is_pv_32on64_vcpu(v))
                                       && !shadow_mode_external(v->domain)
@@ -4612,6 +4683,7 @@ int sh_audit_l4_table(struct vcpu *v, mf
     shadow_l4e_t *sl4e;
     mfn_t mfn, gmfn, gl4mfn;
     gfn_t gfn;
+    p2m_type_t p2mt;
     char *s;
     int done = 0;
 
@@ -4628,7 +4700,7 @@ int sh_audit_l4_table(struct vcpu *v, mf
         {
             gfn = guest_l4e_get_gfn(*gl4e);
             mfn = shadow_l4e_get_mfn(*sl4e);
-            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), 
+            gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt), 
                                      SH_type_l3_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/x86/mm/shadow/types.h    Fri Feb 29 09:19:58 2008 -0700
@@ -435,6 +435,7 @@ struct shadow_walk_t
 #endif
     mfn_t l2mfn;                /* MFN that the level 2 entry was in */
     mfn_t l1mfn;                /* MFN that the level 1 entry was in */
+    int version;                /* Saved guest dirty version */
 };
 
 /* macros for dealing with the naming of the internal function names of the
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/x86_32/Makefile
--- a/xen/arch/x86/x86_32/Makefile      Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/x86/x86_32/Makefile      Fri Feb 29 09:19:58 2008 -0700
@@ -4,6 +4,7 @@ obj-y += mm.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(crash_debug) += gdbstub.o
 
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/x86_32/machine_kexec.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/x86_32/machine_kexec.c       Fri Feb 29 09:19:58 2008 -0700
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * machine_kexec.c
+ *
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@xxxxxxxxxxxx>
+ * - Magnus Damm <magnus@xxxxxxxxxxxxx>
+ */
+
+#ifndef CONFIG_COMPAT
+
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <asm/page.h>
+#include <public/kexec.h>
+
+int machine_kexec_get_xen(xen_kexec_range_t *range)
+{
+        range->start = virt_to_maddr(_start);
+        range->size = (unsigned long)xenheap_phys_end -
+                      (unsigned long)range->start;
+        return 0;
+}
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/x86_64/Makefile
--- a/xen/arch/x86/x86_64/Makefile      Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/arch/x86/x86_64/Makefile      Fri Feb 29 09:19:58 2008 -0700
@@ -4,6 +4,7 @@ obj-y += gpr_switch.o
 obj-y += gpr_switch.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(crash_debug)   += gdbstub.o
 obj-$(CONFIG_COMPAT) += compat.o
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/arch/x86/x86_64/machine_kexec.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/x86_64/machine_kexec.c       Fri Feb 29 09:19:58 2008 -0700
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * machine_kexec.c
+ *
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@xxxxxxxxxxxx>
+ * - Magnus Damm <magnus@xxxxxxxxxxxxx>
+ */
+
+#ifndef CONFIG_COMPAT
+
+#include <xen/types.h>
+#include <asm/page.h>
+#include <public/kexec.h>
+
+int machine_kexec_get_xen(xen_kexec_range_t *range)
+{
+        range->start = xenheap_phys_start;
+        range->size = (unsigned long)xenheap_phys_end -
+                      (unsigned long)range->start;
+        return 0;
+}
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/common/compat/kexec.c
--- a/xen/common/compat/kexec.c Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/common/compat/kexec.c Fri Feb 29 09:19:58 2008 -0700
@@ -8,11 +8,6 @@
 #define ret_t int
 
 #define do_kexec_op compat_kexec_op
-
-#undef kexec_get
-#define kexec_get(x)      compat_kexec_get_##x
-#define xen_kexec_range   compat_kexec_range
-#define xen_kexec_range_t compat_kexec_range_t
 
 #define kexec_load_unload compat_kexec_load_unload
 #define xen_kexec_load    compat_kexec_load
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/common/kexec.c
--- a/xen/common/kexec.c        Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/common/kexec.c        Fri Feb 29 09:19:58 2008 -0700
@@ -20,6 +20,7 @@
 #include <xen/spinlock.h>
 #include <xen/version.h>
 #include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/elfnote.h>
 #include <xsm/xsm.h>
 
@@ -153,11 +154,7 @@ static int sizeof_note(const char *name,
             ELFNOTE_ALIGN(descsz));
 }
 
-#define kexec_get(x)      kexec_get_##x
-
-#endif
-
-static int kexec_get(reserve)(xen_kexec_range_t *range)
+static int kexec_get_reserve(xen_kexec_range_t *range)
 {
     if ( kexec_crash_area.size > 0 && kexec_crash_area.start > 0) {
         range->start = kexec_crash_area.start;
@@ -168,18 +165,7 @@ static int kexec_get(reserve)(xen_kexec_
     return 0;
 }
 
-static int kexec_get(xen)(xen_kexec_range_t *range)
-{
-#ifdef CONFIG_X86_64
-    range->start = xenheap_phys_start;
-#else
-    range->start = virt_to_maddr(_start);
-#endif
-    range->size = (unsigned long)xenheap_phys_end - (unsigned 
long)range->start;
-    return 0;
-}
-
-static int kexec_get(cpu)(xen_kexec_range_t *range)
+static int kexec_get_cpu(xen_kexec_range_t *range)
 {
     int nr = range->nr;
     int nr_bytes = 0;
@@ -223,7 +209,27 @@ static int kexec_get(cpu)(xen_kexec_rang
     return 0;
 }
 
-static int kexec_get(range)(XEN_GUEST_HANDLE(void) uarg)
+static int kexec_get_range_internal(xen_kexec_range_t *range)
+{
+    int ret = -EINVAL;
+
+    switch ( range->range )
+    {
+    case KEXEC_RANGE_MA_CRASH:
+        ret = kexec_get_reserve(range);
+        break;
+    case KEXEC_RANGE_MA_CPU:
+        ret = kexec_get_cpu(range);
+        break;
+    default:
+        ret = machine_kexec_get(range);
+        break;
+    }
+
+    return ret;
+}
+
+static int kexec_get_range(XEN_GUEST_HANDLE(void) uarg)
 {
     xen_kexec_range_t range;
     int ret = -EINVAL;
@@ -231,24 +237,49 @@ static int kexec_get(range)(XEN_GUEST_HA
     if ( unlikely(copy_from_guest(&range, uarg, 1)) )
         return -EFAULT;
 
-    switch ( range.range )
-    {
-    case KEXEC_RANGE_MA_CRASH:
-        ret = kexec_get(reserve)(&range);
-        break;
-    case KEXEC_RANGE_MA_XEN:
-        ret = kexec_get(xen)(&range);
-        break;
-    case KEXEC_RANGE_MA_CPU:
-        ret = kexec_get(cpu)(&range);
-        break;
-    }
+    ret = kexec_get_range_internal(&range);
 
     if ( ret == 0 && unlikely(copy_to_guest(uarg, &range, 1)) )
         return -EFAULT;
 
     return ret;
 }
+
+#else /* COMPAT */
+
+#ifdef CONFIG_COMPAT
+static int kexec_get_range_compat(XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_range_t range;
+    compat_kexec_range_t compat_range;
+    int ret = -EINVAL;
+
+    if ( unlikely(copy_from_guest(&compat_range, uarg, 1)) )
+        return -EFAULT;
+
+    range.range = compat_range.range;
+    range.nr = compat_range.nr;
+    range.size = compat_range.size;
+    range.start = compat_range.start;
+
+    ret = kexec_get_range_internal(&range);
+
+    if ( ret == 0 ) {
+        range.range = compat_range.range;
+        range.nr = compat_range.nr;
+        range.size = compat_range.size;
+        range.start = compat_range.start;
+
+        if ( unlikely(copy_to_guest(uarg, &compat_range, 1)) )
+             return -EFAULT;
+    }
+
+    return ret;
+}
+#endif /* CONFIG_COMPAT */
+
+#endif /* COMPAT */
+
 
 #ifndef COMPAT
 
@@ -375,7 +406,11 @@ ret_t do_kexec_op(unsigned long op, XEN_
     switch ( op )
     {
     case KEXEC_CMD_kexec_get_range:
-        ret = kexec_get(range)(uarg);
+#ifndef COMPAT
+        ret = kexec_get_range(uarg);
+#else
+        ret = kexec_get_range_compat(uarg);
+#endif
         break;
     case KEXEC_CMD_kexec_load:
     case KEXEC_CMD_kexec_unload:
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/acpi/tables.c
--- a/xen/drivers/acpi/tables.c Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/drivers/acpi/tables.c Fri Feb 29 09:19:58 2008 -0700
@@ -60,6 +60,7 @@ static char *acpi_table_signatures[ACPI_
        [ACPI_HPET] = "HPET",
        [ACPI_MCFG] = "MCFG",
        [ACPI_DMAR] = "DMAR",
+       [ACPI_IVRS] = "IVRS",
 };
 
 static char *mps_inti_flags_polarity[] = { "dfl", "high", "res", "low" };
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/passthrough/amd/Makefile
--- a/xen/drivers/passthrough/amd/Makefile      Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/drivers/passthrough/amd/Makefile      Fri Feb 29 09:19:58 2008 -0700
@@ -2,3 +2,4 @@ obj-y += iommu_init.o
 obj-y += iommu_init.o
 obj-y += iommu_map.o
 obj-y += pci_amd_iommu.o
+obj-y += iommu_acpi.o
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/passthrough/amd/iommu_acpi.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c  Fri Feb 29 09:19:58 2008 -0700
@@ -0,0 +1,874 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@xxxxxxx>
+ * Author: Wei Wang <wei.wang2@xxxxxxx> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+#include <asm/hvm/svm/amd-iommu-acpi.h>
+
+extern unsigned long amd_iommu_page_entries;
+extern unsigned short ivrs_bdf_entries;
+extern struct ivrs_mappings *ivrs_mappings;
+
+static struct amd_iommu * __init find_iommu_from_bdf_cap(
+           u16 bdf, u8 cap_offset)
+{
+    struct amd_iommu *iommu;
+
+    for_each_amd_iommu( iommu )
+        if ( iommu->bdf == bdf && iommu->cap_offset == cap_offset )
+            return iommu;
+
+    return NULL;
+}
+
+static void __init reserve_iommu_exclusion_range(
+    struct amd_iommu *iommu, uint64_t base, uint64_t limit)
+{
+    /* need to extend exclusion range? */
+    if ( iommu->exclusion_enable )
+    {
+        if ( iommu->exclusion_base < base )
+            base = iommu->exclusion_base;
+        if ( iommu->exclusion_limit > limit )
+            limit = iommu->exclusion_limit;
+    }
+
+    iommu->exclusion_enable = IOMMU_CONTROL_ENABLED;
+    iommu->exclusion_base = base;
+    iommu->exclusion_limit = limit;
+}
+
+static void __init reserve_iommu_exclusion_range_all(struct amd_iommu *iommu,
+           unsigned long base, unsigned long limit)
+{
+    reserve_iommu_exclusion_range(iommu, base, limit);
+    iommu->exclusion_allow_all = IOMMU_CONTROL_ENABLED;
+}
+
+static void __init reserve_unity_map_for_device(u16 bdf, unsigned long base,
+           unsigned long length, u8 iw, u8 ir)
+{
+    unsigned long old_top, new_top;
+
+    /* need to extend unity-mapped range? */
+    if ( ivrs_mappings[bdf].unity_map_enable )
+    {
+        old_top = ivrs_mappings[bdf].addr_range_start +
+            ivrs_mappings[bdf].addr_range_length;
+        new_top = base + length;
+        if ( old_top > new_top )
+            new_top = old_top;
+        if ( ivrs_mappings[bdf].addr_range_start < base )
+            base = ivrs_mappings[bdf].addr_range_start;
+        length = new_top - base;
+   }
+
+    /* extend r/w permissioms and keep aggregate */
+    if ( iw )
+        ivrs_mappings[bdf].write_permission = IOMMU_CONTROL_ENABLED;
+    if ( ir )
+        ivrs_mappings[bdf].read_permission = IOMMU_CONTROL_ENABLED;
+    ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_ENABLED;
+    ivrs_mappings[bdf].addr_range_start = base;
+    ivrs_mappings[bdf].addr_range_length = length;
+}
+
+static int __init register_exclusion_range_for_all_devices(
+           unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    unsigned long range_top, iommu_top, length;
+    struct amd_iommu *iommu;
+    u16 bdf;
+
+    /* is part of exclusion range inside of IOMMU virtual address space? */
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    range_top = limit + PAGE_SIZE;
+    iommu_top = max_page * PAGE_SIZE;
+    if ( base < iommu_top )
+    {
+        if (range_top > iommu_top)
+            range_top = iommu_top;
+        length = range_top - base;
+        /* reserve r/w unity-mapped page entries for devices */
+        /* note: these entries are part of the exclusion range */
+        for (bdf = 0; bdf < ivrs_bdf_entries; ++bdf)
+            reserve_unity_map_for_device(bdf, base, length, iw, ir);
+        /* push 'base' just outside of virtual address space */
+        base = iommu_top;
+    }
+    /* register IOMMU exclusion range settings */
+    if (limit >= iommu_top)
+    {
+        for_each_amd_iommu( iommu )
+            reserve_iommu_exclusion_range_all(iommu, base, limit);
+    }
+
+    return 0;
+}
+
+static int __init register_exclusion_range_for_device(u16 bdf,
+           unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    unsigned long range_top, iommu_top, length;
+    struct amd_iommu *iommu;
+    u16 bus, devfn, req;
+
+    bus = bdf >> 8;
+    devfn = bdf & 0xFF;
+    iommu = find_iommu_for_device(bus, devfn);
+    if ( !iommu )
+    {
+        dprintk(XENLOG_ERR, "IVMD Error: No IOMMU for Dev_Id 0x%x!\n", bdf);
+        return -ENODEV;
+    }
+    req = ivrs_mappings[bdf].dte_requestor_id;
+
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    range_top = limit + PAGE_SIZE;
+    iommu_top = max_page * PAGE_SIZE;
+    if ( base < iommu_top )
+    {
+        if (range_top > iommu_top)
+            range_top = iommu_top;
+        length = range_top - base;
+        /* reserve unity-mapped page entries for device */
+        /* note: these entries are part of the exclusion range */
+        reserve_unity_map_for_device(bdf, base, length, iw, ir);
+        reserve_unity_map_for_device(req, base, length, iw, ir);
+
+        /* push 'base' just outside of virtual address space */
+        base = iommu_top;
+    }
+
+   /* register IOMMU exclusion range settings for device */
+   if ( limit >= iommu_top  )
+    {
+        reserve_iommu_exclusion_range(iommu, base, limit);
+        ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_ENABLED;
+        ivrs_mappings[req].dte_allow_exclusion = IOMMU_CONTROL_ENABLED;
+    }
+
+    return 0;
+}
+
+static int __init register_exclusion_range_for_iommu_devices(
+           struct amd_iommu *iommu,
+           unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    unsigned long range_top, iommu_top, length;
+    u16 bus, devfn, bdf, req;
+
+    /* is part of exclusion range inside of IOMMU virtual address space? */
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    range_top = limit + PAGE_SIZE;
+    iommu_top = max_page * PAGE_SIZE;
+    if ( base < iommu_top )
+    {
+        if (range_top > iommu_top)
+            range_top = iommu_top;
+        length = range_top - base;
+        /* reserve r/w unity-mapped page entries for devices */
+        /* note: these entries are part of the exclusion range */
+        for ( bdf = 0; bdf < ivrs_bdf_entries; ++bdf )
+        {
+            bus = bdf >> 8;
+            devfn = bdf & 0xFF;
+            if ( iommu == find_iommu_for_device(bus, devfn) )
+            {
+                reserve_unity_map_for_device(bdf, base, length, iw, ir);
+                req = ivrs_mappings[bdf].dte_requestor_id;
+                reserve_unity_map_for_device(req, base, length, iw, ir);
+            }
+        }
+
+        /* push 'base' just outside of virtual address space */
+        base = iommu_top;
+    }
+
+    /* register IOMMU exclusion range settings */
+    if (limit >= iommu_top)
+        reserve_iommu_exclusion_range_all(iommu, base, limit);
+    return 0;
+}
+
+static int __init parse_ivmd_device_select(
+           struct acpi_ivmd_block_header *ivmd_block,
+           unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    u16 bdf;
+
+    bdf = ivmd_block->header.dev_id;
+    if (bdf >= ivrs_bdf_entries)
+    {
+        dprintk(XENLOG_ERR, "IVMD Error: Invalid Dev_Id 0x%x\n", bdf);
+        return -ENODEV;
+    }
+
+    return register_exclusion_range_for_device(bdf, base, limit, iw, ir);
+}
+
+static int __init parse_ivmd_device_range(
+           struct acpi_ivmd_block_header *ivmd_block,
+           unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    u16 first_bdf, last_bdf, bdf;
+    int error;
+
+    first_bdf = ivmd_block->header.dev_id;
+    if (first_bdf >= ivrs_bdf_entries)
+    {
+       dprintk(XENLOG_ERR, "IVMD Error: "
+                    "Invalid Range_First Dev_Id 0x%x\n", first_bdf);
+       return -ENODEV;
+    }
+
+    last_bdf = ivmd_block->last_dev_id;
+    if (last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf)
+    {
+        dprintk(XENLOG_ERR, "IVMD Error: "
+                    "Invalid Range_Last Dev_Id 0x%x\n", last_bdf);
+        return -ENODEV;
+    }
+
+      dprintk(XENLOG_ERR, " Dev_Id Range: 0x%x -> 0x%x\n",
+                    first_bdf, last_bdf);
+
+    for ( bdf = first_bdf, error = 0;
+       bdf <= last_bdf && !error; ++bdf )
+    {
+       error = register_exclusion_range_for_device(
+                     bdf, base, limit, iw, ir);
+    }
+
+   return error;
+}
+
+static int __init parse_ivmd_device_iommu(
+           struct acpi_ivmd_block_header *ivmd_block,
+           unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    struct amd_iommu *iommu;
+
+    /* find target IOMMU */
+    iommu = find_iommu_from_bdf_cap(ivmd_block->header.dev_id,
+                                    ivmd_block->cap_offset);
+    if ( !iommu )
+    {
+       dprintk(XENLOG_ERR,
+           "IVMD Error: No IOMMU for Dev_Id 0x%x  Cap 0x%x\n",
+            ivmd_block->header.dev_id, ivmd_block->cap_offset);
+       return -ENODEV;
+    }
+
+    return register_exclusion_range_for_iommu_devices(
+                 iommu, base, limit, iw, ir);
+}
+
+static int __init parse_ivmd_block(struct acpi_ivmd_block_header *ivmd_block)
+{
+    unsigned long start_addr, mem_length, base, limit;
+    u8 iw, ir;
+
+    if (ivmd_block->header.length <
+       sizeof(struct acpi_ivmd_block_header))
+    {
+       dprintk(XENLOG_ERR, "IVMD Error: Invalid Block Length!\n");
+       return -ENODEV;
+    }
+
+    start_addr = (unsigned long)ivmd_block->start_addr;
+    mem_length = (unsigned long)ivmd_block->mem_length;
+    base = start_addr & PAGE_MASK;
+    limit = (start_addr + mem_length - 1) & PAGE_MASK;
+
+    dprintk(XENLOG_INFO, "IVMD Block: Type 0x%x\n",
+                  ivmd_block->header.type);
+    dprintk(XENLOG_INFO, " Start_Addr_Phys 0x%lx\n", start_addr);
+    dprintk(XENLOG_INFO, " Mem_Length 0x%lx\n", mem_length);
+
+    if ( get_field_from_byte(ivmd_block->header.flags,
+                             AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK,
+                             AMD_IOMMU_ACPI_EXCLUSION_RANGE_SHIFT) )
+        iw = ir = IOMMU_CONTROL_ENABLED;
+    else if ( get_field_from_byte(ivmd_block->header.flags,
+                                  AMD_IOMMU_ACPI_UNITY_MAPPING_MASK,
+                                  AMD_IOMMU_ACPI_UNITY_MAPPING_SHIFT) )
+    {
+        iw = get_field_from_byte(ivmd_block->header.flags,
+                                 AMD_IOMMU_ACPI_IW_PERMISSION_MASK,
+                                 AMD_IOMMU_ACPI_IW_PERMISSION_SHIFT);
+        ir = get_field_from_byte(ivmd_block->header.flags,
+                                 AMD_IOMMU_ACPI_IR_PERMISSION_MASK,
+                                 AMD_IOMMU_ACPI_IR_PERMISSION_SHIFT);
+    }
+    else
+    {
+       dprintk(KERN_ERR, "IVMD Error: Invalid Flag Field!\n");
+       return -ENODEV;
+    }
+
+    switch( ivmd_block->header.type )
+    {
+    case AMD_IOMMU_ACPI_IVMD_ALL_TYPE:
+        return register_exclusion_range_for_all_devices(
+           base, limit, iw, ir);
+
+    case AMD_IOMMU_ACPI_IVMD_ONE_TYPE:
+        return parse_ivmd_device_select(ivmd_block,
+           base, limit, iw, ir);
+
+    case AMD_IOMMU_ACPI_IVMD_RANGE_TYPE:
+        return parse_ivmd_device_range(ivmd_block,
+            base, limit, iw, ir);
+
+    case AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE:
+        return parse_ivmd_device_iommu(ivmd_block,
+           base, limit, iw, ir);
+
+    default:
+        dprintk(XENLOG_ERR, "IVMD Error: Invalid Block Type!\n");
+        return -ENODEV;
+    }
+}
+
+static u16 __init parse_ivhd_device_padding(u16 pad_length,
+           u16 header_length, u16 block_length)
+{
+    if ( header_length < (block_length + pad_length) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    return pad_length;
+}
+
+static u16 __init parse_ivhd_device_select(
+           union acpi_ivhd_device *ivhd_device)
+{
+    u16 bdf;
+
+    bdf = ivhd_device->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+
+    /* override flags for device */
+    ivrs_mappings[bdf].dte_sys_mgt_enable =
+        get_field_from_byte(ivhd_device->header.flags,
+                            AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                            AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+
+    return sizeof(struct acpi_ivhd_device_header);
+}
+
+static u16 __init parse_ivhd_device_range(
+           union acpi_ivhd_device *ivhd_device,
+           u16 header_length, u16 block_length)
+{
+    u16 dev_length, first_bdf, last_bdf, bdf;
+    u8 sys_mgt;
+
+    dev_length = sizeof(struct acpi_ivhd_device_range);
+    if ( header_length < (block_length + dev_length) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    if ( ivhd_device->range.trailer.type !=
+        AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END) {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Range: End_Type 0x%x\n",
+                ivhd_device->range.trailer.type);
+        return 0;
+    }
+
+    first_bdf = ivhd_device->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+       dprintk(XENLOG_ERR, "IVHD Error: "
+           "Invalid Range: First Dev_Id 0x%x\n", first_bdf);
+       return 0;
+    }
+
+    last_bdf = ivhd_device->range.trailer.dev_id;
+    if ( last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf )
+    {
+       dprintk(XENLOG_ERR, "IVHD Error: "
+           "Invalid Range: Last Dev_Id 0x%x\n", last_bdf);
+       return 0;
+    }
+
+    dprintk(XENLOG_INFO, " Dev_Id Range: 0x%x -> 0x%x\n",
+        first_bdf, last_bdf);
+
+    /* override flags for range of devices */
+    sys_mgt = get_field_from_byte(ivhd_device->header.flags,
+                                 AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                                 AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+    for ( bdf = first_bdf; bdf <= last_bdf; ++bdf )
+        ivrs_mappings[bdf].dte_sys_mgt_enable = sys_mgt;
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_alias(
+           union acpi_ivhd_device *ivhd_device,
+           u16 header_length, u16 block_length)
+{
+    u16 dev_length, alias_id, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_alias);
+    if ( header_length < (block_length + dev_length) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+            "Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    bdf = ivhd_device->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+
+    alias_id = ivhd_device->alias.dev_id;
+    if ( alias_id >= ivrs_bdf_entries )
+    {
+       dprintk(XENLOG_ERR, "IVHD Error: "
+               "Invalid Alias Dev_Id 0x%x\n", alias_id);
+       return 0;
+    }
+
+    dprintk(XENLOG_INFO, " Dev_Id Alias: 0x%x\n", alias_id);
+
+    /* override requestor_id and flags for device */
+    ivrs_mappings[bdf].dte_requestor_id = alias_id;
+    ivrs_mappings[bdf].dte_sys_mgt_enable =
+            get_field_from_byte(ivhd_device->header.flags,
+                                AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                                AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+    ivrs_mappings[alias_id].dte_sys_mgt_enable =
+            ivrs_mappings[bdf].dte_sys_mgt_enable;
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_alias_range(
+           union acpi_ivhd_device *ivhd_device,
+           u16 header_length, u16 block_length)
+{
+
+    u16 dev_length, first_bdf, last_bdf, alias_id, bdf;
+    u8 sys_mgt;
+
+    dev_length = sizeof(struct acpi_ivhd_device_alias_range);
+    if ( header_length < (block_length + dev_length) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    if ( ivhd_device->alias_range.trailer.type !=
+       AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Range: End_Type 0x%x\n",
+                ivhd_device->alias_range.trailer.type);
+        return 0;
+    }
+
+    first_bdf = ivhd_device->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+        dprintk(XENLOG_ERR,"IVHD Error: "
+                "Invalid Range: First Dev_Id 0x%x\n", first_bdf);
+        return 0;
+    }
+
+    last_bdf = ivhd_device->alias_range.trailer.dev_id;
+    if ( last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Range: Last Dev_Id 0x%x\n", last_bdf);
+        return 0;
+    }
+
+    alias_id = ivhd_device->alias_range.alias.dev_id;
+    if ( alias_id >= ivrs_bdf_entries )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Alias Dev_Id 0x%x\n", alias_id);
+        return 0;
+    }
+
+    dprintk(XENLOG_INFO, " Dev_Id Range: 0x%x -> 0x%x\n",
+            first_bdf, last_bdf);
+    dprintk(XENLOG_INFO, " Dev_Id Alias: 0x%x\n", alias_id);
+
+    /* override requestor_id and flags for range of devices */
+    sys_mgt = get_field_from_byte(ivhd_device->header.flags,
+                                  AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                                  AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+    for ( bdf = first_bdf; bdf <= last_bdf; ++bdf )
+    {
+        ivrs_mappings[bdf].dte_requestor_id = alias_id;
+        ivrs_mappings[bdf].dte_sys_mgt_enable = sys_mgt;
+    }
+    ivrs_mappings[alias_id].dte_sys_mgt_enable = sys_mgt;
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_extended(
+           union acpi_ivhd_device *ivhd_device,
+           u16 header_length, u16 block_length)
+{
+    u16 dev_length, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_extended);
+    if ( header_length < (block_length + dev_length) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    bdf = ivhd_device->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+
+    /* override flags for device */
+    ivrs_mappings[bdf].dte_sys_mgt_enable =
+        get_field_from_byte(ivhd_device->header.flags,
+                            AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                            AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_extended_range(
+           union acpi_ivhd_device *ivhd_device,
+           u16 header_length, u16 block_length)
+{
+    u16 dev_length, first_bdf, last_bdf, bdf;
+    u8 sys_mgt;
+
+    dev_length = sizeof(struct acpi_ivhd_device_extended_range);
+    if ( header_length < (block_length + dev_length) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    if ( ivhd_device->extended_range.trailer.type !=
+        AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Range: End_Type 0x%x\n",
+                ivhd_device->extended_range.trailer.type);
+        return 0;
+    }
+
+    first_bdf = ivhd_device->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+       dprintk(XENLOG_ERR, "IVHD Error: "
+           "Invalid Range: First Dev_Id 0x%x\n", first_bdf);
+       return 0;
+    }
+
+    last_bdf = ivhd_device->extended_range.trailer.dev_id;
+    if ( last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Range: Last Dev_Id 0x%x\n", last_bdf);
+        return 0;
+    }
+
+    dprintk(XENLOG_INFO, " Dev_Id Range: 0x%x -> 0x%x\n",
+            first_bdf, last_bdf);
+
+    /* override flags for range of devices */
+    sys_mgt = get_field_from_byte(ivhd_device->header.flags,
+                                  AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                                  AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+    for ( bdf = first_bdf; bdf <= last_bdf; ++bdf )
+        ivrs_mappings[bdf].dte_sys_mgt_enable = sys_mgt;
+
+    return dev_length;
+}
+
+static int __init parse_ivhd_block(struct acpi_ivhd_block_header *ivhd_block)
+{
+    union acpi_ivhd_device *ivhd_device;
+    u16 block_length, dev_length;
+    struct amd_iommu *iommu;
+
+    if ( ivhd_block->header.length <
+        sizeof(struct acpi_ivhd_block_header) )
+    {
+        dprintk(XENLOG_ERR, "IVHD Error: Invalid Block Length!\n");
+        return -ENODEV;
+    }
+
+    iommu = find_iommu_from_bdf_cap(ivhd_block->header.dev_id,
+            ivhd_block->cap_offset);
+    if ( !iommu )
+    {
+        dprintk(XENLOG_ERR,
+                "IVHD Error: No IOMMU for Dev_Id 0x%x  Cap 0x%x\n",
+                ivhd_block->header.dev_id, ivhd_block->cap_offset);
+       return -ENODEV;
+    }
+
+    dprintk(XENLOG_INFO, "IVHD Block:\n");
+    dprintk(XENLOG_INFO, " Cap_Offset 0x%x\n",
+            ivhd_block->cap_offset);
+    dprintk(XENLOG_INFO, " MMIO_BAR_Phys 0x%lx\n",
+            (unsigned long)ivhd_block->mmio_base);
+    dprintk(XENLOG_INFO, " PCI_Segment 0x%x\n",
+            ivhd_block->pci_segment);
+    dprintk(XENLOG_INFO, " IOMMU_Info 0x%x\n",
+            ivhd_block->iommu_info);
+
+    /* override IOMMU support flags */
+    iommu->coherent = get_field_from_byte(ivhd_block->header.flags,
+                                          AMD_IOMMU_ACPI_COHERENT_MASK,
+                                          AMD_IOMMU_ACPI_COHERENT_SHIFT);
+    iommu->iotlb_support = get_field_from_byte(ivhd_block->header.flags,
+                                          AMD_IOMMU_ACPI_IOTLB_SUP_MASK,
+                                          AMD_IOMMU_ACPI_IOTLB_SUP_SHIFT);
+    iommu->isochronous = get_field_from_byte(ivhd_block->header.flags,
+                                          AMD_IOMMU_ACPI_ISOC_MASK,
+                                          AMD_IOMMU_ACPI_ISOC_SHIFT);
+    iommu->res_pass_pw = get_field_from_byte(ivhd_block->header.flags,
+                                          AMD_IOMMU_ACPI_RES_PASS_PW_MASK,
+                                          AMD_IOMMU_ACPI_RES_PASS_PW_SHIFT);
+    iommu->pass_pw = get_field_from_byte(ivhd_block->header.flags,
+                                          AMD_IOMMU_ACPI_PASS_PW_MASK,
+                                          AMD_IOMMU_ACPI_PASS_PW_SHIFT);
+    iommu->ht_tunnel_enable = get_field_from_byte(
+                                          ivhd_block->header.flags,
+                                          AMD_IOMMU_ACPI_HT_TUN_ENB_MASK,
+                                          AMD_IOMMU_ACPI_HT_TUN_ENB_SHIFT);
+
+    /* parse Device Entries */
+    block_length = sizeof(struct acpi_ivhd_block_header);
+    while( ivhd_block->header.length >=
+       (block_length + sizeof(struct acpi_ivhd_device_header)) )
+    {
+        ivhd_device = (union acpi_ivhd_device *)
+                ((u8 *)ivhd_block + block_length);
+
+        dprintk(XENLOG_INFO, "IVHD Device Entry:\n");
+        dprintk(XENLOG_INFO, " Type 0x%x\n",
+                ivhd_device->header.type);
+        dprintk(XENLOG_INFO, " Dev_Id 0x%x\n",
+                ivhd_device->header.dev_id);
+        dprintk(XENLOG_INFO, " Flags 0x%x\n",
+                ivhd_device->header.flags);
+
+        switch( ivhd_device->header.type )
+        {
+        case AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD:
+            dev_length = parse_ivhd_device_padding(
+                sizeof(u32),
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD:
+            dev_length = parse_ivhd_device_padding(
+                sizeof(u64),
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_SELECT:
+            dev_length = parse_ivhd_device_select(ivhd_device);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START:
+            dev_length = parse_ivhd_device_range(ivhd_device,
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT:
+            dev_length = parse_ivhd_device_alias(
+                ivhd_device,
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE:
+            dev_length = parse_ivhd_device_alias_range(
+                ivhd_device,
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT:
+            dev_length = parse_ivhd_device_extended(
+                ivhd_device,
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE:
+            dev_length = parse_ivhd_device_extended_range(
+                ivhd_device,
+                ivhd_block->header.length, block_length);
+            break;
+        default:
+            dprintk(XENLOG_ERR, "IVHD Error: "
+                "Invalid Device Type!\n");
+            dev_length = 0;
+            break;
+        }
+
+        block_length += dev_length;
+        if ( !dev_length )
+            return -ENODEV;
+    }
+
+    return 0;
+}
+
+static int __init parse_ivrs_block(struct acpi_ivrs_block_header *ivrs_block)
+{
+    struct acpi_ivhd_block_header *ivhd_block;
+    struct acpi_ivmd_block_header *ivmd_block;
+
+    switch(ivrs_block->type)
+    {
+    case AMD_IOMMU_ACPI_IVHD_TYPE:
+        ivhd_block = (struct acpi_ivhd_block_header *)ivrs_block;
+        return parse_ivhd_block(ivhd_block);
+
+    case AMD_IOMMU_ACPI_IVMD_ALL_TYPE:
+    case AMD_IOMMU_ACPI_IVMD_ONE_TYPE:
+    case AMD_IOMMU_ACPI_IVMD_RANGE_TYPE:
+    case AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE:
+        ivmd_block = (struct acpi_ivmd_block_header *)ivrs_block;
+        return parse_ivmd_block(ivmd_block);
+
+    default:
+        dprintk(XENLOG_ERR, "IVRS Error: Invalid Block Type!\n");
+        return -ENODEV;
+    }
+
+    return 0;
+}
+
+void __init dump_acpi_table_header(struct acpi_table_header *table)
+{
+    int i;
+
+    printk(XENLOG_INFO "AMD IOMMU: ACPI Table:\n");
+    printk(XENLOG_INFO " Signature ");
+    for ( i = 0; i < ACPI_NAME_SIZE; ++i )
+        printk("%c", table->signature[i]);
+    printk("\n");
+
+    printk(" Length 0x%x\n", table->length);
+    printk(" Revision 0x%x\n", table->revision);
+    printk(" CheckSum 0x%x\n", table->checksum);
+
+    printk(" OEM_Id ");
+    for ( i = 0; i < ACPI_OEM_ID_SIZE; ++i )
+        printk("%c", table->oem_id[i]);
+    printk("\n");
+
+    printk(" OEM_Table_Id ");
+    for ( i = 0; i < ACPI_OEM_TABLE_ID_SIZE; ++i )
+        printk("%c", table->oem_table_id[i]);
+    printk("\n");
+
+    printk(" OEM_Revision 0x%x\n", table->oem_revision);
+
+    printk(" Creator_Id ");
+    for ( i = 0; i < ACPI_NAME_SIZE; ++i )
+        printk("%c", table->asl_compiler_id[i]);
+    printk("\n");
+
+    printk(" Creator_Revision 0x%x\n",
+       table->asl_compiler_revision);
+}
+
+int __init parse_ivrs_table(unsigned long phys_addr,
+                                  unsigned long size)
+{
+    struct acpi_ivrs_block_header *ivrs_block;
+    unsigned long length, i;
+    u8 checksum, *raw_table;
+    int error = 0;
+    struct acpi_table_header  *table =
+        (struct acpi_table_header *) __acpi_map_table(phys_addr, size);
+
+    BUG_ON(!table);
+
+#if 0
+    dump_acpi_table_header(table);
+#endif
+
+    /* validate checksum: sum of entire table == 0 */
+    checksum = 0;
+    raw_table = (u8 *)table;
+    for ( i = 0; i < table->length; ++i )
+        checksum += raw_table[i];
+    if ( checksum )
+    {
+        dprintk(XENLOG_ERR, "IVRS Error: "
+                "Invalid Checksum 0x%x\n", checksum);
+        return -ENODEV;
+    }
+
+    /* parse IVRS blocks */
+    length = sizeof(struct acpi_ivrs_table_header);
+    while( error == 0 && table->length >
+       (length + sizeof(struct acpi_ivrs_block_header)) )
+    {
+        ivrs_block = (struct acpi_ivrs_block_header *)
+                ((u8 *)table + length);
+
+        dprintk(XENLOG_INFO, "IVRS Block:\n");
+        dprintk(XENLOG_INFO, " Type 0x%x\n", ivrs_block->type);
+        dprintk(XENLOG_INFO, " Flags 0x%x\n", ivrs_block->flags);
+        dprintk(XENLOG_INFO, " Length 0x%x\n", ivrs_block->length);
+        dprintk(XENLOG_INFO, " Dev_Id 0x%x\n", ivrs_block->dev_id);
+
+        if (table->length >= (length + ivrs_block->length))
+           error = parse_ivrs_block(ivrs_block);
+        else
+        {
+           dprintk(XENLOG_ERR, "IVRS Error: "
+               "Table Length Exceeded: 0x%x -> 0x%lx\n",
+               table->length,
+               (length + ivrs_block->length));
+           return -ENODEV;
+        }
+        length += ivrs_block->length;
+    }
+
+    return error;
+}
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/passthrough/amd/iommu_detect.c
--- a/xen/drivers/passthrough/amd/iommu_detect.c        Fri Feb 29 09:18:01 
2008 -0700
+++ b/xen/drivers/passthrough/amd/iommu_detect.c        Fri Feb 29 09:19:58 
2008 -0700
@@ -86,30 +86,24 @@ int __init get_iommu_capabilities(u8 bus
 int __init get_iommu_capabilities(u8 bus, u8 dev, u8 func, u8 cap_ptr,
             struct amd_iommu *iommu)
 {
-    u32 cap_header, cap_range;
+    u32 cap_header, cap_range, misc_info;
     u64 mmio_bar;
 
-#if HACK_BIOS_SETTINGS
-    /* remove it when BIOS available */
-    write_pci_config(bus, dev, func,
-        cap_ptr + PCI_CAP_MMIO_BAR_HIGH_OFFSET, 0x00000000);
-    write_pci_config(bus, dev, func,
-        cap_ptr + PCI_CAP_MMIO_BAR_LOW_OFFSET, 0x40000001);
-    /* remove it when BIOS available */
-#endif
-
     mmio_bar = (u64)read_pci_config(bus, dev, func,
-             cap_ptr + PCI_CAP_MMIO_BAR_HIGH_OFFSET) << 32;
+            cap_ptr + PCI_CAP_MMIO_BAR_HIGH_OFFSET) << 32;
     mmio_bar |= read_pci_config(bus, dev, func,
-            cap_ptr + PCI_CAP_MMIO_BAR_LOW_OFFSET) &
-            PCI_CAP_MMIO_BAR_LOW_MASK;
-    iommu->mmio_base_phys = (unsigned long)mmio_bar;
-
-    if ( (mmio_bar == 0) || ( (mmio_bar & 0x3FFF) != 0 ) ) {
+            cap_ptr + PCI_CAP_MMIO_BAR_LOW_OFFSET); 
+    iommu->mmio_base_phys = mmio_bar & (u64)~0x3FFF;
+
+    if ( (mmio_bar & 0x1) == 0 || iommu->mmio_base_phys == 0 )
+    {
         dprintk(XENLOG_ERR ,
                 "AMD IOMMU: Invalid MMIO_BAR = 0x%"PRIx64"\n", mmio_bar);
         return -ENODEV;
     }
+
+    iommu->bdf = (bus << 8) | PCI_DEVFN(dev, func);
+    iommu->cap_offset = cap_ptr;
 
     cap_header = read_pci_config(bus, dev, func, cap_ptr);
     iommu->revision = get_field_from_reg_u32(cap_header,
@@ -119,12 +113,15 @@ int __init get_iommu_capabilities(u8 bus
     iommu->ht_tunnel_support = get_field_from_reg_u32(cap_header,
                     PCI_CAP_HT_TUNNEL_MASK,
                     PCI_CAP_HT_TUNNEL_SHIFT);
-    iommu->not_present_cached = get_field_from_reg_u32(cap_header,
+    iommu->pte_not_present_cached = get_field_from_reg_u32(cap_header,
                     PCI_CAP_NP_CACHE_MASK,
                     PCI_CAP_NP_CACHE_SHIFT);
 
     cap_range = read_pci_config(bus, dev, func,
             cap_ptr + PCI_CAP_RANGE_OFFSET);
+    iommu->unit_id = get_field_from_reg_u32(cap_range,
+                PCI_CAP_UNIT_ID_MASK,
+                PCI_CAP_UNIT_ID_SHIFT);
     iommu->root_bus = get_field_from_reg_u32(cap_range,
                 PCI_CAP_BUS_NUMBER_MASK,
                 PCI_CAP_BUS_NUMBER_SHIFT);
@@ -135,6 +132,11 @@ int __init get_iommu_capabilities(u8 bus
                 PCI_CAP_LAST_DEVICE_MASK,
                 PCI_CAP_LAST_DEVICE_SHIFT);
 
+    misc_info = read_pci_config(bus, dev, func,
+            cap_ptr + PCI_MISC_INFO_OFFSET);
+    iommu->msi_number = get_field_from_reg_u32(misc_info,
+                PCI_CAP_MSI_NUMBER_MASK,
+                PCI_CAP_MSI_NUMBER_SHIFT);
     return 0;
 }
 
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/passthrough/amd/iommu_init.c
--- a/xen/drivers/passthrough/amd/iommu_init.c  Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/drivers/passthrough/amd/iommu_init.c  Fri Feb 29 09:19:58 2008 -0700
@@ -137,8 +137,49 @@ static void __init set_iommu_command_buf
     writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
 }
 
+static void __init register_iommu_exclusion_range(struct amd_iommu *iommu)
+{
+    u64 addr_lo, addr_hi;
+    u32 entry;
+
+    addr_lo = iommu->exclusion_limit & DMA_32BIT_MASK;
+    addr_hi = iommu->exclusion_limit >> 32;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+        IOMMU_EXCLUSION_LIMIT_HIGH_MASK,
+        IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET);
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+        IOMMU_EXCLUSION_LIMIT_LOW_MASK,
+        IOMMU_EXCLUSION_LIMIT_LOW_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_LOW_OFFSET);
+
+    addr_lo = iommu->exclusion_base & DMA_32BIT_MASK;
+    addr_hi = iommu->exclusion_base >> 32;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+        IOMMU_EXCLUSION_BASE_HIGH_MASK,
+        IOMMU_EXCLUSION_BASE_HIGH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_HIGH_OFFSET);
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+        IOMMU_EXCLUSION_BASE_LOW_MASK,
+        IOMMU_EXCLUSION_BASE_LOW_SHIFT, &entry);
+
+    set_field_in_reg_u32(iommu->exclusion_allow_all, entry,
+        IOMMU_EXCLUSION_ALLOW_ALL_MASK,
+        IOMMU_EXCLUSION_ALLOW_ALL_SHIFT, &entry);
+
+    set_field_in_reg_u32(iommu->exclusion_enable, entry,
+        IOMMU_EXCLUSION_RANGE_ENABLE_MASK,
+        IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_LOW_OFFSET);
+}
+
 void __init enable_iommu(struct amd_iommu *iommu)
 {
+    register_iommu_exclusion_range(iommu);
     set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED);
     set_iommu_translation_control(iommu, IOMMU_CONTROL_ENABLED);
     printk("AMD IOMMU %d: Enabled\n", nr_amd_iommus);
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/passthrough/amd/iommu_map.c
--- a/xen/drivers/passthrough/amd/iommu_map.c   Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/drivers/passthrough/amd/iommu_map.c   Fri Feb 29 09:19:58 2008 -0700
@@ -234,16 +234,19 @@ static void amd_iommu_set_page_directory
 }
 
 void amd_iommu_set_dev_table_entry(u32 *dte, u64 root_ptr, u16 domain_id,
-                                   u8 paging_mode)
+           u8 sys_mgt, u8 dev_ex, u8 paging_mode)
 {
     u64 addr_hi, addr_lo;
     u32 entry;
 
-    dte[6] = dte[5] = dte[4] = 0;
-
-    set_field_in_reg_u32(IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED, 0,
+    dte[7] = dte[6] = dte[5] = dte[4] = 0;
+
+    set_field_in_reg_u32(sys_mgt, 0,
                          IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK,
                          IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT, &entry);
+    set_field_in_reg_u32(dev_ex, entry,
+                         IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK,
+                         IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT, &entry);
     dte[3] = entry;
 
     set_field_in_reg_u32(domain_id, 0,
@@ -448,3 +451,34 @@ int amd_iommu_unmap_page(struct domain *
 
     return 0;
 }
+
+int amd_iommu_reserve_domain_unity_map(
+           struct domain *domain,
+           unsigned long phys_addr,
+           unsigned long size, int iw, int ir)
+{
+    unsigned long flags, npages, i;
+    void *pte;
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+
+    npages = region_to_pages(phys_addr, size);
+
+    spin_lock_irqsave(&hd->mapping_lock, flags);
+    for ( i = 0; i < npages; ++i )
+    {
+        pte = get_pte_from_page_tables(hd->root_table,
+           hd->paging_mode, phys_addr>>PAGE_SHIFT);
+        if ( pte == 0 )
+        {
+            dprintk(XENLOG_ERR,
+                    "AMD IOMMU: Invalid IO pagetable entry phys_addr = %lx\n", 
phys_addr);
+            spin_unlock_irqrestore(&hd->mapping_lock, flags);
+            return -EFAULT;
+        }
+        set_page_table_entry_present((u32 *)pte,
+           phys_addr, iw, ir);
+        phys_addr += PAGE_SIZE;
+    }
+    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    return 0;
+}
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/drivers/passthrough/amd/pci_amd_iommu.c
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c       Fri Feb 29 09:18:01 
2008 -0700
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c       Fri Feb 29 09:19:58 
2008 -0700
@@ -20,6 +20,7 @@
 
 #include <asm/amd-iommu.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
+#include <asm/hvm/svm/amd-iommu-acpi.h>
 #include <xen/sched.h>
 #include <asm/mm.h>
 #include "../pci-direct.h"
@@ -30,6 +31,9 @@ static long amd_iommu_cmd_buffer_entries
 static long amd_iommu_cmd_buffer_entries = IOMMU_CMD_BUFFER_DEFAULT_ENTRIES;
 int nr_amd_iommus = 0;
 
+unsigned short ivrs_bdf_entries = 0;
+struct ivrs_mappings *ivrs_mappings = NULL;
+
 /* will set if amd-iommu HW is found */
 int amd_iommu_enabled = 0;
 
@@ -82,13 +86,12 @@ static void __init detect_cleanup(void)
         deallocate_iommu_resources(iommu);
         xfree(iommu);
     }
-}
-
-static int requestor_id_from_bdf(int bdf)
-{
-    /* HACK - HACK */
-    /* account for possible 'aliasing' by parent device */
-    return bdf;
+
+    if ( ivrs_mappings )
+    {
+        xfree(ivrs_mappings);
+        ivrs_mappings = NULL;
+    }
 }
 
 static int __init allocate_iommu_table_struct(struct table_struct *table,
@@ -179,10 +182,21 @@ static int __init amd_iommu_init(void)
 {
     struct amd_iommu *iommu;
     unsigned long flags;
+    u16 bdf;
 
     for_each_amd_iommu ( iommu )
     {
         spin_lock_irqsave(&iommu->lock, flags);
+
+        /* assign default IOMMU values */
+        iommu->coherent = IOMMU_CONTROL_ENABLED;
+        iommu->isochronous = IOMMU_CONTROL_ENABLED;
+        iommu->res_pass_pw = IOMMU_CONTROL_ENABLED;
+        iommu->pass_pw = IOMMU_CONTROL_ENABLED;
+        iommu->ht_tunnel_enable = iommu->ht_tunnel_support ?
+            IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED;
+        iommu->exclusion_enable = IOMMU_CONTROL_DISABLED;
+        iommu->exclusion_allow_all = IOMMU_CONTROL_DISABLED;
 
         /* register IOMMU data strucures in MMIO space */
         if ( map_iommu_mmio_region(iommu) != 0 )
@@ -190,10 +204,30 @@ static int __init amd_iommu_init(void)
         register_iommu_dev_table_in_mmio_space(iommu);
         register_iommu_cmd_buffer_in_mmio_space(iommu);
 
+        spin_unlock_irqrestore(&iommu->lock, flags);
+    }
+
+    /* assign default values for device entries */
+    for ( bdf = 0; bdf < ivrs_bdf_entries; ++bdf )
+    {
+        ivrs_mappings[bdf].dte_requestor_id = bdf;
+        ivrs_mappings[bdf].dte_sys_mgt_enable =
+            IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED;
+        ivrs_mappings[bdf].dte_allow_exclusion =
+            IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].unity_map_enable =
+            IOMMU_CONTROL_DISABLED;
+    }
+
+    if ( acpi_table_parse(ACPI_IVRS, parse_ivrs_table) != 0 )
+        dprintk(XENLOG_INFO, "AMD IOMMU: Did not find IVRS table!\n");
+
+    for_each_amd_iommu ( iommu )
+    {
+        spin_lock_irqsave(&iommu->lock, flags);
         /* enable IOMMU translation services */
         enable_iommu(iommu);
         nr_amd_iommus++;
-
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
 
@@ -229,31 +263,38 @@ struct amd_iommu *find_iommu_for_device(
 }
 
 void amd_iommu_setup_domain_device(
-    struct domain *domain, struct amd_iommu *iommu, int requestor_id)
+    struct domain *domain, struct amd_iommu *iommu, int bdf)
 {
     void *dte;
     u64 root_ptr;
     unsigned long flags;
+    int req_id;
+    u8 sys_mgt, dev_ex;
     struct hvm_iommu *hd = domain_hvm_iommu(domain);
 
-    BUG_ON( !hd->root_table||!hd->paging_mode );
+    BUG_ON( !hd->root_table || !hd->paging_mode );
 
     root_ptr = (u64)virt_to_maddr(hd->root_table);
+    /* get device-table entry */
+    req_id = ivrs_mappings[bdf].dte_requestor_id;
     dte = iommu->dev_table.buffer +
-        (requestor_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+        (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
 
     if ( !amd_iommu_is_dte_page_translation_valid((u32 *)dte) )
     {
         spin_lock_irqsave(&iommu->lock, flags); 
 
-        amd_iommu_set_dev_table_entry(
-            (u32 *)dte,
-            root_ptr, hd->domain_id, hd->paging_mode);
-        invalidate_dev_table_entry(iommu, requestor_id);
+        /* bind DTE to domain page-tables */
+        sys_mgt = ivrs_mappings[req_id].dte_sys_mgt_enable;
+        dev_ex = ivrs_mappings[req_id].dte_allow_exclusion;
+        amd_iommu_set_dev_table_entry((u32 *)dte, root_ptr,
+            req_id, sys_mgt, dev_ex, hd->paging_mode);
+
+        invalidate_dev_table_entry(iommu, req_id);
         flush_command_buffer(iommu);
         dprintk(XENLOG_INFO, "AMD IOMMU: Set DTE req_id:%x, "
                 "root_ptr:%"PRIx64", domain_id:%d, paging_mode:%d\n",
-                requestor_id, root_ptr, hd->domain_id, hd->paging_mode);
+                req_id, root_ptr, hd->domain_id, hd->paging_mode);
 
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
@@ -266,7 +307,7 @@ void __init amd_iommu_setup_dom0_devices
     struct pci_dev *pdev;
     int bus, dev, func;
     u32 l;
-    int req_id, bdf;
+    int bdf;
 
     for ( bus = 0; bus < 256; bus++ )
     {
@@ -286,11 +327,12 @@ void __init amd_iommu_setup_dom0_devices
                 list_add_tail(&pdev->list, &hd->pdev_list);
 
                 bdf = (bus << 8) | pdev->devfn;
-                req_id = requestor_id_from_bdf(bdf);
-                iommu = find_iommu_for_device(bus, pdev->devfn);
+                /* supported device? */
+                iommu = (bdf < ivrs_bdf_entries) ?
+                    find_iommu_for_device(bus, pdev->devfn) : NULL;
 
                 if ( iommu )
-                    amd_iommu_setup_domain_device(dom0, iommu, req_id);
+                    amd_iommu_setup_domain_device(dom0, iommu, bdf);
             }
         }
     }
@@ -299,6 +341,8 @@ int amd_iommu_detect(void)
 int amd_iommu_detect(void)
 {
     unsigned long i;
+    int last_bus;
+    struct amd_iommu *iommu;
 
     if ( !enable_amd_iommu )
     {
@@ -318,6 +362,28 @@ int amd_iommu_detect(void)
     {
         printk("AMD IOMMU: Not found!\n");
         return 0;
+    }
+    else
+    {
+        /* allocate 'ivrs mappings' table */
+        /* note: the table has entries to accomodate all IOMMUs */
+        last_bus = 0;
+        for_each_amd_iommu (iommu)
+           if (iommu->last_downstream_bus > last_bus)
+               last_bus = iommu->last_downstream_bus;
+
+        ivrs_bdf_entries = (last_bus + 1) *
+                IOMMU_DEV_TABLE_ENTRIES_PER_BUS;
+        ivrs_mappings = xmalloc_array( struct ivrs_mappings, ivrs_bdf_entries);
+
+        if ( !ivrs_mappings )
+        {
+            dprintk(XENLOG_ERR, "AMD IOMMU:"
+                        " Error allocating IVRS DevMappings table\n");
+            goto error_out;
+        }
+        memset(ivrs_mappings, 0,
+            ivrs_bdf_entries * sizeof(struct ivrs_mappings));
     }
 
     if ( amd_iommu_init() != 0 )
@@ -407,23 +473,25 @@ int amd_iommu_domain_init(struct domain 
 }
 
 static void amd_iommu_disable_domain_device(
-    struct domain *domain, struct amd_iommu *iommu, u16 requestor_id)
+    struct domain *domain, struct amd_iommu *iommu, int bdf)
 {
     void *dte;
     unsigned long flags;
-
+    int req_id;
+
+    req_id = ivrs_mappings[bdf].dte_requestor_id;
     dte = iommu->dev_table.buffer +
-        (requestor_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+        (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
 
     if ( amd_iommu_is_dte_page_translation_valid((u32 *)dte) )
     {
         spin_lock_irqsave(&iommu->lock, flags); 
         memset (dte, 0, IOMMU_DEV_TABLE_ENTRY_SIZE);
-        invalidate_dev_table_entry(iommu, requestor_id);
+        invalidate_dev_table_entry(iommu, req_id);
         flush_command_buffer(iommu);
         dprintk(XENLOG_INFO , "AMD IOMMU: disable DTE 0x%x,"
                 " domain_id:%d, paging_mode:%d\n",
-                requestor_id,  domain_hvm_iommu(domain)->domain_id,
+                req_id,  domain_hvm_iommu(domain)->domain_id,
                 domain_hvm_iommu(domain)->paging_mode);
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
@@ -438,7 +506,7 @@ static int reassign_device( struct domai
     struct hvm_iommu *target_hd = domain_hvm_iommu(target);
     struct pci_dev *pdev;
     struct amd_iommu *iommu;
-    int req_id, bdf;
+    int bdf;
     unsigned long flags;
 
     for_each_pdev( source, pdev )
@@ -450,12 +518,13 @@ static int reassign_device( struct domai
         pdev->devfn = devfn;
 
         bdf = (bus << 8) | devfn;
-        req_id = requestor_id_from_bdf(bdf);
-        iommu = find_iommu_for_device(bus, devfn);
+        /* supported device? */
+        iommu = (bdf < ivrs_bdf_entries) ?
+            find_iommu_for_device(bus, pdev->devfn) : NULL;
 
         if ( iommu )
         {
-            amd_iommu_disable_domain_device(source, iommu, req_id);
+            amd_iommu_disable_domain_device(source, iommu, bdf);
             /* Move pci device from the source domain to target domain. */
             spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
             spin_lock_irqsave(&target_hd->iommu_list_lock, flags);
@@ -463,7 +532,7 @@ static int reassign_device( struct domai
             spin_unlock_irqrestore(&target_hd->iommu_list_lock, flags);
             spin_unlock_irqrestore(&source_hd->iommu_list_lock, flags);
 
-            amd_iommu_setup_domain_device(target, iommu, req_id);
+            amd_iommu_setup_domain_device(target, iommu, bdf);
             gdprintk(XENLOG_INFO ,
                      "AMD IOMMU: reassign %x:%x.%x domain %d -> domain %d\n",
                      bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
@@ -484,6 +553,19 @@ static int reassign_device( struct domai
 
 int amd_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
 {
+    int bdf = (bus << 8) | devfn;
+    int req_id;
+    req_id = ivrs_mappings[bdf].dte_requestor_id;
+
+    if (ivrs_mappings[req_id].unity_map_enable)
+    {
+        amd_iommu_reserve_domain_unity_map(d,
+            ivrs_mappings[req_id].addr_range_start,
+            ivrs_mappings[req_id].addr_range_length,
+            ivrs_mappings[req_id].write_permission,
+            ivrs_mappings[req_id].read_permission);
+    }
+
     pdev_flr(bus, devfn);
     return reassign_device(dom0, d, bus, devfn);
 }
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/asm-x86/amd-iommu.h
--- a/xen/include/asm-x86/amd-iommu.h   Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/asm-x86/amd-iommu.h   Fri Feb 29 09:19:58 2008 -0700
@@ -43,14 +43,25 @@ struct amd_iommu {
     struct list_head list;
     spinlock_t lock; /* protect iommu */
 
-    int iotlb_support;
-    int ht_tunnel_support;
-    int not_present_cached;
+    u16 bdf;
+    u8  cap_offset;
     u8  revision;
+    u8  unit_id;
+    u8  msi_number;
 
     u8  root_bus;
     u8  first_devfn;
     u8  last_devfn;
+
+    u8 pte_not_present_cached;
+    u8 ht_tunnel_support;
+    u8 iotlb_support;
+
+    u8 isochronous;
+    u8 coherent;
+    u8 res_pass_pw;
+    u8 pass_pw;
+    u8 ht_tunnel_enable;
 
     int last_downstream_bus;
     int downstream_bus_present[PCI_MAX_BUS_COUNT];
@@ -61,10 +72,23 @@ struct amd_iommu {
     struct table_struct dev_table;
     struct table_struct cmd_buffer;
     u32 cmd_buffer_tail;
+    struct table_struct event_log;
+    u32 event_log_head;
 
-    int exclusion_enabled;
-    unsigned long exclusion_base;
-    unsigned long exclusion_limit;
+    int exclusion_enable;
+    int exclusion_allow_all;
+    uint64_t exclusion_base;
+    uint64_t exclusion_limit;
 };
 
+struct ivrs_mappings {
+    u16 dte_requestor_id;
+    u8 dte_sys_mgt_enable;
+    u8 dte_allow_exclusion;
+    u8 unity_map_enable;
+    u8 write_permission;
+    u8 read_permission;
+    unsigned long addr_range_start;
+    unsigned long addr_range_length;
+};
 #endif /* _ASM_X86_64_AMD_IOMMU_H */
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/asm-x86/domain.h      Fri Feb 29 09:19:58 2008 -0700
@@ -97,6 +97,11 @@ struct shadow_domain {
 
     /* Fast MMIO path heuristic */
     int has_fast_mmio_entries;
+
+    /* reflect guest table dirty status, incremented by write
+     * emulation and remove write permission
+     */
+    atomic_t          gtable_dirty_version;
 };
 
 struct shadow_vcpu {
diff -r 0b20ac6ec64a -r 71a8366fb212 
xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h      Fri Feb 29 09:19:58 
2008 -0700
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@xxxxxxx>
+ * Author: Wei Wang <wei.wang2@xxxxxxx> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_64_AMD_IOMMU_ACPI_H
+#define _ASM_X86_64_AMD_IOMMU_ACPI_H
+
+#include <xen/acpi.h>
+
+/* I/O Virtualization Reporting Structure */
+#define AMD_IOMMU_ACPI_IVRS_SIG            "IVRS"
+#define AMD_IOMMU_ACPI_IVHD_TYPE       0x10
+#define AMD_IOMMU_ACPI_IVMD_ALL_TYPE       0x20
+#define AMD_IOMMU_ACPI_IVMD_ONE_TYPE       0x21
+#define AMD_IOMMU_ACPI_IVMD_RANGE_TYPE     0x22
+#define AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE     0x23
+
+/* 4-byte Device Entries */
+#define AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD        0
+#define AMD_IOMMU_ACPI_IVHD_DEV_SELECT     2
+#define AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START    3
+#define AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END  4
+
+/* 8-byte Device Entries */
+#define AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD        64
+#define AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT   66
+#define AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE    67
+#define AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT 70
+#define AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE  71
+
+/* IVHD IOMMU Flags */
+#define AMD_IOMMU_ACPI_COHERENT_MASK       0x20
+#define AMD_IOMMU_ACPI_COHERENT_SHIFT      5
+#define AMD_IOMMU_ACPI_IOTLB_SUP_MASK      0x10
+#define AMD_IOMMU_ACPI_IOTLB_SUP_SHIFT     4
+#define AMD_IOMMU_ACPI_ISOC_MASK       0x08
+#define AMD_IOMMU_ACPI_ISOC_SHIFT      3
+#define AMD_IOMMU_ACPI_RES_PASS_PW_MASK        0x04
+#define AMD_IOMMU_ACPI_RES_PASS_PW_SHIFT   2
+#define AMD_IOMMU_ACPI_PASS_PW_MASK        0x02
+#define AMD_IOMMU_ACPI_PASS_PW_SHIFT       1
+#define AMD_IOMMU_ACPI_HT_TUN_ENB_MASK     0x01
+#define AMD_IOMMU_ACPI_HT_TUN_ENB_SHIFT        0
+
+/* IVHD Device Flags */
+#define AMD_IOMMU_ACPI_LINT1_PASS_MASK     0x80
+#define AMD_IOMMU_ACPI_LINT1_PASS_SHIFT        7
+#define AMD_IOMMU_ACPI_LINT0_PASS_MASK     0x40
+#define AMD_IOMMU_ACPI_LINT0_PASS_SHIFT        6
+#define AMD_IOMMU_ACPI_SYS_MGT_MASK        0x30
+#define AMD_IOMMU_ACPI_SYS_MGT_SHIFT       4
+#define AMD_IOMMU_ACPI_NMI_PASS_MASK       0x04
+#define AMD_IOMMU_ACPI_NMI_PASS_SHIFT      2
+#define AMD_IOMMU_ACPI_EINT_PASS_MASK      0x02
+#define AMD_IOMMU_ACPI_EINT_PASS_SHIFT     1
+#define AMD_IOMMU_ACPI_INIT_PASS_MASK      0x01
+#define AMD_IOMMU_ACPI_INIT_PASS_SHIFT     0
+
+/* IVHD Device Extended Flags */
+#define AMD_IOMMU_ACPI_ATS_DISABLED_MASK   0x80000000
+#define AMD_IOMMU_ACPI_ATS_DISABLED_SHIFT  31
+
+/* IVMD Device Flags */
+#define AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK    0x08
+#define AMD_IOMMU_ACPI_EXCLUSION_RANGE_SHIFT   3
+#define AMD_IOMMU_ACPI_IW_PERMISSION_MASK  0x04
+#define AMD_IOMMU_ACPI_IW_PERMISSION_SHIFT 2
+#define AMD_IOMMU_ACPI_IR_PERMISSION_MASK  0x02
+#define AMD_IOMMU_ACPI_IR_PERMISSION_SHIFT 1
+#define AMD_IOMMU_ACPI_UNITY_MAPPING_MASK  0x01
+#define AMD_IOMMU_ACPI_UNITY_MAPPING_SHIFT 0
+
+#define ACPI_OEM_ID_SIZE                6
+#define ACPI_OEM_TABLE_ID_SIZE          8
+
+#pragma pack(1)
+struct acpi_ivrs_table_header {
+   struct acpi_table_header acpi_header;
+   u32 io_info;
+   u8  reserved[8];
+};
+
+struct acpi_ivrs_block_header {
+   u8  type;
+   u8  flags;
+   u16 length;
+   u16 dev_id;
+};
+
+struct acpi_ivhd_block_header {
+   struct acpi_ivrs_block_header header;
+   u16 cap_offset;
+   u64 mmio_base;
+   u16 pci_segment;
+   u16 iommu_info;
+   u8 reserved[4];
+};
+
+struct acpi_ivhd_device_header {
+   u8  type;
+   u16 dev_id;
+   u8  flags;
+};
+
+struct acpi_ivhd_device_trailer {
+   u8  type;
+   u16 dev_id;
+   u8  reserved;
+};
+
+struct acpi_ivhd_device_range {
+   struct acpi_ivhd_device_header header;
+   struct acpi_ivhd_device_trailer trailer;
+};
+
+struct acpi_ivhd_device_alias {
+   struct acpi_ivhd_device_header header;
+   u8  reserved1;
+   u16 dev_id;
+   u8  reserved2;
+};
+
+struct acpi_ivhd_device_alias_range {
+   struct acpi_ivhd_device_alias alias;
+   struct acpi_ivhd_device_trailer trailer;
+};
+
+struct acpi_ivhd_device_extended {
+   struct acpi_ivhd_device_header header;
+   u32 ext_flags;
+};
+
+struct acpi_ivhd_device_extended_range {
+   struct acpi_ivhd_device_extended extended;
+   struct acpi_ivhd_device_trailer trailer;
+};
+
+union acpi_ivhd_device {
+   struct acpi_ivhd_device_header header;
+   struct acpi_ivhd_device_range range;
+   struct acpi_ivhd_device_alias alias;
+   struct acpi_ivhd_device_alias_range alias_range;
+   struct acpi_ivhd_device_extended extended;
+   struct acpi_ivhd_device_extended_range extended_range;
+};
+
+struct acpi_ivmd_block_header {
+   struct acpi_ivrs_block_header header;
+   union {
+       u16 last_dev_id;
+       u16 cap_offset;
+       u16 reserved1;
+   };
+   u64 reserved2;
+   u64 start_addr;
+   u64 mem_length;
+};
+#pragma pack()
+
+#endif /* _ASM_X86_64_AMD_IOMMU_ACPI_H */
diff -r 0b20ac6ec64a -r 71a8366fb212 
xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h      Fri Feb 29 09:18:01 
2008 -0700
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h      Fri Feb 29 09:19:58 
2008 -0700
@@ -117,6 +117,12 @@
 #define PCI_CAP_FIRST_DEVICE_SHIFT     16
 #define PCI_CAP_LAST_DEVICE_MASK       0xFF000000
 #define PCI_CAP_LAST_DEVICE_SHIFT      24
+
+#define PCI_CAP_UNIT_ID_MASK    0x0000001F
+#define PCI_CAP_UNIT_ID_SHIFT   0
+#define PCI_MISC_INFO_OFFSET    0x10
+#define PCI_CAP_MSI_NUMBER_MASK     0x0000001F
+#define PCI_CAP_MSI_NUMBER_SHIFT    0
 
 /* Device Table */
 #define IOMMU_DEV_TABLE_BASE_LOW_OFFSET                0x00
diff -r 0b20ac6ec64a -r 71a8366fb212 
xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h     Fri Feb 29 09:18:01 
2008 -0700
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h     Fri Feb 29 09:19:58 
2008 -0700
@@ -21,6 +21,7 @@
 #ifndef _ASM_X86_64_AMD_IOMMU_PROTO_H
 #define _ASM_X86_64_AMD_IOMMU_PROTO_H
 
+#include <xen/sched.h>
 #include <asm/amd-iommu.h>
 
 #define for_each_amd_iommu(amd_iommu) \
@@ -54,10 +55,12 @@ int amd_iommu_map_page(struct domain *d,
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn);
 int amd_iommu_unmap_page(struct domain *d, unsigned long gfn);
 void *amd_iommu_get_vptr_from_page_table_entry(u32 *entry);
+int amd_iommu_reserve_domain_unity_map(struct domain *domain,
+        unsigned long phys_addr, unsigned long size, int iw, int ir);
 
 /* device table functions */
-void amd_iommu_set_dev_table_entry(u32 *dte,
-        u64 root_ptr, u16 domain_id, u8 paging_mode);
+void amd_iommu_set_dev_table_entry(u32 *dte, u64 root_ptr,
+        u16 domain_id, u8 sys_mgt, u8 dev_ex, u8 paging_mode);
 int amd_iommu_is_dte_page_translation_valid(u32 *entry);
 void invalidate_dev_table_entry(struct amd_iommu *iommu,
             u16 devic_id);
@@ -69,10 +72,13 @@ void flush_command_buffer(struct amd_iom
 /* iommu domain funtions */
 int amd_iommu_domain_init(struct domain *domain);
 void amd_iommu_setup_domain_device(struct domain *domain,
-    struct amd_iommu *iommu, int requestor_id);
+    struct amd_iommu *iommu, int bdf);
 
 /* find iommu for bdf */
 struct amd_iommu *find_iommu_for_device(int bus, int devfn);
+
+/* amd-iommu-acpi functions */
+int __init parse_ivrs_table(unsigned long phys_addr, unsigned long size);
 
 static inline u32 get_field_from_reg_u32(u32 reg_value, u32 mask, u32 shift)
 {
@@ -91,4 +97,16 @@ static inline u32 set_field_in_reg_u32(u
     return reg_value;
 }
 
+static inline u8 get_field_from_byte(u8 value, u8 mask, u8 shift)
+{
+    u8 field;
+    field = (value & mask) >> shift;
+    return field;
+}
+
+static inline unsigned long region_to_pages(unsigned long addr, unsigned long 
size)
+{
+    return (PAGE_ALIGN(addr + size) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
 #endif /* _ASM_X86_64_AMD_IOMMU_PROTO_H */
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/asm-x86/perfc_defn.h  Fri Feb 29 09:19:58 2008 -0700
@@ -88,6 +88,11 @@ PERFCOUNTER(shadow_unshadow_bf,    "shad
 PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
 PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
 PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
+PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
+PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
+PERFCOUNTER(shadow_rm_write_flush_tlb,
+                                   "shadow flush tlb by removing write perm")
+
 PERFCOUNTER(shadow_invlpg,         "shadow emulates invlpg")
 PERFCOUNTER(shadow_invlpg_fault,   "shadow invlpg faults")
 
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/public/io/kbdif.h
--- a/xen/include/public/io/kbdif.h     Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/public/io/kbdif.h     Fri Feb 29 09:19:58 2008 -0700
@@ -65,7 +65,7 @@ struct xenkbd_position
     uint8_t type;        /* XENKBD_TYPE_POS */
     int32_t abs_x;       /* absolute X position (in FB pixels) */
     int32_t abs_y;       /* absolute Y position (in FB pixels) */
-    int32_t abs_z;       /* absolute Z position (wheel) */
+    int32_t rel_z;       /* relative Z motion (wheel) */
 };
 
 #define XENKBD_IN_EVENT_SIZE 40
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/public/kexec.h
--- a/xen/include/public/kexec.h        Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/public/kexec.h        Fri Feb 29 09:19:58 2008 -0700
@@ -126,9 +126,18 @@ typedef struct xen_kexec_load {
     xen_kexec_image_t image;
 } xen_kexec_load_t;
 
-#define KEXEC_RANGE_MA_CRASH 0   /* machine address and size of crash area */
-#define KEXEC_RANGE_MA_XEN   1   /* machine address and size of Xen itself */
-#define KEXEC_RANGE_MA_CPU   2   /* machine address and size of a CPU note */
+#define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area 
*/
+#define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself 
*/
+#define KEXEC_RANGE_MA_CPU        2 /* machine address and size of a CPU note 
*/
+#define KEXEC_RANGE_MA_XENHEAP    3 /* machine address and size of xenheap
+                                     * Note that although this is adjacent
+                                     * to Xen it exists in a separate EFI
+                                     * region on ia64, and thus needs to be
+                                     * inserted into iomem_machine separately 
*/
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+                                     * the ia64_boot_param */
+#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
+                                     * of the EFI Memory Map */
 
 /*
  * Find the address and size of certain memory areas
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/xen/acpi.h
--- a/xen/include/xen/acpi.h    Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/xen/acpi.h    Fri Feb 29 09:19:58 2008 -0700
@@ -368,6 +368,7 @@ enum acpi_table_id {
        ACPI_HPET,
        ACPI_MCFG,
        ACPI_DMAR,
+       ACPI_IVRS,
        ACPI_TABLE_COUNT
 };
 
diff -r 0b20ac6ec64a -r 71a8366fb212 xen/include/xen/kexec.h
--- a/xen/include/xen/kexec.h   Fri Feb 29 09:18:01 2008 -0700
+++ b/xen/include/xen/kexec.h   Fri Feb 29 09:19:58 2008 -0700
@@ -31,6 +31,7 @@ void kexec_crash_save_cpu(void);
 void kexec_crash_save_cpu(void);
 crash_xen_info_t *kexec_crash_save_info(void);
 void machine_crash_shutdown(void);
+int machine_kexec_get(xen_kexec_range_t *range);
 
 #endif /* __XEN_KEXEC_H__ */
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.