|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [qemu-xen staging-4.11] Merge tag 'v2.11.2' into staging-4.11
=== This changeset includes merge from high-traffic branch ===
Commits on that branch are not reported individually.
commit 20c76f9a5fbf16d58c6add2ace2ff0fabd785926
Merge: 43139135a8938de44f66333831d3a8655d07663a
0982a56a551556c704dc15752dabf57b4be1c640
Author: Anthony PERARD <anthony.perard@xxxxxxxxxx>
AuthorDate: Mon Jul 9 14:42:32 2018 +0100
Commit: Anthony PERARD <anthony.perard@xxxxxxxxxx>
CommitDate: Mon Jul 9 14:42:32 2018 +0100
Merge tag 'v2.11.2' into staging-4.11
2.11.2
MAINTAINERS | 6 +
VERSION | 2 +-
block.c | 17 +-
block/file-posix.c | 5 +-
block/gluster.c | 21 +-
block/nbd-client.c | 3 -
block/nbd.c | 2 +
block/qcow2.c | 2 +-
block/raw-format.c | 64 +++--
block/rbd.c | 3 +-
block/ssh.c | 1 +
block/throttle.c | 54 ++--
configure | 3 +
cpus.c | 10 +-
device_tree.c | 2 +-
docs/interop/qcow2.txt | 16 +-
exec.c | 92 ++++---
gdbstub.c | 3 +-
hw/block/pflash_cfi01.c | 10 +-
hw/block/pflash_cfi02.c | 9 +-
hw/char/cmsdk-apb-uart.c | 1 +
hw/core/loader.c | 20 +-
hw/core/qdev.c | 24 ++
hw/display/qxl-render.c | 3 +-
hw/display/vga.c | 2 +
hw/i386/acpi-build.c | 3 +-
hw/i386/intel_iommu.c | 489 +++++++++++++++++++++++++++----------
hw/i386/intel_iommu_internal.h | 43 ++--
hw/i386/multiboot.c | 85 ++++---
hw/i386/trace-events | 5 +-
hw/ide/ahci.c | 13 +-
hw/intc/arm_gicv3_common.c | 79 ++++++
hw/intc/arm_gicv3_cpuif.c | 12 +-
hw/intc/arm_gicv3_kvm.c | 57 ++++-
hw/intc/openpic_kvm.c | 4 -
hw/net/virtio-net.c | 11 +
hw/pci-bridge/i82801b11.c | 1 +
hw/ppc/spapr.c | 161 +++++++-----
hw/ppc/spapr_cpu_core.c | 9 +-
hw/s390x/ccw-device.c | 8 +
hw/s390x/css.c | 8 +
hw/s390x/s390-virtio-ccw.c | 30 ++-
hw/s390x/virtio-ccw.c | 54 ++--
hw/s390x/virtio-ccw.h | 3 +-
hw/tpm/tpm_emulator.c | 4 +-
hw/tpm/tpm_passthrough.c | 36 +--
hw/usb/dev-mtp.c | 6 +-
hw/usb/dev-smartcard-reader.c | 4 +-
hw/usb/redirect.c | 2 +-
hw/vfio/ccw.c | 2 +
hw/virtio/virtio-balloon.c | 1 +
include/block/block.h | 1 +
include/exec/cpu-all.h | 6 +-
include/exec/cpu_ldst.h | 16 +-
include/exec/memory-internal.h | 13 +-
include/exec/memory.h | 45 ++--
include/hw/i386/intel_iommu.h | 26 +-
include/hw/intc/arm_gicv3_common.h | 1 +
include/hw/ppc/spapr.h | 3 +-
include/hw/qdev-core.h | 14 +-
include/net/net.h | 1 +
include/qemu/iova-tree.h | 134 ++++++++++
linux-user/mmap.c | 26 +-
linux-user/syscall.c | 13 +-
memory.c | 30 ---
migration/block.c | 5 +-
nbd/client.c | 14 +-
net/net.c | 1 -
net/tap.c | 2 +
net/vhost-user.c | 11 +-
pc-bios/s390-ccw.img | Bin 26416 -> 26416 bytes
pc-bios/s390-ccw/bootmap.c | 7 +
pc-bios/s390-ccw/cio.h | 2 +-
pc-bios/s390-ccw/iplb.h | 16 +-
qemu-img.c | 29 ++-
qemu-io.c | 4 +-
scripts/qapi.py | 2 +-
scsi/qemu-pr-helper.c | 7 +-
target/arm/translate-a64.c | 6 +-
target/arm/translate.c | 17 +-
target/arm/translate.h | 2 +-
target/i386/cpu.c | 4 +-
target/i386/cpu.h | 3 +
target/i386/kvm.c | 16 +-
target/i386/machine.c | 20 ++
target/i386/translate.c | 2 +-
target/lm32/op_helper.c | 4 +
target/ppc/compat.c | 25 +-
target/ppc/cpu.h | 2 +-
target/ppc/machine.c | 5 +
target/sparc/translate.c | 5 +
target/xtensa/translate.c | 1 +
tcg/arm/tcg-target.inc.c | 4 +-
tcg/tcg-opc.h | 4 +-
tcg/tcg.h | 10 +
tests/boot-serial-test.c | 8 +-
tests/migration-test.c | 4 +-
tests/multiboot/.gitignore | 3 +
tests/multiboot/Makefile | 22 +-
tests/multiboot/aout_kludge.S | 138 +++++++++++
tests/multiboot/aout_kludge.out | 42 ++++
tests/multiboot/run_test.sh | 34 +--
tests/prom-env-test.c | 6 +-
tests/pxe-test.c | 10 +-
tests/qemu-iotests/024 | 82 ++++++-
tests/qemu-iotests/024.out | 30 +++
tests/qemu-iotests/060 | 30 +++
tests/qemu-iotests/060.out | 14 ++
tests/qemu-iotests/106 | 24 ++
tests/qemu-iotests/106.out | 10 +
tests/qemu-iotests/153 | 17 ++
tests/qemu-iotests/153.out | 16 ++
tests/qemu-iotests/221 | 60 +++++
tests/qemu-iotests/221.out | 16 ++
tests/qemu-iotests/group | 1 +
tests/test-crypto-tlssession.c | 1 +
tests/test-io-channel-tls.c | 1 +
ui/console.c | 5 +
util/Makefile.objs | 1 +
util/iova-tree.c | 114 +++++++++
120 files changed, 2141 insertions(+), 612 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index 0255113470..a8e01de523 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1680,6 +1680,12 @@ F: include/sysemu/replay.h
F: docs/replay.txt
F: stubs/replay.c
+IOVA Tree
+M: Peter Xu <peterx@xxxxxxxxxx>
+S: Maintained
+F: include/qemu/iova-tree.h
+F: util/iova-tree.c
+
Usermode Emulation
------------------
Overall
diff --git a/VERSION b/VERSION
index 6ceb272eec..9e5bb77a3b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.11.1
+2.11.2
diff --git a/block.c b/block.c
index f236431da1..24dd28d51d 100644
--- a/block.c
+++ b/block.c
@@ -1596,13 +1596,24 @@ static int bdrv_reopen_get_flags(BlockReopenQueue *q,
BlockDriverState *bs)
/* Returns whether the image file can be written to after the reopen queue @q
* has been successfully applied, or right now if @q is NULL. */
-static bool bdrv_is_writable(BlockDriverState *bs, BlockReopenQueue *q)
+static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
+ BlockReopenQueue *q)
{
int flags = bdrv_reopen_get_flags(q, bs);
return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
}
+/*
+ * Return whether the BDS can be written to. This is not necessarily
+ * the same as !bdrv_is_read_only(bs), as inactivated images may not
+ * be written to but do not count as read-only images.
+ */
+bool bdrv_is_writable(BlockDriverState *bs)
+{
+ return bdrv_is_writable_after_reopen(bs, NULL);
+}
+
static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
BdrvChild *c, const BdrvChildRole *role,
BlockReopenQueue *reopen_queue,
@@ -1640,7 +1651,7 @@ static int bdrv_check_perm(BlockDriverState *bs,
BlockReopenQueue *q,
/* Write permissions never work with read-only images */
if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
- !bdrv_is_writable(bs, q))
+ !bdrv_is_writable_after_reopen(bs, q))
{
error_setg(errp, "Block node is read-only");
return -EPERM;
@@ -1930,7 +1941,7 @@ void bdrv_format_default_perms(BlockDriverState *bs,
BdrvChild *c,
&perm, &shared);
/* Format drivers may touch metadata even if the guest doesn't write */
- if (bdrv_is_writable(bs, reopen_queue)) {
+ if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
}
diff --git a/block/file-posix.c b/block/file-posix.c
index 36ee89e940..275953fdc6 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1694,6 +1694,7 @@ static int raw_regular_truncate(int fd, int64_t offset,
PreallocMode prealloc,
case PREALLOC_MODE_FULL:
{
int64_t num = 0, left = offset - current_length;
+ off_t seek_result;
/*
* Knowing the final size from the beginning could allow the file
@@ -1708,8 +1709,8 @@ static int raw_regular_truncate(int fd, int64_t offset,
PreallocMode prealloc,
buf = g_malloc0(65536);
- result = lseek(fd, current_length, SEEK_SET);
- if (result < 0) {
+ seek_result = lseek(fd, current_length, SEEK_SET);
+ if (seek_result < 0) {
result = -errno;
error_setg_errno(errp, -result,
"Failed to seek to the old end of file");
diff --git a/block/gluster.c b/block/gluster.c
index 0f4265a3a4..d09f4f2283 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -164,7 +164,12 @@ static QemuOptsList runtime_unix_opts = {
{
.name = GLUSTER_OPT_SOCKET,
.type = QEMU_OPT_STRING,
- .help = "socket file path)",
+ .help = "socket file path (legacy)",
+ },
+ {
+ .name = GLUSTER_OPT_PATH,
+ .type = QEMU_OPT_STRING,
+ .help = "socket file path (QAPI)",
},
{ /* end of list */ }
},
@@ -612,10 +617,18 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster
*gconf,
goto out;
}
- ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET);
+ ptr = qemu_opt_get(opts, GLUSTER_OPT_PATH);
+ if (!ptr) {
+ ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET);
+ } else if (qemu_opt_get(opts, GLUSTER_OPT_SOCKET)) {
+ error_setg(&local_err,
+ "Conflicting parameters 'path' and 'socket'");
+ error_append_hint(&local_err, GERR_INDEX_HINT, i);
+ goto out;
+ }
if (!ptr) {
error_setg(&local_err, QERR_MISSING_PARAMETER,
- GLUSTER_OPT_SOCKET);
+ GLUSTER_OPT_PATH);
error_append_hint(&local_err, GERR_INDEX_HINT, i);
goto out;
}
@@ -680,7 +693,7 @@ static struct glfs
*qemu_gluster_init(BlockdevOptionsGluster *gconf,
"file.server.0.host=1.2.3.4,"
"file.server.0.port=24007,"
"file.server.1.transport=unix,"
- "file.server.1.socket=/var/run/glusterd.socket
..."
+ "file.server.1.path=/var/run/glusterd.socket ..."
"\n");
errno = -ret;
return NULL;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index 9206652e45..7b68499b76 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -846,9 +846,6 @@ int nbd_client_init(BlockDriverState *bs,
if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
}
- if (client->info.min_block > bs->bl.request_alignment) {
- bs->bl.request_alignment = client->info.min_block;
- }
qemu_co_mutex_init(&client->send_mutex);
qemu_co_queue_init(&client->free_sema);
diff --git a/block/nbd.c b/block/nbd.c
index 8b8ba56cdd..c32ea9fd73 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -474,8 +474,10 @@ static int nbd_co_flush(BlockDriverState *bs)
static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
{
NBDClientSession *s = nbd_get_client_session(bs);
+ uint32_t min = s->info.min_block;
uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);
+ bs->bl.request_alignment = min ? min : BDRV_SECTOR_SIZE;
bs->bl.max_pdiscard = max;
bs->bl.max_pwrite_zeroes = max;
bs->bl.max_transfer = max;
diff --git a/block/qcow2.c b/block/qcow2.c
index 1914a940e5..28f2d91797 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4235,7 +4235,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool
fatal, int64_t offset,
char *message;
va_list ap;
- fatal = fatal && !bs->read_only;
+ fatal = fatal && bdrv_is_writable(bs);
if (s->signaled_corruption &&
(!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
diff --git a/block/raw-format.c b/block/raw-format.c
index ab552c0954..c77290b93f 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -167,16 +167,37 @@ static void raw_reopen_abort(BDRVReopenState *state)
state->opaque = NULL;
}
+/* Check and adjust the offset, against 'offset' and 'size' options. */
+static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset,
+ uint64_t bytes, bool is_write)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) {
+ /* There's not enough space for the write, or the read request is
+ * out-of-range. Don't read/write anything to prevent leaking out of
+ * the size specified in options. */
+ return is_write ? -ENOSPC : -EINVAL;;
+ }
+
+ if (*offset > INT64_MAX - s->offset) {
+ return -EINVAL;
+ }
+ *offset += s->offset;
+
+ return 0;
+}
+
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
uint64_t bytes, QEMUIOVector *qiov,
int flags)
{
- BDRVRawState *s = bs->opaque;
+ int ret;
- if (offset > UINT64_MAX - s->offset) {
- return -EINVAL;
+ ret = raw_adjust_offset(bs, &offset, bytes, false);
+ if (ret) {
+ return ret;
}
- offset += s->offset;
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
@@ -186,23 +207,11 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState
*bs, uint64_t offset,
uint64_t bytes, QEMUIOVector *qiov,
int flags)
{
- BDRVRawState *s = bs->opaque;
void *buf = NULL;
BlockDriver *drv;
QEMUIOVector local_qiov;
int ret;
- if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
- /* There's not enough space for the data. Don't write anything and just
- * fail to prevent leaking out of the size specified in options. */
- return -ENOSPC;
- }
-
- if (offset > UINT64_MAX - s->offset) {
- ret = -EINVAL;
- goto fail;
- }
-
if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
/* Handling partial writes would be a pain - so we just
* require that guests have 512-byte request alignment if
@@ -237,7 +246,10 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState
*bs, uint64_t offset,
qiov = &local_qiov;
}
- offset += s->offset;
+ ret = raw_adjust_offset(bs, &offset, bytes, true);
+ if (ret) {
+ goto fail;
+ }
BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
@@ -267,22 +279,24 @@ static int coroutine_fn
raw_co_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int bytes,
BdrvRequestFlags flags)
{
- BDRVRawState *s = bs->opaque;
- if (offset > UINT64_MAX - s->offset) {
- return -EINVAL;
+ int ret;
+
+ ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
+ if (ret) {
+ return ret;
}
- offset += s->offset;
return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
}
static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
int64_t offset, int bytes)
{
- BDRVRawState *s = bs->opaque;
- if (offset > UINT64_MAX - s->offset) {
- return -EINVAL;
+ int ret;
+
+ ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
+ if (ret) {
+ return ret;
}
- offset += s->offset;
return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
}
diff --git a/block/rbd.c b/block/rbd.c
index a76a5e8755..2de434dfdd 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -265,13 +265,14 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const
char *keypairs_json,
key = qstring_get_str(name);
ret = rados_conf_set(cluster, key, qstring_get_str(value));
- QDECREF(name);
QDECREF(value);
if (ret < 0) {
error_setg_errno(errp, -ret, "invalid conf option %s", key);
+ QDECREF(name);
ret = -EINVAL;
break;
}
+ QDECREF(name);
}
QDECREF(keypairs);
diff --git a/block/ssh.c b/block/ssh.c
index b049a16eb9..8890a0c4ba 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -556,6 +556,7 @@ static QemuOptsList ssh_runtime_opts = {
.type = QEMU_OPT_STRING,
.help = "Defines how and what to check the host key against",
},
+ { /* end of list */ }
},
};
diff --git a/block/throttle.c b/block/throttle.c
index 833175ac77..d5903784c0 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -35,9 +35,12 @@ static QemuOptsList throttle_opts = {
},
};
-static int throttle_configure_tgm(BlockDriverState *bs,
- ThrottleGroupMember *tgm,
- QDict *options, Error **errp)
+/*
+ * If this function succeeds then the throttle group name is stored in
+ * @group and must be freed by the caller.
+ * If there's an error then @group remains unmodified.
+ */
+static int throttle_parse_options(QDict *options, char **group, Error **errp)
{
int ret;
const char *group_name;
@@ -62,8 +65,7 @@ static int throttle_configure_tgm(BlockDriverState *bs,
goto fin;
}
- /* Register membership to group with name group_name */
- throttle_group_register_tgm(tgm, group_name, bdrv_get_aio_context(bs));
+ *group = g_strdup(group_name);
ret = 0;
fin:
qemu_opts_del(opts);
@@ -74,6 +76,8 @@ static int throttle_open(BlockDriverState *bs, QDict *options,
int flags, Error **errp)
{
ThrottleGroupMember *tgm = bs->opaque;
+ char *group;
+ int ret;
bs->file = bdrv_open_child(NULL, options, "file", bs,
&child_file, false, errp);
@@ -83,7 +87,14 @@ static int throttle_open(BlockDriverState *bs, QDict
*options,
bs->supported_write_flags = bs->file->bs->supported_write_flags;
bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
- return throttle_configure_tgm(bs, tgm, options, errp);
+ ret = throttle_parse_options(options, &group, errp);
+ if (ret == 0) {
+ /* Register membership to group with name group_name */
+ throttle_group_register_tgm(tgm, group, bdrv_get_aio_context(bs));
+ g_free(group);
+ }
+
+ return ret;
}
static void throttle_close(BlockDriverState *bs)
@@ -159,35 +170,36 @@ static void throttle_attach_aio_context(BlockDriverState
*bs,
static int throttle_reopen_prepare(BDRVReopenState *reopen_state,
BlockReopenQueue *queue, Error **errp)
{
- ThrottleGroupMember *tgm;
+ int ret;
+ char *group = NULL;
assert(reopen_state != NULL);
assert(reopen_state->bs != NULL);
- reopen_state->opaque = g_new0(ThrottleGroupMember, 1);
- tgm = reopen_state->opaque;
-
- return throttle_configure_tgm(reopen_state->bs, tgm, reopen_state->options,
- errp);
+ ret = throttle_parse_options(reopen_state->options, &group, errp);
+ reopen_state->opaque = group;
+ return ret;
}
static void throttle_reopen_commit(BDRVReopenState *reopen_state)
{
- ThrottleGroupMember *old_tgm = reopen_state->bs->opaque;
- ThrottleGroupMember *new_tgm = reopen_state->opaque;
+ BlockDriverState *bs = reopen_state->bs;
+ ThrottleGroupMember *tgm = bs->opaque;
+ char *group = reopen_state->opaque;
+
+ assert(group);
- throttle_group_unregister_tgm(old_tgm);
- g_free(old_tgm);
- reopen_state->bs->opaque = new_tgm;
+ if (strcmp(group, throttle_group_get_name(tgm))) {
+ throttle_group_unregister_tgm(tgm);
+ throttle_group_register_tgm(tgm, group, bdrv_get_aio_context(bs));
+ }
+ g_free(reopen_state->opaque);
reopen_state->opaque = NULL;
}
static void throttle_reopen_abort(BDRVReopenState *reopen_state)
{
- ThrottleGroupMember *tgm = reopen_state->opaque;
-
- throttle_group_unregister_tgm(tgm);
- g_free(tgm);
+ g_free(reopen_state->opaque);
reopen_state->opaque = NULL;
}
diff --git a/configure b/configure
index 087a82ac50..ceec276693 100755
--- a/configure
+++ b/configure
@@ -930,6 +930,8 @@ for opt do
;;
--firmwarepath=*) firmwarepath="$optarg"
;;
+ --host=*|--build=*|\
+ --disable-dependency-tracking|\
--sbindir=*|--sharedstatedir=*|\
--oldincludedir=*|--datarootdir=*|--infodir=*|--localedir=*|\
--htmldir=*|--dvidir=*|--pdfdir=*|--psdir=*)
@@ -2788,6 +2790,7 @@ if test "$sdl" != "no" ; then
int main( void ) { return SDL_Init (SDL_INIT_VIDEO); }
EOF
sdl_cflags=$($sdlconfig --cflags 2>/dev/null)
+ sdl_cflags="$sdl_cflags -Wno-undef" # workaround 2.0.8 bug
if test "$static" = "yes" ; then
if $pkg_config $sdlname --exists; then
sdl_libs=$($pkg_config $sdlname --static --libs 2>/dev/null)
diff --git a/cpus.c b/cpus.c
index 114c29b6a0..96bb688d7b 100644
--- a/cpus.c
+++ b/cpus.c
@@ -843,11 +843,19 @@ void qemu_timer_notify_cb(void *opaque, QEMUClockType
type)
return;
}
- if (!qemu_in_vcpu_thread() && first_cpu) {
+ if (qemu_in_vcpu_thread()) {
+ /* A CPU is currently running; kick it back out to the
+ * tcg_cpu_exec() loop so it will recalculate its
+ * icount deadline immediately.
+ */
+ qemu_cpu_kick(current_cpu);
+ } else if (first_cpu) {
/* qemu_cpu_kick is not enough to kick a halted CPU out of
* qemu_tcg_wait_io_event. async_run_on_cpu, instead,
* causes cpu_thread_is_idle to return false. This way,
* handle_icount_deadline can run.
+ * If we have no CPUs at all for some reason, we don't
+ * need to do anything.
*/
async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
}
diff --git a/device_tree.c b/device_tree.c
index a24ddff02b..9eb5fae738 100644
--- a/device_tree.c
+++ b/device_tree.c
@@ -29,7 +29,7 @@
#include <libfdt.h>
-#define FDT_MAX_SIZE 0x10000
+#define FDT_MAX_SIZE 0x100000
void *create_device_tree(int *sizep)
{
diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index d7fdb1fee3..feb711fb6a 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -426,10 +426,20 @@ Standard Cluster Descriptor:
Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)):
- Bit 0 - x: Host cluster offset. This is usually _not_ aligned to a
- cluster boundary!
+ Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a
+ cluster or sector boundary!
- x+1 - 61: Compressed size of the images in sectors of 512 bytes
+ x - 61: Number of additional 512-byte sectors used for the
+ compressed data, beyond the sector containing the offset
+ in the previous field. Some of these sectors may reside
+ in the next contiguous host cluster.
+
+ Note that the compressed data does not necessarily occupy
+ all of the bytes in the final sector; rather, decompression
+ stops when it has produced a cluster of data.
+
+ Another compressed cluster may map to the tail of the final
+ sector used by this compressed cluster.
If a cluster is unallocated, read requests shall read the data from the backing
file (except if bit 0 in the Standard Cluster Descriptor is set). If there is
diff --git a/exec.c b/exec.c
index 03238a3449..e60ad94a42 100644
--- a/exec.c
+++ b/exec.c
@@ -1455,6 +1455,7 @@ static int find_max_supported_pagesize(Object *obj, void
*opaque)
mem_path = object_property_get_str(obj, "mem-path", NULL);
if (mem_path) {
long hpsize = qemu_mempath_getpagesize(mem_path);
+ g_free(mem_path);
if (hpsize < *hpsize_min) {
*hpsize_min = hpsize;
}
@@ -2575,6 +2576,8 @@ static const MemoryRegionOps watch_mem_ops = {
},
};
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+ MemTxAttrs attrs, uint8_t *buf, int len);
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
const uint8_t *buf, int len);
static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
@@ -3005,6 +3008,7 @@ static MemTxResult flatview_write_continue(FlatView *fv,
hwaddr addr,
return result;
}
+/* Called from RCU critical section. */
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
const uint8_t *buf, int len)
{
@@ -3013,25 +3017,14 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr
addr, MemTxAttrs attrs,
MemoryRegion *mr;
MemTxResult result = MEMTX_OK;
- if (len > 0) {
- rcu_read_lock();
- l = len;
- mr = flatview_translate(fv, addr, &addr1, &l, true);
- result = flatview_write_continue(fv, addr, attrs, buf, len,
- addr1, l, mr);
- rcu_read_unlock();
- }
+ l = len;
+ mr = flatview_translate(fv, addr, &addr1, &l, true);
+ result = flatview_write_continue(fv, addr, attrs, buf, len,
+ addr1, l, mr);
return result;
}
-MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
- MemTxAttrs attrs,
- const uint8_t *buf, int len)
-{
- return flatview_write(address_space_to_flatview(as), addr, attrs, buf,
len);
-}
-
/* Called within RCU critical section. */
MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
MemTxAttrs attrs, uint8_t *buf,
@@ -3102,42 +3095,61 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr
addr,
return result;
}
-MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
- MemTxAttrs attrs, uint8_t *buf, int len)
+/* Called from RCU critical section. */
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+ MemTxAttrs attrs, uint8_t *buf, int len)
{
hwaddr l;
hwaddr addr1;
MemoryRegion *mr;
+
+ l = len;
+ mr = flatview_translate(fv, addr, &addr1, &l, false);
+ return flatview_read_continue(fv, addr, attrs, buf, len,
+ addr1, l, mr);
+}
+
+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+ MemTxAttrs attrs, uint8_t *buf, int len)
+{
MemTxResult result = MEMTX_OK;
+ FlatView *fv;
if (len > 0) {
rcu_read_lock();
- l = len;
- mr = flatview_translate(fv, addr, &addr1, &l, false);
- result = flatview_read_continue(fv, addr, attrs, buf, len,
- addr1, l, mr);
+ fv = address_space_to_flatview(as);
+ result = flatview_read(fv, addr, attrs, buf, len);
rcu_read_unlock();
}
return result;
}
-static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
- uint8_t *buf, int len, bool is_write)
+MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
+ MemTxAttrs attrs,
+ const uint8_t *buf, int len)
{
- if (is_write) {
- return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
- } else {
- return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
+ MemTxResult result = MEMTX_OK;
+ FlatView *fv;
+
+ if (len > 0) {
+ rcu_read_lock();
+ fv = address_space_to_flatview(as);
+ result = flatview_write(fv, addr, attrs, buf, len);
+ rcu_read_unlock();
}
+
+ return result;
}
-MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
- MemTxAttrs attrs, uint8_t *buf,
- int len, bool is_write)
+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
+ uint8_t *buf, int len, bool is_write)
{
- return flatview_rw(address_space_to_flatview(as),
- addr, attrs, buf, len, is_write);
+ if (is_write) {
+ return address_space_write(as, addr, attrs, buf, len);
+ } else {
+ return address_space_read_full(as, addr, attrs, buf, len);
+ }
}
void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
@@ -3303,14 +3315,12 @@ static bool flatview_access_valid(FlatView *fv, hwaddr
addr, int len,
MemoryRegion *mr;
hwaddr l, xlat;
- rcu_read_lock();
while (len > 0) {
l = len;
mr = flatview_translate(fv, addr, &xlat, &l, is_write);
if (!memory_access_is_direct(mr, is_write)) {
l = memory_access_size(mr, l, addr);
if (!memory_region_access_valid(mr, xlat, l, is_write)) {
- rcu_read_unlock();
return false;
}
}
@@ -3318,15 +3328,20 @@ static bool flatview_access_valid(FlatView *fv, hwaddr
addr, int len,
len -= l;
addr += l;
}
- rcu_read_unlock();
return true;
}
bool address_space_access_valid(AddressSpace *as, hwaddr addr,
int len, bool is_write)
{
- return flatview_access_valid(address_space_to_flatview(as),
- addr, len, is_write);
+ FlatView *fv;
+ bool result;
+
+ rcu_read_lock();
+ fv = address_space_to_flatview(as);
+ result = flatview_access_valid(fv, addr, len, is_write);
+ rcu_read_unlock();
+ return result;
}
static hwaddr
@@ -3372,7 +3387,7 @@ void *address_space_map(AddressSpace *as,
hwaddr l, xlat;
MemoryRegion *mr;
void *ptr;
- FlatView *fv = address_space_to_flatview(as);
+ FlatView *fv;
if (len == 0) {
return NULL;
@@ -3380,6 +3395,7 @@ void *address_space_map(AddressSpace *as,
l = len;
rcu_read_lock();
+ fv = address_space_to_flatview(as);
mr = flatview_translate(fv, addr, &xlat, &l, is_write);
if (!memory_access_is_direct(mr, is_write)) {
diff --git a/gdbstub.c b/gdbstub.c
index 2a94030d3b..ca8433e1b6 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -515,6 +515,7 @@ static inline int tohex(int v)
return v - 10 + 'a';
}
+/* writes 2*len+1 bytes in buf */
static void memtohex(char *buf, const uint8_t *mem, int len)
{
int i, c;
@@ -970,8 +971,8 @@ static int gdb_handle_packet(GDBState *s, const char
*line_buf)
const char *p;
uint32_t thread;
int ch, reg_size, type, res;
- char buf[MAX_PACKET_LENGTH];
uint8_t mem_buf[MAX_PACKET_LENGTH];
+ char buf[sizeof(mem_buf) + 1 /* trailing NUL */];
uint8_t *registers;
target_ulong addr, len;
diff --git a/hw/block/pflash_cfi01.c b/hw/block/pflash_cfi01.c
index 1113ab1ccf..2e8284001d 100644
--- a/hw/block/pflash_cfi01.c
+++ b/hw/block/pflash_cfi01.c
@@ -90,7 +90,6 @@ struct pflash_t {
uint16_t ident1;
uint16_t ident2;
uint16_t ident3;
- uint8_t cfi_len;
uint8_t cfi_table[0x52];
uint64_t counter;
unsigned int writeblock_size;
@@ -153,7 +152,7 @@ static uint32_t pflash_cfi_query(pflash_t *pfl, hwaddr
offset)
boff = offset >> (ctz32(pfl->bank_width) +
ctz32(pfl->max_device_width) - ctz32(pfl->device_width));
- if (boff > pfl->cfi_len) {
+ if (boff >= sizeof(pfl->cfi_table)) {
return 0;
}
/* Now we will construct the CFI response generated by a single
@@ -385,10 +384,10 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
boff = boff >> 2;
}
- if (boff > pfl->cfi_len) {
- ret = 0;
- } else {
+ if (boff < sizeof(pfl->cfi_table)) {
ret = pfl->cfi_table[boff];
+ } else {
+ ret = 0;
}
} else {
/* If we have a read larger than the bank_width, combine multiple
@@ -791,7 +790,6 @@ static void pflash_cfi01_realize(DeviceState *dev, Error
**errp)
pfl->cmd = 0;
pfl->status = 0;
/* Hardcoded CFI table */
- pfl->cfi_len = 0x52;
/* Standard "QRY" string */
pfl->cfi_table[0x10] = 'Q';
pfl->cfi_table[0x11] = 'R';
diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c
index c81ddd3a99..75d1ae1026 100644
--- a/hw/block/pflash_cfi02.c
+++ b/hw/block/pflash_cfi02.c
@@ -83,7 +83,6 @@ struct pflash_t {
uint16_t ident3;
uint16_t unlock_addr0;
uint16_t unlock_addr1;
- uint8_t cfi_len;
uint8_t cfi_table[0x52];
QEMUTimer *timer;
/* The device replicates the flash memory across its memory space. Emulate
@@ -235,10 +234,11 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
break;
case 0x98:
/* CFI query mode */
- if (boff > pfl->cfi_len)
- ret = 0;
- else
+ if (boff < sizeof(pfl->cfi_table)) {
ret = pfl->cfi_table[boff];
+ } else {
+ ret = 0;
+ }
break;
}
@@ -663,7 +663,6 @@ static void pflash_cfi02_realize(DeviceState *dev, Error
**errp)
pfl->cmd = 0;
pfl->status = 0;
/* Hardcoded CFI table (mostly from SG29 Spansion flash) */
- pfl->cfi_len = 0x52;
/* Standard "QRY" string */
pfl->cfi_table[0x10] = 'Q';
pfl->cfi_table[0x11] = 'R';
diff --git a/hw/char/cmsdk-apb-uart.c b/hw/char/cmsdk-apb-uart.c
index 1ad1e14295..9c0929d8a2 100644
--- a/hw/char/cmsdk-apb-uart.c
+++ b/hw/char/cmsdk-apb-uart.c
@@ -274,6 +274,7 @@ static void uart_write(void *opaque, hwaddr offset,
uint64_t value,
* is then reflected into the intstatus value by the update function).
*/
s->state &= ~(value & (R_INTSTATUS_TXO_MASK | R_INTSTATUS_RXO_MASK));
+ s->intstatus &= ~value;
cmsdk_apb_uart_update(s);
break;
case A_BAUDDIV:
diff --git a/hw/core/loader.c b/hw/core/loader.c
index 91669d65aa..c08f130461 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -1104,20 +1104,22 @@ int rom_check_and_register_reset(void)
if (rom->fw_file) {
continue;
}
- if ((addr > rom->addr) && (as == rom->as)) {
- fprintf(stderr, "rom: requested regions overlap "
- "(rom %s. free=0x" TARGET_FMT_plx
- ", addr=0x" TARGET_FMT_plx ")\n",
- rom->name, addr, rom->addr);
- return -1;
+ if (!rom->mr) {
+ if ((addr > rom->addr) && (as == rom->as)) {
+ fprintf(stderr, "rom: requested regions overlap "
+ "(rom %s. free=0x" TARGET_FMT_plx
+ ", addr=0x" TARGET_FMT_plx ")\n",
+ rom->name, addr, rom->addr);
+ return -1;
+ }
+ addr = rom->addr;
+ addr += rom->romsize;
+ as = rom->as;
}
- addr = rom->addr;
- addr += rom->romsize;
section = memory_region_find(rom->mr ? rom->mr : get_system_memory(),
rom->addr, 1);
rom->isrom = int128_nz(section.size) &&
memory_region_is_rom(section.mr);
memory_region_unref(section.mr);
- as = rom->as;
}
qemu_register_reset(rom_reset, NULL);
roms_loaded = 1;
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 11112951a5..a71cd264e2 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -1140,6 +1140,30 @@ static void device_class_init(ObjectClass *class, void
*data)
dc->user_creatable = true;
}
+void device_class_set_parent_reset(DeviceClass *dc,
+ DeviceReset dev_reset,
+ DeviceReset *parent_reset)
+{
+ *parent_reset = dc->reset;
+ dc->reset = dev_reset;
+}
+
+void device_class_set_parent_realize(DeviceClass *dc,
+ DeviceRealize dev_realize,
+ DeviceRealize *parent_realize)
+{
+ *parent_realize = dc->realize;
+ dc->realize = dev_realize;
+}
+
+void device_class_set_parent_unrealize(DeviceClass *dc,
+ DeviceUnrealize dev_unrealize,
+ DeviceUnrealize *parent_unrealize)
+{
+ *parent_unrealize = dc->unrealize;
+ dc->unrealize = dev_unrealize;
+}
+
void device_reset(DeviceState *dev)
{
DeviceClass *klass = DEVICE_GET_CLASS(dev);
diff --git a/hw/display/qxl-render.c b/hw/display/qxl-render.c
index 90e0865618..9c1c44481f 100644
--- a/hw/display/qxl-render.c
+++ b/hw/display/qxl-render.c
@@ -169,7 +169,8 @@ void qxl_render_update(PCIQXLDevice *qxl)
qemu_mutex_lock(&qxl->ssd.lock);
- if (!runstate_is_running() || !qxl->guest_primary.commands) {
+ if (!runstate_is_running() || !qxl->guest_primary.commands ||
+ qxl->mode == QXL_MODE_UNDEFINED) {
qxl_render_update_area_unlocked(qxl);
qemu_mutex_unlock(&qxl->ssd.lock);
return;
diff --git a/hw/display/vga.c b/hw/display/vga.c
index d150a3a3eb..1fa66d597d 100644
--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@@ -1489,6 +1489,8 @@ static void vga_draw_graphic(VGACommonState *s, int
full_update)
region_start = (s->start_addr * 4);
region_end = region_start + (ram_addr_t)s->line_offset * height;
+ region_end += width * s->get_bpp(s) / 8; /* scanline length */
+ region_end -= s->line_offset;
if (region_end > s->vbe_size) {
/* wraps around (can happen with cirrus vbe modes) */
region_start = 0;
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 73519ab3ac..537957c89a 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2460,6 +2460,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker)
AcpiDmarDeviceScope *scope = NULL;
/* Root complex IOAPIC use one path[0] only */
size_t ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]);
+ IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu);
assert(iommu);
if (iommu->intr_supported) {
@@ -2467,7 +2468,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker)
}
dmar = acpi_data_push(table_data, sizeof(*dmar));
- dmar->host_address_width = VTD_HOST_ADDRESS_WIDTH - 1;
+ dmar->host_address_width = intel_iommu->aw_bits - 1;
dmar->flags = dmar_flags;
/* DMAR Remapping Hardware Unit Definition structure */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3a5bb0bc2e..8e0c03e35d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -128,6 +128,22 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState
*s, hwaddr addr,
return new_val;
}
+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+ qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+ qemu_mutex_unlock(&s->iommu_lock);
+}
+
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+ return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
/* GHashTable functions */
static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
{
@@ -172,9 +188,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key,
gpointer value,
}
/* Reset all the gen of VTDAddressSpace to zero and set the gen of
- * IntelIOMMUState to 1.
+ * IntelIOMMUState to 1. Must be called with IOMMU lock held.
*/
-static void vtd_reset_context_cache(IntelIOMMUState *s)
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
{
VTDAddressSpace *vtd_as;
VTDBus *vtd_bus;
@@ -197,12 +213,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s)
s->context_cache_gen = 1;
}
-static void vtd_reset_iotlb(IntelIOMMUState *s)
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
{
assert(s->iotlb);
g_hash_table_remove_all(s->iotlb);
}
+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+ vtd_iommu_lock(s);
+ vtd_reset_iotlb_locked(s);
+ vtd_iommu_unlock(s);
+}
+
static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
uint32_t level)
{
@@ -215,6 +239,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t
level)
return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
}
+/* Must be called with IOMMU lock held */
static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
hwaddr addr)
{
@@ -235,6 +260,7 @@ out:
return entry;
}
+/* Must be with IOMMU lock held */
static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
uint16_t domain_id, hwaddr addr, uint64_t slpte,
uint8_t access_flags, uint32_t level)
@@ -246,7 +272,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t
source_id,
trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
trace_vtd_iotlb_reset("iotlb exceeds size limit");
- vtd_reset_iotlb(s);
+ vtd_reset_iotlb_locked(s);
}
entry->gfn = gfn;
@@ -521,9 +547,9 @@ static inline dma_addr_t
vtd_ce_get_slpt_base(VTDContextEntry *ce)
return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
}
-static inline uint64_t vtd_get_slpte_addr(uint64_t slpte)
+static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
{
- return slpte & VTD_SL_PT_BASE_ADDR_MASK;
+ return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
}
/* Whether the pte indicates the address of the page frame */
@@ -608,35 +634,29 @@ static inline bool vtd_ce_type_check(X86IOMMUState
*x86_iommu,
return true;
}
-static inline uint64_t vtd_iova_limit(VTDContextEntry *ce)
+static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw)
{
uint32_t ce_agaw = vtd_ce_get_agaw(ce);
- return 1ULL << MIN(ce_agaw, VTD_MGAW);
+ return 1ULL << MIN(ce_agaw, aw);
}
/* Return true if IOVA passes range check, otherwise false. */
-static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
+static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
+ uint8_t aw)
{
/*
* Check if @iova is above 2^X-1, where X is the minimum of MGAW
* in CAP_REG and AW in context-entry.
*/
- return !(iova & ~(vtd_iova_limit(ce) - 1));
-}
-
-static const uint64_t vtd_paging_entry_rsvd_field[] = {
- [0] = ~0ULL,
- /* For not large page */
- [1] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- [2] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- [3] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- [4] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- /* For large page */
- [5] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- [6] = 0x1ff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- [7] = 0x3ffff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
- [8] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-};
+ return !(iova & ~(vtd_iova_limit(ce, aw) - 1));
+}
+
+/*
+ * Rsvd field masks for spte:
+ * Index [1] to [4] 4k pages
+ * Index [5] to [8] large pages
+ */
+static uint64_t vtd_paging_entry_rsvd_field[9];
static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
{
@@ -676,7 +696,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s,
uint8_t bus_num)
*/
static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
uint64_t *slptep, uint32_t *slpte_level,
- bool *reads, bool *writes)
+ bool *reads, bool *writes, uint8_t aw_bits)
{
dma_addr_t addr = vtd_ce_get_slpt_base(ce);
uint32_t level = vtd_ce_get_level(ce);
@@ -684,7 +704,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t
iova, bool is_write,
uint64_t slpte;
uint64_t access_right_check;
- if (!vtd_iova_range_check(iova, ce)) {
+ if (!vtd_iova_range_check(iova, ce, aw_bits)) {
trace_vtd_err_dmar_iova_overflow(iova);
return -VTD_FR_ADDR_BEYOND_MGAW;
}
@@ -721,7 +741,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t
iova, bool is_write,
*slpte_level = level;
return 0;
}
- addr = vtd_get_slpte_addr(slpte);
+ addr = vtd_get_slpte_addr(slpte, aw_bits);
level--;
}
}
@@ -729,21 +749,116 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce,
uint64_t iova, bool is_write,
typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
+ * @aw: maximum address width
+ * @domain: domain ID of the page walk
+ */
+typedef struct {
+ VTDAddressSpace *as;
+ vtd_page_walk_hook hook_fn;
+ void *private;
+ bool notify_unmap;
+ uint8_t aw;
+ uint16_t domain_id;
+} vtd_page_walk_info;
+
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
+{
+ VTDAddressSpace *as = info->as;
+ vtd_page_walk_hook hook_fn = info->hook_fn;
+ void *private = info->private;
+ DMAMap target = {
+ .iova = entry->iova,
+ .size = entry->addr_mask,
+ .translated_addr = entry->translated_addr,
+ .perm = entry->perm,
+ };
+ DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+ if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
+ trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+ return 0;
+ }
+
+ assert(hook_fn);
+
+ /* Update local IOVA mapped ranges */
+ if (entry->perm) {
+ if (mapped) {
+ /* If it's exactly the same translation, skip */
+ if (!memcmp(mapped, &target, sizeof(target))) {
+ trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+ entry->translated_addr);
+ return 0;
+ } else {
+ /*
+ * Translation changed. Normally this should not
+ * happen, but it can happen when with buggy guest
+ * OSes. Note that there will be a small window that
+ * we don't have map at all. But that's the best
+ * effort we can do. The ideal way to emulate this is
+ * atomically modify the PTE to follow what has
+ * changed, but we can't. One example is that vfio
+ * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+ * interface to modify a mapping (meanwhile it seems
+ * meaningless to even provide one). Anyway, let's
+ * mark this as a TODO in case one day we'll have
+ * a better solution.
+ */
+ IOMMUAccessFlags cache_perm = entry->perm;
+ int ret;
+
+ /* Emulate an UNMAP */
+ entry->perm = IOMMU_NONE;
+ trace_vtd_page_walk_one(info->domain_id,
+ entry->iova,
+ entry->translated_addr,
+ entry->addr_mask,
+ entry->perm);
+ ret = hook_fn(entry, private);
+ if (ret) {
+ return ret;
+ }
+ /* Drop any existing mapping */
+ iova_tree_remove(as->iova_tree, &target);
+ /* Recover the correct permission */
+ entry->perm = cache_perm;
+ }
+ }
+ iova_tree_insert(as->iova_tree, &target);
+ } else {
+ if (!mapped) {
+ /* Skip since we didn't map this range at all */
+ trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+ return 0;
+ }
+ iova_tree_remove(as->iova_tree, &target);
+ }
+
+ trace_vtd_page_walk_one(info->domain_id, entry->iova,
+ entry->translated_addr, entry->addr_mask,
+ entry->perm);
+ return hook_fn(entry, private);
+}
+
+/**
* vtd_page_walk_level - walk over specific level for IOVA range
*
* @addr: base GPA addr to start the walk
* @start: IOVA range start address
* @end: IOVA range end address (start <= addr < end)
- * @hook_fn: hook func to be called when detected page
- * @private: private data to be passed into hook func
* @read: whether parent level has read permission
* @write: whether parent level has write permission
- * @notify_unmap: whether we should notify invalid entries
+ * @info: constant information for the page walk
*/
static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
- uint64_t end, vtd_page_walk_hook hook_fn,
- void *private, uint32_t level,
- bool read, bool write, bool notify_unmap)
+ uint64_t end, uint32_t level, bool read,
+ bool write, vtd_page_walk_info *info)
{
bool read_cur, write_cur, entry_valid;
uint32_t offset;
@@ -786,37 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t
start,
*/
entry_valid = read_cur | write_cur;
- if (vtd_is_last_slpte(slpte, level)) {
+ if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+ /*
+ * This is a valid PDE (or even bigger than PDE). We need
+ * to walk one further level.
+ */
+ ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+ iova, MIN(iova_next, end), level - 1,
+ read_cur, write_cur, info);
+ } else {
+ /*
+ * This means we are either:
+ *
+ * (1) the real page entry (either 4K page, or huge page)
+ * (2) the whole range is invalid
+ *
+ * In either case, we send an IOTLB notification down.
+ */
entry.target_as = &address_space_memory;
entry.iova = iova & subpage_mask;
- /* NOTE: this is only meaningful if entry_valid == true */
- entry.translated_addr = vtd_get_slpte_addr(slpte);
- entry.addr_mask = ~subpage_mask;
entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
- if (!entry_valid && !notify_unmap) {
- trace_vtd_page_walk_skip_perm(iova, iova_next);
- goto next;
- }
- trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
- entry.addr_mask, entry.perm);
- if (hook_fn) {
- ret = hook_fn(&entry, private);
- if (ret < 0) {
- return ret;
- }
- }
- } else {
- if (!entry_valid) {
- trace_vtd_page_walk_skip_perm(iova, iova_next);
- goto next;
- }
- ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
- MIN(iova_next, end), hook_fn, private,
- level - 1, read_cur, write_cur,
- notify_unmap);
- if (ret < 0) {
- return ret;
- }
+ entry.addr_mask = ~subpage_mask;
+ /* NOTE: this is only meaningful if entry_valid == true */
+ entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+ ret = vtd_page_walk_one(&entry, info);
+ }
+
+ if (ret < 0) {
+ return ret;
}
next:
@@ -832,27 +944,24 @@ next:
* @ce: context entry to walk upon
* @start: IOVA address to start the walk
* @end: IOVA range end address (start <= addr < end)
- * @hook_fn: the hook that to be called for each detected area
- * @private: private data for the hook function
+ * @info: page walking information struct
*/
static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
- vtd_page_walk_hook hook_fn, void *private,
- bool notify_unmap)
+ vtd_page_walk_info *info)
{
dma_addr_t addr = vtd_ce_get_slpt_base(ce);
uint32_t level = vtd_ce_get_level(ce);
- if (!vtd_iova_range_check(start, ce)) {
+ if (!vtd_iova_range_check(start, ce, info->aw)) {
return -VTD_FR_ADDR_BEYOND_MGAW;
}
- if (!vtd_iova_range_check(end, ce)) {
+ if (!vtd_iova_range_check(end, ce, info->aw)) {
/* Fix end so that it reaches the maximum */
- end = vtd_iova_limit(ce);
+ end = vtd_iova_limit(ce, info->aw);
}
- return vtd_page_walk_level(addr, start, end, hook_fn, private,
- level, true, true, notify_unmap);
+ return vtd_page_walk_level(addr, start, end, level, true, true, info);
}
/* Map a device to its corresponding domain (context-entry) */
@@ -874,7 +983,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
uint8_t bus_num,
return -VTD_FR_ROOT_ENTRY_P;
}
- if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) {
+ if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) {
trace_vtd_re_invalid(re.rsvd, re.val);
return -VTD_FR_ROOT_ENTRY_RSVD;
}
@@ -891,7 +1000,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
uint8_t bus_num,
}
if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
- (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
+ (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
trace_vtd_ce_invalid(ce->hi, ce->lo);
return -VTD_FR_CONTEXT_ENTRY_RSVD;
}
@@ -911,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
uint8_t bus_num,
return 0;
}
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
+ void *private)
+{
+ memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
+ return 0;
+}
+
+/* If context entry is NULL, we'll try to fetch it on our own. */
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+ VTDContextEntry *ce,
+ hwaddr addr, hwaddr size)
+{
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ vtd_page_walk_info info = {
+ .hook_fn = vtd_sync_shadow_page_hook,
+ .private = (void *)&vtd_as->iommu,
+ .notify_unmap = true,
+ .aw = s->aw_bits,
+ .as = vtd_as,
+ };
+ VTDContextEntry ce_cache;
+ int ret;
+
+ if (ce) {
+ /* If the caller provided context entry, use it */
+ ce_cache = *ce;
+ } else {
+ /* If the caller didn't provide ce, try to fetch */
+ ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce_cache);
+ if (ret) {
+ /*
+ * This should not really happen, but in case it happens,
+ * we just skip the sync for this time. After all we even
+ * don't have the root table pointer!
+ */
+ trace_vtd_err("Detected invalid context entry when "
+ "trying to sync shadow page table");
+ return 0;
+ }
+ }
+
+ info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
+
+ return vtd_page_walk(&ce_cache, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+ return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
+}
+
/*
* Fetch translation type for specific device. Returns <0 if error
* happens, otherwise return the shifted type to check against
@@ -1092,7 +1253,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace
*vtd_as, PCIBus *bus,
IntelIOMMUState *s = vtd_as->iommu_state;
VTDContextEntry ce;
uint8_t bus_num = pci_bus_num(bus);
- VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
+ VTDContextCacheEntry *cc_entry;
uint64_t slpte, page_mask;
uint32_t level;
uint16_t source_id = vtd_make_source_id(bus_num, devfn);
@@ -1109,6 +1270,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace
*vtd_as, PCIBus *bus,
*/
assert(!vtd_is_interrupt_addr(addr));
+ vtd_iommu_lock(s);
+
+ cc_entry = &vtd_as->context_cache_entry;
+
/* Try to fetch slpte form IOTLB */
iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
if (iotlb_entry) {
@@ -1168,12 +1333,12 @@ static bool vtd_do_iommu_translate(VTDAddressSpace
*vtd_as, PCIBus *bus,
* IOMMU region can be swapped back.
*/
vtd_pt_enable_fast_path(s, source_id);
-
+ vtd_iommu_unlock(s);
return true;
}
ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
- &reads, &writes);
+ &reads, &writes, s->aw_bits);
if (ret_fr) {
ret_fr = -ret_fr;
if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
@@ -1189,13 +1354,15 @@ static bool vtd_do_iommu_translate(VTDAddressSpace
*vtd_as, PCIBus *bus,
vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
access_flags, level);
out:
+ vtd_iommu_unlock(s);
entry->iova = addr & page_mask;
- entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask;
+ entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
entry->addr_mask = ~page_mask;
entry->perm = access_flags;
return true;
error:
+ vtd_iommu_unlock(s);
entry->iova = 0;
entry->translated_addr = 0;
entry->addr_mask = 0;
@@ -1207,7 +1374,7 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
{
s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
s->root_extended = s->root & VTD_RTADDR_RTT;
- s->root &= VTD_RTADDR_ADDR_MASK;
+ s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
trace_vtd_reg_dmar_root(s->root, s->root_extended);
}
@@ -1223,7 +1390,7 @@ static void
vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
uint64_t value = 0;
value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
- s->intr_root = value & VTD_IRTA_ADDR_MASK;
+ s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
s->intr_eime = value & VTD_IRTA_EIME;
/* Notify global invalidation */
@@ -1234,20 +1401,23 @@ static void
vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
static void vtd_iommu_replay_all(IntelIOMMUState *s)
{
- IntelIOMMUNotifierNode *node;
+ VTDAddressSpace *vtd_as;
- QLIST_FOREACH(node, &s->notifiers_list, next) {
- memory_region_iommu_replay_all(&node->vtd_as->iommu);
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+ vtd_sync_shadow_page_table(vtd_as);
}
}
static void vtd_context_global_invalidate(IntelIOMMUState *s)
{
trace_vtd_inv_desc_cc_global();
+ /* Protects context cache */
+ vtd_iommu_lock(s);
s->context_cache_gen++;
if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
- vtd_reset_context_cache(s);
+ vtd_reset_context_cache_locked(s);
}
+ vtd_iommu_unlock(s);
vtd_switch_address_space_all(s);
/*
* From VT-d spec 6.5.2.1, a global context entry invalidation
@@ -1299,7 +1469,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState
*s,
if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
VTD_PCI_FUNC(devfn_it));
+ vtd_iommu_lock(s);
vtd_as->context_cache_entry.context_cache_gen = 0;
+ vtd_iommu_unlock(s);
/*
* Do switch address space when needed, in case if the
* device passthrough bit is switched.
@@ -1307,14 +1479,13 @@ static void
vtd_context_device_invalidate(IntelIOMMUState *s,
vtd_switch_address_space(vtd_as);
/*
* So a device is moving out of (or moving into) a
- * domain, a replay() suites here to notify all the
- * IOMMU_NOTIFIER_MAP registers about this change.
+ * domain, resync the shadow page table.
* This won't bring bad even if we have no such
* notifier registered - the IOMMU notification
* framework will skip MAP notifications if that
* happened.
*/
- memory_region_iommu_replay_all(&vtd_as->iommu);
+ vtd_sync_shadow_page_table(vtd_as);
}
}
}
@@ -1358,48 +1529,60 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState
*s)
static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
{
- IntelIOMMUNotifierNode *node;
VTDContextEntry ce;
VTDAddressSpace *vtd_as;
trace_vtd_inv_desc_iotlb_domain(domain_id);
+ vtd_iommu_lock(s);
g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
&domain_id);
+ vtd_iommu_unlock(s);
- QLIST_FOREACH(node, &s->notifiers_list, next) {
- vtd_as = node->vtd_as;
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce) &&
domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
- memory_region_iommu_replay_all(&vtd_as->iommu);
+ vtd_sync_shadow_page_table(vtd_as);
}
}
}
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
- void *private)
-{
- memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
- return 0;
-}
-
static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
uint16_t domain_id, hwaddr addr,
uint8_t am)
{
- IntelIOMMUNotifierNode *node;
+ VTDAddressSpace *vtd_as;
VTDContextEntry ce;
int ret;
+ hwaddr size = (1 << am) * VTD_PAGE_SIZE;
- QLIST_FOREACH(node, &(s->notifiers_list), next) {
- VTDAddressSpace *vtd_as = node->vtd_as;
+ QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce);
if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
- vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
- vtd_page_invalidate_notify_hook,
- (void *)&vtd_as->iommu, true);
+ if (vtd_as_has_map_notifier(vtd_as)) {
+ /*
+ * As long as we have MAP notifications registered in
+ * any of our IOMMU notifiers, we need to sync the
+ * shadow page table.
+ */
+ vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+ } else {
+ /*
+ * For UNMAP-only notifiers, we don't need to walk the
+ * page tables. We just deliver the PSI down to
+ * invalidate caches.
+ */
+ IOMMUTLBEntry entry = {
+ .target_as = &address_space_memory,
+ .iova = addr,
+ .translated_addr = 0,
+ .addr_mask = size - 1,
+ .perm = IOMMU_NONE,
+ };
+ memory_region_notify_iommu(&vtd_as->iommu, entry);
+ }
}
}
}
@@ -1415,7 +1598,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s,
uint16_t domain_id,
info.domain_id = domain_id;
info.addr = addr;
info.mask = ~((1 << am) - 1);
+ vtd_iommu_lock(s);
g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+ vtd_iommu_unlock(s);
vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
}
@@ -1479,7 +1664,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool
en)
trace_vtd_inv_qi_enable(en);
if (en) {
- s->iq = iqa_val & VTD_IQA_IQA_MASK;
+ s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
/* 2^(x+8) entries */
s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
s->qi_enabled = true;
@@ -2323,8 +2508,6 @@ static void
vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
{
VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
IntelIOMMUState *s = vtd_as->iommu_state;
- IntelIOMMUNotifierNode *node = NULL;
- IntelIOMMUNotifierNode *next_node = NULL;
if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
error_report("We need to set cache_mode=1 for intel-iommu to enable "
@@ -2332,22 +2515,13 @@ static void
vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
exit(1);
}
- if (old == IOMMU_NOTIFIER_NONE) {
- node = g_malloc0(sizeof(*node));
- node->vtd_as = vtd_as;
- QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
- return;
- }
+ /* Update per-address-space notifier flags */
+ vtd_as->notifier_flags = new;
- /* update notifier node with new flags */
- QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
- if (node->vtd_as == vtd_as) {
- if (new == IOMMU_NOTIFIER_NONE) {
- QLIST_REMOVE(node, next);
- g_free(node);
- }
- return;
- }
+ if (old == IOMMU_NOTIFIER_NONE) {
+ QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+ } else if (new == IOMMU_NOTIFIER_NONE) {
+ QLIST_REMOVE(vtd_as, next);
}
}
@@ -2410,6 +2584,8 @@ static Property vtd_properties[] = {
DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
ON_OFF_AUTO_AUTO),
DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+ DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits,
+ VTD_HOST_ADDRESS_WIDTH),
DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
DEFINE_PROP_END_OF_LIST(),
};
@@ -2714,6 +2890,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s,
PCIBus *bus, int devfn)
vtd_dev_as->devfn = (uint8_t)devfn;
vtd_dev_as->iommu_state = s;
vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+ vtd_dev_as->iova_tree = iova_tree_new();
/*
* Memory region relationships looks like (Address range shows
@@ -2765,6 +2942,8 @@ static void vtd_address_space_unmap(VTDAddressSpace *as,
IOMMUNotifier *n)
hwaddr size;
hwaddr start = n->start;
hwaddr end = n->end;
+ IntelIOMMUState *s = as->iommu_state;
+ DMAMap map;
/*
* Note: all the codes in this function has a assumption that IOVA
@@ -2772,12 +2951,12 @@ static void vtd_address_space_unmap(VTDAddressSpace
*as, IOMMUNotifier *n)
* VT-d spec), otherwise we need to consider overflow of 64 bits.
*/
- if (end > VTD_ADDRESS_SIZE) {
+ if (end > VTD_ADDRESS_SIZE(s->aw_bits)) {
/*
* Don't need to unmap regions that is bigger than the whole
* VT-d supported address space size
*/
- end = VTD_ADDRESS_SIZE;
+ end = VTD_ADDRESS_SIZE(s->aw_bits);
}
assert(start <= end);
@@ -2789,9 +2968,9 @@ static void vtd_address_space_unmap(VTDAddressSpace *as,
IOMMUNotifier *n)
* suite the minimum available mask.
*/
int n = 64 - clz64(size);
- if (n > VTD_MGAW) {
+ if (n > s->aw_bits) {
/* should not happen, but in case it happens, limit it */
- n = VTD_MGAW;
+ n = s->aw_bits;
}
size = 1ULL << n;
}
@@ -2809,17 +2988,19 @@ static void vtd_address_space_unmap(VTDAddressSpace
*as, IOMMUNotifier *n)
VTD_PCI_FUNC(as->devfn),
entry.iova, size);
+ map.iova = entry.iova;
+ map.size = entry.addr_mask;
+ iova_tree_remove(as->iova_tree, &map);
+
memory_region_notify_one(n, &entry);
}
static void vtd_address_space_unmap_all(IntelIOMMUState *s)
{
- IntelIOMMUNotifierNode *node;
VTDAddressSpace *vtd_as;
IOMMUNotifier *n;
- QLIST_FOREACH(node, &s->notifiers_list, next) {
- vtd_as = node->vtd_as;
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
vtd_address_space_unmap(vtd_as, n);
}
@@ -2851,7 +3032,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion
*iommu_mr, IOMMUNotifier *n)
PCI_FUNC(vtd_as->devfn),
VTD_CONTEXT_ENTRY_DID(ce.hi),
ce.hi, ce.lo);
- vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false);
+ if (vtd_as_has_map_notifier(vtd_as)) {
+ /* This is required only for MAP typed notifiers */
+ vtd_page_walk_info info = {
+ .hook_fn = vtd_replay_hook,
+ .private = (void *)n,
+ .notify_unmap = false,
+ .aw = s->aw_bits,
+ .as = vtd_as,
+ .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
+ };
+
+ vtd_page_walk(&ce, 0, ~0ULL, &info);
+ }
} else {
trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
PCI_FUNC(vtd_as->devfn));
@@ -2882,10 +3075,27 @@ static void vtd_init(IntelIOMMUState *s)
s->qi_enabled = false;
s->iq_last_desc_type = VTD_INV_DESC_NONE;
s->next_frcd_reg = 0;
- s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
- VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
+ s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
+ VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+ VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
+ if (s->aw_bits == VTD_HOST_AW_48BIT) {
+ s->cap |= VTD_CAP_SAGAW_48bit;
+ }
s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
+ /*
+ * Rsvd field masks for spte
+ */
+ vtd_paging_entry_rsvd_field[0] = ~0ULL;
+ vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
+ vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits);
+
if (x86_iommu->intr_supported) {
s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -2906,8 +3116,10 @@ static void vtd_init(IntelIOMMUState *s)
s->cap |= VTD_CAP_CM;
}
- vtd_reset_context_cache(s);
- vtd_reset_iotlb(s);
+ vtd_iommu_lock(s);
+ vtd_reset_context_cache_locked(s);
+ vtd_reset_iotlb_locked(s);
+ vtd_iommu_unlock(s);
/* Define registers with default values and bit semantics */
vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
@@ -3021,6 +3233,14 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error
**errp)
}
}
+ /* Currently only address widths supported are 39 and 48 bits */
+ if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
+ (s->aw_bits != VTD_HOST_AW_48BIT)) {
+ error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
+ VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
+ return false;
+ }
+
return true;
}
@@ -3047,7 +3267,8 @@ static void vtd_realize(DeviceState *dev, Error **errp)
return;
}
- QLIST_INIT(&s->notifiers_list);
+ QLIST_INIT(&s->vtd_as_with_notifiers);
+ qemu_mutex_init(&s->iommu_lock);
memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
"intel_iommu", DMAR_REG_SIZE);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0e73a65bf2..d084099ed9 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -131,7 +131,7 @@
#define VTD_TLB_DID(val) (((val) >> 32) & VTD_DOMAIN_ID_MASK)
/* IVA_REG */
-#define VTD_IVA_ADDR(val) ((val) & ~0xfffULL & ((1ULL << VTD_MGAW) - 1))
+#define VTD_IVA_ADDR(val) ((val) & ~0xfffULL)
#define VTD_IVA_AM(val) ((val) & 0x3fULL)
/* GCMD_REG */
@@ -172,10 +172,10 @@
/* RTADDR_REG */
#define VTD_RTADDR_RTT (1ULL << 11)
-#define VTD_RTADDR_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_RTADDR_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL)
/* IRTA_REG */
-#define VTD_IRTA_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_IRTA_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL)
#define VTD_IRTA_EIME (1ULL << 11)
#define VTD_IRTA_SIZE_MASK (0xfULL)
@@ -197,9 +197,8 @@
#define VTD_DOMAIN_ID_SHIFT 16 /* 16-bit domain id for 64K domains */
#define VTD_DOMAIN_ID_MASK ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
#define VTD_CAP_ND (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
-#define VTD_MGAW 39 /* Maximum Guest Address Width */
-#define VTD_ADDRESS_SIZE (1ULL << VTD_MGAW)
-#define VTD_CAP_MGAW (((VTD_MGAW - 1) & 0x3fULL) << 16)
+#define VTD_ADDRESS_SIZE(aw) (1ULL << (aw))
+#define VTD_CAP_MGAW(aw) ((((aw) - 1) & 0x3fULL) << 16)
#define VTD_MAMV 18ULL
#define VTD_CAP_MAMV (VTD_MAMV << 48)
#define VTD_CAP_PSI (1ULL << 39)
@@ -213,13 +212,12 @@
#define VTD_CAP_SAGAW_39bit (0x2ULL << VTD_CAP_SAGAW_SHIFT)
/* 48-bit AGAW, 4-level page-table */
#define VTD_CAP_SAGAW_48bit (0x4ULL << VTD_CAP_SAGAW_SHIFT)
-#define VTD_CAP_SAGAW VTD_CAP_SAGAW_39bit
/* IQT_REG */
#define VTD_IQT_QT(val) (((val) >> 4) & 0x7fffULL)
/* IQA_REG */
-#define VTD_IQA_IQA_MASK (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_IQA_IQA_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL)
#define VTD_IQA_QS 0x7ULL
/* IQH_REG */
@@ -252,7 +250,7 @@
#define VTD_FRCD_SID_MASK 0xffffULL
#define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK)
/* For the low 64-bit of 128-bit */
-#define VTD_FRCD_FI(val) ((val) & (((1ULL << VTD_MGAW) - 1) ^ 0xfffULL))
+#define VTD_FRCD_FI(val) ((val) & ~0xfffULL)
/* DMA Remapping Fault Conditions */
typedef enum VTDFaultReason {
@@ -360,8 +358,7 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_IOTLB_DOMAIN (2ULL << 4)
#define VTD_INV_DESC_IOTLB_PAGE (3ULL << 4)
#define VTD_INV_DESC_IOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK)
-#define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL & \
- ((1ULL << VTD_MGAW) - 1))
+#define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL)
#define VTD_INV_DESC_IOTLB_AM(val) ((val) & 0x3fULL)
#define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000ff00ULL
#define VTD_INV_DESC_IOTLB_RSVD_HI 0xf80ULL
@@ -373,6 +370,24 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8
+/* Rsvd field masks for spte */
+#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw) \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
+ (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \
+ (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw) \
+ (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \
+ (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
/* Information about page-selective IOTLB invalidate */
struct VTDIOTLBPageInvInfo {
uint16_t domain_id;
@@ -403,7 +418,7 @@ typedef struct VTDRootEntry VTDRootEntry;
#define VTD_ROOT_ENTRY_CTP (~0xfffULL)
#define VTD_ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
-#define VTD_ROOT_ENTRY_RSVD (0xffeULL | ~VTD_HAW_MASK)
+#define VTD_ROOT_ENTRY_RSVD(aw) (0xffeULL | ~VTD_HAW_MASK(aw))
/* Masks for struct VTDContextEntry */
/* lo */
@@ -415,7 +430,7 @@ typedef struct VTDRootEntry VTDRootEntry;
#define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2)
/* Second Level Page Translation Pointer*/
#define VTD_CONTEXT_ENTRY_SLPTPTR (~0xfffULL)
-#define VTD_CONTEXT_ENTRY_RSVD_LO (0xff0ULL | ~VTD_HAW_MASK)
+#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw))
/* hi */
#define VTD_CONTEXT_ENTRY_AW 7ULL /* Adjusted guest-address-width */
#define VTD_CONTEXT_ENTRY_DID(val) (((val) >> 8) & VTD_DOMAIN_ID_MASK)
@@ -439,7 +454,7 @@ typedef struct VTDRootEntry VTDRootEntry;
#define VTD_SL_RW_MASK 3ULL
#define VTD_SL_R 1ULL
#define VTD_SL_W (1ULL << 1)
-#define VTD_SL_PT_BASE_ADDR_MASK (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK)
+#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
#define VTD_SL_IGN_COM 0xbff0000000000000ULL
#endif
diff --git a/hw/i386/multiboot.c b/hw/i386/multiboot.c
index c7b70c91d5..36b22832cd 100644
--- a/hw/i386/multiboot.c
+++ b/hw/i386/multiboot.c
@@ -31,12 +31,13 @@
#include "hw/loader.h"
#include "elf.h"
#include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
/* Show multiboot debug output */
//#define DEBUG_MULTIBOOT
#ifdef DEBUG_MULTIBOOT
-#define mb_debug(a...) fprintf(stderr, ## a)
+#define mb_debug(a...) error_report(a)
#else
#define mb_debug(a...)
#endif
@@ -137,7 +138,7 @@ static void mb_add_mod(MultibootState *s,
stl_p(p + MB_MOD_END, end);
stl_p(p + MB_MOD_CMDLINE, cmdline_phys);
- mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx"\n",
+ mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx,
s->mb_mods_count, start, end);
s->mb_mods_count++;
@@ -179,12 +180,12 @@ int load_multiboot(FWCfgState *fw_cfg,
if (!is_multiboot)
return 0; /* no multiboot */
- mb_debug("qemu: I believe we found a multiboot image!\n");
+ mb_debug("qemu: I believe we found a multiboot image!");
memset(bootinfo, 0, sizeof(bootinfo));
memset(&mbs, 0, sizeof(mbs));
if (flags & 0x00000004) { /* MULTIBOOT_HEADER_HAS_VBE */
- fprintf(stderr, "qemu: multiboot knows VBE. we don't.\n");
+ error_report("qemu: multiboot knows VBE. we don't.");
}
if (!(flags & 0x00010000)) { /* MULTIBOOT_HEADER_HAS_ADDR */
uint64_t elf_entry;
@@ -193,7 +194,7 @@ int load_multiboot(FWCfgState *fw_cfg,
fclose(f);
if (((struct elf64_hdr*)header)->e_machine == EM_X86_64) {
- fprintf(stderr, "Cannot load x86-64 image, give a 32bit one.\n");
+ error_report("Cannot load x86-64 image, give a 32bit one.");
exit(1);
}
@@ -201,7 +202,7 @@ int load_multiboot(FWCfgState *fw_cfg,
&elf_low, &elf_high, 0, I386_ELF_MACHINE,
0, 0);
if (kernel_size < 0) {
- fprintf(stderr, "Error while loading elf kernel\n");
+ error_report("Error while loading elf kernel");
exit(1);
}
mh_load_addr = elf_low;
@@ -210,12 +211,13 @@ int load_multiboot(FWCfgState *fw_cfg,
mbs.mb_buf = g_malloc(mb_kernel_size);
if (rom_copy(mbs.mb_buf, mh_load_addr, mb_kernel_size) !=
mb_kernel_size) {
- fprintf(stderr, "Error while fetching elf kernel from rom\n");
+ error_report("Error while fetching elf kernel from rom");
exit(1);
}
- mb_debug("qemu: loading multiboot-elf kernel (%#x bytes) with entry
%#zx\n",
- mb_kernel_size, (size_t)mh_entry_addr);
+ mb_debug("qemu: loading multiboot-elf kernel "
+ "(%#x bytes) with entry %#zx",
+ mb_kernel_size, (size_t)mh_entry_addr);
} else {
/* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */
uint32_t mh_header_addr = ldl_p(header+i+12);
@@ -224,7 +226,11 @@ int load_multiboot(FWCfgState *fw_cfg,
mh_load_addr = ldl_p(header+i+16);
if (mh_header_addr < mh_load_addr) {
- fprintf(stderr, "invalid mh_load_addr address\n");
+ error_report("invalid load_addr address");
+ exit(1);
+ }
+ if (mh_header_addr - mh_load_addr > i) {
+ error_report("invalid header_addr address");
exit(1);
}
@@ -233,43 +239,43 @@ int load_multiboot(FWCfgState *fw_cfg,
mh_entry_addr = ldl_p(header+i+28);
if (mh_load_end_addr) {
- if (mh_bss_end_addr < mh_load_addr) {
- fprintf(stderr, "invalid mh_bss_end_addr address\n");
- exit(1);
- }
- mb_kernel_size = mh_bss_end_addr - mh_load_addr;
-
if (mh_load_end_addr < mh_load_addr) {
- fprintf(stderr, "invalid mh_load_end_addr address\n");
+ error_report("invalid load_end_addr address");
exit(1);
}
mb_load_size = mh_load_end_addr - mh_load_addr;
} else {
if (kernel_file_size < mb_kernel_text_offset) {
- fprintf(stderr, "invalid kernel_file_size\n");
+ error_report("invalid kernel_file_size");
exit(1);
}
- mb_kernel_size = kernel_file_size - mb_kernel_text_offset;
- mb_load_size = mb_kernel_size;
+ mb_load_size = kernel_file_size - mb_kernel_text_offset;
+ }
+ if (mb_load_size > UINT32_MAX - mh_load_addr) {
+ error_report("kernel does not fit in address space");
+ exit(1);
+ }
+ if (mh_bss_end_addr) {
+ if (mh_bss_end_addr < (mh_load_addr + mb_load_size)) {
+ error_report("invalid bss_end_addr address");
+ exit(1);
+ }
+ mb_kernel_size = mh_bss_end_addr - mh_load_addr;
+ } else {
+ mb_kernel_size = mb_load_size;
}
- /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_VBE.
- uint32_t mh_mode_type = ldl_p(header+i+32);
- uint32_t mh_width = ldl_p(header+i+36);
- uint32_t mh_height = ldl_p(header+i+40);
- uint32_t mh_depth = ldl_p(header+i+44); */
-
- mb_debug("multiboot: mh_header_addr = %#x\n", mh_header_addr);
- mb_debug("multiboot: mh_load_addr = %#x\n", mh_load_addr);
- mb_debug("multiboot: mh_load_end_addr = %#x\n", mh_load_end_addr);
- mb_debug("multiboot: mh_bss_end_addr = %#x\n", mh_bss_end_addr);
- mb_debug("qemu: loading multiboot kernel (%#x bytes) at %#x\n",
+ mb_debug("multiboot: header_addr = %#x", mh_header_addr);
+ mb_debug("multiboot: load_addr = %#x", mh_load_addr);
+ mb_debug("multiboot: load_end_addr = %#x", mh_load_end_addr);
+ mb_debug("multiboot: bss_end_addr = %#x", mh_bss_end_addr);
+ mb_debug("qemu: loading multiboot kernel (%#x bytes) at %#x",
mb_load_size, mh_load_addr);
mbs.mb_buf = g_malloc(mb_kernel_size);
fseek(f, mb_kernel_text_offset, SEEK_SET);
if (fread(mbs.mb_buf, 1, mb_load_size, f) != mb_load_size) {
- fprintf(stderr, "fread() failed\n");
+ error_report("fread() failed");
exit(1);
}
memset(mbs.mb_buf + mb_load_size, 0, mb_kernel_size - mb_load_size);
@@ -323,10 +329,10 @@ int load_multiboot(FWCfgState *fw_cfg,
hwaddr c = mb_add_cmdline(&mbs, tmpbuf);
if ((next_space = strchr(tmpbuf, ' ')))
*next_space = '\0';
- mb_debug("multiboot loading module: %s\n", tmpbuf);
+ mb_debug("multiboot loading module: %s", tmpbuf);
mb_mod_length = get_image_size(tmpbuf);
if (mb_mod_length < 0) {
- fprintf(stderr, "Failed to open file '%s'\n", tmpbuf);
+ error_report("Failed to open file '%s'", tmpbuf);
exit(1);
}
@@ -337,7 +343,7 @@ int load_multiboot(FWCfgState *fw_cfg,
mb_add_mod(&mbs, mbs.mb_buf_phys + offs,
mbs.mb_buf_phys + offs + mb_mod_length, c);
- mb_debug("mod_start: %p\nmod_end: %p\n cmdline:
"TARGET_FMT_plx"\n",
+ mb_debug("mod_start: %p\nmod_end: %p\n cmdline: "TARGET_FMT_plx,
(char *)mbs.mb_buf + offs,
(char *)mbs.mb_buf + offs + mb_mod_length, c);
initrd_filename = next_initrd+1;
@@ -365,10 +371,11 @@ int load_multiboot(FWCfgState *fw_cfg,
stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot
switch? */
stl_p(bootinfo + MBI_MMAP_ADDR, ADDR_E820_MAP);
- mb_debug("multiboot: mh_entry_addr = %#x\n", mh_entry_addr);
- mb_debug(" mb_buf_phys = "TARGET_FMT_plx"\n", mbs.mb_buf_phys);
- mb_debug(" mod_start = "TARGET_FMT_plx"\n", mbs.mb_buf_phys
+ mbs.offset_mods);
- mb_debug(" mb_mods_count = %d\n", mbs.mb_mods_count);
+ mb_debug("multiboot: entry_addr = %#x", mh_entry_addr);
+ mb_debug(" mb_buf_phys = "TARGET_FMT_plx, mbs.mb_buf_phys);
+ mb_debug(" mod_start = "TARGET_FMT_plx,
+ mbs.mb_buf_phys + mbs.offset_mods);
+ mb_debug(" mb_mods_count = %d", mbs.mb_mods_count);
/* save bootinfo off the stack */
mb_bootinfo_data = g_memdup(bootinfo, sizeof(bootinfo));
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index d43b4b6cd3..9e5dcb6b2b 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -39,9 +39,10 @@ vtd_fault_disabled(void) "Fault processing disabled for
context entry"
vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain,
uint64_t hi, uint64_t lo) "replay valid context device
%02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid
context device %02"PRIx8":%02"PRIx8".%02"PRIx8
vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t
end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" -
0x%"PRIx64
-vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask,
int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64"
mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask,
int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask
0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated)
"iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64"
mask 0x%"PRIx64
vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova
0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova
0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova
0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on)
"Device %02x:%02x.%x switching address space (iommu enabled=%d)"
vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova,
uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 373311f91a..0741f3405e 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -533,13 +533,6 @@ static void ahci_check_cmd_bh(void *opaque)
qemu_bh_delete(ad->check_bh);
ad->check_bh = NULL;
- if ((ad->busy_slot != -1) &&
- !(ad->port.ifs[0].status & (BUSY_STAT|DRQ_STAT))) {
- /* no longer busy */
- ad->port_regs.cmd_issue &= ~(1 << ad->busy_slot);
- ad->busy_slot = -1;
- }
-
check_cmd(ad->hba, ad->port_no);
}
@@ -1426,6 +1419,12 @@ static void ahci_cmd_done(IDEDMA *dma)
trace_ahci_cmd_done(ad->hba, ad->port_no);
+ /* no longer busy */
+ if (ad->busy_slot != -1) {
+ ad->port_regs.cmd_issue &= ~(1 << ad->busy_slot);
+ ad->busy_slot = -1;
+ }
+
/* update d2h status */
ahci_write_fis_d2h(ad);
diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index 7b54d52376..864b7c6515 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -27,6 +27,7 @@
#include "hw/intc/arm_gicv3_common.h"
#include "gicv3_internal.h"
#include "hw/arm/linux-boot-if.h"
+#include "sysemu/kvm.h"
static int gicv3_pre_save(void *opaque)
{
@@ -141,6 +142,79 @@ static const VMStateDescription vmstate_gicv3_cpu = {
}
};
+static int gicv3_gicd_no_migration_shift_bug_pre_load(void *opaque)
+{
+ GICv3State *cs = opaque;
+
+ /*
+ * The gicd_no_migration_shift_bug flag is used for migration compatibility
+ * for old version QEMU which may have the GICD bmp shift bug under KVM
mode.
+ * Strictly, what we want to know is whether the migration source is using
+ * KVM. Since we don't have any way to determine that, we look at whether
the
+ * destination is using KVM; this is close enough because for the older QEMU
+ * versions with this bug KVM -> TCG migration didn't work anyway. If the
+ * source is a newer QEMU without this bug it will transmit the migration
+ * subsection which sets the flag to true; otherwise it will remain set to
+ * the value we select here.
+ */
+ if (kvm_enabled()) {
+ cs->gicd_no_migration_shift_bug = false;
+ }
+
+ return 0;
+}
+
+static int gicv3_gicd_no_migration_shift_bug_post_load(void *opaque,
+ int version_id)
+{
+ GICv3State *cs = opaque;
+
+ if (cs->gicd_no_migration_shift_bug) {
+ return 0;
+ }
+
+ /* Older versions of QEMU had a bug in the handling of state save/restore
+ * to the KVM GICv3: they got the offset in the bitmap arrays wrong,
+ * so that instead of the data for external interrupts 32 and up
+ * starting at bit position 32 in the bitmap, it started at bit
+ * position 64. If we're receiving data from a QEMU with that bug,
+ * we must move the data down into the right place.
+ */
+ memmove(cs->group, (uint8_t *)cs->group + GIC_INTERNAL / 8,
+ sizeof(cs->group) - GIC_INTERNAL / 8);
+ memmove(cs->grpmod, (uint8_t *)cs->grpmod + GIC_INTERNAL / 8,
+ sizeof(cs->grpmod) - GIC_INTERNAL / 8);
+ memmove(cs->enabled, (uint8_t *)cs->enabled + GIC_INTERNAL / 8,
+ sizeof(cs->enabled) - GIC_INTERNAL / 8);
+ memmove(cs->pending, (uint8_t *)cs->pending + GIC_INTERNAL / 8,
+ sizeof(cs->pending) - GIC_INTERNAL / 8);
+ memmove(cs->active, (uint8_t *)cs->active + GIC_INTERNAL / 8,
+ sizeof(cs->active) - GIC_INTERNAL / 8);
+ memmove(cs->edge_trigger, (uint8_t *)cs->edge_trigger + GIC_INTERNAL / 8,
+ sizeof(cs->edge_trigger) - GIC_INTERNAL / 8);
+
+ /*
+ * While this new version QEMU doesn't have this kind of bug as we fix it,
+ * so it needs to set the flag to true to indicate that and it's necessary
+ * for next migration to work from this new version QEMU.
+ */
+ cs->gicd_no_migration_shift_bug = true;
+
+ return 0;
+}
+
+const VMStateDescription vmstate_gicv3_gicd_no_migration_shift_bug = {
+ .name = "arm_gicv3/gicd_no_migration_shift_bug",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .pre_load = gicv3_gicd_no_migration_shift_bug_pre_load,
+ .post_load = gicv3_gicd_no_migration_shift_bug_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_BOOL(gicd_no_migration_shift_bug, GICv3State),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
static const VMStateDescription vmstate_gicv3 = {
.name = "arm_gicv3",
.version_id = 1,
@@ -165,6 +239,10 @@ static const VMStateDescription vmstate_gicv3 = {
VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, GICv3State, num_cpu,
vmstate_gicv3_cpu, GICv3CPUState),
VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * []) {
+ &vmstate_gicv3_gicd_no_migration_shift_bug,
+ NULL
}
};
@@ -364,6 +442,7 @@ static void arm_gicv3_common_reset(DeviceState *dev)
gicv3_gicd_group_set(s, i);
}
}
+ s->gicd_no_migration_shift_bug = true;
}
static void arm_gic_common_linux_init(ARMLinuxBootIf *obj,
diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 5cbafaf497..519d581bb6 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -431,7 +431,7 @@ static uint64_t icv_ap_read(CPUARMState *env, const
ARMCPRegInfo *ri)
{
GICv3CPUState *cs = icc_cs_from_env(env);
int regno = ri->opc2 & 3;
- int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+ int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;
uint64_t value = cs->ich_apr[grp][regno];
trace_gicv3_icv_ap_read(ri->crm & 1, regno, gicv3_redist_affid(cs), value);
@@ -443,7 +443,7 @@ static void icv_ap_write(CPUARMState *env, const
ARMCPRegInfo *ri,
{
GICv3CPUState *cs = icc_cs_from_env(env);
int regno = ri->opc2 & 3;
- int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+ int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;
trace_gicv3_icv_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs),
value);
@@ -1465,7 +1465,7 @@ static uint64_t icc_ap_read(CPUARMState *env, const
ARMCPRegInfo *ri)
uint64_t value;
int regno = ri->opc2 & 3;
- int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1;
+ int grp = (ri->crm & 1) ? GICV3_G1 : GICV3_G0;
if (icv_access(env, grp == GICV3_G0 ? HCR_FMO : HCR_IMO)) {
return icv_ap_read(env, ri);
@@ -1487,7 +1487,7 @@ static void icc_ap_write(CPUARMState *env, const
ARMCPRegInfo *ri,
GICv3CPUState *cs = icc_cs_from_env(env);
int regno = ri->opc2 & 3;
- int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1;
+ int grp = (ri->crm & 1) ? GICV3_G1 : GICV3_G0;
if (icv_access(env, grp == GICV3_G0 ? HCR_FMO : HCR_IMO)) {
icv_ap_write(env, ri, value);
@@ -2296,7 +2296,7 @@ static uint64_t ich_ap_read(CPUARMState *env, const
ARMCPRegInfo *ri)
{
GICv3CPUState *cs = icc_cs_from_env(env);
int regno = ri->opc2 & 3;
- int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+ int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;
uint64_t value;
value = cs->ich_apr[grp][regno];
@@ -2309,7 +2309,7 @@ static void ich_ap_write(CPUARMState *env, const
ARMCPRegInfo *ri,
{
GICv3CPUState *cs = icc_cs_from_env(env);
int regno = ri->opc2 & 3;
- int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+ int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;
trace_gicv3_ich_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs),
value);
diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c
index 481fe5405a..3fff4687ee 100644
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@@ -135,7 +135,14 @@ static void kvm_dist_get_priority(GICv3State *s, uint32_t
offset, uint8_t *bmp)
uint32_t reg, *field;
int irq;
- field = (uint32_t *)bmp;
+ /* For the KVM GICv3, affinity routing is always enabled, and the first 8
+ * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding
+ * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to
+ * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and
+ * offset.
+ */
+ field = (uint32_t *)(bmp + GIC_INTERNAL);
+ offset += (GIC_INTERNAL * 8) / 8;
for_each_dist_irq_reg(irq, s->num_irq, 8) {
kvm_gicd_access(s, offset, ®, false);
*field = reg;
@@ -149,7 +156,14 @@ static void kvm_dist_put_priority(GICv3State *s, uint32_t
offset, uint8_t *bmp)
uint32_t reg, *field;
int irq;
- field = (uint32_t *)bmp;
+ /* For the KVM GICv3, affinity routing is always enabled, and the first 8
+ * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding
+ * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to
+ * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and
+ * offset.
+ */
+ field = (uint32_t *)(bmp + GIC_INTERNAL);
+ offset += (GIC_INTERNAL * 8) / 8;
for_each_dist_irq_reg(irq, s->num_irq, 8) {
reg = *field;
kvm_gicd_access(s, offset, ®, true);
@@ -164,6 +178,14 @@ static void kvm_dist_get_edge_trigger(GICv3State *s,
uint32_t offset,
uint32_t reg;
int irq;
+ /* For the KVM GICv3, affinity routing is always enabled, and the first 2
+ * GICD_ICFGR<n> registers are always RAZ/WI. The corresponding
+ * functionality is replaced by GICR_ICFGR<n>. It doesn't need to sync
+ * them. So it should increase the offset to skip GIC_INTERNAL irqs.
+ * This matches the for_each_dist_irq_reg() macro which also skips the
+ * first GIC_INTERNAL irqs.
+ */
+ offset += (GIC_INTERNAL * 2) / 8;
for_each_dist_irq_reg(irq, s->num_irq, 2) {
kvm_gicd_access(s, offset, ®, false);
reg = half_unshuffle32(reg >> 1);
@@ -181,6 +203,14 @@ static void kvm_dist_put_edge_trigger(GICv3State *s,
uint32_t offset,
uint32_t reg;
int irq;
+ /* For the KVM GICv3, affinity routing is always enabled, and the first 2
+ * GICD_ICFGR<n> registers are always RAZ/WI. The corresponding
+ * functionality is replaced by GICR_ICFGR<n>. It doesn't need to sync
+ * them. So it should increase the offset to skip GIC_INTERNAL irqs.
+ * This matches the for_each_dist_irq_reg() macro which also skips the
+ * first GIC_INTERNAL irqs.
+ */
+ offset += (GIC_INTERNAL * 2) / 8;
for_each_dist_irq_reg(irq, s->num_irq, 2) {
reg = *gic_bmp_ptr32(bmp, irq);
if (irq % 32 != 0) {
@@ -222,6 +252,15 @@ static void kvm_dist_getbmp(GICv3State *s, uint32_t
offset, uint32_t *bmp)
uint32_t reg;
int irq;
+ /* For the KVM GICv3, affinity routing is always enabled, and the
+ * GICD_IGROUPR0/GICD_IGRPMODR0/GICD_ISENABLER0/GICD_ISPENDR0/
+ * GICD_ISACTIVER0 registers are always RAZ/WI. The corresponding
+ * functionality is replaced by the GICR registers. It doesn't need to sync
+ * them. So it should increase the offset to skip GIC_INTERNAL irqs.
+ * This matches the for_each_dist_irq_reg() macro which also skips the
+ * first GIC_INTERNAL irqs.
+ */
+ offset += (GIC_INTERNAL * 1) / 8;
for_each_dist_irq_reg(irq, s->num_irq, 1) {
kvm_gicd_access(s, offset, ®, false);
*gic_bmp_ptr32(bmp, irq) = reg;
@@ -235,6 +274,19 @@ static void kvm_dist_putbmp(GICv3State *s, uint32_t offset,
uint32_t reg;
int irq;
+ /* For the KVM GICv3, affinity routing is always enabled, and the
+ * GICD_IGROUPR0/GICD_IGRPMODR0/GICD_ISENABLER0/GICD_ISPENDR0/
+ * GICD_ISACTIVER0 registers are always RAZ/WI. The corresponding
+ * functionality is replaced by the GICR registers. It doesn't need to sync
+ * them. So it should increase the offset and clroffset to skip
GIC_INTERNAL
+ * irqs. This matches the for_each_dist_irq_reg() macro which also skips
the
+ * first GIC_INTERNAL irqs.
+ */
+ offset += (GIC_INTERNAL * 1) / 8;
+ if (clroffset != 0) {
+ clroffset += (GIC_INTERNAL * 1) / 8;
+ }
+
for_each_dist_irq_reg(irq, s->num_irq, 1) {
/* If this bitmap is a set/clear register pair, first write to the
* clear-reg to clear all bits before using the set-reg to write
@@ -243,6 +295,7 @@ static void kvm_dist_putbmp(GICv3State *s, uint32_t offset,
if (clroffset != 0) {
reg = 0;
kvm_gicd_access(s, clroffset, ®, true);
+ clroffset += 4;
}
reg = *gic_bmp_ptr32(bmp, irq);
kvm_gicd_access(s, offset, ®, true);
diff --git a/hw/intc/openpic_kvm.c b/hw/intc/openpic_kvm.c
index fa83420254..39a6f369c5 100644
--- a/hw/intc/openpic_kvm.c
+++ b/hw/intc/openpic_kvm.c
@@ -124,10 +124,6 @@ static void kvm_openpic_region_add(MemoryListener
*listener,
uint64_t reg_base;
int ret;
- if (section->fv != address_space_to_flatview(&address_space_memory)) {
- abort();
- }
-
/* Ignore events on regions that are not us */
if (section->mr != &opp->mem) {
return;
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 38674b08aa..6bdef38ceb 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -422,6 +422,7 @@ static RxFilterInfo
*virtio_net_query_rxfilter(NetClientState *nc)
static void virtio_net_reset(VirtIODevice *vdev)
{
VirtIONet *n = VIRTIO_NET(vdev);
+ int i;
/* Reset back to compatibility mode */
n->promisc = 1;
@@ -445,6 +446,16 @@ static void virtio_net_reset(VirtIODevice *vdev)
memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
memset(n->vlans, 0, MAX_VLAN >> 3);
+
+ /* Flush any async TX */
+ for (i = 0; i < n->max_queues; i++) {
+ NetClientState *nc = qemu_get_subqueue(n->nic, i);
+
+ if (nc->peer) {
+ qemu_flush_or_purge_queued_packets(nc->peer, true);
+ assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
+ }
+ }
}
static void peer_test_vnet_hdr(VirtIONet *n)
diff --git a/hw/pci-bridge/i82801b11.c b/hw/pci-bridge/i82801b11.c
index cb522bf30c..ebf7f5f0e8 100644
--- a/hw/pci-bridge/i82801b11.c
+++ b/hw/pci-bridge/i82801b11.c
@@ -98,6 +98,7 @@ static void i82801b11_bridge_class_init(ObjectClass *klass,
void *data)
k->realize = i82801b11_bridge_realize;
k->config_write = pci_bridge_write_config;
dc->vmsd = &i82801b11_bridge_dev_vmstate;
+ dc->reset = pci_bridge_reset;
set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
}
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index b57528baf4..a74eb2dc68 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -100,6 +100,21 @@
#define PHANDLE_XICP 0x00001111
+/* These two functions implement the VCPU id numbering: one to compute them
+ * all and one to identify thread 0 of a VCORE. Any change to the first one
+ * is likely to have an impact on the second one, so let's keep them close.
+ */
+static int spapr_vcpu_id(sPAPRMachineState *spapr, int cpu_index)
+{
+ return
+ (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
+}
+static bool spapr_is_thread0_in_vcore(sPAPRMachineState *spapr,
+ PowerPCCPU *cpu)
+{
+ return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
+}
+
static ICSState *spapr_ics_create(sPAPRMachineState *spapr,
const char *type_ics,
int nr_irqs, Error **errp)
@@ -161,15 +176,14 @@ static void pre_2_10_vmstate_unregister_dummy_icp(int i)
(void *)(uintptr_t) i);
}
-static inline int xics_max_server_number(void)
+static int xics_max_server_number(sPAPRMachineState *spapr)
{
- return DIV_ROUND_UP(max_cpus * kvmppc_smt_threads(), smp_threads);
+ return DIV_ROUND_UP(max_cpus * spapr->vsmt, smp_threads);
}
static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
{
sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
- sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
if (kvm_enabled()) {
if (machine_kernel_irqchip_allowed(machine) &&
@@ -191,17 +205,6 @@ static void xics_system_init(MachineState *machine, int
nr_irqs, Error **errp)
return;
}
}
-
- if (smc->pre_2_10_has_unused_icps) {
- int i;
-
- for (i = 0; i < xics_max_server_number(); i++) {
- /* Dummy entries get deregistered when real ICPState objects
- * are registered during CPU core hotplug.
- */
- pre_2_10_vmstate_register_dummy_icp(i);
- }
- }
}
static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
@@ -210,7 +213,7 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset,
PowerPCCPU *cpu,
int i, ret = 0;
uint32_t servers_prop[smt_threads];
uint32_t gservers_prop[smt_threads * 2];
- int index = spapr_vcpu_id(cpu);
+ int index = spapr_get_vcpu_id(cpu);
if (cpu->compat_pvr) {
ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
@@ -239,7 +242,7 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset,
PowerPCCPU *cpu,
static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu)
{
- int index = spapr_vcpu_id(cpu);
+ int index = spapr_get_vcpu_id(cpu);
uint32_t associativity[] = {cpu_to_be32(0x5),
cpu_to_be32(0x0),
cpu_to_be32(0x0),
@@ -338,16 +341,15 @@ static int spapr_fixup_cpu_dt(void *fdt,
sPAPRMachineState *spapr)
int ret = 0, offset, cpus_offset;
CPUState *cs;
char cpu_model[32];
- int smt = kvmppc_smt_threads();
uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
CPU_FOREACH(cs) {
PowerPCCPU *cpu = POWERPC_CPU(cs);
DeviceClass *dc = DEVICE_GET_CLASS(cs);
- int index = spapr_vcpu_id(cpu);
- int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
+ int index = spapr_get_vcpu_id(cpu);
+ int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
- if ((index % smt) != 0) {
+ if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
continue;
}
@@ -493,7 +495,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt,
int offset,
PowerPCCPU *cpu = POWERPC_CPU(cs);
CPUPPCState *env = &cpu->env;
PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
- int index = spapr_vcpu_id(cpu);
+ int index = spapr_get_vcpu_id(cpu);
uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
0xffffffff, 0xffffffff};
uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
@@ -503,7 +505,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt,
int offset,
size_t page_sizes_prop_size;
uint32_t vcpus_per_socket = smp_threads * smp_cores;
uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
- int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
+ int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
sPAPRDRConnector *drc;
int drc_index;
uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
@@ -614,7 +616,6 @@ static void spapr_populate_cpus_dt_node(void *fdt,
sPAPRMachineState *spapr)
CPUState *cs;
int cpus_offset;
char *nodename;
- int smt = kvmppc_smt_threads();
cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
_FDT(cpus_offset);
@@ -628,11 +629,11 @@ static void spapr_populate_cpus_dt_node(void *fdt,
sPAPRMachineState *spapr)
*/
CPU_FOREACH_REVERSE(cs) {
PowerPCCPU *cpu = POWERPC_CPU(cs);
- int index = spapr_vcpu_id(cpu);
+ int index = spapr_get_vcpu_id(cpu);
DeviceClass *dc = DEVICE_GET_CLASS(cs);
int offset;
- if ((index % smt) != 0) {
+ if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
continue;
}
@@ -1105,7 +1106,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr,
_FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
/* /interrupt controller */
- spapr_dt_xics(xics_max_server_number(), fdt, PHANDLE_XICP);
+ spapr_dt_xics(xics_max_server_number(spapr), fdt, PHANDLE_XICP);
ret = spapr_populate_memory(spapr, fdt);
if (ret < 0) {
@@ -2197,8 +2198,8 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
{
MachineState *machine = MACHINE(spapr);
MachineClass *mc = MACHINE_GET_CLASS(machine);
+ sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
const char *type = spapr_get_cpu_core_type(machine->cpu_type);
- int smt = kvmppc_smt_threads();
const CPUArchIdList *possible_cpus;
int boot_cores_nr = smp_cpus / smp_threads;
int i;
@@ -2228,12 +2229,23 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
boot_cores_nr = possible_cpus->len;
}
+ if (smc->pre_2_10_has_unused_icps) {
+ int i;
+
+ for (i = 0; i < xics_max_server_number(spapr); i++) {
+ /* Dummy entries get deregistered when real ICPState objects
+ * are registered during CPU core hotplug.
+ */
+ pre_2_10_vmstate_register_dummy_icp(i);
+ }
+ }
+
for (i = 0; i < possible_cpus->len; i++) {
int core_id = i * smp_threads;
if (mc->has_hotpluggable_cpus) {
spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
- (core_id / smp_threads) * smt);
+ spapr_vcpu_id(spapr, core_id));
}
if (i < boot_cores_nr) {
@@ -2282,26 +2294,43 @@ static void spapr_set_vsmt_mode(sPAPRMachineState
*spapr, Error **errp)
}
/* In this case, spapr->vsmt has been set by the command line */
} else {
- /* Choose a VSMT mode that may be higher than necessary but is
- * likely to be compatible with hosts that don't have VSMT. */
- spapr->vsmt = MAX(kvm_smt, smp_threads);
+ /*
+ * Default VSMT value is tricky, because we need it to be as
+ * consistent as possible (for migration), but this requires
+ * changing it for at least some existing cases. We pick 8 as
+ * the value that we'd get with KVM on POWER8, the
+ * overwhelmingly common case in production systems.
+ */
+ spapr->vsmt = MAX(8, smp_threads);
}
/* KVM: If necessary, set the SMT mode: */
if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
ret = kvmppc_set_smt_threads(spapr->vsmt);
if (ret) {
+ /* Looks like KVM isn't able to change VSMT mode */
error_setg(&local_err,
"Failed to set KVM's VSMT mode to %d (errno %d)",
spapr->vsmt, ret);
- if (!vsmt_user) {
- error_append_hint(&local_err, "On PPC, a VM with %d threads/"
- "core on a host with %d threads/core requires "
- " the use of VSMT mode %d.\n",
- smp_threads, kvm_smt, spapr->vsmt);
+ /* We can live with that if the default one is big enough
+ * for the number of threads, and a submultiple of the one
+ * we want. In this case we'll waste some vcpu ids, but
+ * behaviour will be correct */
+ if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
+ warn_report_err(local_err);
+ local_err = NULL;
+ goto out;
+ } else {
+ if (!vsmt_user) {
+ error_append_hint(&local_err,
+ "On PPC, a VM with %d threads/core"
+ " on a host with %d threads/core"
+ " requires the use of VSMT mode %d.\n",
+ smp_threads, kvm_smt, spapr->vsmt);
+ }
+ kvmppc_hint_smt_possible(&local_err);
+ goto out;
}
- kvmppc_hint_smt_possible(&local_err);
- goto out;
}
}
/* else TCG: nothing to do currently */
@@ -2327,6 +2356,7 @@ static void ppc_spapr_init(MachineState *machine)
long load_limit, fw_size;
char *filename;
Error *resize_hpt_err = NULL;
+ PowerPCCPU *first_ppc_cpu;
msi_nonbroken = true;
@@ -2419,11 +2449,6 @@ static void ppc_spapr_init(MachineState *machine)
}
spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
- if (!kvm_enabled() || kvmppc_has_cap_mmu_radix()) {
- /* KVM and TCG always allow GTSE with radix... */
- spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
- }
- /* ... but not with hash (currently). */
/* advertise support for dedicated HP event source to guests */
if (spapr->use_hotplug_event_source) {
@@ -2440,6 +2465,15 @@ static void ppc_spapr_init(MachineState *machine)
spapr_init_cpus(spapr);
+ first_ppc_cpu = POWERPC_CPU(first_cpu);
+ if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
+ ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
+ spapr->max_compat_pvr)) {
+ /* KVM and TCG always allow GTSE with radix... */
+ spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+ }
+ /* ... but not with hash (currently). */
+
if (kvm_enabled()) {
/* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
kvmppc_enable_logical_ci_hcalls();
@@ -3199,7 +3233,7 @@ static void *spapr_populate_hotplug_cpu_dt(CPUState *cs,
int *fdt_offset,
{
PowerPCCPU *cpu = POWERPC_CPU(cs);
DeviceClass *dc = DEVICE_GET_CLASS(cs);
- int id = spapr_vcpu_id(cpu);
+ int id = spapr_get_vcpu_id(cpu);
void *fdt;
int offset, fdt_size;
char *nodename;
@@ -3245,10 +3279,10 @@ static
void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
Error **errp)
{
+ sPAPRMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
int index;
sPAPRDRConnector *drc;
CPUCore *cc = CPU_CORE(dev);
- int smt = kvmppc_smt_threads();
if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
error_setg(errp, "Unable to find CPU core with core-id: %d",
@@ -3260,7 +3294,8 @@ void spapr_core_unplug_request(HotplugHandler
*hotplug_dev, DeviceState *dev,
return;
}
- drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index * smt);
+ drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
+ spapr_vcpu_id(spapr, cc->core_id));
g_assert(drc);
spapr_drc_detach(drc);
@@ -3279,7 +3314,6 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev,
DeviceState *dev,
CPUState *cs = CPU(core->threads);
sPAPRDRConnector *drc;
Error *local_err = NULL;
- int smt = kvmppc_smt_threads();
CPUArchId *core_slot;
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |