[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 3 of 4] libxl: Add support for passing in the machine's E820 for PCI passthrough in libxl_device_pci_parse_bdf
# HG changeset patch # User Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> # Date 1302556434 14400 # Node ID 75e24fb720fa9a1c529bc7e6b6eb9e1afc554130 # Parent e7057fec103ba69776d157d470d630ce99dbc540 libxl: Add support for passing in the machine's E820 for PCI passthrough in libxl_device_pci_parse_bdf. The code that populates E820 is unconditionally triggered by the guest configuration having "pci=['<BDF>,..']" and being an PV guest. The code libxl_device_pci_parse_bdf calls the libxl__e820_alloc when it has parsed one BDF. Subsequent calls to libxl_device_pci_parse_bdf will not trigger libxl__e820_alloc being called (unless the first call to libxl__e820_alloc failed). libxl__e820_alloc calls the xc_get_machine_memory_map to retrieve the systems E820. Then the E820 is sanitized to weed out E820 entries below 16MB, and as well remove any E820_RAM or E820_UNUSED regions as the guest does not need to know about them. The guest only needs the E820_ACPI, E820_NVS, E820_RESERVED to get an idea of where the PCI I/O space is. Mostly.. The Linux kernel assumes that any gap in the E820 is considered PCI I/O space which means that if we pass in the guest 2GB, and the E820_ACPI, and its friend start at 3GB, the gap between 2GB and 3GB will be considered as PCI I/O space. To guard against that we also create an E820_UNUSABLE between the region of 'target_kb' (called ram_end in the code) up to the first E820_[ACPI,NVS,RESERVED] region. When tested with another PV guest (NetBSD 5.1) the modified E820 gave it no trouble. The code has also been tested with older "classic" Xen Linux and with the newer "pvops" with success (SLES11, RHEL5, Ubuntu Lucid, Debian Squeeze, 2.6.37, 2.6.38, 2.6.39). Memory that is slack or for balloon (so 'maxmem' in guest configuration) is put behind the machine E820. Which in most cases is after the 4GB. The reason for doing the fetching of the E820 using the hypercall in the toolstack (instead of the guest doing it) is that when a guest would do a hypercall to 'XENMEM_machine_memory_map' it would retrieve an E820 with I/O range caps added in. Meaning that the region after 4GB up to end of possible memory would be marked as unusable and the kernel would not have any space to allocate a balloon region. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> diff -r e7057fec103b -r 75e24fb720fa tools/libxl/libxl.h --- a/tools/libxl/libxl.h Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/libxl/libxl.h Mon Apr 11 17:13:54 2011 -0400 @@ -204,6 +204,14 @@ typedef struct { } libxl_file_reference; void libxl_file_reference_destroy(libxl_file_reference *p); +#define E820MAX (128) +typedef struct { + uint32_t nr_entries; + struct e820entry *entry; +} libxl_e820; + +void libxl_e820_destroy(libxl_e820 *p); + /* libxl_cpuid_policy_list is a dynamic array storing CPUID policies * for multiple leafs. It is terminated with an entry holding * XEN_CPUID_INPUT_UNUSED in input[0] @@ -452,7 +460,8 @@ int libxl_device_pci_remove(libxl_ctx *c int libxl_device_pci_shutdown(libxl_ctx *ctx, uint32_t domid); int libxl_device_pci_list_assigned(libxl_ctx *ctx, libxl_device_pci **list, uint32_t domid, int *num); int libxl_device_pci_list_assignable(libxl_ctx *ctx, libxl_device_pci **list, int *num); -int libxl_device_pci_parse_bdf(libxl_ctx *ctx, libxl_device_pci *pcidev, const char *str); +int libxl_device_pci_parse_bdf(libxl_ctx *ctx, libxl_device_pci *pcidev, const char *str, + libxl_domain_config *d_config); int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str); int libxl_cpuid_parse_config_xend(libxl_cpuid_policy_list *cpuid, const char* str); diff -r e7057fec103b -r 75e24fb720fa tools/libxl/libxl.idl --- a/tools/libxl/libxl.idl Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/libxl/libxl.idl Mon Apr 11 17:13:54 2011 -0400 @@ -22,6 +22,7 @@ libxl_file_reference = Builtin("file_ref libxl_hwcap = Builtin("hwcap") +libxl_e820 = Builtin("e820", destructor_fn="libxl_e820_destroy", passby=PASS_BY_REFERENCE) # # Complex libxl types # @@ -112,6 +113,7 @@ libxl_domain_build_info = Struct("domain ])), ("pv", "!%s", Struct(None, [("slack_memkb", uint32), + ("e820", libxl_e820), ("bootloader", string), ("bootloader_args", string), ("cmdline", string), diff -r e7057fec103b -r 75e24fb720fa tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/libxl/libxl_dom.c Mon Apr 11 17:13:54 2011 -0400 @@ -72,9 +72,17 @@ int libxl__build_pre(libxl__gc *gc, uint libxl_ctx *ctx = libxl__gc_owner(gc); xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus); xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT); - if (!info->hvm) + if (!info->hvm) { + if (info->u.pv.e820.nr_entries) { + xc_domain_set_memory_map(ctx->xch, domid, + info->u.pv.e820.entry, + info->u.pv.e820.nr_entries); + } + else { xc_domain_set_memmap_limit(ctx->xch, domid, (info->max_memkb + info->u.pv.slack_memkb)); + } + } xc_domain_set_tsc_info(ctx->xch, domid, info->tsc_mode, 0, 0, 0); if ( info->disable_migrate ) xc_domain_disable_migrate(ctx->xch, domid); diff -r e7057fec103b -r 75e24fb720fa tools/libxl/libxl_internal.h --- a/tools/libxl/libxl_internal.h Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/libxl/libxl_internal.h Mon Apr 11 17:13:54 2011 -0400 @@ -335,4 +335,5 @@ _hidden int libxl__error_set(libxl__gc * _hidden int libxl__file_reference_map(libxl_file_reference *f); _hidden int libxl__file_reference_unmap(libxl_file_reference *f); +_hidden int libxl__e820_alloc(libxl_ctx *ctx, libxl_domain_build_info *b_info); #endif diff -r e7057fec103b -r 75e24fb720fa tools/libxl/libxl_pci.c --- a/tools/libxl/libxl_pci.c Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/libxl/libxl_pci.c Mon Apr 11 17:13:54 2011 -0400 @@ -87,7 +87,8 @@ static int hex_convert(const char *str, #define STATE_OPTIONS_K 6 #define STATE_OPTIONS_V 7 #define STATE_TERMINAL 8 -int libxl_device_pci_parse_bdf(libxl_ctx *ctx, libxl_device_pci *pcidev, const char *str) +int libxl_device_pci_parse_bdf(libxl_ctx *ctx, libxl_device_pci *pcidev, const char *str, + libxl_domain_config *d_config) { unsigned state = STATE_DOMAIN; unsigned dom, bus, dev, func, vslot = 0; @@ -202,6 +203,25 @@ int libxl_device_pci_parse_bdf(libxl_ctx pcidev_init(pcidev, dom, bus, dev, func, vslot << 3); + if (d_config && !d_config->c_info.hvm) { + libxl_domain_build_info *b_info = &d_config->b_info; + if (b_info->u.pv.e820.entry == NULL) { + /* Note: libxl_init_build_info sets the slack_memkb, and the other + * value: max_memkb, target_memkb, are set by parse_config_data. */ + if (b_info->max_memkb == 0 || b_info->target_memkb == 0 || + b_info->u.pv.slack_memkb == 0) { + LIBXL__LOG(ctx, LIBXL__LOG_WARNING, + "Failed to construct E820: no memory data!"); + } else { + int rc = 0; + rc = libxl__e820_alloc(ctx, b_info); + if (rc) + LIBXL__LOG(ctx, LIBXL__LOG_WARNING, + "Failed while collecting E820 with: %d (errno:%d)\n", + rc, errno); + } + } + } return 0; parse_error: @@ -1047,3 +1067,164 @@ int libxl_device_pci_shutdown(libxl_ctx free(pcidevs); return 0; } + +static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], + uint32_t *nr_entries, + unsigned long map_limitkb, + unsigned long balloon_kb) +{ + uint64_t delta_kb = 0, start = 0, start_kb = 0, last = 0, ram_end; + uint32_t i, idx = 0, nr; + struct e820entry e820[E820MAX]; + + if (!src || !map_limitkb || !balloon_kb || !nr_entries) + return ERROR_INVAL; + + nr = *nr_entries; + if (!nr) + return ERROR_INVAL; + + if (nr > E820MAX) + return ERROR_NOMEM; + + /* Weed out anything under 16MB */ + for (i = 0; i < nr; i++) { + if (src[i].addr > 0x100000) + continue; + + src[i].type = 0; + src[i].size = 0; + src[i].addr = -1ULL; + } + + /* Find the lowest and highest entry in E820, skipping over + * undersired entries. */ + start = -1ULL; + last = 0; + for (i = 0; i < nr; i++) { + if ((src[i].type == E820_RAM) || + (src[i].type == E820_UNUSABLE) || + (src[i].type == 0)) + continue; + + start = src[i].addr < start ? src[i].addr : start; + last = src[i].addr + src[i].size > last ? + src[i].addr + src[i].size > last : last; + } + if (start > 1024) + start_kb = start >> 10; + + /* Add the memory RAM region for the guest */ + e820[idx].addr = 0; + e820[idx].size = (uint64_t)map_limitkb << 10; + e820[idx].type = E820_RAM; + + /* .. and trim if neccessary */ + if (start_kb && map_limitkb > start_kb) { + delta_kb = map_limitkb - start_kb; + if (delta_kb) + e820[idx].size -= (uint64_t)(delta_kb << 10); + } + /* Note: We don't touch balloon_kb here. Will add it at the end. */ + ram_end = e820[idx].addr + e820[idx].size; + idx ++; + + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "Memory: %ldkB End of RAM: 0x%lx (PFN) " \ + "Delta: %ldkB, PCI start: %ldkB (0x%lx PFN), Balloon %ldkB\n", + map_limitkb, ram_end >> 12, delta_kb, start_kb ,start >> 12, + balloon_kb); + + /* Check if there is a region between ram_end and start. */ + if (start > ram_end) { + /* .. and if not present, add it in. This is to guard against + the Linux guest assuming that the gap between the end of + RAM region and the start of the E820_[ACPI,NVS,RESERVED] + is PCI I/O space. Which it certainly is _not_. */ + e820[idx].type = E820_UNUSABLE; + e820[idx].addr = ram_end; + e820[idx].size = start - ram_end; + idx++; + } + /* Almost done: copy them over, ignoring the undesireable ones */ + for (i = 0; i < nr; i++) { + if ((src[i].type == E820_RAM) || + (src[i].type == E820_UNUSABLE) || + (src[i].type == 0)) + continue; + e820[idx].type = src[i].type; + e820[idx].addr = src[i].addr; + e820[idx].size = src[i].size; + idx++; + } + + /* At this point we have the mapped RAM + E820 entries from src. */ + if (balloon_kb) { + /* and if we truncated the RAM region, then add it to the end. */ + e820[idx].type = E820_RAM; + e820[idx].addr = (uint64_t)(1ULL << 32) > last ? (uint64_t)(1ULL << 32) : last; + /* also add the balloon memory to the end. */ + e820[idx].size = (uint64_t)(delta_kb << 10) + (uint64_t)(balloon_kb << 10); + idx++; + + } + nr = idx; + + for (i = 0; i < nr; i++) { + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, ":%s\t[%lx -> %lx]", + e820[i].type == E820_RAM ? "RAM " : + (e820[i].type == E820_RESERVED ? "RSV " : + e820[i].type == E820_ACPI ? "ACPI" : + (e820[i].type == E820_NVS ? "NVS " : + (e820[i].type == E820_UNUSABLE ? "UNU " : "----"))), + e820[i].addr >> 12, + (e820[i].addr + e820[i].size) >> 12); + } + + /* Done: copy the sanitized version. */ + *nr_entries = nr; + memcpy(src, e820, nr * sizeof(struct e820entry)); + return 0; +} + + +int libxl__e820_alloc(libxl_ctx *ctx, libxl_domain_build_info *b_info) +{ + int rc; + uint32_t nr; + libxl_e820 *p; + struct e820entry map[E820MAX]; + + if (b_info == NULL || b_info->hvm) + return ERROR_INVAL; + + rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX); + if (rc < 0) { + errno = rc; + return ERROR_FAIL; + } + nr = rc; + rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb, + (b_info->max_memkb - b_info->target_memkb) + + b_info->u.pv.slack_memkb); + if (rc) + return ERROR_FAIL; + + p = &b_info->u.pv.e820; + p->nr_entries = nr; + p->entry = calloc(nr, sizeof(struct e820entry)); + if (!p->entry) + return ERROR_NOMEM; + + memcpy(p->entry, map, nr * sizeof(struct e820entry)); + return 0; +} + +void libxl_e820_destroy(libxl_e820 *p) +{ + if (p) { + if (p->entry) + free(p->entry); + p->entry = NULL; + p->nr_entries = 0; + } +} diff -r e7057fec103b -r 75e24fb720fa tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/libxl/xl_cmdimpl.c Mon Apr 11 17:13:54 2011 -0400 @@ -1022,7 +1022,7 @@ skip_vfb: pcidev->msitranslate = pci_msitranslate; pcidev->power_mgmt = pci_power_mgmt; - if (!libxl_device_pci_parse_bdf(&ctx, pcidev, buf)) + if (!libxl_device_pci_parse_bdf(&ctx, pcidev, buf, d_config)) d_config->num_pcidevs++; } } @@ -2120,7 +2120,7 @@ static void pcidetach(const char *dom, c find_domain(dom); memset(&pcidev, 0x00, sizeof(pcidev)); - if (libxl_device_pci_parse_bdf(&ctx, &pcidev, bdf)) { + if (libxl_device_pci_parse_bdf(&ctx, &pcidev, bdf, NULL)) { fprintf(stderr, "pci-detach: malformed BDF specification \"%s\"\n", bdf); exit(2); } @@ -2165,7 +2165,7 @@ static void pciattach(const char *dom, c find_domain(dom); memset(&pcidev, 0x00, sizeof(pcidev)); - if (libxl_device_pci_parse_bdf(&ctx, &pcidev, bdf)) { + if (libxl_device_pci_parse_bdf(&ctx, &pcidev, bdf, NULL)) { fprintf(stderr, "pci-attach: malformed BDF specification \"%s\"\n", bdf); exit(2); } diff -r e7057fec103b -r 75e24fb720fa tools/python/xen/lowlevel/xl/xl.c --- a/tools/python/xen/lowlevel/xl/xl.c Fri Apr 08 16:38:28 2011 -0400 +++ b/tools/python/xen/lowlevel/xl/xl.c Mon Apr 11 17:13:54 2011 -0400 @@ -544,7 +544,7 @@ static PyObject *pyxl_pci_parse(XlObject return NULL; } - if ( libxl_device_pci_parse_bdf(&self->ctx, &pci->obj, str) ) { + if ( libxl_device_pci_parse_bdf(&self->ctx, &pci->obj, str, NULL) ) { PyErr_SetString(xl_error_obj, "cannot parse pci device spec (BDF)"); Py_DECREF(pci); return NULL; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |