[Qemu-devel] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions

Discussion:

Shameer Kolothum

2018-05-16 15:20:20 UTC

When the kernel reports valid iova ranges as non-contiguous,
memory should be allocated to Guest in such a way that
reserved regions(holes) are not visible by Guest.

This series retrieves the valid iova ranges based on the new
proposed VFIO interface for kernel [1]. It then populates the
first 1GB ram as a non-pluggable dimm and rest as a pc-dimm if
the valid iova ranges are non-contiguous.

Patch #3 of this series is loosely based on an earlier attempt
by Kwangwoo Lee to add hotplug/pc-dimm support to arm64[2]

RFC v1[3] --> RFCv2
-Based on new VFIO kernel interface
-Part of Mem modelled as pc-dimm
-Rebased to qemu 2.12.0

[1] https://lkml.org/lkml/2018/4/18/293
[2] https://lists.gnu.org/archive/html/qemu-devel/2016-07/msg04600.html
[3] https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg02412.html

Shameer Kolothum (6):
hw/vfio: Retrieve valid iova ranges from kernel
hw/arm/virt: Enable dynamic generation of guest RAM memory regions
hw/arm/virt: Add pc-dimm mem hotplug framework
hw/arm: Changes required to accommodate non-contiguous DT mem nodes
hw/arm: ACPI SRAT changes to accommodate non-contiguous mem
hw/arm: Populate non-contiguous memory regions

default-configs/aarch64-softmmu.mak | 1 +
hw/arm/boot.c | 91 ++++++---
hw/arm/virt-acpi-build.c | 24 ++-
hw/arm/virt.c | 367 +++++++++++++++++++++++++++++++++++-
hw/vfio/common.c | 108 ++++++++++-
include/hw/arm/arm.h | 12 ++
include/hw/arm/virt.h | 3 +
include/hw/vfio/vfio-common.h | 7 +
linux-headers/linux/vfio.h | 23 +++
9 files changed, 589 insertions(+), 47 deletions(-)

--
2.7.4

Shameer Kolothum

2018-05-16 15:20:22 UTC

Permalink

Register ram_memory_region_init notifier to allocate memory region
from system memory.

Signed-off-by: Zhu Yijun <***@huawei.com>
Signed-off-by: Shameer Kolothum <***@huawei.com>
---
hw/arm/virt.c | 28 ++++++++++++++++++++++------
include/hw/arm/virt.h | 1 +
2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 94dcb12..05fcb62 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1171,6 +1171,19 @@ void virt_machine_done(Notifier *notifier, void *data)
virt_build_smbios(vms);
}

+static void virt_ram_memory_region_init(Notifier *notifier, void *data)
+{
+ MemoryRegion *sysmem = get_system_memory();
+ MemoryRegion *ram = g_new(MemoryRegion, 1);
+ VirtMachineState *vms = container_of(notifier, VirtMachineState,
+ ram_memory_region_init);
+ MachineState *machine = MACHINE(vms);
+
+ memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram",
+ machine->ram_size);
+ memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
+}
+
static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
{
uint8_t clustersz = ARM_DEFAULT_CPUS_PER_CLUSTER;
@@ -1204,7 +1217,6 @@ static void machvirt_init(MachineState *machine)
MemoryRegion *sysmem = get_system_memory();
MemoryRegion *secure_sysmem = NULL;
int n, virt_max_cpus;
- MemoryRegion *ram = g_new(MemoryRegion, 1);
bool firmware_loaded = bios_name || drive_get(IF_PFLASH, 0, 0);

/* We can probe only here because during property set
@@ -1361,10 +1373,6 @@ static void machvirt_init(MachineState *machine)
fdt_add_timer_nodes(vms);
fdt_add_cpu_nodes(vms);

- memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram",
- machine->ram_size);
- memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
-
create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem);

create_gic(vms, pic);
@@ -1405,15 +1413,23 @@ static void machvirt_init(MachineState *machine)
vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base;
vms->bootinfo.get_dtb = machvirt_dtb;
vms->bootinfo.firmware_loaded = firmware_loaded;
+
+ /* Register notifiers. They are executed in registration reverse order */
arm_load_kernel(ARM_CPU(first_cpu), &vms->bootinfo);

/*
* arm_load_kernel machine init done notifier registration must
* happen before the platform_bus_create call. In this latter,
* another notifier is registered which adds platform bus nodes.
- * Notifiers are executed in registration reverse order.
*/
create_platform_bus(vms, pic);
+
+ /*
+ * Register memory region notifier last as this has to be executed
+ * first.
+ */
+ vms->ram_memory_region_init.notify = virt_ram_memory_region_init;
+ qemu_add_machine_init_done_notifier(&vms->ram_memory_region_init);
}

static bool virt_get_secure(Object *obj, Error **errp)
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index ba0c1a4..fc24f3a 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -91,6 +91,7 @@ typedef struct {
typedef struct {
MachineState parent;
Notifier machine_done;
+ Notifier ram_memory_region_init;
FWCfgState *fw_cfg;
bool secure;
bool highmem;

--
2.7.4

Shameer Kolothum

2018-05-16 15:20:26 UTC

Permalink

In case valid iova regions are non-contiguous, split the
RAM mem into a 1GB non-pluggable dimm and remaining as a
single pc-dimm mem.

Signed-off-by: Shameer Kolothum <***@huawei.com>
---
hw/arm/virt.c | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 256 insertions(+), 5 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index be3ad14..562e389 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -58,6 +58,12 @@
#include "hw/smbios/smbios.h"
#include "qapi/visitor.h"
#include "standard-headers/linux/input.h"
+#include "hw/vfio/vfio-common.h"
+#include "qemu/config-file.h"
+#include "monitor/qdev.h"
+#include "qom/object_interfaces.h"
+#include "qapi/qmp/qdict.h"
+#include "qemu/option.h"

#define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
@@ -110,7 +116,10 @@ static ARMPlatformBusSystemParams platform_bus_params;
* terabyte of physical address space.)
*/
#define RAMLIMIT_GB 255
-#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024)
+#define SZ_1G (1024ULL * 1024 * 1024)
+#define RAMLIMIT_BYTES (RAMLIMIT_GB * SZ_1G)
+
+#define ALIGN_1G (1ULL << 30)

/* Addresses and sizes of our components.
* 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
@@ -1171,6 +1180,236 @@ void virt_machine_done(Notifier *notifier, void *data)
virt_build_smbios(vms);
}

+static void free_iova_copy(struct vfio_iova_head *iova_copy)
+{
+ VFIOIovaRange *iova, *tmp;
+
+ QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) {
+ QLIST_REMOVE(iova, next);
+ g_free(iova);
+ }
+}
+
+static void get_iova_copy(struct vfio_iova_head *iova_copy)
+{
+ VFIOIovaRange *iova, *new, *prev_iova = NULL;
+
+ QLIST_FOREACH(iova, &vfio_iova_regions, next) {
+ new = g_malloc0(sizeof(*iova));
+ new->start = iova->start;
+ new->end = iova->end;
+
+ if (prev_iova) {
+ QLIST_INSERT_AFTER(prev_iova, new, next);
+ } else {
+ QLIST_INSERT_HEAD(iova_copy, new, next);
+ }
+ prev_iova = new;
+ }
+}
+
+static hwaddr find_memory_chunk(VirtMachineState *vms,
+ struct vfio_iova_head *iova_copy,
+ hwaddr req_size, bool pcdimm)
+{
+ VFIOIovaRange *iova, *tmp;
+ hwaddr new_start, new_size, sz_align;
+ hwaddr virt_start = vms->memmap[VIRT_MEM].base;
+ hwaddr addr_align = ALIGN_1G; /* Set to max ARM64 hugepage size */
+
+ /* Size alignment */
+ sz_align = (pcdimm) ? MAX(TARGET_PAGE_SIZE, QEMU_VMALLOC_ALIGN) :
+ TARGET_PAGE_SIZE;
+
+ QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) {
+ if (virt_start >= iova->end) {
+ continue;
+ }
+
+ /* Align addr */
+ new_start = ROUND_UP(MAX(virt_start, iova->start), addr_align);
+ if (new_start >= iova->end) {
+ continue;
+ }
+
+ if (req_size > iova->end - new_start + 1) {
+ continue;
+ }
+
+ /*
+ * Check the region can hold any size alignment requirement.
+ */
+ new_size = QEMU_ALIGN_UP(req_size, sz_align);
+
+ if ((new_start + new_size - 1 > iova->end) ||
+ (new_start + new_size >= virt_start + RAMLIMIT_BYTES)) {
+ continue;
+ }
+
+ /*
+ * Modify the iova list entry for non pc-dimm case so that it
+ * is not used again for pc-dimm allocation.
+ */
+ if (!pcdimm) {
+ if (new_size - req_size) {
+ iova->start = new_start + new_size;
+ } else {
+ QLIST_REMOVE(iova, next);
+ }
+ }
+ return new_start;
+ }
+
+ return -1;
+}
+
+static void update_memory_regions(VirtMachineState *vms)
+{
+ hwaddr addr;
+ VFIOIovaRange *iova;
+ MachineState *machine = MACHINE(vms);
+ hwaddr virt_start = vms->memmap[VIRT_MEM].base;
+ hwaddr req_size, ram_size = machine->ram_size;
+ struct vfio_iova_head iova_copy = QLIST_HEAD_INITIALIZER(iova_copy);
+
+ /* No valid iova regions, use default */
+ if (QLIST_EMPTY(&vfio_iova_regions)) {
+ vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base;
+ vms->bootinfo.ram_size = ram_size;
+ return;
+ }
+
+ /*
+ * If valid iovas has only one entry, check the req size fits in
+ * and can have the loader start < 4GB. This will make sure platforms
+ * with no holes in mem will have the same mem model as before.
+ */
+ req_size = ram_size;
+ iova = QLIST_NEXT(QLIST_FIRST(&vfio_iova_regions), next);
+ if (!iova) {
+ iova = QLIST_FIRST(&vfio_iova_regions);
+ addr = ROUND_UP(MAX(virt_start, iova->start), ALIGN_1G);
+ if ((addr < 4 * SZ_1G) && (ram_size <= iova->end - addr + 1) &&
+ (addr + ram_size < virt_start + RAMLIMIT_BYTES)) {
+ vms->bootinfo.loader_start = addr;
+ vms->bootinfo.ram_size = ram_size;
+ return;
+ }
+ }
+
+ /* Get a copy of valid iovas and work on it */
+ get_iova_copy(&iova_copy);
+
+ /* Split the mem as first 1GB non-pluggable and rest as pc-dimm */
+ req_size = MIN(ram_size, SZ_1G);
+ addr = find_memory_chunk(vms, &iova_copy, req_size, false);
+ if (addr == -1 || addr >= 4 * SZ_1G) {
+ goto out;
+ }
+
+ /*Update non-pluggable mem details */
+ machine->ram_size = req_size;
+ vms->bootinfo.loader_start = addr;
+ vms->bootinfo.ram_size = machine->ram_size;
+
+ req_size = ram_size - req_size;
+ if (!req_size) {
+ goto done;
+ }
+
+ /* Remaining memory is modeled as a pc-dimm. */
+ addr = find_memory_chunk(vms, &iova_copy, req_size, true);
+ if (addr == -1) {
+ goto out;
+ }
+
+ /*Update pc-dimm mem details */
+ vms->bootinfo.dimm_mem = g_new(struct dimm_mem_info, 1);
+ vms->bootinfo.dimm_mem->base = addr;
+ vms->bootinfo.dimm_mem->size = req_size;
+ machine->maxram_size = machine->ram_size + req_size;
+ machine->ram_slots += 1;
+
+done:
+ free_iova_copy(&iova_copy);
+ return;
+
+out:
+ free_iova_copy(&iova_copy);
+ error_report("mach-virt: Not enough contiguous memory to model ram");
+ exit(1);
+}
+
+static void create_pcdimms(VirtMachineState *vms,
+ MemoryRegion *sysmem,
+ MemoryRegion *ram)
+{
+ hwaddr addr, size;
+ Error *local_err = NULL;
+ QDict *qdict;
+ QemuOpts *opts;
+ char *tmp;
+
+ if (!vms->bootinfo.dimm_mem) {
+ return;
+ }
+
+ addr = vms->bootinfo.dimm_mem->base;
+ size = vms->bootinfo.dimm_mem->size;
+
+ /*Create hotplug address space */
+ vms->hotplug_memory.base = ROUND_UP(addr, ALIGN_1G);
+ size = ROUND_UP(size, MAX(TARGET_PAGE_SIZE, QEMU_VMALLOC_ALIGN));
+
+ memory_region_init(&vms->hotplug_memory.mr, OBJECT(vms),
+ "hotplug-memory", size);
+ memory_region_add_subregion(sysmem, vms->hotplug_memory.base,
+ &vms->hotplug_memory.mr);
+ /* Create backend mem object */
+ qdict = qdict_new();
+ qdict_put_str(qdict, "qom-type", "memory-backend-ram");
+ qdict_put_str(qdict, "id", "mem1");
+ tmp = g_strdup_printf("%"PRIu64 "M", size / (1024 * 1024));
+ qdict_put_str(qdict, "size", tmp);
+ g_free(tmp);
+
+ opts = qemu_opts_from_qdict(qemu_find_opts("object"), qdict, &local_err);
+ if (local_err) {
+ goto err;
+ }
+
+ user_creatable_add_opts(opts, &local_err);
+ qemu_opts_del(opts);
+ QDECREF(qdict);
+ if (local_err) {
+ goto err;
+ }
+
+ /* Create pc-dimm dev*/
+ qdict = qdict_new();
+ qdict_put_str(qdict, "driver", "pc-dimm");
+ qdict_put_str(qdict, "id", "dimm1");
+ qdict_put_str(qdict, "memdev", "mem1");
+
+ opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, &local_err);
+ if (local_err) {
+ goto err;
+ }
+
+ qdev_device_add(opts, &local_err);
+ qemu_opts_del(opts);
+ QDECREF(qdict);
+ if (local_err) {
+ goto err;
+ }
+
+ return;
+
+err:
+ error_report_err(local_err);
+ exit(1);
+}
+
static void virt_ram_memory_region_init(Notifier *notifier, void *data)
{
MemoryRegion *sysmem = get_system_memory();
@@ -1179,9 +1418,14 @@ static void virt_ram_memory_region_init(Notifier *notifier, void *data)
ram_memory_region_init);
MachineState *machine = MACHINE(vms);

+ update_memory_regions(vms);
memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram",
machine->ram_size);
- memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
+ memory_region_add_subregion(sysmem, vms->bootinfo.loader_start, ram);
+
+ if (vms->bootinfo.dimm_mem) {
+ create_pcdimms(vms, sysmem, ram);
+ }
}

static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
@@ -1404,13 +1648,11 @@ static void machvirt_init(MachineState *machine)
vms->machine_done.notify = virt_machine_done;
qemu_add_machine_init_done_notifier(&vms->machine_done);

- vms->bootinfo.ram_size = machine->ram_size;
vms->bootinfo.kernel_filename = machine->kernel_filename;
vms->bootinfo.kernel_cmdline = machine->kernel_cmdline;
vms->bootinfo.initrd_filename = machine->initrd_filename;
vms->bootinfo.nb_cpus = smp_cpus;
vms->bootinfo.board_id = -1;
- vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base;
vms->bootinfo.get_dtb = machvirt_dtb;
vms->bootinfo.firmware_loaded = firmware_loaded;

@@ -1559,7 +1801,7 @@ static void virt_dimm_plug(HotplugHandler *hotplug_dev,
PCDIMMDevice *dimm = PC_DIMM(dev);
PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
MemoryRegion *mr;
- uint64_t align;
+ uint64_t align, addr;
Error *local_err = NULL;

mr = ddc->get_memory_region(dimm, &local_err);
@@ -1573,6 +1815,15 @@ static void virt_dimm_plug(HotplugHandler *hotplug_dev,
goto out;
}

+ addr = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
+ &error_fatal);
+ /* Assign the node for pc-dimm initial ram */
+ if (vms->bootinfo.dimm_mem && (addr == vms->bootinfo.dimm_mem->base)
+ && (nb_numa_nodes > 0)) {
+ vms->bootinfo.dimm_mem->node = object_property_get_uint(OBJECT(dev),
+ PC_DIMM_NODE_PROP, &error_fatal);
+ }
+
out:
error_propagate(errp, local_err);
}

--
2.7.4

Shameer Kolothum

2018-05-16 15:20:24 UTC

Permalink

This makes changes to the DT mem node creation such that its easier
to add non-contiguous mem modeled as non-pluggable and a pc-dimm
mem later.

Signed-off-by: Shameer Kolothum <***@huawei.com>
---
hw/arm/boot.c | 91 ++++++++++++++++++++++++++++++++++++----------------
include/hw/arm/arm.h | 12 +++++++
2 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 26184bc..73db0aa 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -486,6 +486,27 @@ static void fdt_add_psci_node(void *fdt)
qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn);
}

+static char *create_memory_fdt(void *fdt, uint32_t acells, hwaddr mem_base,
+ uint32_t scells, hwaddr mem_len)
+{
+ char *nodename = NULL;
+ int rc;
+
+ nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
+ qemu_fdt_add_subnode(fdt, nodename);
+ qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
+ rc = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg", acells, mem_base,
+ scells, mem_len);
+ if (rc < 0) {
+ fprintf(stderr, "couldn't set %s/reg\n", nodename);
+ g_free(nodename);
+ return NULL;
+ }
+
+ return nodename;
+}
+
+
/**
* load_dtb() - load a device tree binary image into memory
* @addr: the address to load the image at
@@ -567,50 +588,64 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
goto fail;
}

+ /*
+ * Turn the /memory node created before into a NOP node, then create
+ * /***@addr nodes for all numa nodes respectively.
+ */
+ qemu_fdt_nop_node(fdt, "/memory");
+
if (nb_numa_nodes > 0) {
- /*
- * Turn the /memory node created before into a NOP node, then create
- * /***@addr nodes for all numa nodes respectively.
- */
- qemu_fdt_nop_node(fdt, "/memory");
+ hwaddr mem_sz;
+
mem_base = binfo->loader_start;
+ mem_sz = binfo->ram_size;
for (i = 0; i < nb_numa_nodes; i++) {
- mem_len = numa_info[i].node_mem;
- nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
- qemu_fdt_add_subnode(fdt, nodename);
- qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
- rc = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg",
- acells, mem_base,
+ mem_len = MIN(numa_info[i].node_mem, mem_sz);
+
+ nodename = create_memory_fdt(fdt, acells, mem_base,
scells, mem_len);
- if (rc < 0) {
- fprintf(stderr, "couldn't set %s/reg for node %d\n", nodename,
- i);
+ if (!nodename) {
goto fail;
}

qemu_fdt_setprop_cell(fdt, nodename, "numa-node-id", i);
- mem_base += mem_len;
g_free(nodename);
+ mem_base += mem_len;
+ mem_sz -= mem_len;
+ if (!mem_sz) {
+ break;
+ }
}
- } else {
- Error *err = NULL;

- rc = fdt_path_offset(fdt, "/memory");
- if (rc < 0) {
- qemu_fdt_add_subnode(fdt, "/memory");
- }
+ /* Create the node for initial pc-dimm ram, if any */
+ if (binfo->dimm_mem) {

- if (!qemu_fdt_getprop(fdt, "/memory", "device_type", NULL, &err)) {
- qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory");
+ nodename = create_memory_fdt(fdt, acells, binfo->dimm_mem->base,
+ scells, binfo->dimm_mem->size);
+ if (!nodename) {
+ goto fail;
+ }
+ qemu_fdt_setprop_cell(fdt, nodename, "numa-node-id",
+ binfo->dimm_mem->node);
+ g_free(nodename);
}

- rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
- acells, binfo->loader_start,
- scells, binfo->ram_size);
- if (rc < 0) {
- fprintf(stderr, "couldn't set /memory/reg\n");
+ } else {
+
+ nodename = create_memory_fdt(fdt, acells, binfo->loader_start,
+ scells, binfo->ram_size);
+ if (!nodename) {
goto fail;
}
+
+ if (binfo->dimm_mem) {
+ nodename = create_memory_fdt(fdt, acells, binfo->dimm_mem->base,
+ scells, binfo->dimm_mem->size);
+ if (!nodename) {
+ goto fail;
+ }
+ g_free(nodename);
+ }
}

rc = fdt_path_offset(fdt, "/chosen");
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index ce769bd..0ee3b4e 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -48,6 +48,12 @@ typedef struct {
ARMCPU *cpu; /* handle to the first cpu object */
} ArmLoadKernelNotifier;

+struct dimm_mem_info {
+ int node;
+ hwaddr base;
+ hwaddr size;
+};
+
/* arm_boot.c */
struct arm_boot_info {
uint64_t ram_size;
@@ -124,6 +130,12 @@ struct arm_boot_info {
bool secure_board_setup;

arm_endianness endianness;
+
+ /* This is used to model a pc-dimm based mem if the valid iova region
+ * is non-contiguous.
+ */
+ struct dimm_mem_info *dimm_mem;
+
};

/**

--
2.7.4

Shameer Kolothum

2018-05-16 15:20:25 UTC

Permalink

This is in preparation for the next patch where initial ram is split
into a non-pluggable chunk and a pc-dimm modeled mem if the vaild
iova regions are non-contiguous.

Signed-off-by: Shameer Kolothum <***@huawei.com>
---
hw/arm/virt-acpi-build.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index c7c6a57..8d17b40 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -488,7 +488,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
AcpiSratProcessorGiccAffinity *core;
AcpiSratMemoryAffinity *numamem;
int i, srat_start;
- uint64_t mem_base;
+ uint64_t mem_base, mem_sz, mem_len;
MachineClass *mc = MACHINE_GET_CLASS(vms);
const CPUArchIdList *cpu_list = mc->possible_cpu_arch_ids(MACHINE(vms));

@@ -505,12 +505,28 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
core->flags = cpu_to_le32(1);
}

- mem_base = vms->memmap[VIRT_MEM].base;
+ mem_base = vms->bootinfo.loader_start;
+ mem_sz = vms->bootinfo.loader_start;
for (i = 0; i < nb_numa_nodes; ++i) {
numamem = acpi_data_push(table_data, sizeof(*numamem));
- build_srat_memory(numamem, mem_base, numa_info[i].node_mem, i,
+ mem_len = MIN(numa_info[i].node_mem, mem_sz);
+ build_srat_memory(numamem, mem_base, mem_len, i,
MEM_AFFINITY_ENABLED);
- mem_base += numa_info[i].node_mem;
+ mem_base += mem_len;
+ mem_sz -= mem_len;
+ if (!mem_sz) {
+ break;
+ }
+ }
+
+ /* Create table for initial pc-dimm ram, if any */
+ if (vms->bootinfo.dimm_mem) {
+ numamem = acpi_data_push(table_data, sizeof(*numamem));
+ build_srat_memory(numamem, vms->bootinfo.dimm_mem->base,
+ vms->bootinfo.dimm_mem->size,
+ vms->bootinfo.dimm_mem->node,
+ MEM_AFFINITY_ENABLED);
+
}

build_header(linker, table_data, (void *)(table_data->data + srat_start),

--
2.7.4

Shameer Kolothum

2018-05-16 15:20:21 UTC

Permalink

This makes use of the newly introduced iova cap chains added
to the type1 VFIO_IOMMU_GET_INFO ioctl.

The retrieved iova info is stored in a list for later use.

Signed-off-by: Shameer Kolothum <***@huawei.com>
---
hw/vfio/common.c | 108 +++++++++++++++++++++++++++++++++++++++---
include/hw/vfio/vfio-common.h | 7 +++
linux-headers/linux/vfio.h | 23 +++++++++
3 files changed, 132 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 07ffa0b..94d7b24 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -40,6 +40,8 @@ struct vfio_group_head vfio_group_list =
QLIST_HEAD_INITIALIZER(vfio_group_list);
struct vfio_as_head vfio_address_spaces =
QLIST_HEAD_INITIALIZER(vfio_address_spaces);
+struct vfio_iova_head vfio_iova_regions =
+ QLIST_HEAD_INITIALIZER(vfio_iova_regions);

#ifdef CONFIG_KVM
/*
@@ -1030,6 +1032,85 @@ static void vfio_put_address_space(VFIOAddressSpace *space)
}
}

+static void vfio_iommu_get_iova_ranges(struct vfio_iommu_type1_info *info)
+{
+ struct vfio_info_cap_header *hdr;
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iova;
+ VFIOIovaRange *iova, *tmp, *prev = NULL;
+ void *ptr = info;
+ bool found = false;
+ int i;
+
+ if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
+ return;
+ }
+
+ for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
+ if (hdr->id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ return;
+ }
+
+ /* purge the current iova list, if any */
+ QLIST_FOREACH_SAFE(iova, &vfio_iova_regions, next, tmp) {
+ QLIST_REMOVE(iova, next);
+ g_free(iova);
+ }
+
+ cap_iova = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range,
+ header);
+
+ /* populate the list */
+ for (i = 0; i < cap_iova->nr_iovas; i++) {
+ iova = g_malloc0(sizeof(*iova));
+ iova->start = cap_iova->iova_ranges[i].start;
+ iova->end = cap_iova->iova_ranges[i].end;
+
+ if (prev) {
+ QLIST_INSERT_AFTER(prev, iova, next);
+ } else {
+ QLIST_INSERT_HEAD(&vfio_iova_regions, iova, next);
+ }
+ prev = iova;
+ }
+
+ return;
+}
+
+static int vfio_get_iommu_info(VFIOContainer *container,
+ struct vfio_iommu_type1_info **info)
+{
+
+ size_t argsz = sizeof(struct vfio_iommu_type1_info);
+
+
+ *info = g_malloc0(argsz);
+
+retry:
+ (*info)->argsz = argsz;
+
+ if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
+ g_free(*info);
+ *info = NULL;
+ return -errno;
+ }
+
+ if (((*info)->argsz > argsz)) {
+ argsz = (*info)->argsz;
+ *info = g_realloc(*info, argsz);
+ goto retry;
+ }
+
+ vfio_iommu_get_iova_ranges(*info);
+
+ return 0;
+}
+
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
@@ -1044,6 +1125,15 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
vfio_kvm_device_add_group(group);
+
+ /* New group might change the valid iovas. Get the updated list */
+ if ((container->iommu_type == VFIO_TYPE1_IOMMU) ||
+ (container->iommu_type == VFIO_TYPE1v2_IOMMU)) {
+ struct vfio_iommu_type1_info *info;
+
+ vfio_get_iommu_info(container, &info);
+ g_free(info);
+ }
return 0;
}
}
@@ -1071,7 +1161,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) ||
ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) {
bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU);
- struct vfio_iommu_type1_info info;
+ struct vfio_iommu_type1_info *info;

ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
if (ret) {
@@ -1095,14 +1185,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
* existing Type1 IOMMUs generally support any IOVA we're
* going to actually try in practice.
*/
- info.argsz = sizeof(info);
- ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
+ ret = vfio_get_iommu_info(container, &info);
/* Ignore errors */
- if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
+ if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
/* Assume 4k IOVA page size */
- info.iova_pgsizes = 4096;
+ info->iova_pgsizes = 4096;
}
- vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes);
+ vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes);
+ g_free(info);
} else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) ||
ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) {
struct vfio_iommu_spapr_tce_info info;
@@ -1256,6 +1346,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
if (QLIST_EMPTY(&container->group_list)) {
VFIOAddressSpace *space = container->space;
VFIOGuestIOMMU *giommu, *tmp;
+ VFIOIovaRange *iova, *next_iova;

QLIST_REMOVE(container, next);

@@ -1266,6 +1357,11 @@ static void vfio_disconnect_container(VFIOGroup *group)
g_free(giommu);
}

+ QLIST_FOREACH_SAFE(iova, &vfio_iova_regions, next, next_iova) {
+ QLIST_REMOVE(iova, next);
+ g_free(iova);
+ }
+
trace_vfio_disconnect_container(container->fd);
close(container->fd);
g_free(container);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index d936014..874fe2c 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -164,6 +164,12 @@ typedef struct VFIODisplay {
} dmabuf;
} VFIODisplay;

+typedef struct VFIOIovaRange {
+ uint64_t start;
+ uint64_t end;
+ QLIST_ENTRY(VFIOIovaRange) next;
+} VFIOIovaRange;
+
void vfio_put_base_device(VFIODevice *vbasedev);
void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
@@ -187,6 +193,7 @@ int vfio_get_device(VFIOGroup *group, const char *name,
extern const MemoryRegionOps vfio_region_ops;
extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list;
extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
+extern QLIST_HEAD(vfio_iova_head, VFIOIovaRange) vfio_iova_regions;

#ifdef CONFIG_LINUX
int vfio_get_region_info(VFIODevice *vbasedev, int index,
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 3a0a305..117341d 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -589,7 +589,30 @@ struct vfio_iommu_type1_info {
__u32 argsz;
__u32 flags;
#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
+#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
__u64 iova_pgsizes; /* Bitmap of supported page sizes */
+ __u32 cap_offset; /* Offset within info struct of first cap */
+};
+
+/*
+ * The IOVA capability allows to report the valid IOVA range(s)
+ * excluding any reserved regions associated with dev group. Any dma
+ * map attempt outside the valid iova range will return error.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1
+
+struct vfio_iova_range {
+ __u64 start;
+ __u64 end;
+};
+
+struct vfio_iommu_type1_info_cap_iova_range {
+ struct vfio_info_cap_header header;
+ __u32 nr_iovas;
+ __u32 reserved;
+ struct vfio_iova_range iova_ranges[];
};

#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)

--
2.7.4

Shameer Kolothum

2018-05-16 15:20:23 UTC

Permalink

This will be used in subsequent patches to model a chunk of
memory as pc-dimm(cold plug) if the valid iova regions are
non-contiguous. This is not yet a full hotplug support.

Signed-off-by: Shameer Kolothum <***@huawei.com>
---
default-configs/aarch64-softmmu.mak | 1 +
hw/arm/virt.c | 82 +++++++++++++++++++++++++++++++++++++
include/hw/arm/virt.h | 2 +
3 files changed, 85 insertions(+)

diff --git a/default-configs/aarch64-softmmu.mak b/default-configs/aarch64-softmmu.mak
index 9ddccf8..7a82ed8 100644
--- a/default-configs/aarch64-softmmu.mak
+++ b/default-configs/aarch64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_DDC=y
CONFIG_DPCD=y
CONFIG_XLNX_ZYNQMP=y
CONFIG_XLNX_ZYNQMP_ARM=y
+CONFIG_MEM_HOTPLUG=y
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 05fcb62..be3ad14 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1552,9 +1552,82 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
return ms->possible_cpus;
}

+static void virt_dimm_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+ PCDIMMDevice *dimm = PC_DIMM(dev);
+ PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+ MemoryRegion *mr;
+ uint64_t align;
+ Error *local_err = NULL;
+
+ mr = ddc->get_memory_region(dimm, &local_err);
+ if (local_err) {
+ goto out;
+ }
+
+ align = memory_region_get_alignment(mr);
+ pc_dimm_memory_plug(dev, &vms->hotplug_memory, mr, align, &local_err);
+ if (local_err) {
+ goto out;
+ }
+
+out:
+ error_propagate(errp, local_err);
+}
+
+static void virt_dimm_unplug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+ PCDIMMDevice *dimm = PC_DIMM(dev);
+ PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+ MemoryRegion *mr;
+ Error *local_err = NULL;
+
+ mr = ddc->get_memory_region(dimm, &local_err);
+ pc_dimm_memory_unplug(dev, &vms->hotplug_memory, mr);
+ object_unparent(OBJECT(dev));
+
+ error_propagate(errp, local_err);
+}
+
+static void virt_machinedevice_plug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ virt_dimm_plug(hotplug_dev, dev, errp);
+ } else {
+ error_setg(errp, "device plug request for not supported device"
+ " type: %s", object_get_typename(OBJECT(dev)));
+ }
+}
+
+static void virt_machinedevice_unplug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ virt_dimm_unplug(hotplug_dev, dev, errp);
+ } else {
+ error_setg(errp, "device unplug for not supported device"
+ " type: %s", object_get_typename(OBJECT(dev)));
+ }
+}
+
+static HotplugHandler *virt_get_hotplug_handler(MachineState *machine,
+ DeviceState *dev)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ return HOTPLUG_HANDLER(machine);
+ }
+ return NULL;
+}
+
static void virt_machine_class_init(ObjectClass *oc, void *data)
{
MachineClass *mc = MACHINE_CLASS(oc);
+ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);

mc->init = machvirt_init;
/* Start max_cpus at the maximum QEMU supports. We'll further restrict
@@ -1573,6 +1646,11 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
mc->cpu_index_to_instance_props = virt_cpu_index_to_props;
mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-a15");
mc->get_default_cpu_node_id = virt_get_default_cpu_node_id;
+
+ mc->get_hotplug_handler = virt_get_hotplug_handler;
+ hc->plug = virt_machinedevice_plug_cb;
+ hc->unplug = virt_machinedevice_unplug_cb;
+
}

static const TypeInfo virt_machine_info = {
@@ -1582,6 +1660,10 @@ static const TypeInfo virt_machine_info = {
.instance_size = sizeof(VirtMachineState),
.class_size = sizeof(VirtMachineClass),
.class_init = virt_machine_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_HOTPLUG_HANDLER },
+ { }
+ },
};

static void machvirt_machine_init(void)
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index fc24f3a..a39f29e 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -35,6 +35,7 @@
#include "qemu/notify.h"
#include "hw/boards.h"
#include "hw/arm/arm.h"
+#include "hw/mem/pc-dimm.h"

#define NUM_GICV2M_SPIS 64
#define NUM_VIRTIO_TRANSPORTS 32
@@ -108,6 +109,7 @@ typedef struct {
uint32_t gic_phandle;
uint32_t msi_phandle;
int psci_conduit;
+ MemoryHotplugState hotplug_memory;
} VirtMachineState;

#define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt")

--
2.7.4