[RFC PATCH RDMA support v4: 00/10] cleaner ramblocks and documentation

From: "Michael R. Hines" <***@us.ibm.com>

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
include/qemu/sockets.h | 1 +
util/qemu-sockets.c | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index ae5c21c..5066fca 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -48,6 +48,7 @@ typedef void NonBlockingConnectHandler(int fd, void *opaque);
int inet_listen_opts(QemuOpts *opts, int port_offset, Error **errp);
int inet_listen(const char *str, char *ostr, int olen,
int socktype, int port_offset, Error **errp);
+InetSocketAddress *inet_parse(const char *str, Error **errp);
int inet_connect_opts(QemuOpts *opts, Error **errp,
NonBlockingConnectHandler *callback, void *opaque);
int inet_connect(const char *str, Error **errp);
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 83e4e08..6b60b63 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -485,7 +485,7 @@ err:
}

/* compatibility wrapper */
-static InetSocketAddress *inet_parse(const char *str, Error **errp)
+InetSocketAddress *inet_parse(const char *str, Error **errp)
{
InetSocketAddress *addr;
const char *optstr, *h;

--
1.7.10.4

m***@linux.vnet.ibm.com

2013-03-18 03:19:00 UTC

From: "Michael R. Hines" <***@us.ibm.com>

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
migration-rdma.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 205 insertions(+)
create mode 100644 migration-rdma.c

diff --git a/migration-rdma.c b/migration-rdma.c
new file mode 100644
index 0000000..e1ea055
--- /dev/null
+++ b/migration-rdma.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2013 Michael R. Hines <***@us.ibm.com>
+ * Copyright (C) 2013 Jiuxing Liu <***@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "migration/rdma.h"
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+//#define DEBUG_MIGRATION_RDMA
+
+#ifdef DEBUG_MIGRATION_RDMA
+#define DPRINTF(fmt, ...) \
+ do { printf("migration-rdma: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+ do { } while (0)
+#endif
+
+static int rdma_accept_incoming_migration(RDMAData *rdma, Error **errp)
+{
+ int ret;
+
+ ret = qemu_rdma_migrate_listen(rdma, rdma->host, rdma->port);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error listening!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_alloc_qp(&rdma->rdma_ctx);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error allocating qp!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_migrate_accept(&rdma->rdma_ctx, NULL, NULL, NULL, 0);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error accepting connection!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_post_recv_qemu_file(rdma);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error posting second qemu file recv!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_post_send_remote_info(rdma);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error sending remote info!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_wait_for_wrid(rdma, RDMA_WRID_SEND_REMOTE_INFO);
+ if (ret < 0) {
+ qemu_rdma_print("rdma migration: polling remote info error!");
+ goto err_rdma_server_wait;
+ }
+
+ rdma->total_bytes = 0;
+ rdma->enabled = 1;
+ qemu_rdma_dump_gid("server_connect", rdma->rdma_ctx.cm_id);
+ return 0;
+
+err_rdma_server_wait:
+ qemu_rdma_cleanup(rdma);
+ return -1;
+
+}
+
+int rdma_start_incoming_migration(const char * host_port, Error **errp)
+{
+ RDMAData *rdma = g_malloc0(sizeof(RDMAData));
+ QEMUFile *f;
+ int ret;
+
+ if ((ret = qemu_rdma_data_init(rdma, host_port, errp)) < 0)
+ return ret;
+
+ ret = qemu_rdma_server_init(rdma, NULL);
+
+ DPRINTF("Starting RDMA-based incoming migration\n");
+
+ if (!ret) {
+ DPRINTF("qemu_rdma_server_init success\n");
+ ret = qemu_rdma_server_prepare(rdma, NULL);
+
+ if (!ret) {
+ DPRINTF("qemu_rdma_server_prepare success\n");
+
+ ret = rdma_accept_incoming_migration(rdma, NULL);
+ if(!ret)
+ DPRINTF("qemu_rdma_accept_incoming_migration success\n");
+ f = qemu_fopen_rdma(rdma, "rb");
+ if (f == NULL) {
+ fprintf(stderr, "could not qemu_fopen RDMA\n");
+ ret = -EIO;
+ }
+
+ process_incoming_migration(f);
+ }
+ }
+
+ return ret;
+}
+
+void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **errp)
+{
+ RDMAData *rdma = g_malloc0(sizeof(RDMAData));
+ MigrationState *s = opaque;
+ int ret;
+
+ if (qemu_rdma_data_init(rdma, host_port, errp) < 0)
+ return;
+
+ ret = qemu_rdma_client_init(rdma, NULL);
+ if(!ret) {
+ DPRINTF("qemu_rdma_client_init success\n");
+ ret = qemu_rdma_client_connect(rdma, NULL);
+
+ if(!ret) {
+ s->file = qemu_fopen_rdma(rdma, "wb");
+ DPRINTF("qemu_rdma_client_connect success\n");
+ migrate_fd_connect(s);
+ return;
+ }
+ }
+
+ migrate_fd_error(s);
+}
+
+size_t save_rdma_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, int cont, size_t size)
+{
+ int ret;
+ size_t bytes_sent = 0;
+ ram_addr_t current_addr;
+ RDMAData * rdma = migrate_use_rdma(f);
+
+ current_addr = block_offset + offset;
+
+ /*
+ * Add this page to the current 'chunk'. If the chunk
+ * is full, an actual RDMA write will occur.
+ */
+ if ((ret = qemu_rdma_write(rdma, current_addr, size)) < 0) {
+ fprintf(stderr, "rdma migration: write error! %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Drain the Completion Queue if possible.
+ * If not, the end of the iteration will do this
+ * again to make sure we don't overflow the
+ * request queue.
+ */
+ while (1) {
+ int ret = qemu_rdma_poll(rdma);
+ if (ret == RDMA_WRID_NONE) {
+ break;
+ }
+ if (ret < 0) {
+ fprintf(stderr, "rdma migration: polling error! %d\n", ret);
+ return ret;
+ }
+ }
+
+ bytes_sent += size;
+ return bytes_sent;
+}
+
+size_t qemu_rdma_fill(void * opaque, uint8_t *buf, int size)
+{
+ RDMAData * rdma = opaque;
+ size_t len = 0;
+
+ if(rdma->qemu_file_len) {
+ DPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",
+ rdma->qemu_file_len, size);
+
+ len = MIN(size, rdma->qemu_file_len);
+ memcpy(buf, rdma->qemu_file_curr, len);
+ rdma->qemu_file_curr += len;
+ rdma->qemu_file_len -= len;
+ }
+
+ return len;
+}

--
1.7.10.4

Paolo Bonzini

2013-03-18 08:56:40 UTC

Post by m***@linux.vnet.ibm.com
---
migration-rdma.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 205 insertions(+)
create mode 100644 migration-rdma.c
diff --git a/migration-rdma.c b/migration-rdma.c
new file mode 100644
index 0000000..e1ea055
--- /dev/null
+++ b/migration-rdma.c
@@ -0,0 +1,205 @@
+/*
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "migration/rdma.h"
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+//#define DEBUG_MIGRATION_RDMA
+
+#ifdef DEBUG_MIGRATION_RDMA
+#define DPRINTF(fmt, ...) \
+ do { printf("migration-rdma: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+ do { } while (0)
+#endif
+
+static int rdma_accept_incoming_migration(RDMAData *rdma, Error **errp)
+{
+ int ret;
+
+ ret = qemu_rdma_migrate_listen(rdma, rdma->host, rdma->port);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error listening!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_alloc_qp(&rdma->rdma_ctx);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error allocating qp!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_migrate_accept(&rdma->rdma_ctx, NULL, NULL, NULL, 0);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error accepting connection!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_post_recv_qemu_file(rdma);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error posting second qemu file recv!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_post_send_remote_info(rdma);
+ if (ret) {
+ qemu_rdma_print("rdma migration: error sending remote info!");
+ goto err_rdma_server_wait;
+ }
+
+ ret = qemu_rdma_wait_for_wrid(rdma, RDMA_WRID_SEND_REMOTE_INFO);
+ if (ret < 0) {
+ qemu_rdma_print("rdma migration: polling remote info error!");
+ goto err_rdma_server_wait;
+ }

In a "socket-like" abstraction, all of these steps except the initial
listen are part of "accept". Please move them to
qemu_rdma_migrate_accept (possibly renaming the existing
qemu_rdma_migrate_accept to a different name).

Similarly, perhaps you can merge qemu_rdma_server_prepare and
qemu_rdma_migrate_listen.

Try to make the public API between modules as small as possible (but not
smaller :)), so that you can easily document it without too many
references to RDMA concepts.

Thanks,

Paolo

Post by m***@linux.vnet.ibm.com
+ rdma->total_bytes = 0;
+ rdma->enabled = 1;
+ qemu_rdma_dump_gid("server_connect", rdma->rdma_ctx.cm_id);
+ return 0;
+
+ qemu_rdma_cleanup(rdma);
+ return -1;
+
+}
+
+int rdma_start_incoming_migration(const char * host_port, Error **errp)
+{
+ RDMAData *rdma = g_malloc0(sizeof(RDMAData));
+ QEMUFile *f;
+ int ret;
+
+ if ((ret = qemu_rdma_data_init(rdma, host_port, errp)) < 0)
+ return ret;
+
+ ret = qemu_rdma_server_init(rdma, NULL);
+
+ DPRINTF("Starting RDMA-based incoming migration\n");
+
+ if (!ret) {
+ DPRINTF("qemu_rdma_server_init success\n");
+ ret = qemu_rdma_server_prepare(rdma, NULL);
+
+ if (!ret) {
+ DPRINTF("qemu_rdma_server_prepare success\n");
+
+ ret = rdma_accept_incoming_migration(rdma, NULL);
+ if(!ret)
+ DPRINTF("qemu_rdma_accept_incoming_migration success\n");
+ f = qemu_fopen_rdma(rdma, "rb");
+ if (f == NULL) {
+ fprintf(stderr, "could not qemu_fopen RDMA\n");
+ ret = -EIO;
+ }
+
+ process_incoming_migration(f);
+ }
+ }
+
+ return ret;
+}
+
+void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **errp)
+{
+ RDMAData *rdma = g_malloc0(sizeof(RDMAData));
+ MigrationState *s = opaque;
+ int ret;
+
+ if (qemu_rdma_data_init(rdma, host_port, errp) < 0)
+ return;
+
+ ret = qemu_rdma_client_init(rdma, NULL);
+ if(!ret) {
+ DPRINTF("qemu_rdma_client_init success\n");
+ ret = qemu_rdma_client_connect(rdma, NULL);
+
+ if(!ret) {
+ s->file = qemu_fopen_rdma(rdma, "wb");
+ DPRINTF("qemu_rdma_client_connect success\n");
+ migrate_fd_connect(s);
+ return;
+ }
+ }
+
+ migrate_fd_error(s);
+}
+
+size_t save_rdma_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, int cont, size_t size)
+{
+ int ret;
+ size_t bytes_sent = 0;
+ ram_addr_t current_addr;
+ RDMAData * rdma = migrate_use_rdma(f);
+
+ current_addr = block_offset + offset;
+
+ /*
+ * Add this page to the current 'chunk'. If the chunk
+ * is full, an actual RDMA write will occur.
+ */
+ if ((ret = qemu_rdma_write(rdma, current_addr, size)) < 0) {
+ fprintf(stderr, "rdma migration: write error! %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Drain the Completion Queue if possible.
+ * If not, the end of the iteration will do this
+ * again to make sure we don't overflow the
+ * request queue.
+ */
+ while (1) {
+ int ret = qemu_rdma_poll(rdma);
+ if (ret == RDMA_WRID_NONE) {
+ break;
+ }
+ if (ret < 0) {
+ fprintf(stderr, "rdma migration: polling error! %d\n", ret);
+ return ret;
+ }
+ }
+
+ bytes_sent += size;
+ return bytes_sent;
+}
+
+size_t qemu_rdma_fill(void * opaque, uint8_t *buf, int size)
+{
+ RDMAData * rdma = opaque;
+ size_t len = 0;
+
+ if(rdma->qemu_file_len) {
+ DPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",
+ rdma->qemu_file_len, size);
+
+ len = MIN(size, rdma->qemu_file_len);
+ memcpy(buf, rdma->qemu_file_curr, len);
+ rdma->qemu_file_curr += len;
+ rdma->qemu_file_len -= len;
+ }
+
+ return len;
+}

Michael R. Hines

2013-03-18 20:26:27 UTC

Acknowledged.

Post by Paolo Bonzini
In a "socket-like" abstraction, all of these steps except the initial
listen are part of "accept". Please move them to
qemu_rdma_migrate_accept (possibly renaming the existing
qemu_rdma_migrate_accept to a different name). Similarly, perhaps you
can merge qemu_rdma_server_prepare and qemu_rdma_migrate_listen. Try
to make the public API between modules as small as possible (but not
smaller :)), so that you can easily document it without too many
references to RDMA concepts. Thanks, Paolo

m***@linux.vnet.ibm.com

2013-03-18 03:18:55 UTC

From: "Michael R. Hines" <***@us.ibm.com>

Make both rdma.c and migration-rdma.c conditionally built.

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
Makefile.objs | 1 +
1 file changed, 1 insertion(+)

diff --git a/Makefile.objs b/Makefile.objs
index f99841c..d12208b 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -58,6 +58,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
common-obj-$(CONFIG_LINUX) += fsdev/

common-obj-y += migration.o migration-tcp.o
+common-obj-$(CONFIG_RDMA) += migration-rdma.o rdma.o
common-obj-y += qemu-char.o #aio.o
common-obj-y += block-migration.o
common-obj-y += page_cache.o xbzrle.o

--
1.7.10.4

m***@linux.vnet.ibm.com

2013-03-18 03:18:54 UTC

From: "Michael R. Hines" <***@us.ibm.com>

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
configure | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)

diff --git a/configure b/configure
index 46a7594..bdc6b13 100755
--- a/configure
+++ b/configure
@@ -170,6 +170,7 @@ xfs=""

vhost_net="no"
kvm="no"
+rdma="no"
gprof="no"
debug_tcg="no"
debug="no"
@@ -904,6 +905,10 @@ for opt do
;;
--enable-gtk) gtk="yes"
;;
+ --enable-rdma) rdma="yes"
+ ;;
+ --disable-rdma) rdma="no"
+ ;;
--with-gtkabi=*) gtkabi="$optarg"
;;
--enable-tpm) tpm="yes"
@@ -1104,6 +1109,8 @@ echo " --enable-bluez enable bluez stack connectivity"
echo " --disable-slirp disable SLIRP userspace network connectivity"
echo " --disable-kvm disable KVM acceleration support"
echo " --enable-kvm enable KVM acceleration support"
+echo " --disable-rdma disable RDMA-based migration support"
+echo " --enable-rdma enable RDMA-based migration support"
echo " --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)"
echo " --disable-nptl disable usermode NPTL support"
echo " --enable-nptl enable usermode NPTL support"
@@ -1766,6 +1773,18 @@ EOF
libs_softmmu="$sdl_libs $libs_softmmu"
fi

+if test "$rdma" = "yes" ; then
+ cat > $TMPC <<EOF
+#include <rdma/rdma_cma.h>
+int main(void) { return 0; }
+EOF
+ rdma_libs="-lrdmacm -libverbs"
+ if ! compile_prog "" "$rdma_libs" ; then
+ feature_not_found "rdma"
+ fi
+
+fi
+
##########################################
# VNC TLS/WS detection
if test "$vnc" = "yes" -a $ "$vnc_tls" != "no" -o "$vnc_ws" != "no" $ ; then
@@ -3412,6 +3431,7 @@ echo "Linux AIO support $linux_aio"
echo "ATTR/XATTR support $attr"
echo "Install blobs $blobs"
echo "KVM support $kvm"
+echo "RDMA support $rdma"
echo "TCG interpreter $tcg_interpreter"
echo "fdt support $fdt"
echo "preadv support $preadv"
@@ -4384,6 +4404,11 @@ if [ "$pixman" = "internal" ]; then
echo "config-host.h: subdir-pixman" >> $config_host_mak
fi

+if test "$rdma" = "yes" ; then
+echo "CONFIG_RDMA=y" >> $config_host_mak
+echo "LIBS+=$rdma_libs" >> $config_host_mak
+fi
+
# build tree in object directory in case the source is not in the current directory
DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32"
DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas"

--
1.7.10.4

m***@linux.vnet.ibm.com

2013-03-18 03:19:02 UTC

From: "Michael R. Hines" <***@us.ibm.com>

Since we're not using TCP anymore, we skip these calls.

Also print a little extra text while debugging, like "gbps"
which is helpful to know how the link is being utilized.

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
include/migration/migration.h | 3 +++
migration.c | 19 +++++++++++++------
2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index bb617fd..88ab5f6 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -20,6 +20,7 @@
#include "qemu/notify.h"
#include "qapi/error.h"
#include "migration/vmstate.h"
+#include "migration/rdma.h"
#include "qapi-types.h"

struct MigrationParams {
@@ -102,6 +103,7 @@ uint64_t xbzrle_mig_bytes_transferred(void);
uint64_t xbzrle_mig_pages_transferred(void);
uint64_t xbzrle_mig_pages_overflow(void);
uint64_t xbzrle_mig_pages_cache_miss(void);
+uint64_t delta_norm_mig_bytes_transferred(void);

/**
* @migrate_add_blocker - prevent migration from proceeding
@@ -122,6 +124,7 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);

int migrate_use_xbzrle(void);
+void *migrate_use_rdma(QEMUFile *f);
int64_t migrate_xbzrle_cache_size(void);

int64_t xbzrle_cache_resize(int64_t new_size);
diff --git a/migration.c b/migration.c
index 185d112..634437a 100644
--- a/migration.c
+++ b/migration.c
@@ -15,6 +15,7 @@

#include "qemu-common.h"
#include "migration/migration.h"
+#include "migration/rdma.h"
#include "monitor/monitor.h"
#include "migration/qemu-file.h"
#include "sysemu/sysemu.h"
@@ -77,6 +78,8 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)

if (strstart(uri, "tcp:", &p))
tcp_start_incoming_migration(p, errp);
+ else if (strstart(uri, "rdma:", &p))
+ rdma_start_incoming_migration(p, errp);
#if !defined(WIN32)
else if (strstart(uri, "exec:", &p))
exec_start_incoming_migration(p, errp);
@@ -118,10 +121,11 @@ static void process_incoming_migration_co(void *opaque)
void process_incoming_migration(QEMUFile *f)
{
Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
- int fd = qemu_get_fd(f);
-
- assert(fd != -1);
- socket_set_nonblock(fd);
+ if(!migrate_use_rdma(f)) {
+ int fd = qemu_get_fd(f);
+ assert(fd != -1);
+ socket_set_nonblock(fd);
+ }
qemu_coroutine_enter(co, f);
}

@@ -404,6 +408,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,

if (strstart(uri, "tcp:", &p)) {
tcp_start_outgoing_migration(s, p, &local_err);
+ } else if (strstart(uri, "rdma:", &p)) {
+ rdma_start_outgoing_migration(s, p, &local_err);
#if !defined(WIN32)
} else if (strstart(uri, "exec:", &p)) {
exec_start_outgoing_migration(s, p, &local_err);
@@ -545,8 +551,9 @@ static void *migration_thread(void *opaque)
max_size = bandwidth * migrate_max_downtime() / 1000000;

DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
- " bandwidth %g max_size %" PRId64 "\n",
- transferred_bytes, time_spent, bandwidth, max_size);
+ " bandwidth %g (%0.2f mbps) max_size %" PRId64 "\n",
+ transferred_bytes, time_spent,
+ bandwidth, Gbps(transferred_bytes, time_spent), max_size);
/* if we haven't sent anything, we don't want to recalculate
10000 is a small enough number for our purposes */
if (s->dirty_bytes_rate && transferred_bytes > 10000) {

--
1.7.10.4

Paolo Bonzini

2013-03-18 08:47:36 UTC

Post by m***@linux.vnet.ibm.com
Since we're not using TCP anymore, we skip these calls.
Also print a little extra text while debugging, like "gbps"
which is helpful to know how the link is being utilized.
---
include/migration/migration.h | 3 +++
migration.c | 19 +++++++++++++------
2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/include/migration/migration.h b/include/migration/migration.h
index bb617fd..88ab5f6 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -20,6 +20,7 @@
#include "qemu/notify.h"
#include "qapi/error.h"
#include "migration/vmstate.h"
+#include "migration/rdma.h"
#include "qapi-types.h"
struct MigrationParams {
@@ -102,6 +103,7 @@ uint64_t xbzrle_mig_bytes_transferred(void);
uint64_t xbzrle_mig_pages_transferred(void);
uint64_t xbzrle_mig_pages_overflow(void);
uint64_t xbzrle_mig_pages_cache_miss(void);
+uint64_t delta_norm_mig_bytes_transferred(void);

Please add the protocol under the

Post by m***@linux.vnet.ibm.com
/**
@@ -122,6 +124,7 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
int migrate_use_xbzrle(void);
+void *migrate_use_rdma(QEMUFile *f);

Perhaps you can add a new function to QEMUFile send_page? And if it
returns -ENOTSUP, proceed with the normal is_dup_page + put_buffer. I
wonder if that lets use remove migrate_use_rdma() completely.

Also, if QEMUFileRDMA is moved to rdma.c, the number of public and
stubbed functions should decrease noticeably. There is a patch on the
list to move QEMUFile to its own source file. You could incorporate it
in your series.

Post by m***@linux.vnet.ibm.com
int64_t migrate_xbzrle_cache_size(void);
int64_t xbzrle_cache_resize(int64_t new_size);
diff --git a/migration.c b/migration.c
index 185d112..634437a 100644
--- a/migration.c
+++ b/migration.c
@@ -15,6 +15,7 @@
#include "qemu-common.h"
#include "migration/migration.h"
+#include "migration/rdma.h"
#include "monitor/monitor.h"
#include "migration/qemu-file.h"
#include "sysemu/sysemu.h"
@@ -77,6 +78,8 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
if (strstart(uri, "tcp:", &p))
tcp_start_incoming_migration(p, errp);
+ else if (strstart(uri, "rdma:", &p))
+ rdma_start_incoming_migration(p, errp);
#if !defined(WIN32)
else if (strstart(uri, "exec:", &p))
exec_start_incoming_migration(p, errp);
@@ -118,10 +121,11 @@ static void process_incoming_migration_co(void *opaque)
void process_incoming_migration(QEMUFile *f)
{
Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
- int fd = qemu_get_fd(f);
-
- assert(fd != -1);
- socket_set_nonblock(fd);
+ if(!migrate_use_rdma(f)) {
+ int fd = qemu_get_fd(f);
+ assert(fd != -1);
+ socket_set_nonblock(fd);

Is this because qemu_get_fd(f) returns -1 for RDMA? Then, you can
instead put socket_set_nonblock under an if(fd != -1).

Post by m***@linux.vnet.ibm.com
+ }
qemu_coroutine_enter(co, f);
}
@@ -404,6 +408,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
if (strstart(uri, "tcp:", &p)) {
tcp_start_outgoing_migration(s, p, &local_err);
+ } else if (strstart(uri, "rdma:", &p)) {
+ rdma_start_outgoing_migration(s, p, &local_err);
#if !defined(WIN32)
} else if (strstart(uri, "exec:", &p)) {
exec_start_outgoing_migration(s, p, &local_err);
@@ -545,8 +551,9 @@ static void *migration_thread(void *opaque)
max_size = bandwidth * migrate_max_downtime() / 1000000;
DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
- " bandwidth %g max_size %" PRId64 "\n",
- transferred_bytes, time_spent, bandwidth, max_size);
+ " bandwidth %g (%0.2f mbps) max_size %" PRId64 "\n",
+ transferred_bytes, time_spent,
+ bandwidth, Gbps(transferred_bytes, time_spent), max_size);
/* if we haven't sent anything, we don't want to recalculate
10000 is a small enough number for our purposes */
if (s->dirty_bytes_rate && transferred_bytes > 10000) {

Otherwise looks good.

Michael R. Hines

2013-03-18 20:37:48 UTC

Comments inline.......

Post by m***@linux.vnet.ibm.com
int migrate_use_xbzrle(void);
+void *migrate_use_rdma(QEMUFile *f);
Perhaps you can add a new function to QEMUFile send_page? And if it
returns -ENOTSUP, proceed with the normal is_dup_page + put_buffer. I
wonder if that lets use remove migrate_use_rdma() completely.

That's great - I'll make the modification......

Post by m***@linux.vnet.ibm.com
void process_incoming_migration(QEMUFile *f)
{
Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
- int fd = qemu_get_fd(f);
-
- assert(fd != -1);
- socket_set_nonblock(fd);
+ if(!migrate_use_rdma(f)) {
+ int fd = qemu_get_fd(f);
+ assert(fd != -1);
+ socket_set_nonblock(fd);
Is this because qemu_get_fd(f) returns -1 for RDMA? Then, you can
instead put socket_set_nonblock under an if(fd != -1).

Yes, I proposed doing that check (for -1) in a previous RFC,
but you told me to remove it and make a separate patch =)

Is it OK to keep it in this patch?

Post by m***@linux.vnet.ibm.com
Otherwise looks good.

Thanks for taking the time =)

Paolo Bonzini

2013-03-19 09:23:50 UTC

Post by m***@linux.vnet.ibm.com
+ if(!migrate_use_rdma(f)) {
+ int fd = qemu_get_fd(f);
+ assert(fd != -1);
+ socket_set_nonblock(fd);
Is this because qemu_get_fd(f) returns -1 for RDMA? Then, you can
instead put socket_set_nonblock under an if(fd != -1).

Yes, I proposed doing that check (for -1) in a previous RFC,
but you told me to remove it and make a separate patch =)
Is it OK to keep it in this patch?

Yes---this is a separate patch. Apologies if you had the if(fd != -1)
before. :) In fact, both the if(fd != -1) and the
if(!migrate_use_rdma(f)) are bad, but I prefer to eliminate as many uses
as possible of migrate_use_rdma.

The reason why they are bad, is that we try to operate on the socket in
a non-blocking manner, so that the monitor keeps working during incoming
migration. We do it with non-blocking sockets because incoming
migration does not (yet?) have a separate thread, and is not a
bottleneck (the VM is not running, so it's not a problem to hold the big
QEMU lock for extended periods of time).

Does librdmacm support non-blocking operation, similar to select() or
poll()? Perhaps we can add support for that later.

Paolo

Michael R. Hines

2013-03-19 13:08:46 UTC

Post by Paolo Bonzini
Yes---this is a separate patch. Apologies if you had the if(fd != -1)
before. :) In fact, both the if(fd != -1) and the
if(!migrate_use_rdma(f)) are bad, but I prefer to eliminate as many uses
as possible of migrate_use_rdma.

I agree. In my current patch I've eliminated all of them.

Post by Paolo Bonzini
Does librdmacm support non-blocking operation, similar to select() or
poll()? Perhaps we can add support for that later.

Yes, it does, actually. The library provides what is called an "event
channel".
(This term is overloaded by other technologies, but that's OK).

An event channel is a file descriptor provided by (I believe) the rdma_cm
kernel module driver.

When you poll on this file descriptor, it can tell you all sorts of things
just like other files or sockets like when data is ready or when
events of interest have completed (like the completion queue has elements).

In my current patch, I'm using this during
"rdma_accept_incoming_connection()",
but I'm not currently using it for the rest of the coroutine on the
receiver side.

Paolo Bonzini

2013-03-19 13:20:47 UTC

I agree. In my current patch I've eliminated all of them.

Very nice. It remains to be seen how many are replaced by checks on the
QEMUFileOps :) but it cannot be worse!

Post by Paolo Bonzini
Does librdmacm support non-blocking operation, similar to select() or
poll()? Perhaps we can add support for that later.

Yes, it does, actually. The library provides what is called an "event
channel".
(This term is overloaded by other technologies, but that's OK).
An event channel is a file descriptor provided by (I believe) the rdma_cm
kernel module driver.
When you poll on this file descriptor, it can tell you all sorts of things
just like other files or sockets like when data is ready or when
events of interest have completed (like the completion queue has elements).
In my current patch, I'm using this during
"rdma_accept_incoming_connection()",
but I'm not currently using it for the rest of the coroutine on the
receiver side.

Ok, this can be added later.

Paolo

m***@linux.vnet.ibm.com

2013-03-18 03:19:03 UTC

From: "Michael R. Hines" <***@us.ibm.com>

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
arch_init.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch_init.c b/arch_init.c
index 98e2bc6..b013cc8 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -45,6 +45,7 @@
#include "exec/address-spaces.h"
#include "hw/pcspk.h"
#include "migration/page_cache.h"
+#include "migration/rdma.h"
#include "qemu/config-file.h"
#include "qmp-commands.h"
#include "trace.h"
@@ -225,6 +226,18 @@ static void acct_clear(void)
memset(&acct_info, 0, sizeof(acct_info));
}

+/*
+ * RDMA pc.ram doesn't go through QEMUFile directly,
+ * but still needs to be accounted for...
+ */
+uint64_t delta_norm_mig_bytes_transferred(void)
+{
+ static uint64_t last_norm_pages = 0;
+ uint64_t delta_bytes = (acct_info.norm_pages - last_norm_pages) * TARGET_PAGE_SIZE;
+ last_norm_pages = acct_info.norm_pages;
+ return delta_bytes;
+}
+
uint64_t dup_mig_bytes_transferred(void)
{
return acct_info.dup_pages * TARGET_PAGE_SIZE;
@@ -463,7 +476,11 @@ static int ram_save_block(QEMUFile *f, bool last_stage)

/* In doubt sent page as normal */
bytes_sent = -1;
- if (is_dup_page(p)) {
+ if (migrate_use_rdma(f)) {
+ /* for now, mapping the page is slower than RDMA */
+ acct_info.norm_pages++;
+ bytes_sent = save_rdma_page(f, block->offset, offset, cont, TARGET_PAGE_SIZE);
+ } else if (is_dup_page(p)) {
acct_info.dup_pages++;
bytes_sent = save_block_hdr(f, block, offset, cont,
RAM_SAVE_FLAG_COMPRESS);
@@ -648,6 +665,15 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)

qemu_mutex_unlock_ramlist();

+ /*
+ * Don't go to the next iteration without
+ * ensuring RDMA transfers have completed.
+ */
+ if ((ret = qemu_drain(f)) < 0) {
+ fprintf(stderr, "failed to drain RDMA first!\n");
+ return ret;
+ }
+
if (ret < 0) {
bytes_transferred += total_sent;
return ret;

--
1.7.10.4

m***@linux.vnet.ibm.com

2013-03-18 03:19:01 UTC

From: "Michael R. Hines" <***@us.ibm.com>

This compiles with and without --enable-rdma.

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
include/migration/qemu-file.h | 10 +++
savevm.c | 172 ++++++++++++++++++++++++++++++++++++++---
2 files changed, 172 insertions(+), 10 deletions(-)

diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index df81261..9046751 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -51,23 +51,33 @@ typedef int (QEMUFileCloseFunc)(void *opaque);
*/
typedef int (QEMUFileGetFD)(void *opaque);

+/*
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists). (Only used by RDMA right now)
+ */
+typedef int (QEMUFileDrainFunc)(void *opaque);
+
typedef struct QEMUFileOps {
QEMUFilePutBufferFunc *put_buffer;
QEMUFileGetBufferFunc *get_buffer;
QEMUFileCloseFunc *close;
QEMUFileGetFD *get_fd;
+ QEMUFileDrainFunc *drain;
} QEMUFileOps;

QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
QEMUFile *qemu_fopen(const char *filename, const char *mode);
QEMUFile *qemu_fdopen(int fd, const char *mode);
QEMUFile *qemu_fopen_socket(int fd, const char *mode);
+QEMUFile *qemu_fopen_rdma(void *opaque, const char *mode);
QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
int qemu_get_fd(QEMUFile *f);
int qemu_fclose(QEMUFile *f);
int64_t qemu_ftell(QEMUFile *f);
void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
void qemu_put_byte(QEMUFile *f, int v);
+int qemu_drain(QEMUFile *f);

static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
{
diff --git a/savevm.c b/savevm.c
index 35c8d1e..9b90b7f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -32,6 +32,7 @@
#include "qemu/timer.h"
#include "audio/audio.h"
#include "migration/migration.h"
+#include "migration/rdma.h"
#include "qemu/sockets.h"
#include "qemu/queue.h"
#include "sysemu/cpus.h"
@@ -143,6 +144,13 @@ typedef struct QEMUFileSocket
QEMUFile *file;
} QEMUFileSocket;

+typedef struct QEMUFileRDMA
+{
+ void *rdma;
+ size_t len;
+ QEMUFile *file;
+} QEMUFileRDMA;
+
typedef struct {
Coroutine *co;
int fd;
@@ -178,6 +186,66 @@ static int socket_get_fd(void *opaque)
return s->fd;
}

+/*
+ * SEND messages for none-live state only.
+ * pc.ram is handled elsewhere...
+ */
+static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
+{
+ QEMUFileRDMA *r = opaque;
+ size_t remaining = size;
+ uint8_t * data = (void *) buf;
+
+ /*
+ * Although we're sending non-live
+ * state here, push out any writes that
+ * we're queued up for pc.ram anyway.
+ */
+ if (qemu_rdma_write_flush(r->rdma) < 0)
+ return -EIO;
+
+ while(remaining) {
+ r->len = MIN(remaining, RDMA_SEND_INCREMENT);
+ remaining -= r->len;
+
+ if(qemu_rdma_exchange_send(r->rdma, data, r->len) < 0)
+ return -EINVAL;
+
+ data += r->len;
+ }
+
+ return size;
+}
+
+/*
+ * RDMA links don't use bytestreams, so we have to
+ * return bytes to QEMUFile opportunistically.
+ */
+static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+ QEMUFileRDMA *r = opaque;
+
+ /*
+ * First, we hold on to the last SEND message we
+ * were given and dish out the bytes until we run
+ * out of bytes.
+ */
+ if((r->len = qemu_rdma_fill(r->rdma, buf, size)))
+ return r->len;
+
+ /*
+ * Once we run out, we block and wait for another
+ * SEND message to arrive.
+ */
+ if(qemu_rdma_exchange_recv(r->rdma) < 0)
+ return -EINVAL;
+
+ /*
+ * SEND was received with new bytes, now try again.
+ */
+ return qemu_rdma_fill(r->rdma, buf, size);
+}
+
static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
{
QEMUFileSocket *s = opaque;
@@ -390,16 +458,24 @@ static const QEMUFileOps socket_write_ops = {
.close = socket_close
};

-QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+static bool qemu_mode_is_not_valid(const char * mode)
{
- QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
-
if (mode == NULL ||
(mode[0] != 'r' && mode[0] != 'w') ||
mode[1] != 'b' || mode[2] != 0) {
fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
- return NULL;
+ return true;
}
+
+ return false;
+}
+
+QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+{
+ QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
+
+ if(qemu_mode_is_not_valid(mode))
+ return NULL;

s->fd = fd;
if (mode[0] == 'w') {
@@ -411,16 +487,66 @@ QEMUFile *qemu_fopen_socket(int fd, const char *mode)
return s->file;
}

+static int qemu_rdma_close(void *opaque)
+{
+ QEMUFileRDMA *r = opaque;
+ if(r->rdma) {
+ qemu_rdma_cleanup(r->rdma);
+ g_free(r->rdma);
+ }
+ g_free(r);
+ return 0;
+}
+
+void * migrate_use_rdma(QEMUFile *f)
+{
+ QEMUFileRDMA *r = f->opaque;
+
+ return qemu_rdma_enabled(r->rdma) ? r->rdma : NULL;
+}
+
+static int qemu_rdma_drain_completion(void *opaque)
+{
+ QEMUFileRDMA *r = opaque;
+ r->len = 0;
+ return qemu_rdma_drain_cq(r->rdma);
+}
+
+static const QEMUFileOps rdma_read_ops = {
+ .get_buffer = qemu_rdma_get_buffer,
+ .close = qemu_rdma_close,
+};
+
+static const QEMUFileOps rdma_write_ops = {
+ .put_buffer = qemu_rdma_put_buffer,
+ .close = qemu_rdma_close,
+ .drain = qemu_rdma_drain_completion,
+};
+
+QEMUFile *qemu_fopen_rdma(void *opaque, const char * mode)
+{
+ QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
+
+ if(qemu_mode_is_not_valid(mode))
+ return NULL;
+
+ r->rdma = opaque;
+
+ if (mode[0] == 'w') {
+ r->file = qemu_fopen_ops(r, &rdma_write_ops);
+ } else {
+ r->file = qemu_fopen_ops(r, &rdma_read_ops);
+ }
+
+ return r->file;
+}
+
QEMUFile *qemu_fopen(const char *filename, const char *mode)
{
QEMUFileStdio *s;

- if (mode == NULL ||
- (mode[0] != 'r' && mode[0] != 'w') ||
- mode[1] != 'b' || mode[2] != 0) {
- fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
- return NULL;
- }
+ if(qemu_mode_is_not_valid(mode))
+ return NULL;

s = g_malloc0(sizeof(QEMUFileStdio));

@@ -497,6 +623,24 @@ static void qemu_file_set_error(QEMUFile *f, int ret)
}
}

+/*
+ * Called only for RDMA right now at the end
+ * of each live iteration of memory.
+ *
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists).
+ *
+ * For RDMA, this means to make sure we've
+ * received completion queue (CQ) messages
+ * successfully for all of the RDMA writes
+ * that we requested.
+ */
+int qemu_drain(QEMUFile *f)
+{
+ return f->ops->drain ? f->ops->drain(f->opaque) : 0;
+}
+
/** Flushes QEMUFile buffer
*
*/
@@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
int64_t qemu_ftell(QEMUFile *f)
{
qemu_fflush(f);
+ if(migrate_use_rdma(f))
+ return delta_norm_mig_bytes_transferred();
return f->pos;
}

@@ -1737,6 +1883,12 @@ void qemu_savevm_state_complete(QEMUFile *f)
}
}

+ if ((ret = qemu_drain(f)) < 0) {
+ fprintf(stderr, "failed to drain RDMA first!\n");
+ qemu_file_set_error(f, ret);
+ return;
+ }
+
QTAILQ_FOREACH(se, &savevm_handlers, entry) {
int len;

--
1.7.10.4

Paolo Bonzini

2013-03-18 09:09:19 UTC

Post by m***@linux.vnet.ibm.com
This compiles with and without --enable-rdma.
---
include/migration/qemu-file.h | 10 +++
savevm.c | 172 ++++++++++++++++++++++++++++++++++++++---
2 files changed, 172 insertions(+), 10 deletions(-)
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index df81261..9046751 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -51,23 +51,33 @@ typedef int (QEMUFileCloseFunc)(void *opaque);
*/
typedef int (QEMUFileGetFD)(void *opaque);
+/*
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists). (Only used by RDMA right now)
+ */
+typedef int (QEMUFileDrainFunc)(void *opaque);
+
typedef struct QEMUFileOps {
QEMUFilePutBufferFunc *put_buffer;
QEMUFileGetBufferFunc *get_buffer;
QEMUFileCloseFunc *close;
QEMUFileGetFD *get_fd;
+ QEMUFileDrainFunc *drain;
} QEMUFileOps;
QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
QEMUFile *qemu_fopen(const char *filename, const char *mode);
QEMUFile *qemu_fdopen(int fd, const char *mode);
QEMUFile *qemu_fopen_socket(int fd, const char *mode);
+QEMUFile *qemu_fopen_rdma(void *opaque, const char *mode);
QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
int qemu_get_fd(QEMUFile *f);
int qemu_fclose(QEMUFile *f);
int64_t qemu_ftell(QEMUFile *f);
void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
void qemu_put_byte(QEMUFile *f, int v);
+int qemu_drain(QEMUFile *f);
static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
{
diff --git a/savevm.c b/savevm.c
index 35c8d1e..9b90b7f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -32,6 +32,7 @@
#include "qemu/timer.h"
#include "audio/audio.h"
#include "migration/migration.h"
+#include "migration/rdma.h"
#include "qemu/sockets.h"
#include "qemu/queue.h"
#include "sysemu/cpus.h"
@@ -143,6 +144,13 @@ typedef struct QEMUFileSocket
QEMUFile *file;
} QEMUFileSocket;
+typedef struct QEMUFileRDMA
+{
+ void *rdma;

This is an RDMAData *. Please avoid using void * as much as possible.

Post by m***@linux.vnet.ibm.com
+ size_t len;
+ QEMUFile *file;
+} QEMUFileRDMA;
+
typedef struct {
Coroutine *co;
int fd;
@@ -178,6 +186,66 @@ static int socket_get_fd(void *opaque)
return s->fd;
}
+/*
+ * SEND messages for none-live state only.
+ * pc.ram is handled elsewhere...
+ */
+static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
+{
+ QEMUFileRDMA *r = opaque;
+ size_t remaining = size;
+ uint8_t * data = (void *) buf;
+
+ /*
+ * Although we're sending non-live
+ * state here, push out any writes that
+ * we're queued up for pc.ram anyway.
+ */
+ if (qemu_rdma_write_flush(r->rdma) < 0)
+ return -EIO;
+
+ while(remaining) {
+ r->len = MIN(remaining, RDMA_SEND_INCREMENT);
+ remaining -= r->len;
+
+ if(qemu_rdma_exchange_send(r->rdma, data, r->len) < 0)
+ return -EINVAL;
+
+ data += r->len;
+ }
+
+ return size;
+}
+
+/*
+ * RDMA links don't use bytestreams, so we have to
+ * return bytes to QEMUFile opportunistically.
+ */
+static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+ QEMUFileRDMA *r = opaque;
+
+ /*
+ * First, we hold on to the last SEND message we
+ * were given and dish out the bytes until we run
+ * out of bytes.
+ */
+ if((r->len = qemu_rdma_fill(r->rdma, buf, size)))
+ return r->len;
+
+ /*
+ * Once we run out, we block and wait for another
+ * SEND message to arrive.
+ */
+ if(qemu_rdma_exchange_recv(r->rdma) < 0)
+ return -EINVAL;
+
+ /*
+ * SEND was received with new bytes, now try again.
+ */
+ return qemu_rdma_fill(r->rdma, buf, size);
+}

Please move these functions closer to qemu_fopen_rdma (or better, to an
RDMA-specific file altogether). Also, using qemu_rdma_fill introduces a
dependency of savevm.c on migration-rdma.c. There should be no such
dependency; migration-rdma.c should be used only by migration.c.

Post by m***@linux.vnet.ibm.com
static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
{
QEMUFileSocket *s = opaque;
@@ -390,16 +458,24 @@ static const QEMUFileOps socket_write_ops = {
.close = socket_close
};
-QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+static bool qemu_mode_is_not_valid(const char * mode)
{
- QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
-
if (mode == NULL ||
(mode[0] != 'r' && mode[0] != 'w') ||
mode[1] != 'b' || mode[2] != 0) {
fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
- return NULL;
+ return true;
}
+
+ return false;
+}
+
+QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+{
+ QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
+
+ if(qemu_mode_is_not_valid(mode))
+ return NULL;
s->fd = fd;
if (mode[0] == 'w') {
@@ -411,16 +487,66 @@ QEMUFile *qemu_fopen_socket(int fd, const char *mode)
return s->file;
}
+static int qemu_rdma_close(void *opaque)
+{
+ QEMUFileRDMA *r = opaque;
+ if(r->rdma) {
+ qemu_rdma_cleanup(r->rdma);
+ g_free(r->rdma);
+ }
+ g_free(r);
+ return 0;
+}
+
+void * migrate_use_rdma(QEMUFile *f)
+{
+ QEMUFileRDMA *r = f->opaque;
+
+ return qemu_rdma_enabled(r->rdma) ? r->rdma : NULL;

You cannot be sure that f->opaque->rdma is a valid pointer. For
example, the first field in a socket QEMUFile's is a file descriptor.

Instead, you could use a qemu_file_ops_are(const QEMUFile *, const
QEMUFileOps *) function that checks if the file uses the given ops.
Then, migrate_use_rdma can simply check if the QEMUFile is using the
RDMA ops structure.

With this change, the "enabled" field of RDMAData should go.

Post by m***@linux.vnet.ibm.com
+}
+
+static int qemu_rdma_drain_completion(void *opaque)
+{
+ QEMUFileRDMA *r = opaque;
+ r->len = 0;
+ return qemu_rdma_drain_cq(r->rdma);
+}
+
+static const QEMUFileOps rdma_read_ops = {
+ .get_buffer = qemu_rdma_get_buffer,
+ .close = qemu_rdma_close,
+};
+
+static const QEMUFileOps rdma_write_ops = {
+ .put_buffer = qemu_rdma_put_buffer,
+ .close = qemu_rdma_close,
+ .drain = qemu_rdma_drain_completion,
+};
+
+QEMUFile *qemu_fopen_rdma(void *opaque, const char * mode)
+{
+ QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
+
+ if(qemu_mode_is_not_valid(mode))
+ return NULL;
+
+ r->rdma = opaque;
+
+ if (mode[0] == 'w') {
+ r->file = qemu_fopen_ops(r, &rdma_write_ops);
+ } else {
+ r->file = qemu_fopen_ops(r, &rdma_read_ops);
+ }
+
+ return r->file;
+}
+
QEMUFile *qemu_fopen(const char *filename, const char *mode)
{
QEMUFileStdio *s;
- if (mode == NULL ||
- (mode[0] != 'r' && mode[0] != 'w') ||
- mode[1] != 'b' || mode[2] != 0) {
- fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
- return NULL;
- }
+ if(qemu_mode_is_not_valid(mode))
+ return NULL;
s = g_malloc0(sizeof(QEMUFileStdio));
@@ -497,6 +623,24 @@ static void qemu_file_set_error(QEMUFile *f, int ret)
}
}
+/*
+ * Called only for RDMA right now at the end
+ * of each live iteration of memory.
+ *
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists).
+ *
+ * For RDMA, this means to make sure we've
+ * received completion queue (CQ) messages
+ * successfully for all of the RDMA writes
+ * that we requested.
+ */
+int qemu_drain(QEMUFile *f)
+{
+ return f->ops->drain ? f->ops->drain(f->opaque) : 0;
+}

Hmm, this is very similar to qemu_fflush, but not quite. :/

Why exactly is this needed?

Post by m***@linux.vnet.ibm.com
/** Flushes QEMUFile buffer
*
*/
@@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
int64_t qemu_ftell(QEMUFile *f)
{
qemu_fflush(f);
+ if(migrate_use_rdma(f))
+ return delta_norm_mig_bytes_transferred();

Not needed, and another undesirable dependency (savevm.c ->
arch_init.c). Just update f->pos in save_rdma_page.

This is taking shape. Thanks for persevering!

Paolo

Post by m***@linux.vnet.ibm.com
return f->pos;
}
@@ -1737,6 +1883,12 @@ void qemu_savevm_state_complete(QEMUFile *f)
}
}
+ if ((ret = qemu_drain(f)) < 0) {
+ fprintf(stderr, "failed to drain RDMA first!\n");
+ qemu_file_set_error(f, ret);
+ return;
+ }
+
QTAILQ_FOREACH(se, &savevm_handlers, entry) {
int len;

Michael R. Hines

2013-03-18 20:33:42 UTC

Comments inline - tell me what you think.......

Post by m***@linux.vnet.ibm.com
+typedef struct QEMUFileRDMA
+{
+ void *rdma;
This is an RDMAData *. Please avoid using void * as much as possible.

Acknowledged - forgot to move this to rdma.c, so it doesn't have to be
void anymore.

Post by m***@linux.vnet.ibm.com
*/
+ return qemu_rdma_fill(r->rdma, buf, size);
+}
Please move these functions closer to qemu_fopen_rdma (or better, to an
RDMA-specific file altogether). Also, using qemu_rdma_fill introduces a
dependency of savevm.c on migration-rdma.c. There should be no such
dependency; migration-rdma.c should be used only by migration.c.

Acknowledged......

Post by m***@linux.vnet.ibm.com
+void * migrate_use_rdma(QEMUFile *f)
+{
+ QEMUFileRDMA *r = f->opaque;
+
+ return qemu_rdma_enabled(r->rdma) ? r->rdma : NULL;
You cannot be sure that f->opaque->rdma is a valid pointer. For
example, the first field in a socket QEMUFile's is a file descriptor.
Instead, you could use a qemu_file_ops_are(const QEMUFile *, const
QEMUFileOps *) function that checks if the file uses the given ops.
Then, migrate_use_rdma can simply check if the QEMUFile is using the
RDMA ops structure.
With this change, the "enabled" field of RDMAData should go.

Great - I like that...... will do....

Post by m***@linux.vnet.ibm.com
+/*
+ * Called only for RDMA right now at the end
+ * of each live iteration of memory.
+ *
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists).
+ *
+ * For RDMA, this means to make sure we've
+ * received completion queue (CQ) messages
+ * successfully for all of the RDMA writes
+ * that we requested.
+ */
+int qemu_drain(QEMUFile *f)
+{
+ return f->ops->drain ? f->ops->drain(f->opaque) : 0;
+}
Hmm, this is very similar to qemu_fflush, but not quite. :/
Why exactly is this needed?

Good idea - I'll replace drain with flush once I added
the "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
that you recommended......

Not needed, and another undesirable dependency (savevm.c ->
arch_init.c). Just update f->pos in save_rdma_page.

f->pos isn't good enough because save_rdma_page does not
go through QEMUFile directly - only non-live state goes
through QEMUFile ....... pc.ram uses direct RDMA writes.

As a result, the position pointer does not get updated
and the accounting is missed........

- Michael

Paolo Bonzini

2013-03-19 09:18:48 UTC

Post by m***@linux.vnet.ibm.com
+int qemu_drain(QEMUFile *f)
+{
+ return f->ops->drain ? f->ops->drain(f->opaque) : 0;
+}
Hmm, this is very similar to qemu_fflush, but not quite. :/
Why exactly is this needed?

Good idea - I'll replace drain with flush once I added
the "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
that you recommended......

If I understand correctly, the problem is that save_rdma_page is
asynchronous and you have to wait for pending operations to do the
put_buffer protocol correctly.

Would it work to just do the "drain" in the put_buffer operation, if and
only if it was preceded by a save_rdma_page operation?

Not needed, and another undesirable dependency (savevm.c ->
arch_init.c). Just update f->pos in save_rdma_page.

f->pos isn't good enough because save_rdma_page does not
go through QEMUFile directly - only non-live state goes
through QEMUFile ....... pc.ram uses direct RDMA writes.
As a result, the position pointer does not get updated
and the accounting is missed........

Yes, I am suggesting to modify f->pos in save_rdma_page instead.

Paolo

Michael R. Hines

2013-03-19 13:12:31 UTC

Good idea - I'll replace drain with flush once I added
the "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
that you recommended......

If I understand correctly, the problem is that save_rdma_page is
asynchronous and you have to wait for pending operations to do the
put_buffer protocol correctly.
Would it work to just do the "drain" in the put_buffer operation, if and
only if it was preceded by a save_rdma_page operation?

Yes, the drain needs to happen in a few places already:

1. During save_rdma_page (if the current "chunk" is full of pages)
2. During the end of each iteration (now using qemu_fflush in my current
patch)
3. And also during qemu_savem_state_complete(), also using qemu_fflush.

Not needed, and another undesirable dependency (savevm.c ->
arch_init.c). Just update f->pos in save_rdma_page.

f->pos isn't good enough because save_rdma_page does not
go through QEMUFile directly - only non-live state goes
through QEMUFile ....... pc.ram uses direct RDMA writes.
As a result, the position pointer does not get updated
and the accounting is missed........

Yes, I am suggesting to modify f->pos in save_rdma_page instead.
Paolo

Would that not confuse the other QEMUFile users?
If I change that pointer (without actually putting bytes
in into QEMUFile), won't the f->pos pointer be
incorrectly updated?

Paolo Bonzini

2013-03-19 13:25:07 UTC

Good idea - I'll replace drain with flush once I added
the "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
that you recommended......

If I understand correctly, the problem is that save_rdma_page is
asynchronous and you have to wait for pending operations to do the
put_buffer protocol correctly.
Would it work to just do the "drain" in the put_buffer operation, if and
only if it was preceded by a save_rdma_page operation?

1. During save_rdma_page (if the current "chunk" is full of pages)

Ok, this is internal to RDMA so no problem.

Post by Michael R. Hines
2. During the end of each iteration (now using qemu_fflush in my current
patch)

Why?

Post by Michael R. Hines
3. And also during qemu_savem_state_complete(), also using qemu_fflush.

This would be caught by put_buffer, but (2) would not.

Not needed, and another undesirable dependency (savevm.c ->
arch_init.c). Just update f->pos in save_rdma_page.

f->pos isn't good enough because save_rdma_page does not
go through QEMUFile directly - only non-live state goes
through QEMUFile ....... pc.ram uses direct RDMA writes.
As a result, the position pointer does not get updated
and the accounting is missed........

Yes, I am suggesting to modify f->pos in save_rdma_page instead.
Paolo

Would that not confuse the other QEMUFile users?
If I change that pointer (without actually putting bytes
in into QEMUFile), won't the f->pos pointer be
incorrectly updated?

f->pos is never used directly by QEMUFile, it is almost an opaque value.
It is accumulated on every qemu_fflush (so that it can be passed to the
->put_buffer function), and returned by qemu_ftell; nothing else.

If you make somehow save_rdma_page a new op, returning a value from that
op and adding it to f->pos would be a good way to achieve this.

Paolo

Michael R. Hines

2013-03-19 13:40:32 UTC

Post by Michael R. Hines
1. During save_rdma_page (if the current "chunk" is full of pages)
Ok, this is internal to RDMA so no problem.

Post by Michael R. Hines
2. During the end of each iteration (now using qemu_fflush in my current
patch)

Why?

This is because of downtime: You have to drain the queue anyway at the
very end, and if you don't drain it in advance after each iteration, then
the queue will have lots of bytes in it waiting for transmission and the
Virtual Machine will be stopped for a much longer period of time during
the last iteration waiting for RDMA card to finish transmission of all those
bytes.

If you wait till the last iteration to do this, then all of that waiting
time gets
counted as downtime, causing the VCPUs to be unnecessarily stopped.

Post by Michael R. Hines
3. And also during qemu_savem_state_complete(), also using qemu_fflush.

This would be caught by put_buffer, but (2) would not.

I'm not sure this is good enough either - we don't want to flush
the queue *frequently*..... only when it's necessary for performance
.... we do want the queue to have some meat to it so the hardware
can write bytes as fast as possible.....

If we flush inside put_buffer (which is called very frequently): then
we have no way to distinquish *where* put buffer was called from
(either from qemu_savevm_state_complete() or from a device-level
function call that's using QEMUFile).

Post by Paolo Bonzini
Yes, I am suggesting to modify f->pos in save_rdma_page instead.
Paolo

Would that not confuse the other QEMUFile users?
If I change that pointer (without actually putting bytes
in into QEMUFile), won't the f->pos pointer be
incorrectly updated?

f->pos is never used directly by QEMUFile, it is almost an opaque value.
It is accumulated on every qemu_fflush (so that it can be passed to the
->put_buffer function), and returned by qemu_ftell; nothing else.
If you make somehow save_rdma_page a new op, returning a value from that
op and adding it to f->pos would be a good way to achieve this.

Ok, great - I'll take advantage of that........Thanks.

Paolo Bonzini

2013-03-19 13:45:49 UTC

Post by Michael R. Hines
1. During save_rdma_page (if the current "chunk" is full of pages)
Ok, this is internal to RDMA so no problem.

Post by Michael R. Hines
2. During the end of each iteration (now using qemu_fflush in my current
patch)

Why?

Shouldn't the "current chunk full" case take care of it too?

Of course if you disable chunking you have to add a different condition,
perhaps directly into save_rdma_page.

Post by Michael R. Hines
If you wait till the last iteration to do this, then all of that waiting time gets
counted as downtime, causing the VCPUs to be unnecessarily stopped.

Post by Michael R. Hines
3. And also during qemu_savem_state_complete(), also using qemu_fflush.

This would be caught by put_buffer, but (2) would not.

Is it called at any time during RAM migration?

Post by Michael R. Hines
then we have no way to distinquish *where* put buffer was called from
(either from qemu_savevm_state_complete() or from a device-level
function call that's using QEMUFile).

Can you make drain a no-op if there is nothing in flight? Then every
call to put_buffer after the first should not have any overhead.

Paolo

Post by Paolo Bonzini
Yes, I am suggesting to modify f->pos in save_rdma_page instead.
Paolo

Would that not confuse the other QEMUFile users?
If I change that pointer (without actually putting bytes
in into QEMUFile), won't the f->pos pointer be
incorrectly updated?

f->pos is never used directly by QEMUFile, it is almost an opaque value.
It is accumulated on every qemu_fflush (so that it can be passed to the
->put_buffer function), and returned by qemu_ftell; nothing else.
If you make somehow save_rdma_page a new op, returning a value from that
op and adding it to f->pos would be a good way to achieve this.

Ok, great - I'll take advantage of that........Thanks.

Michael R. Hines

2013-03-19 14:10:10 UTC

Post by Michael R. Hines
This is because of downtime: You have to drain the queue anyway at the
very end, and if you don't drain it in advance after each iteration, then
the queue will have lots of bytes in it waiting for transmission and the
Virtual Machine will be stopped for a much longer period of time during
the last iteration waiting for RDMA card to finish transmission of all those
bytes.
Shouldn't the "current chunk full" case take care of it too?
Of course if you disable chunking you have to add a different condition,
perhaps directly into save_rdma_page.

No, we don't want to flush on "chunk full" - that has a different meaning.
We want to have as many chunks submitted to the hardware for transmission
as possible to keep the bytes moving.

Post by Michael R. Hines
3. And also during qemu_savem_state_complete(), also using qemu_fflush.

This would be caught by put_buffer, but (2) would not.

Is it called at any time during RAM migration?

I don't understand the question: the flushing we've been discussing
is *only* for RAM migration - not for the non-live state.

I haven't introduced any "new" flushes for non-live state other than
when it's absolutely necessary to flush for RAM migration.

Post by Michael R. Hines
then we have no way to distinquish *where* put buffer was called from
(either from qemu_savevm_state_complete() or from a device-level
function call that's using QEMUFile).

Can you make drain a no-op if there is nothing in flight? Then every
call to put_buffer after the first should not have any overhead.
Paolo

That still doesn't solve the problem: If there is nothing in flight,
then there is no reason to call qemu_fflush() in the first place.

This is why I avoided using fflush() in the beginning, because it
sort of "confuses" who is using it: from the perspective of fflush(),
you can't tell if the user calling it for RAM or for non-live state.

The flushes we need are only for RAM, not the rest of it......

Make sense?

Paolo Bonzini

2013-03-19 14:22:04 UTC

No, we don't want to flush on "chunk full" - that has a different meaning.
We want to have as many chunks submitted to the hardware for transmission
as possible to keep the bytes moving.

That however gives me an idea... Instead of the full drain at the end
of an iteration, does it make sense to do a "partial" drain at every
chunk full, so that you don't have > N bytes pending and the downtime is
correspondingly limited?

Post by Michael R. Hines
3. And also during qemu_savem_state_complete(), also using
qemu_fflush.

This would be caught by put_buffer, but (2) would not.

Is it called at any time during RAM migration?

I don't understand the question: the flushing we've been discussing
is *only* for RAM migration - not for the non-live state.

Yes. But I would like to piggyback the final, full drain on the switch
from RAM migration to device migration.

Post by Michael R. Hines
Can you make drain a no-op if there is nothing in flight? Then every
call to put_buffer after the first should not have any overhead.

That still doesn't solve the problem: If there is nothing in flight,
then there is no reason to call qemu_fflush() in the first place.

If there is no RAM migration in flight. So you have

migrate RAM
...
RAM migration finished, device migration start
put_buffer <<<<< QEMUFileRDMA triggers drain
put_buffer
put_buffer
put_buffer
...

Post by Michael R. Hines
The flushes we need are only for RAM, not the rest of it......
Make sense?

Paolo

Michael R. Hines

2013-03-19 15:02:12 UTC

Consider the following sequence:

1. Boot fresh VM (say, a boring 1GB vm) => Resident
set is small, say 100M
2. Touch all the memory (with a utility or something) => Resident set is ~1G
3. Send QMP "balloon 500" => Resident set is ~500M
4. Now, migrate the VM => Resident set is 1G again

This suggests to me that migration is not accounting for
what memory was ballooned.

I suspect this is because the migration_bitmap does not coordinate
with the list of ballooned-out memory that was MADVISED().

This affects RDMA as well as TCP on the sender side.

Is there any hard reason why we're not validating migration_bitmap against
the memory that was MADVISED()'d?

- Michael R. Hines

Michael R. Hines

2013-03-19 15:12:50 UTC

Actually, you don't even need ballooning to reproduce this behavior.

Is this a known issue?

- Michael

Post by Michael R. Hines
1. Boot fresh VM (say, a boring 1GB vm) => Resident
set is small, say 100M
2. Touch all the memory (with a utility or something) => Resident set is ~1G
3. Send QMP "balloon 500" => Resident set is ~500M
4. Now, migrate the VM => Resident set is 1G again
This suggests to me that migration is not accounting for
what memory was ballooned.
I suspect this is because the migration_bitmap does not coordinate
with the list of ballooned-out memory that was MADVISED().
This affects RDMA as well as TCP on the sender side.
Is there any hard reason why we're not validating migration_bitmap against
the memory that was MADVISED()'d?
- Michael R. Hines

Michael S. Tsirkin

2013-03-19 15:17:34 UTC

Post by Michael R. Hines
Actually, you don't even need ballooning to reproduce this behavior.
Is this a known issue?
- Michael

Yes.

Michael R. Hines

2013-03-19 18:27:59 UTC

No, we don't want to flush on "chunk full" - that has a different meaning.
We want to have as many chunks submitted to the hardware for transmission
as possible to keep the bytes moving.

Sure, you could do that, but it seems overly complex just to avoid
a single flush() call at the end of each iteration, right?

Post by Paolo Bonzini
If there is no RAM migration in flight. So you have
migrate RAM
...
RAM migration finished, device migration start
put_buffer <<<<< QEMUFileRDMA triggers drain
put_buffer
put_buffer
put_buffer
...

Ah, yes, ok. Very simple modification......

Paolo Bonzini

2013-03-19 18:40:55 UTC

Post by Paolo Bonzini
That however gives me an idea... Instead of the full drain at the end
of an iteration, does it make sense to do a "partial" drain at every
chunk full, so that you don't have > N bytes pending and the downtime is
correspondingly limited?

Sure, you could do that, but it seems overly complex just to avoid
a single flush() call at the end of each iteration, right?

Would it really be that complex? Not having an extra QEMUFile op
perhaps balances that complexity (and the complexity remains hidden in
rdma.c, which is an advantage).

You could alternatively drain every N megabytes sent, or something like
that. But a partial drain would help obeying the maximum downtime
limitations.

Paolo

Paolo Bonzini

2013-03-20 15:20:16 UTC

Sure, you could do that, but it seems overly complex just to avoid
a single flush() call at the end of each iteration, right?

Would it really be that complex? Not having an extra QEMUFile op
perhaps balances that complexity (and the complexity remains hidden in
rdma.c, which is an advantage).
You could alternatively drain every N megabytes sent, or something like
that. But a partial drain would help obeying the maximum downtime
limitations.

On second thought: just keep the drain operation, but make it clear that
it is related to the new save_ram_page QEMUFileOps field. You could
call it flush_ram_pages or something like that.

Paolo

Michael R. Hines

2013-03-20 16:09:16 UTC

Sure, you could do that, but it seems overly complex just to avoid
a single flush() call at the end of each iteration, right?

Would it really be that complex? Not having an extra QEMUFile op
perhaps balances that complexity (and the complexity remains hidden in
rdma.c, which is an advantage).
You could alternatively drain every N megabytes sent, or something like
that. But a partial drain would help obeying the maximum downtime
limitations.

On second thought: just keep the drain operation, but make it clear that
it is related to the new save_ram_page QEMUFileOps field. You could
call it flush_ram_pages or something like that.
Paolo

Acknowledged. This helps a lot, thank you. I'll be sure to
clearly conditionalize everything in the next RFC.

m***@linux.vnet.ibm.com

2013-03-18 03:18:56 UTC

From: "Michael R. Hines" <***@us.ibm.com>

This tries to cover all the questions I got the last time.

Please do tell me what is not clear, and I'll revise again.

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
docs/rdma.txt | 208 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 208 insertions(+)
create mode 100644 docs/rdma.txt

diff --git a/docs/rdma.txt b/docs/rdma.txt
new file mode 100644
index 0000000..2a48ab0
--- /dev/null
+++ b/docs/rdma.txt
@@ -0,0 +1,208 @@
+Changes since v3:
+
+- Compile-tested with and without --enable-rdma is working.
+- Updated docs/rdma.txt (included below)
+- Merged with latest pull queue from Paolo
+- Implemented qemu_ram_foreach_block()
+
+***@mrhinesdev:~/qemu$ git diff --stat master
+Makefile.objs | 1 +
+arch_init.c | 28 +-
+configure | 25 ++
+docs/rdma.txt | 190 +++++++++++
+exec.c | 21 ++
+include/exec/cpu-common.h | 6 +
+include/migration/migration.h | 3 +
+include/migration/qemu-file.h | 10 +
+include/migration/rdma.h | 269 ++++++++++++++++
+include/qemu/sockets.h | 1 +
+migration-rdma.c | 205 ++++++++++++
+migration.c | 19 +-
+rdma.c | 1511 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+savevm.c | 172 +++++++++-
+util/qemu-sockets.c | 2 +-
+15 files changed, 2445 insertions(+), 18 deletions(-)
+
+QEMUFileRDMA:
+==================================
+
+QEMUFileRDMA introduces a couple of new functions:
+
+1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops)
+2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops)
+
+These two functions provide an RDMA transport
+(not a protocol) without changing the upper-level
+users of QEMUFile that depend on a bytstream abstraction.
+
+In order to provide the same bytestream interface
+for RDMA, we use SEND messages instead of sockets.
+The operations themselves and the protocol built on
+top of QEMUFile used throughout the migration
+process do not change whatsoever.
+
+An infiniband SEND message is the standard ibverbs
+message used by applications of infiniband hardware.
+The only difference between a SEND message and an RDMA
+message is that SEND message cause completion notifications
+to be posted to the completion queue (CQ) on the
+infiniband receiver side, whereas RDMA messages (used
+for pc.ram) do not (to behave like an actual DMA).
+
+Messages in infiniband require two things:
+
+1. registration of the memory that will be transmitted
+2. (SEND only) work requests to be posted on both
+ sides of the network before the actual transmission
+ can occur.
+
+RDMA messages much easier to deal with. Once the memory
+on the receiver side is registed and pinned, we're
+basically done. All that is required is for the sender
+side to start dumping bytes onto the link.
+
+SEND messages require more coordination because the
+receiver must have reserved space (using a receive
+work request) on the receive queue (RQ) before QEMUFileRDMA
+can start using them to carry all the bytes as
+a transport for migration of device state.
+
+After the initial connection setup (migration-rdma.c),
+this coordination starts by having both sides post
+a single work request to the RQ before any users
+of QEMUFile are activated.
+
+Once an initial receive work request is posted,
+we have a put_buffer()/get_buffer() implementation
+that looks like this:
+
+Logically:
+
+qemu_rdma_get_buffer():
+
+1. A user on top of QEMUFile calls ops->get_buffer(),
+ which calls us.
+2. We transmit an empty SEND to let the sender know that
+ we are *ready* to receive some bytes from QEMUFileRDMA.
+ These bytes will come in the form of a another SEND.
+3. Before attempting to receive that SEND, we post another
+ RQ work request to replace the one we just used up.
+4. Block on a CQ event channel and wait for the SEND
+ to arrive.
+5. When the send arrives, librdmacm will unblock us
+ and we can consume the bytes (described later).
+
+qemu_rdma_put_buffer():
+
+1. A user on top of QEMUFile calls ops->put_buffer(),
+ which calls us.
+2. Block on the CQ event channel waiting for a SEND
+ from the receiver to tell us that the receiver
+ is *ready* for us to transmit some new bytes.
+3. When the "ready" SEND arrives, librdmacm will
+ unblock us and we immediately post a RQ work request
+ to replace the one we just used up.
+4. Now, we can actually deliver the bytes that
+ put_buffer() wants and return.
+
+NOTE: This entire sequents of events is designed this
+way to mimic the operations of a bytestream and is not
+typical of an infiniband application. (Something like MPI
+would not 'ping-pong' messages like this and would not
+block after every request, which would normally defeat
+the purpose of using zero-copy infiniband in the first place).
+
+Finally, how do we handoff the actual bytes to get_buffer()?
+
+Again, because we're trying to "fake" a bytestream abstraction
+using an analogy not unlike individual UDP frames, we have
+to hold on to the bytes received from SEND in memory.
+
+Each time we get to "Step 5" above for get_buffer(),
+the bytes from SEND are copied into a local holding buffer.
+
+Then, we return the number of bytes requested by get_buffer()
+and leave the remaining bytes in the buffer until get_buffer()
+comes around for another pass.
+
+If the buffer is empty, then we follow the same steps
+listed above for qemu_rdma_get_buffer() and block waiting
+for another SEND message to re-fill the buffer.
+
+Migration of pc.ram:
+===============================
+
+At the beginning of the migration, (migration-rdma.c),
+the sender and the receiver populate the list of RAMBlocks
+to be registered with each other into a structure.
+
+Then, using a single SEND message, they exchange this
+structure with each other, to be used later during the
+iteration of main memory. This structure includes a list
+of all the RAMBlocks, their offsets and lengths.
+
+Main memory is not migrated with SEND infiniband
+messages, but is instead migrated with RDMA infiniband
+messages.
+
+Messages are migrated in "chunks" (about 64 pages right now).
+Chunk size is not dynamic, but it could be in a future
+implementation.
+
+When a total of 64 pages (or a flush()) are aggregated,
+the memory backed by the chunk on the sender side is
+registered with librdmacm and pinned in memory.
+
+After pinning, an RDMA send is generated and tramsmitted
+for the entire chunk.
+
+Error-handling:
+===============================
+
+Infiniband has what is called a "Reliable, Connected"
+link (one of 4 choices). This is the mode in which
+we use for RDMA migration.
+
+If a *single* message fails,
+the decision is to abort the migration entirely and
+cleanup all the RDMA descriptors and unregister all
+the memory.
+
+After cleanup, the Virtual Machine is returned to normal
+operation the same way that would happen if the TCP
+socket is broken during a non-RDMA based migration.
+
+USAGE
+===============================
+
+Compiling:
+
+$ ./configure --enable-rdma --target-list=x86_64-softmmu
+
+$ make
+
+Command-line on the Source machine AND Destination:
+
+$ virsh qemu-monitor-command --hmp --cmd "migrate_set_speed 40g" # or whatever is the MAX of your RDMA device
+
+Finally, perform the actual migration:
+
+$ virsh migrate domain rdma:xx.xx.xx.xx:port
+
+PERFORMANCE
+===================
+
+Using a 40gbps infinband link performing a worst-case stress test:
+
+RDMA Throughput With $ stress --vm-bytes 1024M --vm 1 --vm-keep
+Approximately 30 gpbs (little better than the paper)
+1. Average worst-case throughput
+TCP Throughput With $ stress --vm-bytes 1024M --vm 1 --vm-keep
+2. Approximately 8 gpbs (using IPOIB IP over Infiniband)
+
+Average downtime (stop time) ranges between 28 and 33 milliseconds.
+
+An *exhaustive* paper (2010) shows additional performance details
+linked on the QEMU wiki:
+
+http://wiki.qemu.org/Features/RDMALiveMigration

--
1.7.10.4

Michael S. Tsirkin

2013-03-18 10:40:13 UTC

Post by m***@linux.vnet.ibm.com
This tries to cover all the questions I got the last time.
Please do tell me what is not clear, and I'll revise again.
---
docs/rdma.txt | 208 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 208 insertions(+)
create mode 100644 docs/rdma.txt
diff --git a/docs/rdma.txt b/docs/rdma.txt
new file mode 100644
index 0000000..2a48ab0
--- /dev/null
+++ b/docs/rdma.txt
@@ -0,0 +1,208 @@
+
+- Compile-tested with and without --enable-rdma is working.
+- Updated docs/rdma.txt (included below)
+- Merged with latest pull queue from Paolo
+- Implemented qemu_ram_foreach_block()
+
+Makefile.objs | 1 +
+arch_init.c | 28 +-
+configure | 25 ++
+docs/rdma.txt | 190 +++++++++++
+exec.c | 21 ++
+include/exec/cpu-common.h | 6 +
+include/migration/migration.h | 3 +
+include/migration/qemu-file.h | 10 +
+include/migration/rdma.h | 269 ++++++++++++++++
+include/qemu/sockets.h | 1 +
+migration-rdma.c | 205 ++++++++++++
+migration.c | 19 +-
+rdma.c | 1511 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+savevm.c | 172 +++++++++-
+util/qemu-sockets.c | 2 +-
+15 files changed, 2445 insertions(+), 18 deletions(-)

Above looks strange :)
I think there are two things here, API documentation
and protocol documentation, protocol documentation
still needs some more work. Also if what I understand
from this document is correct this breaks memory overcommit
on destination which needs to be fixed.

Post by m***@linux.vnet.ibm.com
+==================================
+
+
+1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops)
+2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops)
+
+These two functions provide an RDMA transport
+(not a protocol) without changing the upper-level
+users of QEMUFile that depend on a bytstream abstraction.
+
+In order to provide the same bytestream interface
+for RDMA, we use SEND messages instead of sockets.
+The operations themselves and the protocol built on
+top of QEMUFile used throughout the migration
+process do not change whatsoever.
+
+An infiniband SEND message is the standard ibverbs
+message used by applications of infiniband hardware.
+The only difference between a SEND message and an RDMA
+message is that SEND message cause completion notifications
+to be posted to the completion queue (CQ) on the
+infiniband receiver side, whereas RDMA messages (used
+for pc.ram) do not (to behave like an actual DMA).
+
+
+1. registration of the memory that will be transmitted
+2. (SEND only) work requests to be posted on both
+ sides of the network before the actual transmission
+ can occur.
+
+RDMA messages much easier to deal with. Once the memory
+on the receiver side is registed and pinned, we're
+basically done. All that is required is for the sender
+side to start dumping bytes onto the link.
+
+SEND messages require more coordination because the
+receiver must have reserved space (using a receive
+work request) on the receive queue (RQ) before QEMUFileRDMA
+can start using them to carry all the bytes as
+a transport for migration of device state.
+
+After the initial connection setup (migration-rdma.c),

Is there any feature and/or version negotiation? How are we going to
handle compatibility when we extend the protocol?

Post by m***@linux.vnet.ibm.com
+this coordination starts by having both sides post
+a single work request to the RQ before any users
+of QEMUFile are activated.

So how does destination know it's ok to send anything
to source?
I suspect this is wrong. When using CM you must post
on RQ before completing the connection negotiation,
not after it's done.

Post by m***@linux.vnet.ibm.com
+
+Once an initial receive work request is posted,
+we have a put_buffer()/get_buffer() implementation
+
+
+
+1. A user on top of QEMUFile calls ops->get_buffer(),
+ which calls us.
+2. We transmit an empty SEND to let the sender know that
+ we are *ready* to receive some bytes from QEMUFileRDMA.
+ These bytes will come in the form of a another SEND.
+3. Before attempting to receive that SEND, we post another
+ RQ work request to replace the one we just used up.
+4. Block on a CQ event channel and wait for the SEND
+ to arrive.
+5. When the send arrives, librdmacm will unblock us
+ and we can consume the bytes (described later).

Using an empty message seems somewhat hacky, a fixed header in the
message would let you do more things if protocol is ever extended.

Post by m***@linux.vnet.ibm.com
+
+1. A user on top of QEMUFile calls ops->put_buffer(),
+ which calls us.
+2. Block on the CQ event channel waiting for a SEND
+ from the receiver to tell us that the receiver
+ is *ready* for us to transmit some new bytes.
+3. When the "ready" SEND arrives, librdmacm will
+ unblock us and we immediately post a RQ work request
+ to replace the one we just used up.
+4. Now, we can actually deliver the bytes that
+ put_buffer() wants and return.

OK to summarize flow control: at any time there's
either 0 or 1 outstanding buffers in RQ.
At each time only one side can talk.
Destination always goes first, then source, etc.
At each time a single send message can be passed.

Just FYI, this means you are often at 0 buffers in RQ and IIRC 0 buffers
is a worst-case path for infiniband. It's better to keep at least 1
buffers in RQ at all times, so prepost 2 initially so it would fluctuate
between 1 and 2.

Post by m***@linux.vnet.ibm.com
+
+NOTE: This entire sequents of events is designed this
+way to mimic the operations of a bytestream and is not
+typical of an infiniband application. (Something like MPI
+would not 'ping-pong' messages like this and would not
+block after every request, which would normally defeat
+the purpose of using zero-copy infiniband in the first place).
+
+Finally, how do we handoff the actual bytes to get_buffer()?
+
+Again, because we're trying to "fake" a bytestream abstraction
+using an analogy not unlike individual UDP frames, we have
+to hold on to the bytes received from SEND in memory.
+
+Each time we get to "Step 5" above for get_buffer(),
+the bytes from SEND are copied into a local holding buffer.
+
+Then, we return the number of bytes requested by get_buffer()
+and leave the remaining bytes in the buffer until get_buffer()
+comes around for another pass.
+
+If the buffer is empty, then we follow the same steps
+listed above for qemu_rdma_get_buffer() and block waiting
+for another SEND message to re-fill the buffer.
+
+===============================
+
+At the beginning of the migration, (migration-rdma.c),
+the sender and the receiver populate the list of RAMBlocks
+to be registered with each other into a structure.

Could you add the packet format here as well please?
Need to document endian-ness etc.

Post by m***@linux.vnet.ibm.com
+Then, using a single SEND message, they exchange this
+structure with each other, to be used later during the
+iteration of main memory. This structure includes a list
+of all the RAMBlocks, their offsets and lengths.

This basically means that all memort on destination has to be registered
upfront. A typical guest has gigabytes of memory, IMHO that's too much
memory to have pinned.

Post by m***@linux.vnet.ibm.com
+
+Main memory is not migrated with SEND infiniband
+messages, but is instead migrated with RDMA infiniband
+messages.
+
+Messages are migrated in "chunks" (about 64 pages right now).
+Chunk size is not dynamic, but it could be in a future
+implementation.
+
+When a total of 64 pages (or a flush()) are aggregated,
+the memory backed by the chunk on the sender side is
+registered with librdmacm and pinned in memory.
+
+After pinning, an RDMA send is generated and tramsmitted
+for the entire chunk.

I think something chunk-based on the destination side is required
as well. You also can't trust the source to tell you
the chunk size it could be malicious and ask for too much.
Maybe source gives chunk size hint and destination responds
with what it wants to use.

Post by m***@linux.vnet.ibm.com
+===============================
+
+Infiniband has what is called a "Reliable, Connected"
+link (one of 4 choices). This is the mode in which
+we use for RDMA migration.
+
+If a *single* message fails,
+the decision is to abort the migration entirely and
+cleanup all the RDMA descriptors and unregister all
+the memory.
+
+After cleanup, the Virtual Machine is returned to normal
+operation the same way that would happen if the TCP
+socket is broken during a non-RDMA based migration.

Yes but we also need to report errors detected during migration.
Need to document how this is done.
We also need to report success.

Post by m***@linux.vnet.ibm.com
+
+USAGE
+===============================
+
+
+$ ./configure --enable-rdma --target-list=x86_64-softmmu
+
+$ make
+
+
+$ virsh qemu-monitor-command --hmp --cmd "migrate_set_speed 40g" # or whatever is the MAX of your RDMA device
+
+
+$ virsh migrate domain rdma:xx.xx.xx.xx:port
+
+PERFORMANCE
+===================
+
+
+RDMA Throughput With $ stress --vm-bytes 1024M --vm 1 --vm-keep
+Approximately 30 gpbs (little better than the paper)
+1. Average worst-case throughput
+TCP Throughput With $ stress --vm-bytes 1024M --vm 1 --vm-keep
+2. Approximately 8 gpbs (using IPOIB IP over Infiniband)
+
+Average downtime (stop time) ranges between 28 and 33 milliseconds.
+
+An *exhaustive* paper (2010) shows additional performance details
+
+http://wiki.qemu.org/Features/RDMALiveMigration
--
1.7.10.4

Michael R. Hines

2013-03-18 20:24:44 UTC

I think there are two things here, API documentation and protocol
documentation, protocol documentation still needs some more work. Also
if what I understand from this document is correct this breaks memory
overcommit on destination which needs to be fixed.
I think something chunk-based on the destination side is required as
well. You also can't trust the source to tell you the chunk size it
could be malicious and ask for too much. Maybe source gives chunk size
hint and destination responds with what it wants to use.

Do we allow ballooning *during* the live migration? Is that necessary?

Would it be sufficient to inform the destination which pages are ballooned
and then only register the ones that the VM actually owns?

Is there any feature and/or version negotiation? How are we going to
handle compatibility when we extend the protocol?

You mean, on top of the protocol versioning that's already
builtin to QEMUFile? inside qemu_savevm_state_begin()?

Should I piggy-back and additional protocol version number
before QEMUFile sends it's version number?

So how does destination know it's ok to send anything to source? I
suspect this is wrong. When using CM you must post on RQ before
completing the connection negotiation, not after it's done.

This is already handled by the RDMA connection manager (librdmacm).

The library already has functions like listen() and accept() the same
way that TCP does.

Once these functions return success, we have a gaurantee that both
sides of the connection have already posted the appropriate work
requests sufficient for driving the migration.

Post by m***@linux.vnet.ibm.com
+2. We transmit an empty SEND to let the sender know that
+ we are *ready* to receive some bytes from QEMUFileRDMA.
+ These bytes will come in the form of a another SEND.

Using an empty message seems somewhat hacky, a fixed header in the
message would let you do more things if protocol is ever extended.

Great idea....... I'll add a struct RDMAHeader to each send
message in the next RFC which includes a version number.

(Until now, there were *only* QEMUFile bytes, nothing else,
so I didn't have any reason for a formal structure.)

OK to summarize flow control: at any time there's either 0 or 1
outstanding buffers in RQ. At each time only one side can talk.
Destination always goes first, then source, etc. At each time a single
send message can be passed. Just FYI, this means you are often at 0
buffers in RQ and IIRC 0 buffers is a worst-case path for infiniband.
It's better to keep at least 1 buffers in RQ at all times, so prepost
2 initially so it would fluctuate between 1 and 2.

That's correct. Having 0 buffers is not possible - sending
a message with 0 buffers would throw an error. The "protocol"
as I described ensures that there is always one buffer posted
before waiting for another message to arrive.

I avoided "better" flow control because the non-live state
is so small in comparison to the pc.ram contents that would be sent.
The non-live state is in the range of kilobytes, so it seemed silly to
have more rigorous flow control....

Post by m***@linux.vnet.ibm.com
+===============================
+
+At the beginning of the migration, (migration-rdma.c),
+the sender and the receiver populate the list of RAMBlocks
+to be registered with each other into a structure.

Could you add the packet format here as well please?
Need to document endian-ness etc.

There is no packet format for pc.ram. It's just bytes - raw RDMA
writes of each 4K page, because the memory must be registered
before the RDMA write can begin.

(As discussed, there will be a format for SEND, though - so I'll
take care of that in my next RFC).

Yes but we also need to report errors detected during migration. Need
to document how this is done. We also need to report success.

Acknowledged - I'll add more verbosity to the different error conditions.

- Michael R. Hines

Michael S. Tsirkin

2013-03-18 21:26:46 UTC

I think there are two things here, API documentation and protocol
documentation, protocol documentation still needs some more work.
Also if what I understand from this document is correct this
breaks memory overcommit on destination which needs to be fixed.
I think something chunk-based on the destination side is required
as well. You also can't trust the source to tell you the chunk
size it could be malicious and ask for too much. Maybe source
gives chunk size hint and destination responds with what it wants
to use.

Do we allow ballooning *during* the live migration? Is that necessary?

Probably but I haven't mentioned ballooning at all.

memory overcommit != ballooning

Post by Michael R. Hines
Would it be sufficient to inform the destination which pages are ballooned
and then only register the ones that the VM actually owns?

I haven't thought about it.

Is there any feature and/or version negotiation? How are we going to
handle compatibility when we extend the protocol?

You mean, on top of the protocol versioning that's already
builtin to QEMUFile? inside qemu_savevm_state_begin()?

I mean for protocol things like credit negotiation, which are unrelated
to high level QEMUFile.

Post by Michael R. Hines
Should I piggy-back and additional protocol version number
before QEMUFile sends it's version number?

CM can exchange a bit of data during connection setup, maybe use that?

So how does destination know it's ok to send anything to source? I
suspect this is wrong. When using CM you must post on RQ before
completing the connection negotiation, not after it's done.

This is already handled by the RDMA connection manager (librdmacm).
The library already has functions like listen() and accept() the same
way that TCP does.
Once these functions return success, we have a gaurantee that both
sides of the connection have already posted the appropriate work
requests sufficient for driving the migration.

Not if you don't post anything. librdmacm does not post requests. So
everyone posts 1 buffer on RQ during connection setup?
OK though this is not what the document said, I was under the impression
this is done after connection setup.

Using an empty message seems somewhat hacky, a fixed header in the
message would let you do more things if protocol is ever extended.

Great idea....... I'll add a struct RDMAHeader to each send
message in the next RFC which includes a version number.
(Until now, there were *only* QEMUFile bytes, nothing else,
so I didn't have any reason for a formal structure.)

OK to summarize flow control: at any time there's either 0 or 1
outstanding buffers in RQ. At each time only one side can talk.
Destination always goes first, then source, etc. At each time a
single send message can be passed. Just FYI, this means you are
often at 0 buffers in RQ and IIRC 0 buffers is a worst-case path
for infiniband. It's better to keep at least 1 buffers in RQ at
all times, so prepost 2 initially so it would fluctuate between 1
and 2.

So # of buffers goes 0 -> 1 -> 0 -> 1.
What I am saying is you should have an extra buffer
so it goes 1 -> 2 -> 1 -> 2
otherwise you keep hitting slow path in RQ processing:
each time you consume the last buffer, IIRC receiver sends
and ACK to sender saying "hey this is the last buffer, slow down".
You don't want that.

Post by Michael R. Hines
I avoided "better" flow control because the non-live state
is so small in comparison to the pc.ram contents that would be sent.
The non-live state is in the range of kilobytes, so it seemed silly to
have more rigorous flow control....

I think it's good enough, just add an extra unused buffer to make
hardware happy.

Could you add the packet format here as well please?
Need to document endian-ness etc.

There is no packet format for pc.ram.

The 'structure' above is passed using SEND so there is
a format.

Post by Michael R. Hines
It's just bytes - raw RDMA
writes of each 4K page, because the memory must be registered
before the RDMA write can begin.
(As discussed, there will be a format for SEND, though - so I'll
take care of that in my next RFC).

Yes but we also need to report errors detected during migration.
Need to document how this is done. We also need to report success.

Acknowledged - I'll add more verbosity to the different error conditions.
- Michael R. Hines

Michael R. Hines

2013-03-18 23:23:53 UTC

Post by Michael S. Tsirkin
Probably but I haven't mentioned ballooning at all.
memory overcommit != ballooning

Sure, then setting ballooning aside for the moment,
then let's just consider regular (unused) virtual memory.

In this case, what's wrong with the destination mapping
and pinning all the memory if it is not being ballooned?

If the guest touches all the memory during normal operation
before migration begins (which would be the common case),
then overcommit is irrelevant, no?

Post by Michael S. Tsirkin
This is already handled by the RDMA connection manager (librdmacm).
The library already has functions like listen() and accept() the same
way that TCP does.
Once these functions return success, we have a gaurantee that both
sides of the connection have already posted the appropriate work
requests sufficient for driving the migration.
Not if you don't post anything. librdmacm does not post requests. So
everyone posts 1 buffer on RQ during connection setup?
OK though this is not what the document said, I was under the impression
this is done after connection setup.

Sorry, I wasn't being clear. Here's the existing sequence
that I've already coded and validated:

1. Receiver and Sender are started (command line):
(The receiver has to be running before QMP migrate
can connect, of course or this all falls apart.)

2. Both sides post RQ work requests (or multiple ones)
3. Receiver does listen()
4. Sender does connect()
At this point both sides have already posted
work requests as stated before.
5. Receiver accept() => issue first SEND message

At this point the sequence of events I describe in the
documentation for put_buffer() / get_buffer() all kick
in and everything is normal.

I'll be sure to post an extra few work requests as suggested.

Post by Michael S. Tsirkin
So # of buffers goes 0 -> 1 -> 0 -> 1.
What I am saying is you should have an extra buffer
so it goes 1 -> 2 -> 1 -> 2
each time you consume the last buffer, IIRC receiver sends
and ACK to sender saying "hey this is the last buffer, slow down".
You don't want that.

No problem - I'll take care of it.......

Michael S. Tsirkin

2013-03-19 08:19:39 UTC

Post by Michael S. Tsirkin
Probably but I haven't mentioned ballooning at all.
memory overcommit != ballooning

Sure, then setting ballooning aside for the moment,
then let's just consider regular (unused) virtual memory.
In this case, what's wrong with the destination mapping
and pinning all the memory if it is not being ballooned?
If the guest touches all the memory during normal operation
before migration begins (which would be the common case),
then overcommit is irrelevant, no?

We have ways (e.g. cgroups) to limit what a VM can do. If it tries to
use more RAM than we let it, it will swap, still making progress, just
slower. OTOH it looks like pinning more memory than allowed by the
cgroups limit will just get stuck forever (probably a bug,
should fail instead? but does not help your protocol
which needs it all pinned at all times).

There are also per-task resource limits. If you exceed this
registration will fail, so not good either.

I just don't see why do registration by chunks
on source but not on destination.

--
MST

Michael R. Hines

2013-03-19 13:21:18 UTC

Post by Michael S. Tsirkin
We have ways (e.g. cgroups) to limit what a VM can do. If it tries to
use more RAM than we let it, it will swap, still making progress, just
slower. OTOH it looks like pinning more memory than allowed by the
cgroups limit will just get stuck forever (probably a bug, should fail
instead? but does not help your protocol which needs it all pinned at
all times). There are also per-task resource limits. If you exceed
this registration will fail, so not good either. I just don't see why
do registration by chunks on source but not on destination.

Would this a hard requirement for an initial version?

I do understand how and why this makes things more flexible during
the long run, but it does have the potential to slow down the RDMA
protocol significantly.

The way its implemented now, the sender can dump bytes
onto the wire at full speed (up to 30gbps last time I measured it),
but if we insert a round-trip message + registration on the
destination side before we're allowed to push more bytes out,
we'll have to introduce more complex flow control only for
the benefit of making the destination side have the flexibility
that you described.

Michael R. Hines

2013-03-19 15:08:24 UTC

This is actual a much bigger problem that I thought, not just for RDMA:

Currently the *sender* side is does not support overcommit
during a regular TCP migration.......I assume because the
migration_bitmap does not know which memory is mapped or
unmapped by the host kernel.

Is this a known issue?

- Michael

Post by Michael S. Tsirkin
Probably but I haven't mentioned ballooning at all.
memory overcommit != ballooning

Sure, then setting ballooning aside for the moment,
then let's just consider regular (unused) virtual memory.
In this case, what's wrong with the destination mapping
and pinning all the memory if it is not being ballooned?
If the guest touches all the memory during normal operation
before migration begins (which would be the common case),
then overcommit is irrelevant, no?

We have ways (e.g. cgroups) to limit what a VM can do. If it tries to
use more RAM than we let it, it will swap, still making progress, just
slower. OTOH it looks like pinning more memory than allowed by the
cgroups limit will just get stuck forever (probably a bug,
should fail instead? but does not help your protocol
which needs it all pinned at all times).
There are also per-task resource limits. If you exceed this
registration will fail, so not good either.
I just don't see why do registration by chunks
on source but not on destination.

Michael S. Tsirkin

2013-03-19 15:16:07 UTC

Post by Michael R. Hines
Currently the *sender* side is does not support overcommit
during a regular TCP migration.......I assume because the
migration_bitmap does not know which memory is mapped or
unmapped by the host kernel.
Is this a known issue?
- Michael

I don't really understand what you are saying here.
Do you see some bug with migration where we might use
more memory than allowed by cgroups?

Michael R. Hines

2013-03-19 15:32:49 UTC

I don't really understand what you are saying here.
Do you see some bug with migration where we might use
more memory than allowed by cgroups?

Yes: cgroups does not coordinate with the list of pages
that have "not yet been mapped" or touched by the
virtual machine, right?

I may be missing something here from what I read in
the code, but even if I set a cgroups limit on memory,
QEMU will still attempt to access that memory if the
migration_bitmap tells it to, as far as I can tell.

Is this an accurate observation?

A simple solution would be to just have QEMU consult with /dev/pagemap, no?

- Michael

Michael S. Tsirkin

2013-03-19 15:36:58 UTC

I don't really understand what you are saying here.
Do you see some bug with migration where we might use
more memory than allowed by cgroups?

Yes: cgroups does not coordinate with the list of pages
that have "not yet been mapped" or touched by the
virtual machine, right?
I may be missing something here from what I read in
the code, but even if I set a cgroups limit on memory,
QEMU will still attempt to access that memory if the
migration_bitmap tells it to, as far as I can tell.
Is this an accurate observation?

Yes but this simply means QEMU will hit swap.

Post by Michael R. Hines
A simple solution would be to just have QEMU consult with /dev/pagemap, no?
- Michael

Michael R. Hines

2013-03-19 17:09:23 UTC

Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?

I'm trying to keep an open mind, but that would kill the migration time.....

- Michael

I don't really understand what you are saying here.
Do you see some bug with migration where we might use
more memory than allowed by cgroups?

Yes: cgroups does not coordinate with the list of pages
that have "not yet been mapped" or touched by the
virtual machine, right?
I may be missing something here from what I read in
the code, but even if I set a cgroups limit on memory,
QEMU will still attempt to access that memory if the
migration_bitmap tells it to, as far as I can tell.
Is this an accurate observation?

Yes but this simply means QEMU will hit swap.

Post by Michael R. Hines
A simple solution would be to just have QEMU consult with /dev/pagemap, no?
- Michael

Paolo Bonzini

2013-03-19 17:14:45 UTC

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.

Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.

Paolo

Michael S. Tsirkin

2013-03-19 17:23:33 UTC

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Maybe not if you have a fast SSD, or are using swap in RAM or compressed
swap or ...

Post by Paolo Bonzini
Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.
Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.
Paolo

AFAIK for zero pages, yes. I'm not sure what the problem is either.

--
MST

Michael R. Hines

2013-03-19 17:40:06 UTC

OK, so I did a quick test and the cgroup does appear to be working
correctly for zero pages.

Nevertheless, this still doesn't solve the chunk registration problem
for RDMA.

Even with a cgroup on the sender *or* receiver side, there is no API
that I know
that would correctly indicate to the migration process which pages are
safe to register
or not with the hardware. Without such an API, even a "smarter" chunked
memory
registration scheme would not work with cgroups because we would be
attempting
to pin zero pages (for no reason) that cgroups has already kicked out,
which would
defeat the purpose of using cgroups.

So, if I submit a separate patch to fix this, would you guys review it?
(Using /dev/pagemap).

Unless there is a better idea? Does KVM expose the necessary mappings?

- Michael

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.
Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.
Paolo

Paolo Bonzini

2013-03-19 17:52:59 UTC

Post by Michael R. Hines
registration scheme would not work with cgroups because we would be
attempting to pin zero pages (for no reason) that cgroups has already
kicked out, which would defeat the purpose of using cgroups.

Yeah, pinning would be a problem.

Post by Michael R. Hines
So, if I submit a separate patch to fix this, would you guys review it?
(Using /dev/pagemap).

Sorry about the ignorance, but what is /dev/pagemap? :)

Post by Michael R. Hines
Unless there is a better idea? Does KVM expose the necessary mappings?

We could have the balloon driver track the pages. I and Michael had
some initial work a few months ago on extending the virtio-balloon spec
to allow this. It went nowhere, though.

Still, at this point this is again an RDMA-specific problem, I don't
think it would be that bad if the first iterations of RDMA didn't
support ballooning/overcommit.

Paolo

Michael R. Hines

2013-03-19 18:04:44 UTC

Post by Michael R. Hines
So, if I submit a separate patch to fix this, would you guys review it?
(Using /dev/pagemap).
Sorry about the ignorance, but what is /dev/pagemap? :)

/dev/pagemap is a recent interface for eserland accesses to the pagetables.

https://www.kernel.org/doc/Documentation/vm/pagemap.txt

It would very easily tell you (without extra tracking) which pages
were mapped and which were not mapped.

It should work for both cgroups and ballooning. We've used it before.

- Michael

Michael S. Tsirkin

2013-03-20 13:07:54 UTC

Yeah, pinning would be a problem.

Post by Michael R. Hines
So, if I submit a separate patch to fix this, would you guys review it?
(Using /dev/pagemap).

Sorry about the ignorance, but what is /dev/pagemap? :)

Post by Michael R. Hines
Unless there is a better idea? Does KVM expose the necessary mappings?

We could have the balloon driver track the pages. I and Michael had
some initial work a few months ago on extending the virtio-balloon spec
to allow this. It went nowhere, though.
Still, at this point this is again an RDMA-specific problem, I don't
think it would be that bad if the first iterations of RDMA didn't
support ballooning/overcommit.
Paolo

My problem is with the protocol. If it assumes at the protocol level
that everything is pinned down on the destination, we'll have to rework
it all to make it really useful.

--
MST

Michael R. Hines

2013-03-20 15:15:48 UTC

OK, can we make a deal? =)

I'm willing to put in the work to perform the dynamic registration on
the destination side,
but let's go a step further and piggy-back on the effort:

We need to couple this registration with a very small modification to
save_ram_block():

Currently, save_ram_block does:

1. is RDMA turned on? if yes, unconditionally add to next chunk
(will be made to dynamically
register on destination)
2. is_dup_page() ? if yes, skip
3. in xbzrle cache? if yes, skip
4. still not sent? if yes, transmit

I propose adding a "stub" function that adds:

0. is page mapped? if yes, skip (always returns true for now)
1. same
2. same
3. same
4. same

Then, later, in a separate patch, I can implement /dev/pagemap support.

When that's done, RDMA dynamic registration will actually take effect and
benefit from actually verifying that the page is mapped or not.

- Michael

Post by Michael S. Tsirkin
My problem is with the protocol. If it assumes at the protocol level
that everything is pinned down on the destination, we'll have to
rework it all to make it really useful.

Michael R. Hines

2013-03-20 15:22:36 UTC

s / is page mapped?/ is page unmapped?/ g

Post by Michael R. Hines
OK, can we make a deal? =)
I'm willing to put in the work to perform the dynamic registration on
the destination side,
We need to couple this registration with a very small modification to
1. is RDMA turned on? if yes, unconditionally add to next chunk
(will be made to dynamically
register on destination)
2. is_dup_page() ? if yes, skip
3. in xbzrle cache? if yes, skip
4. still not sent? if yes, transmit
0. is page mapped? if yes, skip (always returns true for now)
1. same
2. same
3. same
4. same
Then, later, in a separate patch, I can implement /dev/pagemap support.
When that's done, RDMA dynamic registration will actually take effect and
benefit from actually verifying that the page is mapped or not.
- Michael

Michael S. Tsirkin

2013-03-20 15:55:15 UTC

Post by Michael R. Hines
OK, can we make a deal? =)
I'm willing to put in the work to perform the dynamic registration
on the destination side,
We need to couple this registration with a very small modification
1. is RDMA turned on? if yes, unconditionally add to next chunk
(will be made to
dynamically register on destination)
2. is_dup_page() ? if yes, skip
3. in xbzrle cache? if yes, skip
4. still not sent? if yes, transmit
0. is page mapped? if yes, skip (always returns true for now)
1. same
2. same
3. same
4. same
Then, later, in a separate patch, I can implement /dev/pagemap support.
When that's done, RDMA dynamic registration will actually take effect and
benefit from actually verifying that the page is mapped or not.
- Michael

Mapped into guest? You mean e.g. for ballooning?

Michael R. Hines

2013-03-20 16:08:40 UTC

Post by Michael R. Hines
OK, can we make a deal? =)
I'm willing to put in the work to perform the dynamic registration
on the destination side,
We need to couple this registration with a very small modification
1. is RDMA turned on? if yes, unconditionally add to next chunk
(will be made to
dynamically register on destination)
2. is_dup_page() ? if yes, skip
3. in xbzrle cache? if yes, skip
4. still not sent? if yes, transmit
0. is page mapped? if yes, skip (always returns true for now)
1. same
2. same
3. same
4. same
Then, later, in a separate patch, I can implement /dev/pagemap support.
When that's done, RDMA dynamic registration will actually take effect and
benefit from actually verifying that the page is mapped or not.
- Michael

Mapped into guest? You mean e.g. for ballooning?

No, not just ballooning. Overcommit (i.e. cgroups).

Anytime cgroups kicks out a page (or anytime the balloon kicks in),
the page would become unmapped.

The make dynamic registration useful, we have to actually have something
in place in the future that knows how to *check* if a page is unmapped
from the virtual machine, either because it has never been dirtied before
(and might be pointing to the zero page) or because it has been madvised()
out or has been detatched because of a cgroup limit.

- Michael

Michael S. Tsirkin

2013-03-20 19:06:34 UTC

Post by Michael R. Hines
OK, can we make a deal? =)
I'm willing to put in the work to perform the dynamic registration
on the destination side,
We need to couple this registration with a very small modification
1. is RDMA turned on? if yes, unconditionally add to next chunk
(will be made to
dynamically register on destination)
2. is_dup_page() ? if yes, skip
3. in xbzrle cache? if yes, skip
4. still not sent? if yes, transmit
0. is page mapped? if yes, skip (always returns true for now)
1. same
2. same
3. same
4. same
Then, later, in a separate patch, I can implement /dev/pagemap support.
When that's done, RDMA dynamic registration will actually take effect and
benefit from actually verifying that the page is mapped or not.
- Michael

Mapped into guest? You mean e.g. for ballooning?

No, not just ballooning. Overcommit (i.e. cgroups).
Anytime cgroups kicks out a page (or anytime the balloon kicks in),
the page would become unmapped.

OK but we still need to send that page to remote.
It's in swap but has guest data in there, you can't
just ignore it.

Post by Michael R. Hines
The make dynamic registration useful, we have to actually have something
in place in the future that knows how to *check* if a page is unmapped
from the virtual machine, either because it has never been dirtied before
(and might be pointing to the zero page) or because it has been madvised()
out or has been detatched because of a cgroup limit.
- Michael

Michael R. Hines

2013-03-20 20:20:06 UTC

Yes, absolutely: https://www.kernel.org/doc/Documentation/vm/pagemap.txt

The pagemap will tell you that.

In fact the pagemap ideally would *only* be used for the 1st migration
round.

The rest of them would depend exclusively on the dirty bitmap as they do.

Basically, we could use the pagemap as first-time "hint" for the bulk of
the memory that costs the most to transmit.

Michael S. Tsirkin

2013-03-20 20:31:19 UTC

Post by Michael R. Hines
No, not just ballooning. Overcommit (i.e. cgroups).
Anytime cgroups kicks out a page (or anytime the balloon kicks in),
the page would become unmapped.
OK but we still need to send that page to remote.
It's in swap but has guest data in there, you can't
just ignore it.
Yes, absolutely: https://www.kernel.org/doc/Documentation/vm/pagemap.txt
The pagemap will tell you that.
In fact the pagemap ideally would *only* be used for the 1st migration round.
The rest of them would depend exclusively on the dirty bitmap as they do.
Basically, we could use the pagemap as first-time "hint" for the bulk of
the memory that costs the most to transmit.

OK sure, this could be useful to detect pages deduplicated by KSM and only
transmit one copy. There's still the question of creating same
duplicate mappings on destination - do you just do data copy on destination?

Not sure why you talk about unmapped pages above though, it seems
not really relevant...

There's also the matter of KSM not touching pinned pages,
that's another good reason not to pin all pages on destination,
they won't be deduplicated.

Michael R. Hines

2013-03-20 20:39:00 UTC

Agreed. Very useful for KSM.

Unmapped virtual addresses cannot be pinned for RDMA (the hardware will
break),
but there's no way to know they are unmapped without checking another
data structure.

- Michael

Post by Michael S. Tsirkin
OK sure, this could be useful to detect pages deduplicated by KSM and only
transmit one copy. There's still the question of creating same
duplicate mappings on destination - do you just do data copy on destination?
Not sure why you talk about unmapped pages above though, it seems
not really relevant...
There's also the matter of KSM not touching pinned pages,
that's another good reason not to pin all pages on destination,
they won't be deduplicated.

Michael S. Tsirkin

2013-03-20 20:46:16 UTC

Post by Michael R. Hines
Unmapped virtual addresses cannot be pinned for RDMA (the hardware
will break),
but there's no way to know they are unmapped without checking
another data structure.

So for RDMA, when you try to register them, this will fault them in.
For regular migration we really should try using vmsplice. Anyone up to
it? If we do this TCP could outperform RDMA for some workloads ...

--
MST

Michael R. Hines

2013-03-20 20:56:01 UTC

Forgive me, vmsplice system call? Or some other interface?

I'm not following......

Post by Michael R. Hines
Unmapped virtual addresses cannot be pinned for RDMA (the hardware
will break),
but there's no way to know they are unmapped without checking
another data structure.

So for RDMA, when you try to register them, this will fault them in.

Michael S. Tsirkin

2013-03-21 05:20:34 UTC

Post by Michael R. Hines
Forgive me, vmsplice system call? Or some other interface?
I'm not following......

Post by Michael R. Hines
Unmapped virtual addresses cannot be pinned for RDMA (the hardware
will break),
but there's no way to know they are unmapped without checking
another data structure.

So for RDMA, when you try to register them, this will fault them in.

I'm just saying get_user_pages brings pages back in from swap.

Michael R. Hines

2013-03-20 20:24:14 UTC

Post by Michael R. Hines
Then, later, in a separate patch, I can implement /dev/pagemap support.
When that's done, RDMA dynamic registration will actually take effect and
benefit from actually verifying that the page is mapped or not.
- Michael
Mapped into guest? You mean e.g. for ballooning?

Three scenarios are candidates for mapped checking:

1. anytime the virtual machine has not yet accessed a page (usually
during the 1st-time boot)
2. Anytime madvise(DONTNEED) happens (for ballooning)
3. Anytime cgroups kicks out a zero page that was accessed and faulted
but not dirty that is a clean candidate for unmapping.
(I did a test that seems to confirm that cgroups is pretty
"smart" about that)

Basically, anytime the pagemap says "this page is *not* swap and *not*
mapped
- then the page is not important during the 1st iteration.

On the subsequent iterations, we come along as normal checking the dirty
bitmap as usual.

- Michael

Michael S. Tsirkin

2013-03-20 20:37:34 UTC

1. anytime the virtual machine has not yet accessed a page (usually
during the 1st-time boot)

So migrating booting machines is faster now? Why is this worth
optimizing for?

Post by Michael R. Hines
2. Anytime madvise(DONTNEED) happens (for ballooning)

This is likely worth optimizing.
I think a better the way to handling this one is by tracking
ballooned state. Just mark these pages as unused in qemu.

Post by Michael R. Hines
3. Anytime cgroups kicks out a zero page that was accessed and
faulted but not dirty that is a clean candidate for unmapping.
(I did a test that seems to confirm that cgroups is pretty
"smart" about that)
Basically, anytime the pagemap says "this page is *not* swap and
*not* mapped
- then the page is not important during the 1st iteration.
On the subsequent iterations, we come along as normal checking the
dirty bitmap as usual.
- Michael

If it will never be dirty you will never migrate it?
Seems wrong - it could have guest data on disk - AFAIK clean does not
mean no data, it means disk is in sync with memory.

Michael R. Hines

2013-03-20 20:45:05 UTC

1. anytime the virtual machine has not yet accessed a page (usually
during the 1st-time boot)

So migrating booting machines is faster now? Why is this worth
optimizing for?

Yes, it helps both the TCP migration and RDMA migration simultaneously.

Post by Michael R. Hines
2. Anytime madvise(DONTNEED) happens (for ballooning)

This is likely worth optimizing.
I think a better the way to handling this one is by tracking
ballooned state. Just mark these pages as unused in qemu.

Paolo said somebody attempted that, but stopped work on it for some reason?

If it will never be dirty you will never migrate it?
Seems wrong - it could have guest data on disk - AFAIK clean does not
mean no data, it means disk is in sync with memory.

Sorry, yes - that was a mis-statement: clean pages are always mapped (or
swapped) and would have to
be transmitted at least once.

- Michael

Michael S. Tsirkin

2013-03-20 20:52:21 UTC

1. anytime the virtual machine has not yet accessed a page (usually
during the 1st-time boot)

So migrating booting machines is faster now? Why is this worth
optimizing for?

Yes, it helps both the TCP migration and RDMA migration simultaneously.

But for a class of VMs that is only common when you want to
run a benchmark. People do live migration precisely to
avoid the need to reboot the VM.

Post by Michael R. Hines
2. Anytime madvise(DONTNEED) happens (for ballooning)

This is likely worth optimizing.
I think a better the way to handling this one is by tracking
ballooned state. Just mark these pages as unused in qemu.

Paolo said somebody attempted that, but stopped work on it for some reason?

If it will never be dirty you will never migrate it?
Seems wrong - it could have guest data on disk - AFAIK clean does not
mean no data, it means disk is in sync with memory.

Sorry, yes - that was a mis-statement: clean pages are always mapped
(or swapped) and would have to
be transmitted at least once.
- Michael

Right so maybe my idea of looking at the PFNs in pagemap and transmitting
only once could help some VMs (and it would cover the booting VMs as a
partial case), and it could be a useful though linux-specific
optimization, but I don't see how looking at whether page is
mapped would help for TCP.

--
MST

Michael R. Hines

2013-03-19 17:49:34 UTC

I also did a test using RDMA + cgroup, and the kernel killed my QEMU :)

So, infiniband is not smart enough to know how to avoid pinning a zero
page, I guess.

- Michael

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.
Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.
Paolo

Michael S. Tsirkin

2013-03-21 06:11:59 UTC

Post by Michael R. Hines
I also did a test using RDMA + cgroup, and the kernel killed my QEMU :)
So, infiniband is not smart enough to know how to avoid pinning a
zero page, I guess.
- Michael

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.
Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.
Paolo

I really shouldn't break COW if you don't request LOCAL_WRITE.
I think it's a kernel bug, and apparently has been there in the code since the
first version: get_user_pages parameters swapped.

I'll send a patch. If it's applied, you should also
change your code from

+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_WRITE |
+ IBV_ACCESS_REMOTE_READ);

to

+ IBV_ACCESS_REMOTE_READ);

on send side.
Then, each time we detect a page has changed we must make sure to
unregister and re-register it. Or if you want to be very
smart, check that the PFN didn't change and reregister
if it did.

This will make overcommit work.

--
MST

Michael R. Hines

2013-03-21 15:22:00 UTC

Very nice catch. Yes, I didn't think about that.

Thanks.

Post by Michael S. Tsirkin
I really shouldn't break COW if you don't request LOCAL_WRITE.
I think it's a kernel bug, and apparently has been there in the code since the
first version: get_user_pages parameters swapped.
I'll send a patch. If it's applied, you should also
change your code from
+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_WRITE |
+ IBV_ACCESS_REMOTE_READ);
to
+ IBV_ACCESS_REMOTE_READ);
on send side.
Then, each time we detect a page has changed we must make sure to
unregister and re-register it. Or if you want to be very
smart, check that the PFN didn't change and reregister
if it did.
This will make overcommit work.

Michael R. Hines

2013-04-05 20:46:57 UTC

FYI, I used the following redhat cgroups instructions, to test if
overcommit + RDMA was working:

https://access.redhat.com/site/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Resource_Management_Guide/sec-memory.html

- Michael

Post by Michael R. Hines
I also did a test using RDMA + cgroup, and the kernel killed my QEMU :)
So, infiniband is not smart enough to know how to avoid pinning a
zero page, I guess.
- Michael

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.
Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.
Paolo

I really shouldn't break COW if you don't request LOCAL_WRITE.
I think it's a kernel bug, and apparently has been there in the code since the
first version: get_user_pages parameters swapped.
I'll send a patch. If it's applied, you should also
change your code from
+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_WRITE |
+ IBV_ACCESS_REMOTE_READ);
to
+ IBV_ACCESS_REMOTE_READ);
on send side.
Then, each time we detect a page has changed we must make sure to
unregister and re-register it. Or if you want to be very
smart, check that the PFN didn't change and reregister
if it did.
This will make overcommit work.

Michael R. Hines

2013-04-05 20:45:34 UTC

Post by Michael R. Hines
I also did a test using RDMA + cgroup, and the kernel killed my QEMU :)
So, infiniband is not smart enough to know how to avoid pinning a
zero page, I guess.
- Michael

Post by Michael R. Hines
Allowing QEMU to swap due to a cgroup limit during migration is a viable
overcommit option?
I'm trying to keep an open mind, but that would kill the migration time.....

Would it swap? Doesn't the kernel back all zero pages with a single
copy-on-write page? If that still accounts towards cgroup limits, it
would be a bug.
Old kernels do not have a shared zero hugepage, and that includes some
distro kernels. Perhaps that's the problem.
Paolo

I really shouldn't break COW if you don't request LOCAL_WRITE.
I think it's a kernel bug, and apparently has been there in the code since the
first version: get_user_pages parameters swapped.
I'll send a patch. If it's applied, you should also
change your code from
+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_WRITE |
+ IBV_ACCESS_REMOTE_READ);
to
+ IBV_ACCESS_REMOTE_READ);
on send side.
Then, each time we detect a page has changed we must make sure to
unregister and re-register it. Or if you want to be very
smart, check that the PFN didn't change and reregister
if it did.
This will make overcommit work.

Unfortunately RDMA + cgroups still kills QEMU:

I removed the *_WRITE flags and did a test like this:

1. Start QEMU with 2GB ram configured

$ cd /sys/fs/cgroup/memory/libvirt/qemu
$ echo "-1" > memory.memsw.limit_in_bytes
$ echo "-1" > memory.limit_in_bytes
$ echo $(pidof qemu-system-x86_64) > tasks
$ echo 512M > memory.limit_in_bytes # maximum RSS
$ echo 3G > memory.memsw.limit_in_bytes # maximum RSS + swap, extra
1G to be safe

2. Start RDMA migration

3. RSS of 512M is reached
4. swap starts filling up
5. the kernel kills QEMU
6. dmesg:

[ 2981.657135] Task in /libvirt/qemu killed as a result of limit of
/libvirt/qemu
[ 2981.657140] memory: usage 524288kB, limit 524288kB, failcnt 18031
[ 2981.657143] memory+swap: usage 525460kB, limit 3145728kB, failcnt 0
[ 2981.657146] Mem-Info:
[ 2981.657148] Node 0 DMA per-cpu:
[ 2981.657152] CPU 0: hi: 0, btch: 1 usd: 0
[ 2981.657155] CPU 1: hi: 0, btch: 1 usd: 0
[ 2981.657157] CPU 2: hi: 0, btch: 1 usd: 0
[ 2981.657160] CPU 3: hi: 0, btch: 1 usd: 0
[ 2981.657163] CPU 4: hi: 0, btch: 1 usd: 0
[ 2981.657165] CPU 5: hi: 0, btch: 1 usd: 0
[ 2981.657167] CPU 6: hi: 0, btch: 1 usd: 0
[ 2981.657170] CPU 7: hi: 0, btch: 1 usd: 0
[ 2981.657172] Node 0 DMA32 per-cpu:
[ 2981.657176] CPU 0: hi: 186, btch: 31 usd: 160
[ 2981.657178] CPU 1: hi: 186, btch: 31 usd: 22
[ 2981.657181] CPU 2: hi: 186, btch: 31 usd: 179
[ 2981.657184] CPU 3: hi: 186, btch: 31 usd: 6
[ 2981.657186] CPU 4: hi: 186, btch: 31 usd: 21
[ 2981.657189] CPU 5: hi: 186, btch: 31 usd: 15
[ 2981.657191] CPU 6: hi: 186, btch: 31 usd: 19
[ 2981.657194] CPU 7: hi: 186, btch: 31 usd: 22
[ 2981.657196] Node 0 Normal per-cpu:
[ 2981.657200] CPU 0: hi: 186, btch: 31 usd: 44
[ 2981.657202] CPU 1: hi: 186, btch: 31 usd: 58
[ 2981.657205] CPU 2: hi: 186, btch: 31 usd: 156
[ 2981.657207] CPU 3: hi: 186, btch: 31 usd: 107
[ 2981.657210] CPU 4: hi: 186, btch: 31 usd: 44
[ 2981.657213] CPU 5: hi: 186, btch: 31 usd: 70
[ 2981.657215] CPU 6: hi: 186, btch: 31 usd: 76
[ 2981.657218] CPU 7: hi: 186, btch: 31 usd: 173
[ 2981.657223] active_anon:181703 inactive_anon:68856 isolated_anon:0
[ 2981.657224] active_file:66881 inactive_file:141056 isolated_file:0
[ 2981.657225] unevictable:2174 dirty:6 writeback:0 unstable:0
[ 2981.657226] free:4058168 slab_reclaimable:5152 slab_unreclaimable:10785
[ 2981.657227] mapped:7709 shmem:192 pagetables:1913 bounce:0
[ 2981.657230] Node 0 DMA free:15896kB min:56kB low:68kB high:84kB
active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB
unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15672kB
mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB
slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB
pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0
all_unreclaimable? no
[ 2981.657242] lowmem_reserve[]: 0 1966 18126 18126
[ 2981.657249] Node 0 DMA32 free:1990652kB min:7324kB low:9152kB
high:10984kB active_anon:0kB inactive_anon:0kB active_file:0kB
inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB
present:2013280kB mlocked:0kB dirty:0kB writeback:0kB mapped:4kB
shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB
pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0
all_unreclaimable? no
[ 2981.657260] lowmem_reserve[]: 0 0 16160 16160
[ 2981.657268] Node 0 Normal free:14226124kB min:60200kB low:75248kB
high:90300kB active_anon:726812kB inactive_anon:275424kB
active_file:267524kB inactive_file:564224kB unevictable:8696kB
isolated(anon):0kB isolated(file):0kB present:16547840kB mlocked:6652kB
dirty:24kB writeback:0kB mapped:30832kB shmem:768kB
slab_reclaimable:20608kB slab_unreclaimable:43140kB kernel_stack:1784kB
pagetables:7652kB unstable:0kB bounce:0kB writeback_tmp:0kB
pages_scanned:0 all_unreclaimable? no
[ 2981.657281] lowmem_reserve[]: 0 0 0 0
[ 2981.657289] Node 0 DMA: 0*4kB 1*8kB 1*16kB 0*32kB 2*64kB 1*128kB
1*256kB 0*512kB 1*1024kB 1*2048kB 3*4096kB = 15896kB
[ 2981.657307] Node 0 DMA32: 17*4kB 9*8kB 7*16kB 4*32kB 8*64kB 5*128kB
6*256kB 4*512kB 3*1024kB 6*2048kB 481*4096kB = 1990652kB
[ 2981.657325] Node 0 Normal: 2*4kB 1*8kB 991*16kB 893*32kB 271*64kB
50*128kB 50*256kB 12*512kB 5*1024kB 1*2048kB 3450*4096kB = 14225504kB
[ 2981.657343] 277718 total pagecache pages
[ 2981.657345] 68816 pages in swap cache
[ 2981.657348] Swap cache stats: add 656848, delete 588032, find 19850/22338
[ 2981.657350] Free swap = 15288376kB
[ 2981.657353] Total swap = 15564796kB
[ 2981.706982] 4718576 pages RAM

m***@linux.vnet.ibm.com

2013-03-18 03:18:57 UTC

From: "Michael R. Hines" <***@us.ibm.com>

This introduces:
1. qemu_ram_foreach_block
2. qemu_ram_count_blocks

Both used in communicating the RAMBlocks
to each side for later memory registration.

Signed-off-by: Michael R. Hines <***@us.ibm.com>
---
exec.c | 21 +++++++++++++++++++++
include/exec/cpu-common.h | 6 ++++++
2 files changed, 27 insertions(+)

diff --git a/exec.c b/exec.c
index 8a6aac3..a985da8 100644
--- a/exec.c
+++ b/exec.c
@@ -2629,3 +2629,24 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
memory_region_is_romd(section->mr));
}
#endif
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
+{
+ RAMBlock *block;
+
+ QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+ func(block->host, block->offset, block->length, opaque);
+ }
+}
+
+int qemu_ram_count_blocks(void)
+{
+ RAMBlock *block;
+ int total = 0;
+
+ QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+ total++;
+ }
+
+ return total;
+}
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 2e5f11f..aea3fe0 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -119,6 +119,12 @@ extern struct MemoryRegion io_mem_rom;
extern struct MemoryRegion io_mem_unassigned;
extern struct MemoryRegion io_mem_notdirty;

+typedef void (RAMBlockIterFunc)(void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque);
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
+int qemu_ram_count_blocks(void);
+
#endif

#endif /* !CPU_COMMON_H */

--
1.7.10.4

Paolo Bonzini

2013-03-18 08:48:38 UTC

Post by m***@linux.vnet.ibm.com
1. qemu_ram_foreach_block
2. qemu_ram_count_blocks
Both used in communicating the RAMBlocks
to each side for later memory registration.
---
exec.c | 21 +++++++++++++++++++++
include/exec/cpu-common.h | 6 ++++++
2 files changed, 27 insertions(+)
diff --git a/exec.c b/exec.c
index 8a6aac3..a985da8 100644
--- a/exec.c
+++ b/exec.c
@@ -2629,3 +2629,24 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
memory_region_is_romd(section->mr));
}
#endif
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
+{
+ RAMBlock *block;
+
+ QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+ func(block->host, block->offset, block->length, opaque);
+ }
+}
+
+int qemu_ram_count_blocks(void)
+{
+ RAMBlock *block;
+ int total = 0;
+
+ QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+ total++;
+ }

Please move this to rdma.c, and implement it using qemu_ram_foreach_block.

Otherwise looks good.

Paolo

Post by m***@linux.vnet.ibm.com
+ return total;
+}
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 2e5f11f..aea3fe0 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -119,6 +119,12 @@ extern struct MemoryRegion io_mem_rom;
extern struct MemoryRegion io_mem_unassigned;
extern struct MemoryRegion io_mem_notdirty;
+typedef void (RAMBlockIterFunc)(void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque);
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
+int qemu_ram_count_blocks(void);
+
#endif
#endif /* !CPU_COMMON_H */

Michael R. Hines

2013-03-18 20:25:31 UTC

Acnkowledged.

Please move this to rdma.c, and implement it using qemu_ram_foreach_block.
Otherwise looks good.
Paolo

m***@linux.vnet.ibm.com

2013-03-18 03:18:59 UTC