Live Block Migration using Mirroring

Discussion:

Live Block Migration using Mirroring

Federico Simoncelli

2012-02-22 17:13:32 UTC

Hi,
recently I've been working on live block migration combining the live
snapshots and the blkmirror patch sent by Marcelo Tosatti few months ago.

The design is summarized at this url as "Mirrored-Snapshot":

http://www.ovirt.org/wiki/Features/Design/StorageLiveMigration

The design assumes that the qemu process can reach both the source and
destination storages and no real VM migration between hosts is involved.
The principal problem that it tries to solve is moving a VM to a new
reachable storage (more space, faster) without temporarily disrupting its
services.

The following set of patches are implementing the required changes in
QEMU.

Here it is a quick example of the use case (for consistency with the
design at the url above I will use the same step numbers):

Preparation
===========

$ mkdir /tmp/{src/dst}
$ qemu-img create -f qcow2 /tmp/src/hd0base.qcow2 20G
Formatting '/tmp/src/hd0base.qcow2', fmt=qcow2 size=21474836480
encryption=off cluster_size=65536

Step 1 - Initital Scenario
==========================
VM1 is running on the src/hd0base. (Where "<=" stands for "uses")

[src/hd0base] <= VM1(read-write)

$ qemu-system-x86_64 -hda /tmp/src/hd0base.qcow2 -monitor stdio
QEMU 1.0.50 monitor - type 'help' for more information
(qemu)

Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and dst/hd0snap1 as
image files. (Where "<-" stands for "has backing file")

[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
... <- [dst/hd0snap1] <= VM1(write-only)

$ qemu-img create -f qcow2 \
-b /tmp/src/hd0base.qcow2 /tmp/src/hd0snap1.qcow2 20G
Formatting '/tmp/src/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536

$ qemu-img create -f qcow2 \
-b /tmp/dst/hd0base.qcow2 /tmp/dst/hd0snap1.qcow2 20G
Formatting '/tmp/dst/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536

(qemu) snapshot_blkdev -n ide0-hd0 \
blkmirror:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2 blkmirror

Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination dst/hd0base.

[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base] <- [dst/hd0snap1] <= VM1(write-only)

$ cp -a /tmp/src/hd0base.qcow2 /tmp/dst/hd0base.qcow2

Step 5 - Final Switch to Destination
====================================
VM1 is now able to switch to the destination for both read and write
operations.

[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)

(qemu) snapshot_blkdev -n ide0-hd0 /tmp/dst/hd0snap1.qcow2

--
Federico

Federico Simoncelli

2012-02-22 17:13:34 UTC

Signed-off-by: Federico Simoncelli <***@redhat.com>
---
block/blkmirror.c | 26 +++++++++++++-------------
1 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/block/blkmirror.c b/block/blkmirror.c
index 1c02710..1cfd2fb 100644
--- a/block/blkmirror.c
+++ b/block/blkmirror.c
@@ -46,7 +46,7 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
filename += strlen("blkmirror:");

/* Parse the raw image filename */
- filename2 = qemu_malloc(strlen(filename)+1);
+ filename2 = qemu_vmalloc(strlen(filename)+1);
escape = 0;
for (i = n = 0; i < strlen(filename); i++) {
if (!escape && filename[i] == ':') {
@@ -66,11 +66,11 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)

m->bs[0] = bdrv_new("");
if (m->bs[0] == NULL) {
- free(filename2);
+ qemu_vfree(filename2);
return -ENOMEM;
}
ret = bdrv_open(m->bs[0], filename2, flags, NULL);
- free(filename2);
+ qemu_vfree(filename2);
if (ret < 0) {
return ret;
}
@@ -81,7 +81,7 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
bdrv_delete(m->bs[0]);
return -ENOMEM;
}
- ret = bdrv_open(m->bs[1], filename, flags, NULL);
+ ret = bdrv_open(m->bs[1], filename, flags | BDRV_O_NO_BACKING, NULL);
if (ret < 0) {
bdrv_delete(m->bs[0]);
return ret;
@@ -101,17 +101,17 @@ static void blkmirror_close(BlockDriverState *bs)
}
}

-static int blkmirror_flush(BlockDriverState *bs)
+static coroutine_fn int blkmirror_co_flush(BlockDriverState *bs)
{
BdrvMirrorState *m = bs->opaque;
int ret;

- ret = bdrv_flush(m->bs[0]);
+ ret = bdrv_co_flush(m->bs[0]);
if (ret < 0) {
return ret;
}

- return bdrv_flush(m->bs[1]);
+ return bdrv_co_flush(m->bs[1]);
}

static int64_t blkmirror_getlength(BlockDriverState *bs)
@@ -242,18 +242,18 @@ static BlockDriverAIOCB *blkmirror_aio_flush(BlockDriverState *bs,
return &dcb->common;
}

-static int blkmirror_discard(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors)
+static coroutine_fn int blkmirror_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
{
BdrvMirrorState *m = bs->opaque;
int ret;

- ret = bdrv_discard(m->bs[0], sector_num, nb_sectors);
+ ret = bdrv_co_discard(m->bs[0], sector_num, nb_sectors);
if (ret < 0) {
return ret;
}

- return bdrv_discard(m->bs[1], sector_num, nb_sectors);
+ return bdrv_co_discard(m->bs[1], sector_num, nb_sectors);
}

@@ -266,8 +266,8 @@ static BlockDriver bdrv_blkmirror = {

.bdrv_file_open = blkmirror_open,
.bdrv_close = blkmirror_close,
- .bdrv_flush = blkmirror_flush,
- .bdrv_discard = blkmirror_discard,
+ .bdrv_co_flush_to_disk = blkmirror_co_flush,
+ .bdrv_co_discard = blkmirror_co_discard,

.bdrv_aio_readv = blkmirror_aio_readv,
.bdrv_aio_writev = blkmirror_aio_writev,

--
1.7.1

Paolo Bonzini

2012-02-23 07:18:28 UTC

Post by Federico Simoncelli
@@ -46,7 +46,7 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
filename += strlen("blkmirror:");
/* Parse the raw image filename */
- filename2 = qemu_malloc(strlen(filename)+1);
+ filename2 = qemu_vmalloc(strlen(filename)+1);
escape = 0;
for (i = n = 0; i < strlen(filename); i++) {
if (!escape && filename[i] == ':') {
@@ -66,11 +66,11 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
m->bs[0] = bdrv_new("");
if (m->bs[0] == NULL) {
- free(filename2);
+ qemu_vfree(filename2);
return -ENOMEM;
}
ret = bdrv_open(m->bs[0], filename2, flags, NULL);
- free(filename2);
+ qemu_vfree(filename2);
if (ret < 0) {
return ret;
}

These should be g_malloc and g_free.

Also, please squash this patch in part 1.

Paolo

Federico Simoncelli

2012-02-23 09:44:25 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 8:18:28 AM
Subject: Re: [PATCH 2/3] Update the blkmirror block driver

Post by Federico Simoncelli
@@ -46,7 +46,7 @@ static int blkmirror_open(BlockDriverState *bs,
const char *filename, int flags)
filename += strlen("blkmirror:");
/* Parse the raw image filename */
- filename2 = qemu_malloc(strlen(filename)+1);
+ filename2 = qemu_vmalloc(strlen(filename)+1);
escape = 0;
for (i = n = 0; i < strlen(filename); i++) {
if (!escape && filename[i] == ':') {
@@ -66,11 +66,11 @@ static int blkmirror_open(BlockDriverState *bs,
const char *filename, int flags)
m->bs[0] = bdrv_new("");
if (m->bs[0] == NULL) {
- free(filename2);
+ qemu_vfree(filename2);
return -ENOMEM;
}
ret = bdrv_open(m->bs[0], filename2, flags, NULL);
- free(filename2);
+ qemu_vfree(filename2);
if (ret < 0) {
return ret;
}

These should be g_malloc and g_free.

Thanks!

Also, please squash this patch in part 1.

Yes, I sent it as a separate patch to make it easier to review my
changes (if someone already reviewed Marcelo's patch).
Any comment on the BDRV_O_NO_BACKING flag I added? That's the real
trick I'm pulling here. It basically allows to open the second image
only for writing and it doesn't complain if the backing file is not
there yet (it will be copied during step 4).

--
Federico

Paolo Bonzini

2012-02-23 09:45:51 UTC

Post by Federico Simoncelli
Any comment on the BDRV_O_NO_BACKING flag I added? That's the real
trick I'm pulling here. It basically allows to open the second image
only for writing and it doesn't complain if the backing file is not
there yet (it will be copied during step 4).

Yes, it makes sense to me.

Paolo

Federico Simoncelli

2012-02-22 17:13:35 UTC

Signed-off-by: Federico Simoncelli <***@redhat.com>
---
blockdev.c | 14 ++++++++------
hmp-commands.hx | 16 ++++++++++------
hmp.c | 4 +++-
qapi-schema.json | 8 +++++++-
qmp-commands.hx | 2 +-
5 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 7a6613a..2c3e10a 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -647,7 +647,7 @@ void do_commit(Monitor *mon, const QDict *qdict)

void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
bool has_format, const char *format,
- Error **errp)
+ bool has_nocreate, bool nocreate, Error **errp)
{
BlockDriverState *bs;
BlockDriver *drv, *old_drv, *proto_drv;
@@ -686,11 +686,13 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
return;
}

- ret = bdrv_img_create(snapshot_file, format, bs->filename,
- bs->drv->format_name, NULL, -1, flags);
- if (ret) {
- error_set(errp, QERR_UNDEFINED_ERROR);
- return;
+ if (!(has_nocreate && nocreate)) {
+ ret = bdrv_img_create(snapshot_file, format, bs->filename,
+ bs->drv->format_name, NULL, -1, flags);
+ if (ret) {
+ error_set(errp, QERR_UNDEFINED_ERROR);
+ return;
+ }
}

bdrv_drain_all();
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..ff49412 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -868,14 +868,18 @@ ETEXI

{
.name = "snapshot_blkdev",
- .args_type = "device:B,snapshot-file:s?,format:s?",
- .params = "device [new-image-file] [format]",
+ .args_type = "nocreate:-n,device:B,snapshot-file:s?,format:s?",
+ .params = "[-n] device [new-image-file] [format]",
.help = "initiates a live snapshot\n\t\t\t"
"of device. If a new image file is specified, the\n\t\t\t"
- "new image file will become the new root image.\n\t\t\t"
- "If format is specified, the snapshot file will\n\t\t\t"
- "be created in that format. Otherwise the\n\t\t\t"
- "snapshot will be internal! (currently unsupported)",
+ "new image file will be created (unless -n is\n\t\t\t"
+ "specified) and will become the new root image.\n\t\t\t"
+ "The -n (nocreate) option is potentially dangerous\n\t\t\t"
+ "if the image wasn't prepared in the correct way\n\t\t\t"
+ "by an external manager. If format is specified,\n\t\t\t"
+ "the snapshot file will be created in that format.\n\t\t\t"
+ "Otherwise the snapshot will be internal!\n\t\t\t"
+ "(currently unsupported)",
.mhandler.cmd = hmp_snapshot_blkdev,
},

diff --git a/hmp.c b/hmp.c
index 8ff8c94..44868c7 100644
--- a/hmp.c
+++ b/hmp.c
@@ -684,6 +684,7 @@ void hmp_block_resize(Monitor *mon, const QDict *qdict)

void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
{
+ int nocreate = qdict_get_try_bool(qdict, "nocreate", 0);
const char *device = qdict_get_str(qdict, "device");
const char *filename = qdict_get_try_str(qdict, "snapshot-file");
const char *format = qdict_get_try_str(qdict, "format");
@@ -697,7 +698,8 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
return;
}

- qmp_blockdev_snapshot_sync(device, filename, !!format, format, &errp);
+ qmp_blockdev_snapshot_sync(device, filename,
+ !!format, format, true, nocreate, &errp);
hmp_handle_error(mon, &errp);
}

diff --git a/qapi-schema.json b/qapi-schema.json
index d02ee86..ac652c2 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1116,9 +1116,14 @@
# @snapshot-file: the target of the new image. If the file exists, or if it
# is a device, the snapshot will be created in the existing
# file/device. If does not exist, a new file will be created.
+# This behavior can be overridden using the nocreate option.
#
# @format: #optional the format of the snapshot image, default is 'qcow2'.
#
+# @nocreate: #optional use the target image avoiding the creation. This is
+# a potentially dangerous option if the image wasn't prepared
+# in the correct way by an external manager.
+#
# Returns: nothing on success
# If @device is not a valid block device, DeviceNotFound
# If @snapshot-file can't be opened, OpenFileFailed
@@ -1133,7 +1138,8 @@
# Since 0.14.0
##
{ 'command': 'blockdev-snapshot-sync',
- 'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }
+ 'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str',
+ '*nocreate': 'bool' } }

##
# @human-monitor-command:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index b5e2ab8..e717826 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -668,7 +668,7 @@ EQMP

{
.name = "blockdev-snapshot-sync",
- .args_type = "device:B,snapshot-file:s,format:s?",
+ .args_type = "nocreate:-n,device:B,snapshot-file:s,format:s?",
.mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
},

--
1.7.1

Paolo Bonzini

2012-02-23 07:19:41 UTC

What is the usecase, and how can this be tested?

Paolo

Paolo Bonzini

2012-02-23 07:38:55 UTC

Post by Paolo Bonzini
What is the usecase, and how can this be tested?

Oops, you explained it in part 0.

Post by Paolo Bonzini
Step 5 - Final Switch to Destination
====================================
VM1 is now able to switch to the destination for both read and write
operations.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
(qemu) snapshot_blkdev -n ide0-hd0 /tmp/dst/hd0snap1.qcow2

It seems to me that reusing the snapshot_blkdev command is a bit of a
misnomer. It also forces you to use format autodetection for the
destination in the blkmirror phase.

I see two possibilities:

1) you can make a new command block_reopen; this fixes only the naming
problem.

2) you can hide the new format behind a new command to be used like this:

block_migrate --mirror ide-hd0 /tmp/dst/hd0snap1.qcow2
...
block_migrate --switch ide-hd0

This leave the possibility to specify the format in the future:

block_migrate --mirror ide-hd0 /tmp/dst/hd0snap1.qcow2 qcow2

And we could have another sub-command

block_mirror --stream ide-hd0 /tmp/dst/hd0.raw

to migrate block devices without shared storage and without an
intermediate snapshot.

Paolo

Federico Simoncelli

2012-02-23 09:39:04 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 8:38:55 AM
Subject: Re: [PATCH 3/3] Add nocreate option to snapshot_blkdev

Post by Paolo Bonzini
What is the usecase, and how can this be tested?

Oops, you explained it in part 0.

Post by Paolo Bonzini
Step 5 - Final Switch to Destination
====================================
VM1 is now able to switch to the destination for both read and write
operations.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
(qemu) snapshot_blkdev -n ide0-hd0 /tmp/dst/hd0snap1.qcow2

It seems to me that reusing the snapshot_blkdev command is a bit of a
misnomer. It also forces you to use format autodetection for the
destination in the blkmirror phase.
1) you can make a new command block_reopen; this fixes only the naming
problem.
block_migrate --mirror ide-hd0 /tmp/dst/hd0snap1.qcow2
...
block_migrate --switch ide-hd0
block_migrate --mirror ide-hd0 /tmp/dst/hd0snap1.qcow2 qcow2
And we could have another sub-command
block_mirror --stream ide-hd0 /tmp/dst/hd0.raw
to migrate block devices without shared storage and without an
intermediate snapshot.

I agree on the fact that using snapshot_blkdev is a misnomer but at
the same time I like very much how blkmirror is modular and it can
be used as a regular image filename.
I'm worried that making it explicit with a "--mirror" option would
take away its flexibility.
What about adding block_reopen and extending the blkmirror string:

(qemu) block_reopen ide-hd0 \
blkmirror:qcow2:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2

so that the format is passed as first argument (and if it's empty we
would use the auto-detection).

--
Federico

Paolo Bonzini

2012-02-23 09:48:59 UTC

Post by Federico Simoncelli
I agree on the fact that using snapshot_blkdev is a misnomer but at
the same time I like very much how blkmirror is modular and it can
be used as a regular image filename.

True, on the other hand if we add this we will have to keep it forever.

I'm worried that blkmirror does not satisfy _all_ mirroring needs, for
example you cannot use block_stream to copy from the source to the
destination, so perhaps in the future we want to change it to something
else.

So while I appreciate the easier debugging, I'm afraid that we'll regret
adding a command-line-visible feature.

Post by Federico Simoncelli
I'm worried that making it explicit with a "--mirror" option would
take away its flexibility.
(qemu) block_reopen ide-hd0 \
blkmirror:qcow2:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2
so that the format is passed as first argument (and if it's empty we
would use the auto-detection).

No, the format of the source and destination should be independent. It
would be nice to have something like this:

blkmirror:src=...,dst=...,srcformat=...,dstformat=...

but commas are a pain because they need escaping.

Paolo

Federico Simoncelli

2012-02-23 10:19:08 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 10:48:59 AM
Subject: Re: [PATCH 3/3] Add nocreate option to snapshot_blkdev

Post by Federico Simoncelli
I agree on the fact that using snapshot_blkdev is a misnomer but at
the same time I like very much how blkmirror is modular and it can
be used as a regular image filename.

True, on the other hand if we add this we will have to keep it
forever.
I'm worried that blkmirror does not satisfy _all_ mirroring needs, for
example you cannot use block_stream to copy from the source to the
destination, so perhaps in the future we want to change it to
something
else.

Are you talking about a mirroring where you block_stream the missing
clusters in the destination from the source?
I believe that it could be done without losing the blkmirror
modularity probably extending the BlockDriver structure with some
additional concepts.

So while I appreciate the easier debugging, I'm afraid that we'll regret
adding a command-line-visible feature.

Post by Federico Simoncelli
I'm worried that making it explicit with a "--mirror" option would
take away its flexibility.
(qemu) block_reopen ide-hd0 \
blkmirror:qcow2:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2
so that the format is passed as first argument (and if it's empty we
would use the auto-detection).

No, the format of the source and destination should be independent.
It
blkmirror:src=...,dst=...,srcformat=...,dstformat=...
but commas are a pain because they need escaping.

I like that, one limitation we need to keep in mind is that it should
fit into the hard-coded filename limit of 1024 characters that (sadly) we
have in multiple places. Another thing is that at this stage the mirroring
is more an original/copy concept rather than source/destination.
What about:

blkmirror:...,copy=...,fmt=... (both images uses the same fmt)
blkmirror:...,copy=...,fmt=...,copyfmt=...

Or, eventually if you feel like that source/destination is more appropriate
for the future, then:

blkmirror:...,dst=...,fmt=...,dstfmt=...

--
Federico

Paolo Bonzini

2012-02-23 11:30:42 UTC

Post by Federico Simoncelli

Post by Paolo Bonzini
I'm worried that blkmirror does not satisfy _all_ mirroring
needs, for example you cannot use block_stream to copy from the
source to the destination, so perhaps in the future we want to
change it to something else.

Are you talking about a mirroring where you block_stream the missing
clusters in the destination from the source? I believe that it could
be done without losing the blkmirror modularity probably extending
the BlockDriver structure with some additional concepts.

Yes, agreed. It's not the idea of a driver that I don't like; it is
making it available through the command-line that worries me. This
would be unlike all other "filtering" options (copy-on-read, I/O limits,
dirty sector tracking). True, there are blkdebug and blkverify but
those are for debugging only.

Paolo

Federico Simoncelli

2012-02-22 17:13:33 UTC

From: Marcelo Tosatti <***@redhat.com>

Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <***@redhat.com>
---
Makefile.objs | 2 +-
block/blkmirror.c | 282 ++++++++++++++++++++++++++++++++++++++++++++++++++++
docs/blkmirror.txt | 15 +++
3 files changed, 298 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt

diff --git a/Makefile.objs b/Makefile.objs
index 391e524..cd65e1b 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,7 +34,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..1c02710
--- /dev/null
+++ b/block/blkmirror.c
@@ -0,0 +1,282 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ int finished;
+ DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+ BlockDriverAIOCB common;
+ int count;
+
+ BlockDriverCompletionFunc *cb;
+ SingleAIOCB aios[2];
+ int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:path/to/image1:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret, escape, i, n;
+ char *filename2;
+
+ /* Parse the blkmirror: prefix */
+ if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
+ return -EINVAL;
+ }
+ filename += strlen("blkmirror:");
+
+ /* Parse the raw image filename */
+ filename2 = qemu_malloc(strlen(filename)+1);
+ escape = 0;
+ for (i = n = 0; i < strlen(filename); i++) {
+ if (!escape && filename[i] == ':') {
+ break;
+ }
+ if (!escape && filename[i] == '\\') {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ filename2[n++] = filename[i];
+ }
+ }
+ filename2[n] = '\0';
+
+ m->bs[0] = bdrv_new("");
+ if (m->bs[0] == NULL) {
+ free(filename2);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[0], filename2, flags, NULL);
+ free(filename2);
+ if (ret < 0) {
+ return ret;
+ }
+ filename += i + 1;
+
+ m->bs[1] = bdrv_new("");
+ if (m->bs[1] == NULL) {
+ bdrv_delete(m->bs[0]);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[1], filename, flags, NULL);
+ if (ret < 0) {
+ bdrv_delete(m->bs[0]);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+}
+
+static int blkmirror_flush(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_flush(m->bs[0]);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_flush(m->bs[1]);
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+
+ return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+ int i;
+
+ for (i = 0 ; i < 2; i++) {
+ if (!acb->aios[i].finished) {
+ bdrv_aio_cancel(acb->aios[i].aiocb);
+ }
+ }
+ qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+ .aiocb_size = sizeof(DupAIOCB),
+ .cancel = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+ SingleAIOCB *scb = opaque;
+ DupAIOCB *dcb = scb->parent;
+
+ scb->finished = 1;
+ dcb->count--;
+ assert(dcb->count >= 0);
+ if (ret < 0) {
+ dcb->ret = ret;
+ }
+ if (dcb->count == 0) {
+ dcb->common.cb(dcb->common.opaque, dcb->ret);
+ qemu_aio_release(dcb);
+ }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ DupAIOCB *dcb;
+ int i;
+
+ dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+ if (!dcb) {
+ return NULL;
+ }
+ dcb->count = 2;
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].parent = dcb;
+ dcb->aios[i].finished = 0;
+ }
+ dcb->ret = 0;
+
+ return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+ nb_sectors, &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ if (!dcb->aios[i].aiocb) {
+ int a;
+
+ for (a = 0; a < i; a++) {
+ bdrv_aio_cancel(dcb->aios[i].aiocb);
+ }
+ qemu_aio_release(dcb);
+ return NULL;
+ }
+ }
+
+ return &dcb->common;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_flush(m->bs[i], &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ if (!dcb->aios[i].aiocb) {
+ int a;
+
+ for (a = 0; a < i; a++) {
+ bdrv_aio_cancel(dcb->aios[i].aiocb);
+ }
+ qemu_aio_release(dcb);
+ return NULL;
+ }
+ }
+
+ return &dcb->common;
+}
+
+static int blkmirror_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_discard(m->bs[0], sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+ .format_name = "blkmirror",
+ .protocol_name = "blkmirror",
+ .instance_size = sizeof(BdrvMirrorState),
+
+ .bdrv_getlength = blkmirror_getlength,
+
+ .bdrv_file_open = blkmirror_open,
+ .bdrv_close = blkmirror_close,
+ .bdrv_flush = blkmirror_flush,
+ .bdrv_discard = blkmirror_discard,
+
+ .bdrv_aio_readv = blkmirror_aio_readv,
+ .bdrv_aio_writev = blkmirror_aio_writev,
+ .bdrv_aio_flush = blkmirror_aio_flush,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+ bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..202be7d
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,15 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+Its used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+
+

--
1.7.1

Stefan Hajnoczi

2012-02-23 16:14:09 UTC

On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Mirrored writes are used by live block copy.

I think the right approach is to create a single blkmirror driver that
also includes blkverify functionality. The code is basically the same
except blkverify also compares reads - just use a flag to
enable/disable that behavior.

Feel free to rename the blkverify driver to blkmirror if you wish.

By the way, this code does not build against qemu.git/master. It uses
interfaces which have been dropped.

Stefan

Stefan Hajnoczi

2012-02-23 16:18:41 UTC

By the way, this code does not build against qemu.git/master. It uses
interfaces which have been dropped.

Ah, I see you changed that in Patch 2/3. Please don't do that, it's
hard to review and breaks git-bisect.

Stefan

Federico Simoncelli

2012-02-23 16:20:24 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 5:18:41 PM
Subject: Re: [Qemu-devel] [PATCH 1/3] Add blkmirror block driver

By the way, this code does not build against qemu.git/master. It uses
interfaces which have been dropped.

Ah, I see you changed that in Patch 2/3. Please don't do that, it's
hard to review and breaks git-bisect.

Squashing is easy (anyone could do it at the commit phase), reviewing
is hard (if you hide your changes in someone else's code).

--
Federico

Stefan Hajnoczi

2012-02-23 16:28:23 UTC

On Thu, Feb 23, 2012 at 4:20 PM, Federico Simoncelli

Post by Federico Simoncelli
----- Original Message -----

Sent: Thursday, February 23, 2012 5:18:41 PM
Subject: Re: [Qemu-devel] [PATCH 1/3] Add blkmirror block driver

By the way, this code does not build against qemu.git/master. It uses
interfaces which have been dropped.

Ah, I see you changed that in Patch 2/3. Please don't do that, it's
hard to review and breaks git-bisect.

Squashing is easy (anyone could do it at the commit phase), reviewing
is hard (if you hide your changes in someone else's code).

I don't care if you or Marcelo wrote it, I want to see a coherent piece of code.

The way to do it is to merge the g_malloc() and BlockDriver interface
changes into this path. Then you have not snuck any significant
changes into Marcelo's code.

Keep the BDRV_O_NO_BACKING separate.

Stefan

Federico Simoncelli

2012-02-23 16:51:37 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 5:28:23 PM
Subject: Re: [Qemu-devel] [PATCH 1/3] Add blkmirror block driver
On Thu, Feb 23, 2012 at 4:20 PM, Federico Simoncelli

Post by Federico Simoncelli
----- Original Message -----

Sent: Thursday, February 23, 2012 5:18:41 PM
Subject: Re: [Qemu-devel] [PATCH 1/3] Add blkmirror block driver
On Thu, Feb 23, 2012 at 4:14 PM, Stefan Hajnoczi

Post by Stefan Hajnoczi
By the way, this code does not build against qemu.git/master.
It
uses
interfaces which have been dropped.

Ah, I see you changed that in Patch 2/3. Please don't do that, it's
hard to review and breaks git-bisect.

Squashing is easy (anyone could do it at the commit phase),
reviewing
is hard (if you hide your changes in someone else's code).

I don't care if you or Marcelo wrote it, I want to see a coherent piece of code.
The way to do it is to merge the g_malloc() and BlockDriver interface
changes into this path. Then you have not snuck any significant
changes into Marcelo's code.

Well for example I'd have snuck the qemu_vmalloc/qemu_vfree mistake.

Keep the BDRV_O_NO_BACKING separate.

Ok, thanks.

--
Federico

Federico Simoncelli

2012-02-23 16:18:54 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 5:14:09 PM
Subject: Re: [Qemu-devel] [PATCH 1/3] Add blkmirror block driver
On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Mirrored writes are used by live block copy.

I think the right approach is to create a single blkmirror driver that
also includes blkverify functionality. The code is basically the same
except blkverify also compares reads - just use a flag to
enable/disable that behavior.
Feel free to rename the blkverify driver to blkmirror if you wish.
By the way, this code does not build against qemu.git/master. It uses
interfaces which have been dropped.

Yes you also need:

[PATCH 2/3] Update the blkmirror block driver

Which was sent separately to facilitate the review for people who
already reviewed this.

--
Federico

Stefan Hajnoczi

2012-02-27 09:23:38 UTC

Post by Stefan Hajnoczi
On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Mirrored writes are used by live block copy.

I think the right approach is to create a single blkmirror driver that
also includes blkverify functionality. The code is basically the same
except blkverify also compares reads - just use a flag to
enable/disable that behavior.
Feel free to rename the blkverify driver to blkmirror if you wish.

Federico: Any response to this? I still think blkmirror and blkverify
do essentially the same thing and should be unified.

Stefan

Paolo Bonzini

2012-02-27 11:37:34 UTC

Post by Stefan Hajnoczi

Post by Stefan Hajnoczi
I think the right approach is to create a single blkmirror driver that
also includes blkverify functionality. The code is basically the same
except blkverify also compares reads - just use a flag to
enable/disable that behavior.
Feel free to rename the blkverify driver to blkmirror if you wish.

Federico: Any response to this? I still think blkmirror and blkverify
do essentially the same thing and should be unified.

Once non-incremental mode is added, I suspect blkmirror will diverge
from blkverify significantly. In particular, we would need to track
where have writes been done in the destination. We also would need to
hooks for block/stream.c, or perhaps a completely separate
implementation of streaming.

Also, blkverify doesn't support cancellation. I know we do quite poorly
in this area, but I'd rather not make it worse...

Paolo

Stefan Hajnoczi

2012-02-27 11:42:35 UTC

Post by Paolo Bonzini

Post by Stefan Hajnoczi
I think the right approach is to create a single blkmirror driver that
also includes blkverify functionality. The code is basically the same
except blkverify also compares reads - just use a flag to
enable/disable that behavior.
Feel free to rename the blkverify driver to blkmirror if you wish.

Federico: Any response to this? I still think blkmirror and blkverify
do essentially the same thing and should be unified.

Once non-incremental mode is added, I suspect blkmirror will diverge
from blkverify significantly. In particular, we would need to track
where have writes been done in the destination. We also would need to
hooks for block/stream.c, or perhaps a completely separate
implementation of streaming.

I'm not familiar with non-incremental mode, could you please explain
it or link to it?

Post by Paolo Bonzini
Also, blkverify doesn't support cancellation. I know we do quite poorly
in this area, but I'd rather not make it worse...

That's why I suggested unifying them - take the best of both approaches.

Stefan

Paolo Bonzini

2012-02-27 11:48:41 UTC

Post by Stefan Hajnoczi

Post by Paolo Bonzini
Once non-incremental mode is added, I suspect blkmirror will diverge
from blkverify significantly. In particular, we would need to track
where have writes been done in the destination. We also would need to
hooks for block/stream.c, or perhaps a completely separate
implementation of streaming.

I'm not familiar with non-incremental mode, could you please explain
it or link to it?

Non-incremental mode is "pre-copy" migration. It would stream in the
background from the source to the destination. In this case:

- you need to differentiate streaming writes from other writes. When
streaming, you do not want to issue spurious writes to the source;

- you can skip streaming anything that has been written to the
destination already. This means that you need: 1) a bitmap similar to
is_allocated; 2) to widen writes to a cluster; 3) serialization similar
to copy-on-read.

I'm not yet sure how much of this will be generalized in the block layer
and block/stream.c, and how much will be specific to blkmirror, but in
general none of this applies to blkverify.

Paolo

Stefan Hajnoczi

2012-02-27 13:09:00 UTC

Post by Stefan Hajnoczi

Post by Paolo Bonzini
Once non-incremental mode is added, I suspect blkmirror will diverge
from blkverify significantly. In particular, we would need to track
where have writes been done in the destination. We also would need to
hooks for block/stream.c, or perhaps a completely separate
implementation of streaming.

I'm not familiar with non-incremental mode, could you please explain
it or link to it?

Non-incremental mode is "pre-copy" migration. It would stream in the
- you need to differentiate streaming writes from other writes. When
streaming, you do not want to issue spurious writes to the source;
- you can skip streaming anything that has been written to the
destination already. This means that you need: 1) a bitmap similar to
is_allocated; 2) to widen writes to a cluster; 3) serialization similar
to copy-on-read.
I'm not yet sure how much of this will be generalized in the block layer
and block/stream.c, and how much will be specific to blkmirror, but in
general none of this applies to blkverify.

Agreed but I'm not sure it has to do with blkmirror either. blkmirror
and blkverify perform the same function except that blkverify mirrors
reads too and checks that they match.

Who knows when and if pre-copy will be (re)implemented, it's not a
good argument to say let's duplicate block mirroring because we're not
sure about the design of a feature feature yet which might want to
hook in here.

Stefan

Paolo Bonzini

2012-02-27 13:47:03 UTC

Post by Stefan Hajnoczi

Post by Paolo Bonzini
Non-incremental mode is "pre-copy" migration. It would stream in the
- you need to differentiate streaming writes from other writes. When
streaming, you do not want to issue spurious writes to the source;
- you can skip streaming anything that has been written to the
destination already. This means that you need: 1) a bitmap similar to
is_allocated; 2) to widen writes to a cluster; 3) serialization similar
to copy-on-read.
I'm not yet sure how much of this will be generalized in the block layer
and block/stream.c, and how much will be specific to blkmirror, but in
general none of this applies to blkverify.

Agreed but I'm not sure it has to do with blkmirror either. blkmirror
and blkverify perform the same function except that blkverify mirrors
reads too and checks that they match.
Who knows when and if pre-copy will be (re)implemented, it's not a
good argument to say let's duplicate block mirroring because we're not
sure about the design of a feature feature yet which might want to
hook in here.

It's not a duplicate, it just happens that two very simple drivers share
1 operation out of 4 (open, read, write, flush). There are other
differences, for example:

1) blkverify hardcodes raw for one format and auto-detects the other.
blkmirror needs to have a specified format for both, and I don't think
starting another bikeshedding discussion on blkverify backwards
compatibility is a good idea;

2) blkverify doesn't flush the raw file;

3) blkverify in the future could add callbacks to create snapshots or
load/save imgstate, and forward them to the test file; this doesn't make
sense for blkmirror.

Paolo

Stefan Hajnoczi

2012-02-27 14:49:11 UTC

Post by Paolo Bonzini

Non-incremental mode is "pre-copy" migration. It would stream in the
- you need to differentiate streaming writes from other writes. When
streaming, you do not want to issue spurious writes to the source;
- you can skip streaming anything that has been written to the
destination already. This means that you need: 1) a bitmap similar to
is_allocated; 2) to widen writes to a cluster; 3) serialization similar
to copy-on-read.
I'm not yet sure how much of this will be generalized in the block layer
and block/stream.c, and how much will be specific to blkmirror, but in
general none of this applies to blkverify.

Agreed but I'm not sure it has to do with blkmirror either. blkmirror
and blkverify perform the same function except that blkverify mirrors
reads too and checks that they match.
Who knows when and if pre-copy will be (re)implemented, it's not a
good argument to say let's duplicate block mirroring because we're not
sure about the design of a feature feature yet which might want to
hook in here.

It's not a duplicate, it just happens that two very simple drivers share
1 operation out of 4 (open, read, write, flush). There are other
1) blkverify hardcodes raw for one format and auto-detects the other.
blkmirror needs to have a specified format for both, and I don't think
starting another bikeshedding discussion on blkverify backwards
compatibility is a good idea;

Backwards compatibility isn't needed here, it's a debugging tool and
some distros even disable it.

Allowing another format for the reference image is a feature that
would be nice to have. It allows direct qcow2 to vmdk comparison, for
example, if we ever hit a bug that benefits from this type of
comparison.

Post by Paolo Bonzini
2) blkverify doesn't flush the raw file;

It's fine to flush the reference file. This is an accidental difference :).

Post by Paolo Bonzini
3) blkverify in the future could add callbacks to create snapshots or
load/save imgstate, and forward them to the test file; this doesn't make
sense for blkmirror.

I guess what I'm saying is, if we need to copy-paste in order to fork
them in the future that's fine, but why maintain duplicates in the
mean-time? Please make the codebase nice today. We can always extend
it in the future, we're not forced to keep them unified forever if it
turns out we want to split them. But as it stands they are
essentially the same thing.

Stefan

Stefan Hajnoczi

2012-02-27 14:59:55 UTC

Post by Stefan Hajnoczi

Post by Paolo Bonzini
3) blkverify in the future could add callbacks to create snapshots or
load/save imgstate, and forward them to the test file; this doesn't make
sense for blkmirror.

I guess what I'm saying is, if we need to copy-paste in order to fork
them in the future that's fine, but why maintain duplicates in the
mean-time? Please make the codebase nice today. We can always extend
it in the future, we're not forced to keep them unified forever if it
turns out we want to split them. But as it stands they are
essentially the same thing.

BTW, I feel that I may just be missing context on what the plans are
around mirroring and block migration. Is there a plan or discussions
that haven't hit qemu-devel (maybe they were done in ovirt or
internally)?

Stefan

Paolo Bonzini

2012-02-27 15:08:20 UTC

Post by Stefan Hajnoczi

Post by Stefan Hajnoczi
I guess what I'm saying is, if we need to copy-paste in order to fork
them in the future that's fine, but why maintain duplicates in the
mean-time? Please make the codebase nice today. We can always extend
it in the future, we're not forced to keep them unified forever if it
turns out we want to split them. But as it stands they are
essentially the same thing.

BTW, I feel that I may just be missing context on what the plans are
around mirroring and block migration. Is there a plan or discussions
that haven't hit qemu-devel (maybe they were done in ovirt or
internally)?

Not that I know of. Me, I'm not on any oVirt list. :)

Paolo

Stefan Hajnoczi

2012-02-23 15:47:38 UTC

On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Preparation
===========
$ mkdir /tmp/{src/dst}
$ qemu-img create -f qcow2 /tmp/src/hd0base.qcow2 20G
Formatting '/tmp/src/hd0base.qcow2', fmt=qcow2 size=21474836480
encryption=off cluster_size=65536
Step 1 - Initital Scenario
==========================
VM1 is running on the src/hd0base. (Where "<=" stands for "uses")
[src/hd0base] <= VM1(read-write)
$ qemu-system-x86_64 -hda /tmp/src/hd0base.qcow2 -monitor stdio
QEMU 1.0.50 monitor - type 'help' for more information
(qemu)
Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and dst/hd0snap1 as
image files. (Where "<-" stands for "has backing file")
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
... <- [dst/hd0snap1] <= VM1(write-only)
$ qemu-img create -f qcow2 \
-b /tmp/src/hd0base.qcow2 /tmp/src/hd0snap1.qcow2 20G
Formatting '/tmp/src/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536
$ qemu-img create -f qcow2 \
-b /tmp/dst/hd0base.qcow2 /tmp/dst/hd0snap1.qcow2 20G
Formatting '/tmp/dst/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536

At this stage /tmp/dst/hd0base.qcow2 does not exist yet. The qemu-img
output you pasted shows /tmp/src/hd0base.qcow2 was actually used.
Typo?

Post by Federico Simoncelli
(qemu) snapshot_blkdev -n ide0-hd0 \
blkmirror:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2 blkmirror
Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination dst/hd0base.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base] <- [dst/hd0snap1] <= VM1(write-only)
$ cp -a /tmp/src/hd0base.qcow2 /tmp/dst/hd0base.qcow2

Are we missing a fixup step that changes backing_file in
dst/hd0snap1.qcow2 to point at dst/hd0base.qcow2?

Post by Federico Simoncelli
Step 5 - Final Switch to Destination
====================================
VM1 is now able to switch to the destination for both read and write
operations.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
(qemu) snapshot_blkdev -n ide0-hd0 /tmp/dst/hd0snap1.qcow2
--
Federico

Federico Simoncelli

2012-02-23 16:10:52 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 4:47:38 PM
Subject: Re: [Qemu-devel] Live Block Migration using Mirroring
On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and
dst/hd0snap1 as
image files. (Where "<-" stands for "has backing file")
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
... <- [dst/hd0snap1] <= VM1(write-only)
$ qemu-img create -f qcow2 \
-b /tmp/src/hd0base.qcow2 /tmp/src/hd0snap1.qcow2 20G
Formatting '/tmp/src/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off
cluster_size=65536
$ qemu-img create -f qcow2 \
-b /tmp/dst/hd0base.qcow2 /tmp/dst/hd0snap1.qcow2 20G
Formatting '/tmp/dst/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off
cluster_size=65536

At this stage /tmp/dst/hd0base.qcow2 does not exist yet. The
qemu-img
output you pasted shows /tmp/src/hd0base.qcow2 was actually used.
Typo?

No that's part of the flag used in [PATCH 2/3] (Update the blkmirror
block driver): BDRV_O_NO_BACKING

It's also documented in the design:

Loading Image...

Post by Federico Simoncelli
(qemu) snapshot_blkdev -n ide0-hd0 \
blkmirror:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2
blkmirror
Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination
dst/hd0base.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base] <- [dst/hd0snap1] <= VM1(write-only)
$ cp -a /tmp/src/hd0base.qcow2 /tmp/dst/hd0base.qcow2

Are we missing a fixup step that changes backing_file in
dst/hd0snap1.qcow2 to point at dst/hd0base.qcow2?

See above.

--
Federico

Stefan Hajnoczi

2012-02-23 16:35:23 UTC

On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

recently I've been working on live block migration combining the live
snapshots and the blkmirror patch sent by Marcelo Tosatti few months ago.
http://www.ovirt.org/wiki/Features/Design/StorageLiveMigration

After mirrored-snapshot completes we're left with the base and the
snapshot. Is the idea to implement live snapshot merge next? Or do
you have something else planned to avoid growing the backing file
chain each time mirrored-snapshot is used?

Stefan

Federico Simoncelli

2012-02-23 17:06:34 UTC

----- Original Message -----

Sent: Thursday, February 23, 2012 5:35:23 PM
Subject: Re: [Qemu-devel] Live Block Migration using Mirroring
On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

recently I've been working on live block migration combining the
live
snapshots and the blkmirror patch sent by Marcelo Tosatti few months ago.
http://www.ovirt.org/wiki/Features/Design/StorageLiveMigration

After mirrored-snapshot completes we're left with the base and the
snapshot. Is the idea to implement live snapshot merge next? Or do
you have something else planned to avoid growing the backing file
chain each time mirrored-snapshot is used?

The general idea is that I don't expect the need to migrate to a new
storage to be very frequent. Being able to live merge the new snapshot
would be great.

--
Federico

Federico Simoncelli

2012-02-24 11:37:38 UTC

From: Marcelo Tosatti <***@redhat.com>

Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <***@redhat.com>
Signed-off-by: Federico Simoncelli <***@redhat.com>
---
Makefile.objs | 2 +-
block/blkmirror.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++
docs/blkmirror.txt | 16 ++++
3 files changed, 264 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt

diff --git a/Makefile.objs b/Makefile.objs
index 67ee3df..6020308 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,7 +34,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..39927c8
--- /dev/null
+++ b/block/blkmirror.c
@@ -0,0 +1,247 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ int finished;
+ DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+ BlockDriverAIOCB common;
+ int count;
+
+ BlockDriverCompletionFunc *cb;
+ SingleAIOCB aios[2];
+ int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:path/to/image1:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret, escape, i, n;
+ char *filename2;
+
+ /* Parse the blkmirror: prefix */
+ if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
+ return -EINVAL;
+ }
+ filename += strlen("blkmirror:");
+
+ /* Parse the raw image filename */
+ filename2 = g_malloc(strlen(filename)+1);
+ escape = 0;
+ for (i = n = 0; i < strlen(filename); i++) {
+ if (!escape && filename[i] == ':') {
+ break;
+ }
+ if (!escape && filename[i] == '\\') {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ filename2[n++] = filename[i];
+ }
+ }
+ filename2[n] = '\0';
+
+ m->bs[0] = bdrv_new("");
+ if (m->bs[0] == NULL) {
+ g_free(filename2);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[0], filename2, flags, NULL);
+ g_free(filename2);
+ if (ret < 0) {
+ return ret;
+ }
+ filename += i + 1;
+
+ m->bs[1] = bdrv_new("");
+ if (m->bs[1] == NULL) {
+ bdrv_delete(m->bs[0]);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[1], filename, flags, NULL);
+ if (ret < 0) {
+ bdrv_delete(m->bs[0]);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+}
+
+static coroutine_fn int blkmirror_co_flush(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_flush(m->bs[0]);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_flush(m->bs[1]);
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+
+ return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+ int i;
+
+ for (i = 0 ; i < 2; i++) {
+ if (!acb->aios[i].finished) {
+ bdrv_aio_cancel(acb->aios[i].aiocb);
+ }
+ }
+ qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+ .aiocb_size = sizeof(DupAIOCB),
+ .cancel = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+ SingleAIOCB *scb = opaque;
+ DupAIOCB *dcb = scb->parent;
+
+ scb->finished = 1;
+ dcb->count--;
+ assert(dcb->count >= 0);
+ if (ret < 0) {
+ dcb->ret = ret;
+ }
+ if (dcb->count == 0) {
+ dcb->common.cb(dcb->common.opaque, dcb->ret);
+ qemu_aio_release(dcb);
+ }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ DupAIOCB *dcb;
+ int i;
+
+ dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+ if (!dcb) {
+ return NULL;
+ }
+ dcb->count = 2;
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].parent = dcb;
+ dcb->aios[i].finished = 0;
+ }
+ dcb->ret = 0;
+
+ return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+ nb_sectors, &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ }
+
+ return &dcb->common;
+}
+
+static coroutine_fn int blkmirror_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_discard(m->bs[0], sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+ .format_name = "blkmirror",
+ .protocol_name = "blkmirror",
+ .instance_size = sizeof(BdrvMirrorState),
+
+ .bdrv_getlength = blkmirror_getlength,
+
+ .bdrv_file_open = blkmirror_open,
+ .bdrv_close = blkmirror_close,
+ .bdrv_co_flush_to_disk = blkmirror_co_flush,
+ .bdrv_co_discard = blkmirror_co_discard,
+
+ .bdrv_aio_readv = blkmirror_aio_readv,
+ .bdrv_aio_writev = blkmirror_aio_writev,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+ bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..cf73f3f
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,16 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+Backslashes themselves also can be escaped as '\\'.
+
+

--
1.7.1

Federico Simoncelli

2012-02-24 11:37:39 UTC

Signed-off-by: Federico Simoncelli <***@redhat.com>
---
block/blkmirror.c | 2 +-
blockdev.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++------
hmp-commands.hx | 36 +++++++++++++++++
hmp.c | 30 ++++++++++++++
hmp.h | 2 +
qapi-schema.json | 63 ++++++++++++++++++++++++++++++
6 files changed, 229 insertions(+), 13 deletions(-)

diff --git a/block/blkmirror.c b/block/blkmirror.c
index 39927c8..49e3381 100644
--- a/block/blkmirror.c
+++ b/block/blkmirror.c
@@ -81,7 +81,7 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
bdrv_delete(m->bs[0]);
return -ENOMEM;
}
- ret = bdrv_open(m->bs[1], filename, flags, NULL);
+ ret = bdrv_open(m->bs[1], filename, flags | BDRV_O_NO_BACKING, NULL);
if (ret < 0) {
bdrv_delete(m->bs[0]);
return ret;
diff --git a/blockdev.c b/blockdev.c
index 2c132a3..1df2542 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -646,9 +646,8 @@ void do_commit(Monitor *mon, const QDict *qdict)
}
}

-void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
- bool has_format, const char *format,
- Error **errp)
+static void change_blockdev_image(const char *device, const char *new_image_file,
+ const char *format, bool create, Error **errp)
{
BlockDriverState *bs;
BlockDriver *drv, *old_drv, *proto_drv;
@@ -671,7 +670,7 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
old_drv = bs->drv;
flags = bs->open_flags;

- if (!has_format) {
+ if (!format) {
format = "qcow2";
}

@@ -681,24 +680,26 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
return;
}

- proto_drv = bdrv_find_protocol(snapshot_file);
+ proto_drv = bdrv_find_protocol(new_image_file);
if (!proto_drv) {
error_set(errp, QERR_INVALID_BLOCK_FORMAT, format);
return;
}

- ret = bdrv_img_create(snapshot_file, format, bs->filename,
- bs->drv->format_name, NULL, -1, flags);
- if (ret) {
- error_set(errp, QERR_UNDEFINED_ERROR);
- return;
+ if (create) {
+ ret = bdrv_img_create(new_image_file, format, bs->filename,
+ bs->drv->format_name, NULL, -1, flags);
+ if (ret) {
+ error_set(errp, QERR_UNDEFINED_ERROR);
+ return;
+ }
}

bdrv_drain_all();
bdrv_flush(bs);

bdrv_close(bs);
- ret = bdrv_open(bs, snapshot_file, flags, drv);
+ ret = bdrv_open(bs, new_image_file, flags, drv);
/*
* If reopening the image file we just created fails, fall back
* and try to re-open the original image. If that fails too, we
@@ -709,11 +710,95 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
if (ret != 0) {
error_set(errp, QERR_OPEN_FILE_FAILED, old_filename);
} else {
- error_set(errp, QERR_OPEN_FILE_FAILED, snapshot_file);
+ error_set(errp, QERR_OPEN_FILE_FAILED, new_image_file);
}
}
}

+void qmp_blockdev_reopen(const char *device, const char *new_image_file,
+ bool has_format, const char *format, Error **errp)
+{
+ change_blockdev_image(device, new_image_file,
+ has_format ? format : NULL, false, errp);
+}
+
+void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
+ bool has_format, const char *format,
+ Error **errp)
+{
+ change_blockdev_image(device, snapshot_file,
+ has_format ? format : NULL, true, errp);
+}
+
+void qmp_blockdev_migrate(const char *device, BlockMigrateOp mode,
+ const char *destination, bool has_new_image_file,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriverState *bs;
+ BlockDriver *drv;
+ int i, j, escape;
+ char filename[2048];
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (mode == BLOCK_MIGRATE_OP_MIRROR) {
+ drv = bdrv_find_format("blkmirror");
+ if (!drv) {
+ error_set(errp, QERR_INVALID_BLOCK_FORMAT, "blkmirror");
+ return;
+ }
+
+ if (!has_new_image_file) {
+ new_image_file = bs->filename;
+ }
+
+ pstrcpy(filename, sizeof(filename), "blkmirror:");
+ i = strlen("blkmirror:");
+
+ escape = 0;
+ for (j = 0; j < strlen(new_image_file); j++) {
+ loop:
+ if (!(i < sizeof(filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {
+ if (!escape) {
+ filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ filename[i++] = new_image_file[j];
+ }
+
+ if (i + strlen(destination) + 2 > sizeof(filename)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "destination", "shorter filename");
+ return;
+ }
+
+ filename[i++] = ':';
+ pstrcpy(filename + i, sizeof(filename) - i - 2, destination);
+
+ change_blockdev_image(device, filename, "blkmirror", false, errp);
+ } else if (mode == BLOCK_MIGRATE_OP_STREAM) {
+ error_set(errp, QERR_NOT_SUPPORTED);
+ }
+}
+
static void eject_device(BlockDriverState *bs, int force, Error **errp)
{
if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..ccb1f62 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -886,6 +886,42 @@ Snapshot device, using snapshot file as target if provided
ETEXI

{
+ .name = "blkdev_reopen",
+ .args_type = "device:B,new-image-file:s?,format:s?",
+ .params = "device [new-image-file] [format]",
+ .help = "Assigns a new image file to a device.\n\t\t\t"
+ "The image will be opened using the format\n\t\t\t"
+ "specified or 'qcow2' by default.",
+ .mhandler.cmd = hmp_blkdev_reopen,
+ },
+
+STEXI
+@item blkdev_reopen
+@findex blkdev_reopen
+Assigns a new image file to a device.
+ETEXI
+
+ {
+ .name = "blkdev_migrate",
+ .args_type = "mirror:-m,device:B,destination:s,new-image-file:s?",
+ .params = "device mode destination [new-image-file]",
+ .help = "Migrates the device to a new destination.\n\t\t\t"
+ "The default migration method is 'stream',\n\t\t\t"
+ "the option -m is used to select 'mirror'\n\t\t\t"
+ "(currently the only method supported).\n\t\t\t"
+ "An optional existing image file that will\n\t\t\t"
+ "replace the current one in the device\n\t\t\t"
+ "can be specified.",
+ .mhandler.cmd = hmp_blkdev_migrate,
+ },
+
+STEXI
+@item blkdev_migrate
+@findex blkdev_migrate
+Migrates the device to a new destination.
+ETEXI
+
+ {
.name = "drive_add",
.args_type = "pci_addr:s,opts:s",
.params = "[[<domain>:]<bus>:]<slot>\n"
diff --git a/hmp.c b/hmp.c
index 8ff8c94..b687b8f 100644
--- a/hmp.c
+++ b/hmp.c
@@ -701,6 +701,36 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
hmp_handle_error(mon, &errp);
}

+void hmp_blkdev_reopen(Monitor *mon, const QDict *qdict)
+{
+ const char *device = qdict_get_str(qdict, "device");
+ const char *filename = qdict_get_str(qdict, "new-image-file");
+ const char *format = qdict_get_try_str(qdict, "format");
+ Error *errp = NULL;
+
+ qmp_blockdev_reopen(device, filename, !!format, format, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict)
+{
+ bool mirror = qdict_get_try_bool(qdict, "mirror", 0);
+ const char *device = qdict_get_str(qdict, "device");
+ const char *destination = qdict_get_str(qdict, "destination");
+ const char *new_image_file = qdict_get_try_str(qdict, "new-image-file");
+ BlockMigrateOp mode;
+ Error *errp = NULL;
+
+ if (mirror)
+ mode = BLOCK_MIGRATE_OP_MIRROR;
+ else
+ mode = BLOCK_MIGRATE_OP_STREAM;
+
+ qmp_blockdev_migrate(device, mode, destination,
+ !!new_image_file, new_image_file, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict)
{
qmp_migrate_cancel(NULL);
diff --git a/hmp.h b/hmp.h
index 18eecbd..f4e802b 100644
--- a/hmp.h
+++ b/hmp.h
@@ -47,6 +47,8 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict);
void hmp_balloon(Monitor *mon, const QDict *qdict);
void hmp_block_resize(Monitor *mon, const QDict *qdict);
void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
+void hmp_blkdev_reopen(Monitor *mon, const QDict *qdict);
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict);
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
diff --git a/qapi-schema.json b/qapi-schema.json
index d02ee86..c86796a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1136,6 +1136,69 @@
'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }

##
+# @blockdev-reopen
+#
+# Assigns a new image file to a device.
+#
+# @device: the name of the device for which we are chainging the image file.
+#
+# @new-image-file: the target of the new image. If the file doesn't exists the
+# command will fail.
+#
+# @format: #optional the format of the new image, default is 'qcow2'.
+#
+# Returns: nothing on success
+# If @device is not a valid block device, DeviceNotFound
+# If @new-image-file can't be opened, OpenFileFailed
+# If @format is invalid, InvalidBlockFormat
+#
+# Since 1.1
+##
+
+{ 'command': 'blockdev-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }
+
+##
+# @BlockMigrateOp:
+#
+# An enumeration of block migration methods.
+#
+# @mirror: Only mirror future writes to the destination
+#
+# @stream: Stream the image to the destination
+#
+# Since 1.1
+##
+{ 'enum' : 'BlockMigrateOp', 'data': [ 'mirror', 'stream' ] }
+
+
+##
+# @blockdev-migrate
+#
+# Migrates the device to a new destination.
+#
+# @device: the name of the block device to migrate.
+#
+# @mode: the block migration method to be used.
+#
+# @destination: the destination of the block migration.
+#
+# @new-image-file: #optional an existing image file that will replace
+# the current one in the device.
+#
+# Returns: nothing on success
+# If @device is not a valid block device, DeviceNotFound
+# If @mode is not a valid migration mode, InvalidParameterValue
+# If @destination can't be opened, OpenFileFailed
+# If @new-image-file can't be opened, OpenFileFailed
+#
+# Since 1.1
+##
+{ 'command': 'blockdev-migrate',
+ 'data': { 'device': 'str', 'mode' : 'BlockMigrateOp',
+ 'destination': 'str', '*new-image-file': 'str' } }
+
+##
# @human-monitor-command:
#
# Execute a command on the human monitor and return the output.

--
1.7.1

Kevin Wolf

2012-02-24 12:03:08 UTC

Post by Federico Simoncelli
---
block/blkmirror.c | 2 +-
blockdev.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++------
hmp-commands.hx | 36 +++++++++++++++++
hmp.c | 30 ++++++++++++++
hmp.h | 2 +
qapi-schema.json | 63 ++++++++++++++++++++++++++++++
6 files changed, 229 insertions(+), 13 deletions(-)
diff --git a/block/blkmirror.c b/block/blkmirror.c
index 39927c8..49e3381 100644
--- a/block/blkmirror.c
+++ b/block/blkmirror.c
@@ -81,7 +81,7 @@ static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
bdrv_delete(m->bs[0]);
return -ENOMEM;
}
- ret = bdrv_open(m->bs[1], filename, flags, NULL);
+ ret = bdrv_open(m->bs[1], filename, flags | BDRV_O_NO_BACKING, NULL);
if (ret < 0) {
bdrv_delete(m->bs[0]);
return ret;

Was this hunk meant to be in patch 1?

Post by Federico Simoncelli
diff --git a/blockdev.c b/blockdev.c
index 2c132a3..1df2542 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -646,9 +646,8 @@ void do_commit(Monitor *mon, const QDict *qdict)
}
}
-void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
- bool has_format, const char *format,
- Error **errp)
+static void change_blockdev_image(const char *device, const char *new_image_file,
+ const char *format, bool create, Error **errp)
{
BlockDriverState *bs;
BlockDriver *drv, *old_drv, *proto_drv;
@@ -671,7 +670,7 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
old_drv = bs->drv;
flags = bs->open_flags;
- if (!has_format) {
+ if (!format) {
format = "qcow2";
}
@@ -681,24 +680,26 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
return;
}
- proto_drv = bdrv_find_protocol(snapshot_file);
+ proto_drv = bdrv_find_protocol(new_image_file);
if (!proto_drv) {
error_set(errp, QERR_INVALID_BLOCK_FORMAT, format);
return;
}
- ret = bdrv_img_create(snapshot_file, format, bs->filename,
- bs->drv->format_name, NULL, -1, flags);
- if (ret) {
- error_set(errp, QERR_UNDEFINED_ERROR);
- return;
+ if (create) {
+ ret = bdrv_img_create(new_image_file, format, bs->filename,
+ bs->drv->format_name, NULL, -1, flags);
+ if (ret) {
+ error_set(errp, QERR_UNDEFINED_ERROR);
+ return;
+ }
}
bdrv_drain_all();
bdrv_flush(bs);
bdrv_close(bs);
- ret = bdrv_open(bs, snapshot_file, flags, drv);
+ ret = bdrv_open(bs, new_image_file, flags, drv);
/*
* If reopening the image file we just created fails, fall back
* and try to re-open the original image. If that fails too, we
@@ -709,11 +710,95 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
if (ret != 0) {
error_set(errp, QERR_OPEN_FILE_FAILED, old_filename);
} else {
- error_set(errp, QERR_OPEN_FILE_FAILED, snapshot_file);
+ error_set(errp, QERR_OPEN_FILE_FAILED, new_image_file);
}
}
}
+void qmp_blockdev_reopen(const char *device, const char *new_image_file,
+ bool has_format, const char *format, Error **errp)
+{
+ change_blockdev_image(device, new_image_file,
+ has_format ? format : NULL, false, errp);
+}
+
+void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
+ bool has_format, const char *format,
+ Error **errp)
+{
+ change_blockdev_image(device, snapshot_file,
+ has_format ? format : NULL, true, errp);
+}
+
+void qmp_blockdev_migrate(const char *device, BlockMigrateOp mode,
+ const char *destination, bool has_new_image_file,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriverState *bs;
+ BlockDriver *drv;
+ int i, j, escape;
+ char filename[2048];
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (mode == BLOCK_MIGRATE_OP_MIRROR) {

Move into a separate function?

Post by Federico Simoncelli
+ drv = bdrv_find_format("blkmirror");
+ if (!drv) {
+ error_set(errp, QERR_INVALID_BLOCK_FORMAT, "blkmirror");
+ return;
+ }
+
+ if (!has_new_image_file) {
+ new_image_file = bs->filename;
+ }
+
+ pstrcpy(filename, sizeof(filename), "blkmirror:");
+ i = strlen("blkmirror:");
+
+ escape = 0;
+ for (j = 0; j < strlen(new_image_file); j++) {
+ if (!(i < sizeof(filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {

Markus suggested that using comma for the separator is better even
though it requires escaping. It would allow to parse the option string
with QemuOpts.

Post by Federico Simoncelli
+ if (!escape) {
+ filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ filename[i++] = new_image_file[j];
+ }

Looks like a string helper for qemu-option.c (it contains the parser, so
keeping the escaping nearby would make sense).

Post by Federico Simoncelli
+
+ if (i + strlen(destination) + 2 > sizeof(filename)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "destination", "shorter filename");
+ return;
+ }
+
+ filename[i++] = ':';
+ pstrcpy(filename + i, sizeof(filename) - i - 2, destination);
+
+ change_blockdev_image(device, filename, "blkmirror", false, errp);
+ } else if (mode == BLOCK_MIGRATE_OP_STREAM) {
+ error_set(errp, QERR_NOT_SUPPORTED);

Why even define it then?

Post by Federico Simoncelli
+ }
+}
+
static void eject_device(BlockDriverState *bs, int force, Error **errp)
{
if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..ccb1f62 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -886,6 +886,42 @@ Snapshot device, using snapshot file as target if provided
ETEXI
{
+ .name = "blkdev_reopen",

NACK on the name. Let's reserve blkdev_*/blockdev_* for the proper API
(that we'll release with the qemu version that comes with Hurd 1.0).

drive_* would align well with the existing drive_add/del commands.

Post by Federico Simoncelli
+ .args_type = "device:B,new-image-file:s?,format:s?",
+ .params = "device [new-image-file] [format]",
+ .help = "Assigns a new image file to a device.\n\t\t\t"
+ "The image will be opened using the format\n\t\t\t"
+ "specified or 'qcow2' by default.",
+ .mhandler.cmd = hmp_blkdev_reopen,
+ },
+
+STEXI
+Assigns a new image file to a device.
+ETEXI
+
+ {
+ .name = "blkdev_migrate",
+ .args_type = "mirror:-m,device:B,destination:s,new-image-file:s?",
+ .params = "device mode destination [new-image-file]",
+ .help = "Migrates the device to a new destination.\n\t\t\t"
+ "The default migration method is 'stream',\n\t\t\t"
+ "the option -m is used to select 'mirror'\n\t\t\t"
+ "(currently the only method supported).\n\t\t\t"
+ "An optional existing image file that will\n\t\t\t"
+ "replace the current one in the device\n\t\t\t"
+ "can be specified.",
+ .mhandler.cmd = hmp_blkdev_migrate,
+ },
+
+STEXI
+Migrates the device to a new destination.
+ETEXI
+
+ {
.name = "drive_add",
.args_type = "pci_addr:s,opts:s",
.params = "[[<domain>:]<bus>:]<slot>\n"
diff --git a/hmp.c b/hmp.c
index 8ff8c94..b687b8f 100644
--- a/hmp.c
+++ b/hmp.c
@@ -701,6 +701,36 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
hmp_handle_error(mon, &errp);
}
+void hmp_blkdev_reopen(Monitor *mon, const QDict *qdict)
+{
+ const char *device = qdict_get_str(qdict, "device");
+ const char *filename = qdict_get_str(qdict, "new-image-file");
+ const char *format = qdict_get_try_str(qdict, "format");
+ Error *errp = NULL;
+
+ qmp_blockdev_reopen(device, filename, !!format, format, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict)
+{
+ bool mirror = qdict_get_try_bool(qdict, "mirror", 0);
+ const char *device = qdict_get_str(qdict, "device");
+ const char *destination = qdict_get_str(qdict, "destination");
+ const char *new_image_file = qdict_get_try_str(qdict, "new-image-file");
+ BlockMigrateOp mode;
+ Error *errp = NULL;
+
+ if (mirror)
+ mode = BLOCK_MIGRATE_OP_MIRROR;
+ else
+ mode = BLOCK_MIGRATE_OP_STREAM;
+
+ qmp_blockdev_migrate(device, mode, destination,
+ !!new_image_file, new_image_file, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict)
{
qmp_migrate_cancel(NULL);
diff --git a/hmp.h b/hmp.h
index 18eecbd..f4e802b 100644
--- a/hmp.h
+++ b/hmp.h
@@ -47,6 +47,8 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict);
void hmp_balloon(Monitor *mon, const QDict *qdict);
void hmp_block_resize(Monitor *mon, const QDict *qdict);
void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
+void hmp_blkdev_reopen(Monitor *mon, const QDict *qdict);
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict);
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
diff --git a/qapi-schema.json b/qapi-schema.json
index d02ee86..c86796a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1136,6 +1136,69 @@
'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }
##
+#
+# Assigns a new image file to a device.
+#
+#
+# command will fail.
+#
+#
+# Returns: nothing on success
+#
+# Since 1.1
+##
+
+{ 'command': 'blockdev-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }

Same consideration on the name.

Also I think we should immediately mark the command as deprecated (I
think there is precedence for it, though I can't remember which command
it was). This is not an interface we'll want to keep long term.

Kevin

Federico Simoncelli

2012-02-24 12:12:27 UTC

----- Original Message -----

Sent: Friday, February 24, 2012 1:03:08 PM
Subject: Re: [PATCH 2/2] Add the blockdev-reopen and blockdev-migrate commands

Post by Federico Simoncelli
---
block/blkmirror.c | 2 +-
blockdev.c | 109
+++++++++++++++++++++++++++++++++++++++++++++++------
hmp-commands.hx | 36 +++++++++++++++++
hmp.c | 30 ++++++++++++++
hmp.h | 2 +
qapi-schema.json | 63 ++++++++++++++++++++++++++++++
6 files changed, 229 insertions(+), 13 deletions(-)
diff --git a/block/blkmirror.c b/block/blkmirror.c
index 39927c8..49e3381 100644
--- a/block/blkmirror.c
+++ b/block/blkmirror.c
@@ -81,7 +81,7 @@ static int blkmirror_open(BlockDriverState *bs,
const char *filename, int flags)
bdrv_delete(m->bs[0]);
return -ENOMEM;
}
- ret = bdrv_open(m->bs[1], filename, flags, NULL);
+ ret = bdrv_open(m->bs[1], filename, flags | BDRV_O_NO_BACKING, NULL);
if (ret < 0) {
bdrv_delete(m->bs[0]);
return ret;

Was this hunk meant to be in patch 1?

Not necessarily, I thought a lot about it. I didn't want to modify Marcelo's
patch too much. This flag is actually mandatory only to make blockdev-migrate
work with a destination that doesn't have a backing file yet, so I thought it
was more appropriate to squash it here.

--
Federico

Paolo Bonzini

2012-02-24 13:11:31 UTC

Post by Kevin Wolf

Post by Federico Simoncelli
+ if (!(i < sizeof(filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {

Markus suggested that using comma for the separator is better even
though it requires escaping. It would allow to parse the option string
with QemuOpts.

Isn't that a bit overengineering for an internal interface?

Post by Kevin Wolf

Post by Federico Simoncelli
+ if (!escape) {
+ filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ filename[i++] = new_image_file[j];
+ }

Looks like a string helper for qemu-option.c (it contains the parser, so
keeping the escaping nearby would make sense).

Post by Federico Simoncelli
+
+ if (i + strlen(destination) + 2 > sizeof(filename)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "destination", "shorter filename");
+ return;
+ }
+
+ filename[i++] = ':';
+ pstrcpy(filename + i, sizeof(filename) - i - 2, destination);
+
+ change_blockdev_image(device, filename, "blkmirror", false, errp);
+ } else if (mode == BLOCK_MIGRATE_OP_STREAM) {
+ error_set(errp, QERR_NOT_SUPPORTED);

Why even define it then?

Because it's the default for the HMP. Until we have live merging,
mirror mode cannot be consider anything more than a hack.

Post by Kevin Wolf

Post by Federico Simoncelli
##
+#
+# Assigns a new image file to a device.
+#
+#
+# command will fail.
+#
+#
+# Returns: nothing on success
+#
+# Since 1.1
+##
+
+{ 'command': 'blockdev-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }

Same consideration on the name.
Also I think we should immediately mark the command as deprecated (I
think there is precedence for it, though I can't remember which command
it was). This is not an interface we'll want to keep long term.

What about blockdev-snapshot-sync/snapshot_blkdev?

Paolo

Luiz Capitulino

2012-02-24 17:04:25 UTC

On Fri, 24 Feb 2012 13:03:08 +0100

Post by Kevin Wolf

Post by Federico Simoncelli
static void eject_device(BlockDriverState *bs, int force, Error **errp)
{
if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..ccb1f62 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -886,6 +886,42 @@ Snapshot device, using snapshot file as target if provided
ETEXI
{
+ .name = "blkdev_reopen",

NACK on the name. Let's reserve blkdev_*/blockdev_* for the proper API
(that we'll release with the qemu version that comes with Hurd 1.0).

lol.

Post by Kevin Wolf

Post by Federico Simoncelli
##
+#
+# Assigns a new image file to a device.
+#
+#
+# command will fail.
+#
+#
+# Returns: nothing on success
+#
+# Since 1.1
+##
+
+{ 'command': 'blockdev-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }

Same consideration on the name.
Also I think we should immediately mark the command as deprecated (I
think there is precedence for it, though I can't remember which command
it was). This is not an interface we'll want to keep long term.

We can only mark an interface as deprecated when there's a substitute for it.

Markus Armbruster

2012-02-27 14:57:23 UTC

[...]

Post by Kevin Wolf

Post by Federico Simoncelli
diff --git a/blockdev.c b/blockdev.c
index 2c132a3..1df2542 100644
--- a/blockdev.c
+++ b/blockdev.c

[...]

Post by Kevin Wolf

Post by Federico Simoncelli
+void qmp_blockdev_migrate(const char *device, BlockMigrateOp mode,
+ const char *destination, bool has_new_image_file,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriverState *bs;
+ BlockDriver *drv;
+ int i, j, escape;
+ char filename[2048];
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (mode == BLOCK_MIGRATE_OP_MIRROR) {

Move into a separate function?

Post by Federico Simoncelli
+ drv = bdrv_find_format("blkmirror");
+ if (!drv) {
+ error_set(errp, QERR_INVALID_BLOCK_FORMAT, "blkmirror");
+ return;
+ }
+
+ if (!has_new_image_file) {
+ new_image_file = bs->filename;
+ }
+
+ pstrcpy(filename, sizeof(filename), "blkmirror:");
+ i = strlen("blkmirror:");
+
+ escape = 0;
+ for (j = 0; j < strlen(new_image_file); j++) {
+ if (!(i < sizeof(filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {

Markus suggested that using comma for the separator is better even
though it requires escaping. It would allow to parse the option string
with QemuOpts.

Yes, I did. While I'm no particular friend of QemuOpts, inventing more
ad hoc syntaxes is even worse.

On the other hand, many block drivers already do, often a colon syntax
similar to yours. But not all. I guess consistency is a lost cause
there, as is generality (support for arbitrary filenames). Maybe it's
best to concede defeat, do another ad hoc syntax now, and hope we can
replace the whole -drive mess by something saner eventually.

Post by Kevin Wolf

Post by Federico Simoncelli
+ if (!escape) {
+ filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ filename[i++] = new_image_file[j];
+ }

Looks like a string helper for qemu-option.c (it contains the parser, so
keeping the escaping nearby would make sense).

Post by Federico Simoncelli
+
+ if (i + strlen(destination) + 2 > sizeof(filename)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "destination", "shorter filename");
+ return;
+ }
+
+ filename[i++] = ':';
+ pstrcpy(filename + i, sizeof(filename) - i - 2, destination);
+
+ change_blockdev_image(device, filename, "blkmirror", false, errp);
+ } else if (mode == BLOCK_MIGRATE_OP_STREAM) {
+ error_set(errp, QERR_NOT_SUPPORTED);

Why even define it then?

Post by Federico Simoncelli
+ }
+}
+
static void eject_device(BlockDriverState *bs, int force, Error **errp)
{
if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..ccb1f62 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -886,6 +886,42 @@ Snapshot device, using snapshot file as target if provided
ETEXI
{
+ .name = "blkdev_reopen",

NACK on the name. Let's reserve blkdev_*/blockdev_* for the proper API

Yes, please.

Post by Kevin Wolf
(that we'll release with the qemu version that comes with Hurd 1.0).

/me hides.

Post by Kevin Wolf
drive_* would align well with the existing drive_add/del commands.

Agree.

[...]

Post by Kevin Wolf

Post by Federico Simoncelli
diff --git a/qapi-schema.json b/qapi-schema.json
index d02ee86..c86796a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1136,6 +1136,69 @@
'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }
##
+#
+# Assigns a new image file to a device.
+#
+#
+# command will fail.
+#
+#
+# Returns: nothing on success
+#
+# Since 1.1
+##
+
+{ 'command': 'blockdev-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }

Same consideration on the name.
Also I think we should immediately mark the command as deprecated (I
think there is precedence for it, though I can't remember which command
it was). This is not an interface we'll want to keep long term.

"Deprecated" isn't quite right without a replacement. We could mark it
"experimental" :)

Federico Simoncelli

2012-02-24 16:49:03 UTC

From: Marcelo Tosatti <***@redhat.com>

Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <***@redhat.com>
Signed-off-by: Federico Simoncelli <***@redhat.com>
---
Makefile.objs | 2 +-
block/blkmirror.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++
docs/blkmirror.txt | 16 ++++
3 files changed, 264 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt

diff --git a/Makefile.objs b/Makefile.objs
index 67ee3df..6020308 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,7 +34,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..49e3381
--- /dev/null
+++ b/block/blkmirror.c
@@ -0,0 +1,247 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ int finished;
+ DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+ BlockDriverAIOCB common;
+ int count;
+
+ BlockDriverCompletionFunc *cb;
+ SingleAIOCB aios[2];
+ int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:path/to/image1:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret, escape, i, n;
+ char *filename2;
+
+ /* Parse the blkmirror: prefix */
+ if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
+ return -EINVAL;
+ }
+ filename += strlen("blkmirror:");
+
+ /* Parse the raw image filename */
+ filename2 = g_malloc(strlen(filename)+1);
+ escape = 0;
+ for (i = n = 0; i < strlen(filename); i++) {
+ if (!escape && filename[i] == ':') {
+ break;
+ }
+ if (!escape && filename[i] == '\\') {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ filename2[n++] = filename[i];
+ }
+ }
+ filename2[n] = '\0';
+
+ m->bs[0] = bdrv_new("");
+ if (m->bs[0] == NULL) {
+ g_free(filename2);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[0], filename2, flags, NULL);
+ g_free(filename2);
+ if (ret < 0) {
+ return ret;
+ }
+ filename += i + 1;
+
+ m->bs[1] = bdrv_new("");
+ if (m->bs[1] == NULL) {
+ bdrv_delete(m->bs[0]);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[1], filename, flags | BDRV_O_NO_BACKING, NULL);
+ if (ret < 0) {
+ bdrv_delete(m->bs[0]);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+}
+
+static coroutine_fn int blkmirror_co_flush(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_flush(m->bs[0]);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_flush(m->bs[1]);
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+
+ return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+ int i;
+
+ for (i = 0 ; i < 2; i++) {
+ if (!acb->aios[i].finished) {
+ bdrv_aio_cancel(acb->aios[i].aiocb);
+ }
+ }
+ qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+ .aiocb_size = sizeof(DupAIOCB),
+ .cancel = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+ SingleAIOCB *scb = opaque;
+ DupAIOCB *dcb = scb->parent;
+
+ scb->finished = 1;
+ dcb->count--;
+ assert(dcb->count >= 0);
+ if (ret < 0) {
+ dcb->ret = ret;
+ }
+ if (dcb->count == 0) {
+ dcb->common.cb(dcb->common.opaque, dcb->ret);
+ qemu_aio_release(dcb);
+ }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ DupAIOCB *dcb;
+ int i;
+
+ dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+ if (!dcb) {
+ return NULL;
+ }
+ dcb->count = 2;
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].parent = dcb;
+ dcb->aios[i].finished = 0;
+ }
+ dcb->ret = 0;
+
+ return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+ nb_sectors, &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ }
+
+ return &dcb->common;
+}
+
+static coroutine_fn int blkmirror_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_discard(m->bs[0], sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+ .format_name = "blkmirror",
+ .protocol_name = "blkmirror",
+ .instance_size = sizeof(BdrvMirrorState),
+
+ .bdrv_getlength = blkmirror_getlength,
+
+ .bdrv_file_open = blkmirror_open,
+ .bdrv_close = blkmirror_close,
+ .bdrv_co_flush_to_disk = blkmirror_co_flush,
+ .bdrv_co_discard = blkmirror_co_discard,
+
+ .bdrv_aio_readv = blkmirror_aio_readv,
+ .bdrv_aio_writev = blkmirror_aio_writev,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+ bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..cf73f3f
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,16 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+Backslashes themselves also can be escaped as '\\'.
+
+

--
1.7.1

Eric Blake

2012-02-24 17:02:36 UTC

Post by Federico Simoncelli
Mirrored writes are used by live block copy.
---
+++ b/block/blkmirror.c
@@ -0,0 +1,247 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.

It's 2012 now.

Post by Federico Simoncelli
+++ b/docs/blkmirror.txt
@@ -0,0 +1,16 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+Backslashes themselves also can be escaped as '\\'.

Is the escaping of : and \ only necessary for image1.img, or for both
image1 and image2? I need to know if the parser is consistent for both
arguments, but this wording makes it sound like it is only for the first
argument.

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Federico Simoncelli

2012-02-24 17:15:25 UTC

----- Original Message -----

Sent: Friday, February 24, 2012 6:02:36 PM
Subject: Re: [Qemu-devel] [PATCH 1/2 v2] Add blkmirror block driver

Post by Federico Simoncelli
+++ b/docs/blkmirror.txt
@@ -0,0 +1,16 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+Backslashes themselves also can be escaped as '\\'.

Is the escaping of : and \ only necessary for image1.img, or for both
image1 and image2? I need to know if the parser is consistent for both
arguments, but this wording makes it sound like it is only for the first
argument.

Yes it is only for the first argument.

--
Federico

Paolo Bonzini

2012-02-24 18:49:42 UTC

Post by Eric Blake
Is the escaping of : and \ only necessary for image1.img, or for both
image1 and image2? I need to know if the parser is consistent for both
arguments, but this wording makes it sound like it is only for the first
argument.

libvirt is completely insulated from this.

Paolo

Luiz Capitulino

2012-02-24 18:17:22 UTC

On Fri, 24 Feb 2012 16:49:03 +0000

Post by Federico Simoncelli
Mirrored writes are used by live block copy.
---
Makefile.objs | 2 +-
block/blkmirror.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++
docs/blkmirror.txt | 16 ++++
3 files changed, 264 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt
diff --git a/Makefile.objs b/Makefile.objs
index 67ee3df..6020308 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,7 +34,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..49e3381
--- /dev/null
+++ b/block/blkmirror.c
@@ -0,0 +1,247 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ int finished;
+ DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+ BlockDriverAIOCB common;
+ int count;
+
+ BlockDriverCompletionFunc *cb;
+ SingleAIOCB aios[2];
+ int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:path/to/image1:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret, escape, i, n;
+ char *filename2;
+
+ /* Parse the blkmirror: prefix */
+ if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
+ return -EINVAL;
+ }
+ filename += strlen("blkmirror:");

Minor: you can use strstart() here.

Post by Federico Simoncelli
+
+ /* Parse the raw image filename */
+ filename2 = g_malloc(strlen(filename)+1);
+ escape = 0;
+ for (i = n = 0; i < strlen(filename); i++) {
+ if (!escape && filename[i] == ':') {
+ break;
+ }
+ if (!escape && filename[i] == '\\') {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ filename2[n++] = filename[i];
+ }
+ }
+ filename2[n] = '\0';

You're escaping only the first image name string, is that intentional?

Post by Federico Simoncelli
+
+ m->bs[0] = bdrv_new("");
+ if (m->bs[0] == NULL) {
+ g_free(filename2);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[0], filename2, flags, NULL);
+ g_free(filename2);
+ if (ret < 0) {

Leaking m->bs[0]?

Post by Federico Simoncelli
+ return ret;
+ }
+ filename += i + 1;
+
+ m->bs[1] = bdrv_new("");
+ if (m->bs[1] == NULL) {
+ bdrv_delete(m->bs[0]);
+ return -ENOMEM;
+ }
+ ret = bdrv_open(m->bs[1], filename, flags | BDRV_O_NO_BACKING, NULL);
+ if (ret < 0) {
+ bdrv_delete(m->bs[0]);

What about m->bs[1]?

Post by Federico Simoncelli
+ return ret;
+ }
+
+ return 0;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+}
+
+static coroutine_fn int blkmirror_co_flush(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_flush(m->bs[0]);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_flush(m->bs[1]);
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+
+ return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+ int i;
+
+ for (i = 0 ; i < 2; i++) {
+ if (!acb->aios[i].finished) {
+ bdrv_aio_cancel(acb->aios[i].aiocb);
+ }
+ }
+ qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+ .aiocb_size = sizeof(DupAIOCB),
+ .cancel = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+ SingleAIOCB *scb = opaque;
+ DupAIOCB *dcb = scb->parent;
+
+ scb->finished = 1;
+ dcb->count--;
+ assert(dcb->count >= 0);
+ if (ret < 0) {
+ dcb->ret = ret;
+ }
+ if (dcb->count == 0) {
+ dcb->common.cb(dcb->common.opaque, dcb->ret);
+ qemu_aio_release(dcb);
+ }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ DupAIOCB *dcb;
+ int i;
+
+ dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+ if (!dcb) {
+ return NULL;
+ }
+ dcb->count = 2;
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].parent = dcb;
+ dcb->aios[i].finished = 0;
+ }
+ dcb->ret = 0;
+
+ return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+ nb_sectors, &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ }
+
+ return &dcb->common;
+}
+
+static coroutine_fn int blkmirror_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_discard(m->bs[0], sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+ .format_name = "blkmirror",
+ .protocol_name = "blkmirror",
+ .instance_size = sizeof(BdrvMirrorState),
+
+ .bdrv_getlength = blkmirror_getlength,
+
+ .bdrv_file_open = blkmirror_open,
+ .bdrv_close = blkmirror_close,
+ .bdrv_co_flush_to_disk = blkmirror_co_flush,
+ .bdrv_co_discard = blkmirror_co_discard,
+
+ .bdrv_aio_readv = blkmirror_aio_readv,
+ .bdrv_aio_writev = blkmirror_aio_writev,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+ bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..cf73f3f
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,16 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+Backslashes themselves also can be escaped as '\\'.
+
+

Federico Simoncelli

2012-02-27 09:17:12 UTC

----- Original Message -----

Sent: Friday, February 24, 2012 7:17:22 PM
Subject: Re: [PATCH 1/2 v2] Add blkmirror block driver
On Fri, 24 Feb 2012 16:49:03 +0000

Post by Federico Simoncelli
+ /* Parse the raw image filename */
+ filename2 = g_malloc(strlen(filename)+1);
+ escape = 0;
+ for (i = n = 0; i < strlen(filename); i++) {
+ if (!escape && filename[i] == ':') {
+ break;
+ }
+ if (!escape && filename[i] == '\\') {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ filename2[n++] = filename[i];
+ }
+ }
+ filename2[n] = '\0';

You're escaping only the first image name string, is that
intentional?

Post by Federico Simoncelli
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..cf73f3f
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,16 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator, in the first image filename.
+Backslashes themselves also can be escaped as '\\'.
+
+

Anyway this format is encapsulated by blockdev-migrate.

--
Federico

Federico Simoncelli

2012-02-24 16:49:04 UTC

Signed-off-by: Federico Simoncelli <***@redhat.com>
---
blockdev.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++------
hmp-commands.hx | 38 +++++++++++++++++++
hmp.c | 24 ++++++++++++
hmp.h | 2 +
qapi-schema.json | 54 +++++++++++++++++++++++++++
5 files changed, 213 insertions(+), 12 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 2c132a3..f51bd6f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -646,9 +646,8 @@ void do_commit(Monitor *mon, const QDict *qdict)
}
}

-void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
- bool has_format, const char *format,
- Error **errp)
+static void change_blockdev_image(const char *device, const char *new_image_file,
+ const char *format, bool create, Error **errp)
{
BlockDriverState *bs;
BlockDriver *drv, *old_drv, *proto_drv;
@@ -671,7 +670,7 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
old_drv = bs->drv;
flags = bs->open_flags;

- if (!has_format) {
+ if (!format) {
format = "qcow2";
}

@@ -681,24 +680,26 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
return;
}

- proto_drv = bdrv_find_protocol(snapshot_file);
+ proto_drv = bdrv_find_protocol(new_image_file);
if (!proto_drv) {
error_set(errp, QERR_INVALID_BLOCK_FORMAT, format);
return;
}

- ret = bdrv_img_create(snapshot_file, format, bs->filename,
- bs->drv->format_name, NULL, -1, flags);
- if (ret) {
- error_set(errp, QERR_UNDEFINED_ERROR);
- return;
+ if (create) {
+ ret = bdrv_img_create(new_image_file, format, bs->filename,
+ bs->drv->format_name, NULL, -1, flags);
+ if (ret) {
+ error_set(errp, QERR_UNDEFINED_ERROR);
+ return;
+ }
}

bdrv_drain_all();
bdrv_flush(bs);

bdrv_close(bs);
- ret = bdrv_open(bs, snapshot_file, flags, drv);
+ ret = bdrv_open(bs, new_image_file, flags, drv);
/*
* If reopening the image file we just created fails, fall back
* and try to re-open the original image. If that fails too, we
@@ -709,11 +710,93 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
if (ret != 0) {
error_set(errp, QERR_OPEN_FILE_FAILED, old_filename);
} else {
- error_set(errp, QERR_OPEN_FILE_FAILED, snapshot_file);
+ error_set(errp, QERR_OPEN_FILE_FAILED, new_image_file);
}
}
}

+void qmp_drive_reopen(const char *device, const char *new_image_file,
+ bool has_format, const char *format, Error **errp)
+{
+ change_blockdev_image(device, new_image_file,
+ has_format ? format : NULL, false, errp);
+}
+
+void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
+ bool has_format, const char *format,
+ Error **errp)
+{
+ change_blockdev_image(device, snapshot_file,
+ has_format ? format : NULL, true, errp);
+}
+
+static void qmp_blockdev_mirror(const char *device, const char *destination,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;
+
+ drv = bdrv_find_format("blkmirror");
+ if (!drv) {
+ error_set(errp, QERR_INVALID_BLOCK_FORMAT, "blkmirror");
+ return;
+ }
+
+ escape = 0;
+ for (i = 0, j = 0; j < strlen(new_image_file); j++) {
+ loop:
+ if (!(i < sizeof(new_filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {
+ if (!escape) {
+ new_filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ new_filename[i++] = new_image_file[j];
+ }
+
+ filename = g_strdup_printf("blkmirror:%s:%s", new_filename, destination);
+ change_blockdev_image(device, filename, "blkmirror", false, errp);
+ g_free(filename);
+}
+
+void qmp_blockdev_migrate(const char *device, bool incremental,
+ const char *destination, bool has_new_image_file,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriverState *bs;
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (incremental) {
+ if (!has_new_image_file) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "incremental", "a new image file");
+ } else {
+ qmp_blockdev_mirror(device, destination, new_image_file, errp);
+ }
+ } else {
+ error_set(errp, QERR_NOT_SUPPORTED);
+ }
+}
+
static void eject_device(BlockDriverState *bs, int force, Error **errp)
{
if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..4aacf2a 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -886,6 +886,44 @@ Snapshot device, using snapshot file as target if provided
ETEXI

{
+ .name = "drive_reopen",
+ .args_type = "device:B,new-image-file:s?,format:s?",
+ .params = "device [new-image-file] [format]",
+ .help = "Assigns a new image file to a device.\n\t\t\t"
+ "The image will be opened using the format\n\t\t\t"
+ "specified or 'qcow2' by default.\n\t\t\t"
+ "This command is deprecated.",
+ .mhandler.cmd = hmp_drive_reopen,
+ },
+
+STEXI
+@item drive_reopen
+@findex drive_reopen
+Assigns a new image file to a device.
+ETEXI
+
+ {
+ .name = "blkdev_migrate",
+ .args_type = "incremental:-i,device:B,destination:s,new-image-file:s?",
+ .params = "device mode destination [new-image-file]",
+ .help = "Migrates the device to a new destination.\n\t\t\t"
+ "If the incremental (-i) option is used then\n\t\t\t"
+ "only the new writes are sent to the destination.\n\t\t\t"
+ "This method is particularly useful if used in\n\t\t\t"
+ "conjunction with new-image-file (a new image for\n\t\t\t"
+ "the device) allowing the current image to be\n\t\t\t"
+ "transferred to the destination by an external\n\t\t\t"
+ "manager.",
+ .mhandler.cmd = hmp_blkdev_migrate,
+ },
+
+STEXI
+@item blkdev_migrate
+@findex blkdev_migrate
+Migrates the device to a new destination.
+ETEXI
+
+ {
.name = "drive_add",
.args_type = "pci_addr:s,opts:s",
.params = "[[<domain>:]<bus>:]<slot>\n"
diff --git a/hmp.c b/hmp.c
index 8ff8c94..282d3e5 100644
--- a/hmp.c
+++ b/hmp.c
@@ -701,6 +701,30 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
hmp_handle_error(mon, &errp);
}

+void hmp_drive_reopen(Monitor *mon, const QDict *qdict)
+{
+ const char *device = qdict_get_str(qdict, "device");
+ const char *filename = qdict_get_str(qdict, "new-image-file");
+ const char *format = qdict_get_try_str(qdict, "format");
+ Error *errp = NULL;
+
+ qmp_drive_reopen(device, filename, !!format, format, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict)
+{
+ bool incremental = qdict_get_try_bool(qdict, "incremental", 0);
+ const char *device = qdict_get_str(qdict, "device");
+ const char *destination = qdict_get_str(qdict, "destination");
+ const char *new_image_file = qdict_get_try_str(qdict, "new-image-file");
+ Error *errp = NULL;
+
+ qmp_blockdev_migrate(device, incremental, destination,
+ !!new_image_file, new_image_file, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict)
{
qmp_migrate_cancel(NULL);
diff --git a/hmp.h b/hmp.h
index 18eecbd..3c66257 100644
--- a/hmp.h
+++ b/hmp.h
@@ -47,6 +47,8 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict);
void hmp_balloon(Monitor *mon, const QDict *qdict);
void hmp_block_resize(Monitor *mon, const QDict *qdict);
void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
+void hmp_drive_reopen(Monitor *mon, const QDict *qdict);
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict);
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
diff --git a/qapi-schema.json b/qapi-schema.json
index d02ee86..df1248c 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1136,6 +1136,60 @@
'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }

##
+# @drive-reopen
+#
+# Assigns a new image file to a device.
+#
+# @device: the name of the device for which we are chainging the image file.
+#
+# @new-image-file: the target of the new image. If the file doesn't exists the
+# command will fail.
+#
+# @format: #optional the format of the new image, default is 'qcow2'.
+#
+# Returns: nothing on success
+# If @device is not a valid block device, DeviceNotFound
+# If @new-image-file can't be opened, OpenFileFailed
+# If @format is invalid, InvalidBlockFormat
+#
+# Notes: This command is deprecated.
+#
+# Since 1.1
+##
+
+{ 'command': 'drive-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }
+
+##
+# @blockdev-migrate
+#
+# Migrates the device to a new destination.
+#
+# @device: the name of the block device to migrate.
+#
+# @incremental: if true only the new writes are sent to the destination.
+# This method is particularly useful if used in conjunction
+# with new-image-file allowing the current image to be
+# transferred to the destination by an external manager.
+#
+# @destination: the destination of the block migration.
+#
+# @new-image-file: #optional an existing image file that will replace
+# the current one in the device.
+#
+# Returns: nothing on success
+# If @device is not a valid block device, DeviceNotFound
+# If @mode is not a valid migration mode, InvalidParameterValue
+# If @destination can't be opened, OpenFileFailed
+# If @new-image-file can't be opened, OpenFileFailed
+#
+# Since 1.1
+##
+{ 'command': 'blockdev-migrate',
+ 'data': { 'device': 'str', 'incremental' : 'bool',
+ 'destination': 'str', '*new-image-file': 'str' } }
+
+##
# @human-monitor-command:
#
# Execute a command on the human monitor and return the output.

--
1.7.1

Eric Blake

2012-02-24 17:46:26 UTC

Pretty sparse on the commit message.

Post by Federico Simoncelli
+++ b/hmp-commands.hx
@@ -886,6 +886,44 @@ Snapshot device, using snapshot file as target if provided
ETEXI
{
+ .name = "drive_reopen",
+ .args_type = "device:B,new-image-file:s?,format:s?",
+ .params = "device [new-image-file] [format]",
+ .help = "Assigns a new image file to a device.\n\t\t\t"
+ "The image will be opened using the format\n\t\t\t"
+ "specified or 'qcow2' by default.\n\t\t\t"
+ "This command is deprecated.",
+ .mhandler.cmd = hmp_drive_reopen,

Libvirt will just use the QMP version, so I'm not reviewing the HMP
version very closely.

Post by Federico Simoncelli
+++ b/qapi-schema.json
@@ -1136,6 +1136,60 @@
'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }
##
+#
+# Assigns a new image file to a device.
+#

s/chainging/changing/

Post by Federico Simoncelli
+#
+# command will fail.

I think you need to be more explicit that @new-image-file MUST have
identical contents as the current image file, for this to be useful, and
that qemu does not validate whether the new image met those conditions.
Possible ways to achieve this:

1. Freeze all guest I/O, so that you know the current image file is
stable, copy the contents to new-image-file, drive-reopen, then unfreeze
guest I/O

2. Create an empty qcow2 file with backing image of the current image
file. drive-reopen will then see the new image file as containing the
same contents as the previous image file.

3. Create the new image file via mirroring (blockdev-migrate with
incremental and new-image-file), and reopen to the mirror

Post by Federico Simoncelli
+#
+#
+# Returns: nothing on success
+#
+# Notes: This command is deprecated.

Generally, a deprecation note should list the preferred replacement; but
we don't have a replacement, so I don't see this note adding any
information.

Post by Federico Simoncelli
+#
+# Since 1.1
+##
+
+{ 'command': 'drive-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }
+
+##
+#
+# Migrates the device to a new destination.
+#
+#
+# This method is particularly useful if used in conjunction
+# with new-image-file allowing the current image to be
+# transferred to the destination by an external manager.
+#

Where do you list what format the destination is? Shouldn't this have
an optional format, defaulting to qcow2? Does qemu create the
destination file, or must it already be existing?

Post by Federico Simoncelli
+#
+# the current one in the device.

Where do you list what format the new-image-file is? Shouldn't this
have an optional format, defaulting to qcow2? Does qemu create the
new-image-file, or can one already be existing?

I know that this patch is only implementing the case where incremental
is true and new-image-file is provided; but I'm not quite sure what
semantics are intended if incremental is false. Is that still a case
where this sets up mirroring (writes go to two images) but additionally
the contents from the current image are (asynchronously) streamed into
the destination? I guess I'm not sure what the intended semantics of
the modes that we aren't implementing, or why we don't just go with a
simpler command that only exposes the semantics that we are implementing.

Post by Federico Simoncelli
+#
+# Returns: nothing on success
+#
+# Since 1.1
+##
+{ 'command': 'blockdev-migrate',
+ 'data': { 'device': 'str', 'incremental' : 'bool',
+ 'destination': 'str', '*new-image-file': 'str' } }
+
+##
#
# Execute a command on the human monitor and return the output.

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Paolo Bonzini

2012-02-24 18:57:52 UTC

Post by Eric Blake
identical contents as the current image file, for this to be useful, and
that qemu does not validate whether the new image met those conditions.

Not necessarily, you could always do this on a paused machine.

Post by Eric Blake
Where do you list what format the destination is? Shouldn't this have
an optional format, defaulting to qcow2? Does qemu create the
destination file, or must it already be existing?

I think no default, raw makes as much or more sense in the
non-incremental case. Anyway either autodetect, or no default.

Post by Eric Blake

Post by Federico Simoncelli
+#
+# the current one in the device.

Where do you list what format the new-image-file is? Shouldn't this
have an optional format, defaulting to qcow2? Does qemu create the
new-image-file, or can one already be existing?

qemu does not create the file now, but in the future we may add a flag
to create a snapshot. I think no default is better here too, or autodetect.

Post by Eric Blake
I know that this patch is only implementing the case where incremental
is true and new-image-file is provided; but I'm not quite sure what
semantics are intended if incremental is false. Is that still a case
where this sets up mirroring (writes go to two images) but additionally
the contents from the current image are (asynchronously) streamed into
the destination?

Yes. The image should already be there also in this case, and
new-image-file will usually be omitted.

Paolo

Eric Blake

2012-02-24 19:37:24 UTC

Post by Paolo Bonzini

Post by Eric Blake
identical contents as the current image file, for this to be useful, and
that qemu does not validate whether the new image met those conditions.

Not necessarily, you could always do this on a paused machine.

You lost me; I think you snipped too much. Was it about this statement
in my original?

Post by Paolo Bonzini

Post by Eric Blake
1. Freeze all guest I/O, so that you know the current image file is
stable,

To which I have a counter-question: Is pausing a machine guaranteed to
flush all I/O out to the image prior to returning control to the
controlling app, or do we have to rely on the qemu-ga agent command to
actually freeze I/O within the guest?

Overall, I was arguing that from a usability perspective, if
new-image-file does not have the same contents (from the guest's
perspective) as the current image, then when you resume the machine, you
will be injecting arbitrary disk changes into the guest's disk, probably
with disastrous results. I'm not arguing that new-image-file has to be
identical to the current image (in fact, converting between raw or qcow2
via drive-reopen seems like great use cases for this new command). So
documenting this distinction (guest data must be consistent across the
reopen), as part of the contract of the monitor command, would be useful.

Post by Paolo Bonzini

Post by Eric Blake
Where do you list what format the destination is? Shouldn't this have
an optional format, defaulting to qcow2? Does qemu create the
destination file, or must it already be existing?

I think no default, raw makes as much or more sense in the
non-incremental case. Anyway either autodetect, or no default.

So if there is no default, then how do I specify the format? Libvirt
_cannot_ rely on autodetect, ever since we had to solve a CVE where
relying on autodetect would allow the guest to cause libvirt to mislabel
files on the host.

Post by Paolo Bonzini

Post by Eric Blake
I know that this patch is only implementing the case where incremental
is true and new-image-file is provided; but I'm not quite sure what
semantics are intended if incremental is false. Is that still a case
where this sets up mirroring (writes go to two images) but additionally
the contents from the current image are (asynchronously) streamed into
the destination?

Yes. The image should already be there also in this case, and
new-image-file will usually be omitted.

If the image to be opened must already exist, then explicitly state that
in the command documentation.

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Luiz Capitulino

2012-02-24 19:01:43 UTC

On Fri, 24 Feb 2012 16:49:04 +0000
How does this relate to migrate -b? Should it be deprecated?

Btw, would be nice to have a 0/2 intro email describing the feature and changelog
info.

Post by Federico Simoncelli
---
blockdev.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++------
hmp-commands.hx | 38 +++++++++++++++++++
hmp.c | 24 ++++++++++++
hmp.h | 2 +
qapi-schema.json | 54 +++++++++++++++++++++++++++
5 files changed, 213 insertions(+), 12 deletions(-)
diff --git a/blockdev.c b/blockdev.c
index 2c132a3..f51bd6f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -646,9 +646,8 @@ void do_commit(Monitor *mon, const QDict *qdict)
}
}
-void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
- bool has_format, const char *format,
- Error **errp)
+static void change_blockdev_image(const char *device, const char *new_image_file,
+ const char *format, bool create, Error **errp)
{
BlockDriverState *bs;
BlockDriver *drv, *old_drv, *proto_drv;
@@ -671,7 +670,7 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
old_drv = bs->drv;
flags = bs->open_flags;
- if (!has_format) {
+ if (!format) {
format = "qcow2";
}
@@ -681,24 +680,26 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
return;
}
- proto_drv = bdrv_find_protocol(snapshot_file);
+ proto_drv = bdrv_find_protocol(new_image_file);
if (!proto_drv) {
error_set(errp, QERR_INVALID_BLOCK_FORMAT, format);
return;
}
- ret = bdrv_img_create(snapshot_file, format, bs->filename,
- bs->drv->format_name, NULL, -1, flags);
- if (ret) {
- error_set(errp, QERR_UNDEFINED_ERROR);
- return;
+ if (create) {
+ ret = bdrv_img_create(new_image_file, format, bs->filename,
+ bs->drv->format_name, NULL, -1, flags);
+ if (ret) {
+ error_set(errp, QERR_UNDEFINED_ERROR);
+ return;
+ }
}
bdrv_drain_all();
bdrv_flush(bs);
bdrv_close(bs);
- ret = bdrv_open(bs, snapshot_file, flags, drv);
+ ret = bdrv_open(bs, new_image_file, flags, drv);
/*
* If reopening the image file we just created fails, fall back
* and try to re-open the original image. If that fails too, we
@@ -709,11 +710,93 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
if (ret != 0) {
error_set(errp, QERR_OPEN_FILE_FAILED, old_filename);
} else {
- error_set(errp, QERR_OPEN_FILE_FAILED, snapshot_file);
+ error_set(errp, QERR_OPEN_FILE_FAILED, new_image_file);
}
}
}
+void qmp_drive_reopen(const char *device, const char *new_image_file,
+ bool has_format, const char *format, Error **errp)
+{
+ change_blockdev_image(device, new_image_file,
+ has_format ? format : NULL, false, errp);
+}
+
+void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
+ bool has_format, const char *format,
+ Error **errp)
+{
+ change_blockdev_image(device, snapshot_file,
+ has_format ? format : NULL, true, errp);
+}
+
+static void qmp_blockdev_mirror(const char *device, const char *destination,
+ const char *new_image_file, Error **errp)
+{

Minor: this is a not a qmp function, please drop the qmp_ prefix then.

Post by Federico Simoncelli
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;

I'd use PATH_MAX for new_filename's size.

Post by Federico Simoncelli
+
+ drv = bdrv_find_format("blkmirror");
+ if (!drv) {
+ error_set(errp, QERR_INVALID_BLOCK_FORMAT, "blkmirror");
+ return;
+ }
+
+ escape = 0;
+ for (i = 0, j = 0; j < strlen(new_image_file); j++) {
+ if (!(i < sizeof(new_filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {
+ if (!escape) {
+ new_filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ new_filename[i++] = new_image_file[j];
+ }

IMO, you could require the filename arguments to be escaped already.

Post by Federico Simoncelli
+
+ filename = g_strdup_printf("blkmirror:%s:%s", new_filename, destination);
+ change_blockdev_image(device, filename, "blkmirror", false, errp);
+ g_free(filename);
+}
+
+void qmp_blockdev_migrate(const char *device, bool incremental,
+ const char *destination, bool has_new_image_file,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriverState *bs;
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (incremental) {
+ if (!has_new_image_file) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "incremental", "a new image file");
+ } else {
+ qmp_blockdev_mirror(device, destination, new_image_file, errp);
+ }
+ } else {
+ error_set(errp, QERR_NOT_SUPPORTED);
+ }

The command documentation says that 'incremental' and 'new_image_file' are
optionals, but the code makes them required. Why?

Post by Federico Simoncelli
+}
+
static void eject_device(BlockDriverState *bs, int force, Error **errp)
{
if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 573b823..4aacf2a 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -886,6 +886,44 @@ Snapshot device, using snapshot file as target if provided
ETEXI
{
+ .name = "drive_reopen",
+ .args_type = "device:B,new-image-file:s?,format:s?",
+ .params = "device [new-image-file] [format]",
+ .help = "Assigns a new image file to a device.\n\t\t\t"
+ "The image will be opened using the format\n\t\t\t"
+ "specified or 'qcow2' by default.\n\t\t\t"
+ "This command is deprecated.",
+ .mhandler.cmd = hmp_drive_reopen,
+ },
+
+STEXI
+Assigns a new image file to a device.
+ETEXI
+
+ {
+ .name = "blkdev_migrate",
+ .args_type = "incremental:-i,device:B,destination:s,new-image-file:s?",
+ .params = "device mode destination [new-image-file]",
+ .help = "Migrates the device to a new destination.\n\t\t\t"
+ "If the incremental (-i) option is used then\n\t\t\t"
+ "only the new writes are sent to the destination.\n\t\t\t"
+ "This method is particularly useful if used in\n\t\t\t"
+ "conjunction with new-image-file (a new image for\n\t\t\t"
+ "the device) allowing the current image to be\n\t\t\t"
+ "transferred to the destination by an external\n\t\t\t"
+ "manager.",
+ .mhandler.cmd = hmp_blkdev_migrate,
+ },
+
+STEXI
+Migrates the device to a new destination.
+ETEXI
+
+ {
.name = "drive_add",
.args_type = "pci_addr:s,opts:s",
.params = "[[<domain>:]<bus>:]<slot>\n"
diff --git a/hmp.c b/hmp.c
index 8ff8c94..282d3e5 100644
--- a/hmp.c
+++ b/hmp.c
@@ -701,6 +701,30 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
hmp_handle_error(mon, &errp);
}
+void hmp_drive_reopen(Monitor *mon, const QDict *qdict)
+{
+ const char *device = qdict_get_str(qdict, "device");
+ const char *filename = qdict_get_str(qdict, "new-image-file");
+ const char *format = qdict_get_try_str(qdict, "format");
+ Error *errp = NULL;
+
+ qmp_drive_reopen(device, filename, !!format, format, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict)
+{
+ bool incremental = qdict_get_try_bool(qdict, "incremental", 0);
+ const char *device = qdict_get_str(qdict, "device");
+ const char *destination = qdict_get_str(qdict, "destination");
+ const char *new_image_file = qdict_get_try_str(qdict, "new-image-file");
+ Error *errp = NULL;
+
+ qmp_blockdev_migrate(device, incremental, destination,
+ !!new_image_file, new_image_file, &errp);
+ hmp_handle_error(mon, &errp);
+}
+
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict)
{
qmp_migrate_cancel(NULL);
diff --git a/hmp.h b/hmp.h
index 18eecbd..3c66257 100644
--- a/hmp.h
+++ b/hmp.h
@@ -47,6 +47,8 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict);
void hmp_balloon(Monitor *mon, const QDict *qdict);
void hmp_block_resize(Monitor *mon, const QDict *qdict);
void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
+void hmp_drive_reopen(Monitor *mon, const QDict *qdict);
+void hmp_blkdev_migrate(Monitor *mon, const QDict *qdict);
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
diff --git a/qapi-schema.json b/qapi-schema.json
index d02ee86..df1248c 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1136,6 +1136,60 @@
'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str' } }
##
+#
+# Assigns a new image file to a device.
+#
+#
+# command will fail.
+#
+#
+# Returns: nothing on success
+#
+# Notes: This command is deprecated.

As I said in the other thread, we need a replacement to deprecate something.

Post by Federico Simoncelli
+#
+# Since 1.1
+##
+
+{ 'command': 'drive-reopen',
+ 'data': { 'device': 'str', 'new-image-file': 'str', '*format': 'str' } }
+
+##
+#
+# Migrates the device to a new destination.
+#
+#
+# This method is particularly useful if used in conjunction
+# with new-image-file allowing the current image to be
+# transferred to the destination by an external manager.
+#
+#
+# the current one in the device.
+#
+# Returns: nothing on success
+#
+# Since 1.1
+##
+{ 'command': 'blockdev-migrate',
+ 'data': { 'device': 'str', 'incremental' : 'bool',
+ 'destination': 'str', '*new-image-file': 'str' } }
+
+##
#
# Execute a command on the human monitor and return the output.

Eric Blake

2012-02-24 19:40:17 UTC

Post by Luiz Capitulino

Post by Federico Simoncelli
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;

I'd use PATH_MAX for new_filename's size.

PATH_MAX need not be defined (and on Hurd, it intentionally is not
defined); or might be so huge as to be useless. But 2048 is smaller
than typical PATH_MAX of 4096, so you may still want to consider a
compromise.

Post by Luiz Capitulino

Post by Federico Simoncelli
+
+ new_filename[i++] = new_image_file[j];
+ }

IMO, you could require the filename arguments to be escaped already.

And if you do that, then the escaping should be consistent for both
arguments. Libvirt is okay with pre-escaping file names before passing
them to qemu.

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Luiz Capitulino

2012-02-24 20:26:55 UTC

On Fri, 24 Feb 2012 12:40:17 -0700

Post by Eric Blake

Post by Luiz Capitulino

Post by Federico Simoncelli
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;

I'd use PATH_MAX for new_filename's size.

PATH_MAX need not be defined (and on Hurd, it intentionally is not
defined); or might be so huge as to be useless.

Aren't those extreme cases? PATH_MAX is a standard define and is used in
QEMU in several places. If it's not good here, it shouldn't be good anywhere.

Eric Blake

2012-02-24 22:46:02 UTC

Post by Luiz Capitulino
On Fri, 24 Feb 2012 12:40:17 -0700

Post by Eric Blake

Post by Luiz Capitulino

Post by Federico Simoncelli
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;

I'd use PATH_MAX for new_filename's size.

PATH_MAX need not be defined (and on Hurd, it intentionally is not
defined); or might be so huge as to be useless.

Aren't those extreme cases? PATH_MAX is a standard define and is used in
QEMU in several places. If it's not good here, it shouldn't be good anywhere.

PATH_MAX is specifically declared in POSIX to be defined if there is a
limit, or undefined if there is no limit. There is no limit in GNU
Hurd, so PATH_MAX is undefined there, and you will get a compile error
(then again, no one has ported qemu to Hurd).

Here's what gnulib has recommended:

https://lists.gnu.org/archive/html/bug-gnulib/2011-06/msg00328.html

Post by Luiz Capitulino
A package like coreutils can also do
#ifndef PATH_MAX
# define PATH_MAX 8192
#endif
in its system.h.
Looking at both uses of PATH_MAX in coreutils (src/pwd.c:88 and
src/remove.c:186) the value of PATH_MAX is capped by 8192 or 16384 anyway.
So, on systems like GNU/Hurd, where filenames can have arbitrary size, you
are calling pathconf for no real purpose.
To me, this confirms that a generic pathmax.h (like the one in gnulib)
should only define PATH_MAX when it makes sense - like POSIX says -,
and that the handling of the GNU/Hurd case should be done on a case-by-case
- Either a package-wide handling, or a per-file handling.
- Either a fallback value of 8192, or a fallback value of
pathconf ("/", _PC_PATH_MAX), or just a #ifdef test.

Other mails in that thread are also an interesting read.

In short, use of PATH_MAX should only ever be used to optimize routines
to the common case; in which case, you can pick your own cap for
PATH_MAX if the implementation did not provide one or reduce the
implementation's 8k PATH_MAX down to something like 2048 that you can
safely fit on the stack for the common case before malloc'ing for the
larger strings. But using it as a bounds to a statically-sized object
is a recipe for artificially limiting software; if you are okay with
introducing that artificial limit, then go for it; but if you want to be
truly portable, it is best to never use PATH_MAX as an array bounds, and
to write fallback code paths to handle the cases where user input
exceeds PATH_MAX but can still be handled without error by the system
you are running on.

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Paolo Bonzini

2012-02-24 20:32:30 UTC

Post by Luiz Capitulino
IMO, you could require the filename arguments to be escaped already.

The monitor command was introduced exactly to avoid having to worry
about details such as escaping. JSON is already supposed to take care
of those.

That said, I think Eric is right that we need to specify the formats, so
we'll need another iteration, and we probably need to rename the command
to drive-migrate too.

Paolo

Luiz Capitulino

2012-02-24 20:36:46 UTC

On Fri, 24 Feb 2012 21:32:30 +0100

Post by Paolo Bonzini

Post by Luiz Capitulino
IMO, you could require the filename arguments to be escaped already.

The monitor command was introduced exactly to avoid having to worry
about details such as escaping. JSON is already supposed to take care
of those.

Then the escaping should be done by the parser, not by the command
implementation.

Post by Paolo Bonzini
That said, I think Eric is right that we need to specify the formats, so
we'll need another iteration, and we probably need to rename the command
to drive-migrate too.
Paolo

Paolo Bonzini

2012-02-24 21:05:29 UTC

Post by Luiz Capitulino

Post by Paolo Bonzini

Post by Luiz Capitulino
IMO, you could require the filename arguments to be escaped already.

The monitor command was introduced exactly to avoid having to worry
about details such as escaping. JSON is already supposed to take care
of those.

Then the escaping should be done by the parser, not by the command
implementation.

This command is passing its arguments to the block layer, which needs
its own escaping. It's a wart of the block layer and not supposed to
percolate outside QEMU.

Paolo

Eric Blake

2012-02-24 22:30:28 UTC

Post by Luiz Capitulino
On Fri, 24 Feb 2012 16:49:04 +0000
How does this relate to migrate -b? Should it be deprecated?
Btw, would be nice to have a 0/2 intro email describing the feature and changelog
info.

Another question - how does this interact with Jeff Cody's work to add
blockdev-group-snapshot-sync to add atomic group snapshots? Do you ever
envision the need to migrate and/or reopen multiple disks at once, in
which case, we would want to be setting things up to work on a group in
the same way that Jeff's patches for live snapshot are headed?

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Paolo Bonzini

2012-02-25 06:47:11 UTC

Post by Eric Blake

Post by Luiz Capitulino
On Fri, 24 Feb 2012 16:49:04 +0000
How does this relate to migrate -b? Should it be deprecated?
Btw, would be nice to have a 0/2 intro email describing the feature and changelog
info.

Another question - how does this interact with Jeff Cody's work to add
blockdev-group-snapshot-sync to add atomic group snapshots? Do you ever
envision the need to migrate and/or reopen multiple disks at once, in
which case, we would want to be setting things up to work on a group in
the same way that Jeff's patches for live snapshot are headed?

I was thinking that perhaps we needed something like
blockdev-group-start, blockdev-group-commit, blockdev-group-abort
instead of Jeff's group snapshot, but I was afraid of proposing it. :)

Paolo

Federico Simoncelli

2012-02-27 11:29:39 UTC

----- Original Message -----

Sent: Friday, February 24, 2012 8:01:43 PM
Subject: Re: [PATCH 2/2 v2] Add the blockdev-reopen and blockdev-migrate commands
On Fri, 24 Feb 2012 16:49:04 +0000
Btw, would be nice to have a 0/2 intro email describing the feature and changelog
info.

Yes the 0/2 (actually 0/3) was sent at the beginning of the thread so you might
have missed it because you were added later on but I'm sure you can still find
it in the archives.

Post by Federico Simoncelli
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;

I'd use PATH_MAX for new_filename's size.

Maybe PATH_MAX * 2 + 1? (To handle the case where all the characters should be
escaped).

Post by Federico Simoncelli
+ escape = 0;
+ for (i = 0, j = 0; j < strlen(new_image_file); j++) {
+ if (!(i < sizeof(new_filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {
+ if (!escape) {
+ new_filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ new_filename[i++] = new_image_file[j];
+ }

IMO, you could require the filename arguments to be escaped already.

Can you be more explicit, who should escape it?
The only thing that comes to my mind right now is requiring the input of
blockdev-migrate already escaped but that would expose an internal format.
(I'd not recommend it).

Post by Federico Simoncelli
+void qmp_blockdev_migrate(const char *device, bool incremental,
+ const char *destination, bool
has_new_image_file,
+ const char *new_image_file, Error
**errp)
+{
+ BlockDriverState *bs;
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (incremental) {
+ if (!has_new_image_file) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "incremental", "a new image file");
+ } else {
+ qmp_blockdev_mirror(device, destination,
new_image_file, errp);
+ }
+ } else {
+ error_set(errp, QERR_NOT_SUPPORTED);
+ }

The command documentation says that 'incremental' and
'new_image_file' are
optionals, but the code makes them required. Why?

If I didn't make any mistake in the code I'm just enforcing that when
you specify "incremental" you also need a new image.
There are still other valid cases where they are optional.

--
Federico

Luiz Capitulino

2012-02-27 12:12:15 UTC

On Mon, 27 Feb 2012 06:29:39 -0500 (EST)

Post by Federico Simoncelli
----- Original Message -----

Sent: Friday, February 24, 2012 8:01:43 PM
Subject: Re: [PATCH 2/2 v2] Add the blockdev-reopen and blockdev-migrate commands
On Fri, 24 Feb 2012 16:49:04 +0000
Btw, would be nice to have a 0/2 intro email describing the feature and changelog
info.

Yes the 0/2 (actually 0/3) was sent at the beginning of the thread so you might
have missed it because you were added later on but I'm sure you can still find
it in the archives.

I only found v1 iirc, but this is not important right now, as you're going to
post v3 right? And that's going to have the intro :)

Post by Federico Simoncelli

Post by Federico Simoncelli
+ BlockDriver *drv;
+ int i, j, escape;
+ char new_filename[2048], *filename;

I'd use PATH_MAX for new_filename's size.

Maybe PATH_MAX * 2 + 1? (To handle the case where all the characters should be
escaped).

Well, I was discussing this with Eric and he thinks that a value of 4096
should be fine. I personally prefer using PATH_MAX because I don't believe
I'm better at choosing a random value for this vs. using what the system
provides me.

Feel free to choose what you think is the best for this case.

Post by Federico Simoncelli

Post by Federico Simoncelli
+ escape = 0;
+ for (i = 0, j = 0; j < strlen(new_image_file); j++) {
+ if (!(i < sizeof(new_filename) - 2)) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "new-image-file", "shorter filename");
+ return;
+ }
+
+ if (new_image_file[j] == ':' || new_image_file[j] == '\\') {
+ if (!escape) {
+ new_filename[i++] = '\\', escape = 1;
+ goto loop;
+ } else {
+ escape = 0;
+ }
+ }
+
+ new_filename[i++] = new_image_file[j];
+ }

IMO, you could require the filename arguments to be escaped already.

Can you be more explicit, who should escape it?

Paolo thinks this should be done by the block layer, fine with me.

Post by Federico Simoncelli
The only thing that comes to my mind right now is requiring the input of
blockdev-migrate already escaped but that would expose an internal format.
(I'd not recommend it).

Post by Federico Simoncelli
+void qmp_blockdev_migrate(const char *device, bool incremental,
+ const char *destination, bool
has_new_image_file,
+ const char *new_image_file, Error **errp)
+{
+ BlockDriverState *bs;
+
+ bs = bdrv_find(device);
+ if (!bs) {
+ error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+ return;
+ }
+ if (bdrv_in_use(bs)) {
+ error_set(errp, QERR_DEVICE_IN_USE, device);
+ return;
+ }
+
+ if (incremental) {
+ if (!has_new_image_file) {
+ error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+ "incremental", "a new image file");
+ } else {
+ qmp_blockdev_mirror(device, destination,
new_image_file, errp);
+ }
+ } else {
+ error_set(errp, QERR_NOT_SUPPORTED);
+ }

The command documentation says that 'incremental' and
'new_image_file' are
optionals, but the code makes them required. Why?

If I didn't make any mistake in the code I'm just enforcing that when
you specify "incremental" you also need a new image.
There are still other valid cases where they are optional.

Which operation will be performed if 'incremental' is not passed? If
it's passed, which operation will be performed if 'new_image_file' is not?

Paolo Bonzini

2012-02-27 12:49:17 UTC

Post by Luiz Capitulino

Post by Federico Simoncelli
If I didn't make any mistake in the code I'm just enforcing that when
you specify "incremental" you also need a new image.
There are still other valid cases where they are optional.

Which operation will be performed if 'incremental' is not passed? If
it's passed, which operation will be performed if 'new_image_file' is not?

There are four cases:

1) incremental=false, new_image_file not passed:
right now fail; in the future:
- create an image on dest with no backing file;
- writes will be mirrored to current_image_file and dest
- start streaming from current_image_file to dest

2) incremental=false, new_image_file passed:
right now fail; in the future:
- create an image on dest with no backing file;
- live-snapshot based on current_image_file to new_image_file;
- writes will be mirrored to new_image_file and dest
- start streaming from current_image_file to dest

3) incremental=true, new_image_file not passed:
- error

4) incremental=true, new_image_file passed:
- no images will be created
- writes will be mirrored to new_image_file and dest

Paolo

Luiz Capitulino

2012-02-27 13:06:45 UTC

On Mon, 27 Feb 2012 13:49:17 +0100

Post by Paolo Bonzini

Post by Luiz Capitulino

Post by Federico Simoncelli
If I didn't make any mistake in the code I'm just enforcing that when
you specify "incremental" you also need a new image.
There are still other valid cases where they are optional.

Which operation will be performed if 'incremental' is not passed? If
it's passed, which operation will be performed if 'new_image_file' is not?

- create an image on dest with no backing file;
- writes will be mirrored to current_image_file and dest
- start streaming from current_image_file to dest
- create an image on dest with no backing file;
- live-snapshot based on current_image_file to new_image_file;
- writes will be mirrored to new_image_file and dest
- start streaming from current_image_file to dest
- error
- no images will be created
- writes will be mirrored to new_image_file and dest

IMHO, this is asking for two commands, where cases 1 & 2 is one of them
and cases 3 & 4 is the other one. Note how 'incremental' goes away and
'new_image_file' really becomes an optional.

Paolo Bonzini

2012-02-27 14:39:33 UTC

Post by Luiz Capitulino
IMHO, this is asking for two commands, where cases 1 & 2 is one of them
and cases 3 & 4 is the other one. Note how 'incremental' goes away and
'new_image_file' really becomes an optional.

I really would have no idea on naming except perhaps "drive_migrate" and
"funny_drive_migrate_for_ovirt". But actually I must admit that it's a
rat's nest.

First, there's no reason why live-snapshotting to new_image_file
shouldn't be handled within QEMU. That would make the above table much
more orthogonal. However, oVirt likes to create its own snapshots.

Perhaps we do need to rethink this together with group snapshots.

There are four kinds of operations that we do on block devices:
(a) create an image. This is part of what blockdev-snapshot does.
(b) switch a block device to a new image. drive-reopen does this.
(c) add mirroring to a new destination.
(d) activate streaming from a base image

drive-migrate does (b) and (c), will do (a) and (d) when we add
pre-copy, and should do (a) right now if Federico wasn't an oVirt
developer. :)

Thinking more about it, the commands we _do_ need are:
- start a transaction
- switch to a new image
- add mirroring to a new destination
- commit a transaction
- rollback a transaction
- query transaction errors

Creating an image can be done outside a transaction for now because we
only support live external snapshots. Streaming can also be started
outside a transaction, because it doesn't need to be started atomically.

If we have the above elementary commands, blockdev-snapshot-sync becomes
sugar for this:

(create image)
blockdev-start-transaction if no active transaction
drive-reopen
blockdev-commit-transaction if no active transaction

Jeff's group snapshot can be realized as this:

blockdev-begin-transaction
blockdev-snapshot-sync
...
blockdev-snapshot-sync
blockdev-commit-transaction

Post by Luiz Capitulino

Post by Paolo Bonzini
- create an image on dest with no backing file;
- writes will be mirrored to current_image_file and dest
- start streaming from current_image_file to dest

This is a new command "drive-mirror device dest", which does:

(create image for dest)
blockdev-begin-transaction if no active transaction
drive-mirror device dest
blockdev-commit-transaction if no active transaction

The command does this:

- mirror writes to current_image_file and dest
- start streaming from current_image_file to dest

The second part can be suppressed with a boolean argument.

Post by Luiz Capitulino

Post by Paolo Bonzini
- create an image on dest with no backing file;
- live-snapshot based on current_image_file to new_image_file;
- writes will be mirrored to new_image_file and dest
- start streaming from current_image_file to dest

Atomicity is not needed here, so the user can simply issue:

blockdev-snapshot-sync device new-image-file
drive-mirror device dest

Post by Luiz Capitulino

Post by Paolo Bonzini
- no images will be created
- writes will be mirrored to new_image_file and dest

No need to provide this from within QEMU, because libvirt/oVirt can do
the dance using elementary operations:

blockdev-begin-transaction
drive-reopen device new-image-file
drive-mirror streaming=false device dest
blockdev-commit-transaction

No strange optional arguments, no proliferation of commands, etc. The
only downside is that if someone tries to do (4) without transactions
(or without stopping the VM) they'll get corruption because atomicity is
required.

Paolo

Anthony Liguori

2012-02-27 14:46:57 UTC

Post by Paolo Bonzini

IMHO, this is asking for two commands, where cases 1& 2 is one of them
and cases 3& 4 is the other one. Note how 'incremental' goes away and
'new_image_file' really becomes an optional.

I really would have no idea on naming except perhaps "drive_migrate" and
"funny_drive_migrate_for_ovirt". But actually I must admit that it's a
rat's nest.
First, there's no reason why live-snapshotting to new_image_file
shouldn't be handled within QEMU. That would make the above table much
more orthogonal. However, oVirt likes to create its own snapshots.
Perhaps we do need to rethink this together with group snapshots.
(a) create an image. This is part of what blockdev-snapshot does.
(b) switch a block device to a new image. drive-reopen does this.
(c) add mirroring to a new destination.
(d) activate streaming from a base image
drive-migrate does (b) and (c), will do (a) and (d) when we add
pre-copy, and should do (a) right now if Federico wasn't an oVirt
developer. :)
- start a transaction
- switch to a new image
- add mirroring to a new destination
- commit a transaction
- rollback a transaction
- query transaction errors

I think a better way to think of this is as a batch submission. It would be
relatively easy to model in QMP too (just have a batch-command that has a list
of commands as it's argument).

The difference between batch submission and a transaction is atomic rollback.
But I don't think atomic rollback is really needed here.

Regards,

Anthony Liguori

Paolo Bonzini

2012-02-27 14:54:41 UTC

Post by Anthony Liguori
I think a better way to think of this is as a batch submission. It
would be relatively easy to model in QMP too (just have a batch-command
that has a list of commands as it's argument).
The difference between batch submission and a transaction is atomic
rollback. But I don't think atomic rollback is really needed here.

A transaction enforces atomicity at the block layer level. It's
different from batch commands in two ways:

* bdrv_drain_all/bdrv_flush needs to be called at the beginning of the
commit. This may not be the case with batch commands.

* with batch commands, atomicity happens by chance because VCPUs cannot
send I/O while the monitor is holding the global mutex.

Paolo

Anthony Liguori

2012-02-27 14:59:18 UTC

Post by Paolo Bonzini

Post by Anthony Liguori
I think a better way to think of this is as a batch submission. It
would be relatively easy to model in QMP too (just have a batch-command
that has a list of commands as it's argument).
The difference between batch submission and a transaction is atomic
rollback. But I don't think atomic rollback is really needed here.

A transaction enforces atomicity at the block layer level. It's
* bdrv_drain_all/bdrv_flush needs to be called at the beginning of the
commit. This may not be the case with batch commands.
* with batch commands, atomicity happens by chance because VCPUs cannot
send I/O while the monitor is holding the global mutex.

If the commands are designed correctly, then this works well. The problem is
that the current commands are not designed well. For instance, multi-snapshot
could look like:

block-freeze ide0-hd0
block-freeze ide1-hd1
block-reopen ide0-hd0 my-new-file0.qcow2
block-reopen ide1-hd1 my-new-file1.qcow2
block-unfreeze ide1-hd1
block-unfreeze ide1-hd0

This would work regardless of whether the commands were implemented
asynchronously within QEMU too.

Regards,

Anthony Liguori

Post by Paolo Bonzini
Paolo

Paolo Bonzini

2012-02-27 15:03:48 UTC

The problem is that the current commands are not designed well. For
block-freeze ide0-hd0
block-freeze ide1-hd1
block-reopen ide0-hd0 my-new-file0.qcow2
block-reopen ide1-hd1 my-new-file1.qcow2
block-unfreeze ide1-hd1
block-unfreeze ide1-hd0
This would work regardless of whether the commands were implemented
asynchronously within QEMU too.

This looks good, too. Positive: maps well to fsfreeze/thaw with help
from the guest agent. Negative: you have to specify the devices three
times. Overall, I think I like it.

However, you need to add freeze/unfreeze capabilities to the block
layer. Not hard, but one more thing to do.

Paolo

Anthony Liguori

2012-02-27 15:06:15 UTC

Post by Paolo Bonzini

The problem is that the current commands are not designed well. For
block-freeze ide0-hd0
block-freeze ide1-hd1
block-reopen ide0-hd0 my-new-file0.qcow2
block-reopen ide1-hd1 my-new-file1.qcow2
block-unfreeze ide1-hd1
block-unfreeze ide1-hd0
This would work regardless of whether the commands were implemented
asynchronously within QEMU too.

This looks good, too. Positive: maps well to fsfreeze/thaw with help
from the guest agent. Negative: you have to specify the devices three
times. Overall, I think I like it.
However, you need to add freeze/unfreeze capabilities to the block
layer. Not hard, but one more thing to do.

Right. But it also generalizes to other QMP operations which is potentially
interesting.

And providing mechanisms like this gives more flexibility to management tools to
implement interesting features without constantly chancing new QMP commands.

Regards,

Anthony Liguori

Post by Paolo Bonzini
Paolo

Kevin Wolf

2012-02-27 15:17:07 UTC

Post by Anthony Liguori

Post by Paolo Bonzini

Post by Anthony Liguori
I think a better way to think of this is as a batch submission. It
would be relatively easy to model in QMP too (just have a batch-command
that has a list of commands as it's argument).
The difference between batch submission and a transaction is atomic
rollback. But I don't think atomic rollback is really needed here.

A transaction enforces atomicity at the block layer level. It's
* bdrv_drain_all/bdrv_flush needs to be called at the beginning of the
commit. This may not be the case with batch commands.
* with batch commands, atomicity happens by chance because VCPUs cannot
send I/O while the monitor is holding the global mutex.

If the commands are designed correctly, then this works well. The problem is
that the current commands are not designed well. For instance, multi-snapshot
block-freeze ide0-hd0
block-freeze ide1-hd1
block-reopen ide0-hd0 my-new-file0.qcow2
block-reopen ide1-hd1 my-new-file1.qcow2
block-unfreeze ide1-hd1
block-unfreeze ide1-hd0
This would work regardless of whether the commands were implemented
asynchronously within QEMU too.

What if block-reopen ide1-hd1 fails? In case of failure, you want both
drives to point to their old image, not the first one to the new image
and the second one to the old image.

Kevin

Anthony Liguori

2012-02-27 15:24:52 UTC

Post by Kevin Wolf

Post by Anthony Liguori

Post by Paolo Bonzini

Post by Anthony Liguori
I think a better way to think of this is as a batch submission. It
would be relatively easy to model in QMP too (just have a batch-command
that has a list of commands as it's argument).
The difference between batch submission and a transaction is atomic
rollback. But I don't think atomic rollback is really needed here.

A transaction enforces atomicity at the block layer level. It's
* bdrv_drain_all/bdrv_flush needs to be called at the beginning of the
commit. This may not be the case with batch commands.
* with batch commands, atomicity happens by chance because VCPUs cannot
send I/O while the monitor is holding the global mutex.

If the commands are designed correctly, then this works well. The problem is
that the current commands are not designed well. For instance, multi-snapshot
block-freeze ide0-hd0
block-freeze ide1-hd1
block-reopen ide0-hd0 my-new-file0.qcow2
block-reopen ide1-hd1 my-new-file1.qcow2
block-unfreeze ide1-hd1
block-unfreeze ide1-hd0
This would work regardless of whether the commands were implemented
asynchronously within QEMU too.

What if block-reopen ide1-hd1 fails? In case of failure, you want both
drives to point to their old image, not the first one to the new image
and the second one to the old image.

Then you get an error with the block devices still frozen. You can execute
another command to reopen back to the old image to roll back the transaction.

Pushing the rollback logic to the client does make the client interface a bit
more complicated and adds latency to the error path but it's much easier than
building a complex transaction infrastructure.

And there are examples of this in the wild too. LDAP uses a similar mechanism.

Regards,

Anthony Liguori

Post by Kevin Wolf
Kevin

Paolo Bonzini

2012-02-27 16:51:47 UTC

Post by Anthony Liguori
Then you get an error with the block devices still frozen. You can
execute another command to reopen back to the old image to roll back the
transaction.
Pushing the rollback logic to the client does make the client interface
a bit more complicated and adds latency to the error path but it's much
easier than building a complex transaction infrastructure.
And there are examples of this in the wild too. LDAP uses a similar mechanism.

Actually, have you seen Jeff's atomic snapshot patch? It really
implements all that is needed to do transactions, and gets 100% ok
error-recovery unlike the existing blockdev_snapshot_sync. It really
looks like we can do better than client-side error management, and there
is not that much complexity at all.

Jeff could rework his patches to work with transaction begin/commit
commands, and Federico can then add drive-reopen and drive-migrate on top.

Paolo

Anthony Liguori

2012-02-27 16:58:33 UTC

Post by Paolo Bonzini

Post by Anthony Liguori
Then you get an error with the block devices still frozen. You can
execute another command to reopen back to the old image to roll back the
transaction.
Pushing the rollback logic to the client does make the client interface
a bit more complicated and adds latency to the error path but it's much
easier than building a complex transaction infrastructure.
And there are examples of this in the wild too. LDAP uses a similar mechanism.

Actually, have you seen Jeff's atomic snapshot patch? It really
implements all that is needed to do transactions, and gets 100% ok
error-recovery unlike the existing blockdev_snapshot_sync. It really
looks like we can do better than client-side error management, and there
is not that much complexity at all.
Jeff could rework his patches to work with transaction begin/commit
commands, and Federico can then add drive-reopen and drive-migrate on top.

Yes, maybe I lack imagination but I fail to see how it generalizes easily/nicely.

From what I can tell, all of the rollback logic is very specific to the
commands being used, right?

Regards,

Anthony Liguori

Post by Paolo Bonzini
Paolo

Paolo Bonzini

2012-02-27 17:06:58 UTC

Post by Anthony Liguori

Post by Paolo Bonzini
Jeff could rework his patches to work with transaction begin/commit
commands, and Federico can then add drive-reopen and drive-migrate on top.

Yes, maybe I lack imagination but I fail to see how it generalizes easily/nicely.
From what I can tell, all of the rollback logic is very specific to the
commands being used, right?

The rollback logic is just "close the new devices".

The commit logic is specific to the commands being used, but reopen
should be easier than snapshot (and basically the same except for
backing_hd handling).

Migrate is really syntactic sugar around reopen, so no surprises there.

Paolo

Federico Simoncelli

2012-02-27 16:33:00 UTC

----- Original Message -----

Sent: Monday, February 27, 2012 3:39:33 PM
Subject: drive transactions (was Re: [PATCH 2/2 v2] Add the blockdev-reopen and blockdev-migrate commands)

Post by Paolo Bonzini
- no images will be created
- writes will be mirrored to new_image_file and dest

No need to provide this from within QEMU, because libvirt/oVirt can do
blockdev-begin-transaction
drive-reopen device new-image-file
drive-mirror streaming=false device dest
blockdev-commit-transaction
No strange optional arguments, no proliferation of commands, etc.
The
only downside is that if someone tries to do (4) without transactions
(or without stopping the VM) they'll get corruption because atomicity is
required.

I'm all for the modularity of the commands (I suggested it since the beginning),
but all this infrastructure goes way beyond what I'd need for oVirt now.

When I submitted my patches we knew that my work wasn't the definitive solution
(eg: the future implementation of -blockdev). So I'd suggest to try and settle
with something that is good enough and that is not preventing a future improvement.

What about having a (temporary) flag in drive-mirror to accept also a new-image-file
until we will have the optimal solution?

--
Federico

Paolo Bonzini

2012-02-27 16:41:35 UTC

Post by Federico Simoncelli

Post by Paolo Bonzini
blockdev-begin-transaction
drive-reopen device new-image-file
drive-mirror streaming=false device dest
blockdev-commit-transaction
No strange optional arguments, no proliferation of commands, etc.
The
only downside is that if someone tries to do (4) without transactions
(or without stopping the VM) they'll get corruption because atomicity is
required.

I'm all for the modularity of the commands (I suggested it since the beginning),
but all this infrastructure goes way beyond what I'd need for oVirt now.
When I submitted my patches we knew that my work wasn't the definitive solution
(eg: the future implementation of -blockdev). So I'd suggest to try and settle
with something that is good enough and that is not preventing a future improvement.
What about having a (temporary) flag in drive-mirror to accept also a new-image-file
until we will have the optimal solution?

What if libvirt could simply try blockdev-freeze and, if it's not there,
try passing a new-image-file argument. Add the new-image-file
downstream if you have time constraints, and leave it out upstream. Too
ugly?

Paolo

Anthony Liguori

2012-02-27 16:42:31 UTC

Post by Federico Simoncelli
I'm all for the modularity of the commands (I suggested it since the beginning),
but all this infrastructure goes way beyond what I'd need for oVirt now.
When I submitted my patches we knew that my work wasn't the definitive solution
(eg: the future implementation of -blockdev). So I'd suggest to try and settle
with something that is good enough and that is not preventing a future improvement.
What about having a (temporary) flag in drive-mirror to accept also a new-image-file
until we will have the optimal solution?

Unless there are extenuating circumstances (like the absence of core
infrastructure in QEMU), then we should not add commands that we know are not
the right command.

We have to support our commands forever.

Regards,

Anthony Liguori

Federico Simoncelli

2012-02-27 16:50:43 UTC

----- Original Message -----

Sent: Monday, February 27, 2012 5:42:31 PM
Subject: Re: [Qemu-devel] drive transactions (was Re: [PATCH 2/2 v2] Add the blockdev-reopen and blockdev-migrate
commands)

Post by Federico Simoncelli
I'm all for the modularity of the commands (I suggested it since the beginning),
but all this infrastructure goes way beyond what I'd need for oVirt now.
When I submitted my patches we knew that my work wasn't the
definitive solution
(eg: the future implementation of -blockdev). So I'd suggest to try and settle
with something that is good enough and that is not preventing a future improvement.
What about having a (temporary) flag in drive-mirror to accept also a new-image-file
until we will have the optimal solution?

Unless there are extenuating circumstances (like the absence of core
infrastructure in QEMU), then we should not add commands that we know are not
the right command.

So are you in favor or against my suggestion? It looks like this is exactly
the case where the core infrastructure (transactions) is missing.

--
Federico

Anthony Liguori

2012-02-27 16:53:27 UTC

Post by Federico Simoncelli

Post by Anthony Liguori

Post by Federico Simoncelli
I'm all for the modularity of the commands (I suggested it since the beginning),
but all this infrastructure goes way beyond what I'd need for oVirt now.
When I submitted my patches we knew that my work wasn't the
definitive solution
(eg: the future implementation of -blockdev). So I'd suggest to try and settle
with something that is good enough and that is not preventing a future improvement.
What about having a (temporary) flag in drive-mirror to accept also a new-image-file
until we will have the optimal solution?

Unless there are extenuating circumstances (like the absence of core
infrastructure in QEMU), then we should not add commands that we know are not
the right command.

So are you in favor or against my suggestion?

Against. I think we have everything we need.

Post by Federico Simoncelli
It looks like this is exactly
the case where the core infrastructure (transactions) is missing.

Batch requests are incredibly easy to add. I'm stuck in meetings for the next
couple days but I'm sure Luiz throw it together in no time at all.

Regards,

Anthony Liguori

Paolo Bonzini

2012-02-27 16:54:25 UTC

Post by Anthony Liguori

Post by Federico Simoncelli
It looks like this is exactly
the case where the core infrastructure (transactions) is missing.

Batch requests are incredibly easy to add. I'm stuck in meetings for
the next couple days but I'm sure Luiz throw it together in no time at all.

Batch requests are unnecessary. They should be a convenience, not a
correctness feature.

Paolo

Anthony Liguori

2012-02-27 16:59:37 UTC

Post by Paolo Bonzini

Post by Anthony Liguori

Post by Federico Simoncelli
It looks like this is exactly
the case where the core infrastructure (transactions) is missing.

Batch requests are incredibly easy to add. I'm stuck in meetings for
the next couple days but I'm sure Luiz throw it together in no time at all.

Batch requests are unnecessary. They should be a convenience, not a
correctness feature.

I think this discussion has gotten a bit unwieldy as I'm having trouble keeping
up with the various proposals. Can we move to something more formal and written
on the wiki?

Can we start with a clear description of what the various requirements are?

Regards,

Anthony Liguroi

Post by Paolo Bonzini
Paolo

Luiz Capitulino

2012-02-27 17:37:45 UTC

On Mon, 27 Feb 2012 10:42:31 -0600

Post by Anthony Liguori

Post by Federico Simoncelli
I'm all for the modularity of the commands (I suggested it since the beginning),
but all this infrastructure goes way beyond what I'd need for oVirt now.
When I submitted my patches we knew that my work wasn't the definitive solution
(eg: the future implementation of -blockdev). So I'd suggest to try and settle
with something that is good enough and that is not preventing a future improvement.
What about having a (temporary) flag in drive-mirror to accept also a new-image-file
until we will have the optimal solution?

Unless there are extenuating circumstances (like the absence of core
infrastructure in QEMU), then we should not add commands that we know are not
the right command.
We have to support our commands forever.

Agreed. Worst case we have vendor extension support that downstream can use to
add its own commands.

Stefan Hajnoczi

2012-02-28 15:47:48 UTC

On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and dst/hd0snap1 as
image files. (Where "<-" stands for "has backing file")
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
... <- [dst/hd0snap1] <= VM1(write-only)
$ qemu-img create -f qcow2 \
-b /tmp/src/hd0base.qcow2 /tmp/src/hd0snap1.qcow2 20G
Formatting '/tmp/src/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536
$ qemu-img create -f qcow2 \
-b /tmp/dst/hd0base.qcow2 /tmp/dst/hd0snap1.qcow2 20G
Formatting '/tmp/dst/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536
(qemu) snapshot_blkdev -n ide0-hd0 \
blkmirror:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2 blkmirror
Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination dst/hd0base.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base] <- [dst/hd0snap1] <= VM1(write-only)

At this stage we have dst/hd0snap1 opened with BDRV_O_NO_BACKING. If
it has no backing file and the guest issues a write request that is
smaller than a cluster in the image file, the untouched areas of that
cluster will be populated with zeroes.

Once dst/hd0snap1 is reopened with dst/hd0base in place there will be
zeros in clusters where the guest wrote only a few sectors. We will
not "see" the backing file data in those clusters.

Have you hit this problem or did I miss something?

Stefan

Federico Simoncelli

2012-02-28 17:15:44 UTC

----- Original Message -----

Sent: Tuesday, February 28, 2012 4:47:48 PM
Subject: Re: [Qemu-devel] Live Block Migration using Mirroring
On Wed, Feb 22, 2012 at 5:13 PM, Federico Simoncelli

Post by Federico Simoncelli
Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and
dst/hd0snap1 as
image files. (Where "<-" stands for "has backing file")
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
... <- [dst/hd0snap1] <= VM1(write-only)
$ qemu-img create -f qcow2 \
-b /tmp/src/hd0base.qcow2 /tmp/src/hd0snap1.qcow2 20G
Formatting '/tmp/src/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off
cluster_size=65536
$ qemu-img create -f qcow2 \
-b /tmp/dst/hd0base.qcow2 /tmp/dst/hd0snap1.qcow2 20G
Formatting '/tmp/dst/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off
cluster_size=65536
(qemu) snapshot_blkdev -n ide0-hd0 \
blkmirror:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2
blkmirror
Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination
dst/hd0base.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base] <- [dst/hd0snap1] <= VM1(write-only)

At this stage we have dst/hd0snap1 opened with BDRV_O_NO_BACKING. If
it has no backing file and the guest issues a write request that is
smaller than a cluster in the image file, the untouched areas of that
cluster will be populated with zeroes.
Once dst/hd0snap1 is reopened with dst/hd0base in place there will be
zeros in clusters where the guest wrote only a few sectors. We will
not "see" the backing file data in those clusters.
Have you hit this problem or did I miss something?

Thank you for getting this. Being able to have a bogus backing file was
a bonus but it's not really required for the mirrored live block migration.
We can add the support for switching the backing file in the drive-reopen
part.
I'll remove the BDRV_O_NO_BACKING flag from the blkmirror patch.

--
Federico

Paolo Bonzini

2012-02-28 17:36:57 UTC

Post by Federico Simoncelli
Thank you for getting this. Being able to have a bogus backing file was
a bonus but it's not really required for the mirrored live block migration.
We can add the support for switching the backing file in the drive-reopen
part.

Wait, it's not really required for oVirt because it creates the snapshot
outside QEMU. What about everyone else?

Paolo

Federico Simoncelli

2012-02-28 17:46:55 UTC

----- Original Message -----

Sent: Tuesday, February 28, 2012 6:36:57 PM
Subject: Re: [Qemu-devel] Live Block Migration using Mirroring

Post by Federico Simoncelli
Thank you for getting this. Being able to have a bogus backing file was
a bonus but it's not really required for the mirrored live block migration.
We can add the support for switching the backing file in the
drive-reopen
part.

Wait, it's not really required for oVirt because it creates the snapshot
outside QEMU. What about everyone else?

They'll have (as oVirt) two mirrored snapshot pointing at the same
base. The only difference is that the image is created internally, but
that's not hard.

--
Federico

Paolo Bonzini

2012-02-28 18:02:40 UTC

Post by Federico Simoncelli

Post by Paolo Bonzini

Post by Federico Simoncelli
Thank you for getting this. Being able to have a bogus backing file
was a bonus but it's not really required for the mirrored live block
migration. We can add the support for switching the backing file in the
drive-reopen part.

Wait, it's not really required for oVirt because it creates the
snapshot outside QEMU. What about everyone else?

They'll have (as oVirt) two mirrored snapshot pointing at the same
base. The only difference is that the image is created internally, but
that's not hard.

Can you detail how you are switching the backing file in the
drive-reopen? Either the BlockDriverState opened by blkmirror, or the
one opened at the end, will have to use the "wrong" backing_file. How
do you arrange for that?

Paolo

Federico Simoncelli

2012-02-28 18:21:40 UTC

----- Original Message -----

Sent: Tuesday, February 28, 2012 7:02:40 PM
Subject: Re: [Qemu-devel] Live Block Migration using Mirroring

Post by Federico Simoncelli

Post by Paolo Bonzini

Post by Federico Simoncelli
Thank you for getting this. Being able to have a bogus backing file
was a bonus but it's not really required for the mirrored live block
migration. We can add the support for switching the backing file in the
drive-reopen part.

Wait, it's not really required for oVirt because it creates the
snapshot outside QEMU. What about everyone else?

They'll have (as oVirt) two mirrored snapshot pointing at the same
base. The only difference is that the image is created internally, but
that's not hard.

Can you detail how you are switching the backing file in the
drive-reopen? Either the BlockDriverState opened by blkmirror, or the
one opened at the end, will have to use the "wrong" backing_file.
How
do you arrange for that?

Step 1 - Initital Scenario
==========================
VM1 is running on the src/hd0base.

[src/hd0base] <= VM1(read-write)

Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and dst/hd0snap1 as image
files (both having src/hd0base as backing file).

[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
^-- [dst/hd0snap1] <= VM1(read-write)

Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination (dst/hd0base).

[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base]^-- [dst/hd0snap1] <= VM1(read-write)

Step 5 - Final Switch to Destination
====================================
VM1 is now able to switch to the destination for both read and write operations
fixing the backing file path in dst/hd0snap1.

[src/hd0base] <- [src/hd0snap1]
[dst/hd0base] <- [dst/hd0snap1] <= VM1(read-write)

--
Federico

Paolo Bonzini

2012-02-28 17:26:04 UTC

Post by Stefan Hajnoczi
At this stage we have dst/hd0snap1 opened with BDRV_O_NO_BACKING. If
it has no backing file and the guest issues a write request that is
smaller than a cluster in the image file, the untouched areas of that
cluster will be populated with zeroes.
Once dst/hd0snap1 is reopened with dst/hd0base in place there will be
zeros in clusters where the guest wrote only a few sectors. We will
not "see" the backing file data in those clusters.
Have you hit this problem or did I miss something?

I'm afraid not.

Federico, perhaps you have to rewrite blkmirror to reuse copy-on-read
mechanism. You can implement is_allocated and make the base image a
real backing_file even though you can write to it. Perhaps some hack
with BDRV_O_NO_BACKING, or perhaps it just works with Jeff's open-on-top
behavior.

Looks like it would also make streaming of the base image Just Work, at
least to a non-raw destination.

Remaining problems:

1) differentiating writes from copy-on-read and writes from the guest.
This is needed to avoid spurious writes to the backing image each time
you're doing a copy-on-read.

2) triggering a copy-on-read before writing partial clusters.

Stefan, what do you think?

Paolo

Federico Simoncelli

2012-02-29 12:28:21 UTC

Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <***@redhat.com>
Signed-off-by: Federico Simoncelli <***@redhat.com>
---
Makefile.objs | 2 +-
block/blkmirror.c | 255 ++++++++++++++++++++++++++++++++++++++++++++++++++++
cutils.c | 30 ++++++
docs/blkmirror.txt | 15 +++
qemu-common.h | 1 +
5 files changed, 302 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt

diff --git a/Makefile.objs b/Makefile.objs
index 808de6a..2302c96 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,7 +34,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..c98b162
--- /dev/null
+++ b/block/blkmirror.c
@@ -0,0 +1,255 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011-2012 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ int finished;
+ DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+ BlockDriverAIOCB common;
+ int count;
+
+ BlockDriverCompletionFunc *cb;
+ SingleAIOCB aios[2];
+ int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:fmt1:path/to/image1:fmt2:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ int ret = 0, i;
+ char *tmpbuf, *tok[4], *next;
+ BlockDriver *drv1, *drv2;
+ BdrvMirrorState *m = bs->opaque;
+
+ m->bs[0] = bdrv_new("");
+ if (m->bs[0] == NULL) {
+ return -ENOMEM;
+ }
+
+ m->bs[1] = bdrv_new("");
+ if (m->bs[1] == NULL) {
+ bdrv_delete(m->bs[0]);
+ return -ENOMEM;
+ }
+
+ tmpbuf = g_malloc(strlen(filename) + 1);
+ pstrcpy(tmpbuf, strlen(filename) + 1, filename);
+
+ /* Parse the blkmirror: prefix */
+ if (!strstart(tmpbuf, "blkmirror:", (const char **) &next)) {
+ next = tmpbuf;
+ }
+
+ for (i = 0; i < 4; i++) {
+ if (!next) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tok[i] = pstrtok_r(NULL, ":", '\\', &next);
+ }
+
+ drv1 = bdrv_find_whitelisted_format(tok[0]);
+ drv2 = bdrv_find_whitelisted_format(tok[2]);
+
+ if (!drv1 || !drv2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = bdrv_open(m->bs[0], tok[1], flags, drv1);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = bdrv_open(m->bs[0], tok[3], flags, drv2);
+ if (ret < 0) {
+ goto out;
+ }
+
+ out:
+ g_free(tmpbuf);
+
+ if (ret < 0) {
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+}
+
+static coroutine_fn int blkmirror_co_flush(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_flush(m->bs[0]);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_flush(m->bs[1]);
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+
+ return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+ int i;
+
+ for (i = 0 ; i < 2; i++) {
+ if (!acb->aios[i].finished) {
+ bdrv_aio_cancel(acb->aios[i].aiocb);
+ }
+ }
+ qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+ .aiocb_size = sizeof(DupAIOCB),
+ .cancel = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+ SingleAIOCB *scb = opaque;
+ DupAIOCB *dcb = scb->parent;
+
+ scb->finished = 1;
+ dcb->count--;
+ assert(dcb->count >= 0);
+ if (ret < 0) {
+ dcb->ret = ret;
+ }
+ if (dcb->count == 0) {
+ dcb->common.cb(dcb->common.opaque, dcb->ret);
+ qemu_aio_release(dcb);
+ }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ DupAIOCB *dcb;
+ int i;
+
+ dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+ if (!dcb) {
+ return NULL;
+ }
+ dcb->count = 2;
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].parent = dcb;
+ dcb->aios[i].finished = 0;
+ }
+ dcb->ret = 0;
+
+ return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+ nb_sectors, &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ }
+
+ return &dcb->common;
+}
+
+static coroutine_fn int blkmirror_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_discard(m->bs[0], sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+ .format_name = "blkmirror",
+ .protocol_name = "blkmirror",
+ .instance_size = sizeof(BdrvMirrorState),
+
+ .bdrv_getlength = blkmirror_getlength,
+
+ .bdrv_file_open = blkmirror_open,
+ .bdrv_close = blkmirror_close,
+ .bdrv_co_flush_to_disk = blkmirror_co_flush,
+ .bdrv_co_discard = blkmirror_co_discard,
+
+ .bdrv_aio_readv = blkmirror_aio_readv,
+ .bdrv_aio_writev = blkmirror_aio_writev,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+ bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
diff --git a/cutils.c b/cutils.c
index af308cd..7bb298d 100644
--- a/cutils.c
+++ b/cutils.c
@@ -54,6 +54,36 @@ char *pstrcat(char *buf, int buf_size, const char *s)
return buf;
}

+/* strtok_r with escaping support */
+char *pstrtok_r(char *str, const char *delim, char esc, char **p)
+{
+ int n = 0, escape = 0;
+
+ if (str == NULL) {
+ str = *p;
+ }
+
+ for (*p = str; **p != '\0'; (*p)++) {
+ if (!escape && strchr(delim, **p)) {
+ str[n] = '\0', (*p)++;
+ return str;
+ }
+
+ if (!escape && **p == esc) {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ str[n++] = **p;
+ }
+ }
+
+ *p = NULL;
+ return str;
+}
+
int strstart(const char *str, const char *val, const char **ptr)
{
const char *p, *q;
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..c9967eb
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,15 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:fmt1:/image1.img:fmt2:/image2.img
+
+Backslash '\' can be used to escape colon processing as
+a separator. Backslashes themselves also can be escaped
+as '\\'.
+
diff --git a/qemu-common.h b/qemu-common.h
index c5e9cad..2c8c664 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -122,6 +122,7 @@ int qemu_timedate_diff(struct tm *tm);
/* cutils.c */
void pstrcpy(char *buf, int buf_size, const char *str);
char *pstrcat(char *buf, int buf_size, const char *s);
+char *pstrtok_r(char *str, const char *delim, char esc, char **p);
int strstart(const char *str, const char *val, const char **ptr);
int stristart(const char *str, const char *val, const char **ptr);
int qemu_strnlen(const char *s, int max_len);

--
1.7.1

Federico Simoncelli

2012-02-29 13:02:09 UTC

----- Original Message -----

Sent: Wednesday, February 29, 2012 1:28:21 PM
Subject: [PATCHv3] Add blkmirror block driver
Mirrored writes are used by live block copy.
---
Makefile.objs | 2 +-
block/blkmirror.c | 255
++++++++++++++++++++++++++++++++++++++++++++++++++++
cutils.c | 30 ++++++
docs/blkmirror.txt | 15 +++
qemu-common.h | 1 +
5 files changed, 302 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..c98b162
--- /dev/null
+++ b/block/blkmirror.c

[...]

+ if (!drv1 || !drv2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = bdrv_open(m->bs[0], tok[1], flags, drv1);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = bdrv_open(m->bs[0], tok[3], flags, drv2);
+ if (ret < 0) {
+ goto out;
+ }

There's a small mistake here, the second one is m->bs[1].

--
Federico

Federico Simoncelli

2012-02-29 17:01:14 UTC

Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <***@redhat.com>
Signed-off-by: Federico Simoncelli <***@redhat.com>
Signed-off-by: Paolo Bonzini <***@redhat.com>
---
Makefile.objs | 2 +-
block/blkmirror.c | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++++
cutils.c | 56 +++++++++++
docs/blkmirror.txt | 15 +++
qemu-common.h | 2 +
5 files changed, 352 insertions(+), 1 deletions(-)
create mode 100644 block/blkmirror.c
create mode 100644 docs/blkmirror.txt

diff --git a/Makefile.objs b/Makefile.objs
index 808de6a..2302c96 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,7 +34,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/blkmirror.c b/block/blkmirror.c
new file mode 100644
index 0000000..d894ca8
--- /dev/null
+++ b/block/blkmirror.c
@@ -0,0 +1,278 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011-2012 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+ BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ int finished;
+ DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+ BlockDriverAIOCB common;
+ int count;
+
+ BlockDriverCompletionFunc *cb;
+ SingleAIOCB aios[2];
+ int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:fmt1:path/to/image1:fmt2:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ int ret = 0, i;
+ char *tmpbuf, *tok[4], *next;
+ BlockDriver *drv1, *drv2;
+ BdrvMirrorState *m = bs->opaque;
+ BlockDriverState *bk;
+
+ m->bs[0] = bdrv_new("");
+ if (m->bs[0] == NULL) {
+ return -ENOMEM;
+ }
+
+ m->bs[1] = bdrv_new("");
+ if (m->bs[1] == NULL) {
+ bdrv_delete(m->bs[0]);
+ return -ENOMEM;
+ }
+
+ tmpbuf = g_malloc(strlen(filename) + 1);
+ pstrcpy(tmpbuf, strlen(filename) + 1, filename);
+
+ /* Parse the blkmirror: prefix */
+ if (!strstart(tmpbuf, "blkmirror:", (const char **) &next)) {
+ next = tmpbuf;
+ }
+
+ for (i = 0; i < 4; i++) {
+ if (!next) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tok[i] = estrtok_r(NULL, ":", '\\', &next);
+ }
+
+ drv1 = bdrv_find_whitelisted_format(tok[0]);
+ drv2 = bdrv_find_whitelisted_format(tok[2]);
+ if (!drv1 || !drv2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = bdrv_open(m->bs[0], tok[1], flags, drv1);
+ if (ret < 0) {
+ goto out;
+ }
+
+ /* If we crash, we cannot assume that the destination is a
+ * valid mirror and we have to start over. So speed up things
+ * by effectively operating on the destination in cache=unsafe
+ * mode.
+ */
+ ret = bdrv_open(m->bs[1], tok[3], flags | BDRV_O_NO_BACKING
+ | BDRV_O_NO_FLUSH | BDRV_O_CACHE_WB, drv2);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (m->bs[0]->backing_hd) {
+ bk = m->bs[0]->backing_hd;
+
+ m->bs[1]->backing_hd = bdrv_new("");
+ if (!m->bs[1]->backing_hd) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* opening the same backing file of the source */
+ ret = bdrv_open(m->bs[1]->backing_hd,
+ bk->filename, bk->open_flags, bk->drv);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ out:
+ g_free(tmpbuf);
+
+ if (ret < 0) {
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ bdrv_delete(m->bs[i]);
+ m->bs[i] = NULL;
+ }
+}
+
+static coroutine_fn int blkmirror_co_flush(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_flush(m->bs[0]);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_flush(m->bs[1]);
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+ BdrvMirrorState *m = bs->opaque;
+
+ return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+ int i;
+
+ for (i = 0 ; i < 2; i++) {
+ if (!acb->aios[i].finished) {
+ bdrv_aio_cancel(acb->aios[i].aiocb);
+ }
+ }
+ qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+ .aiocb_size = sizeof(DupAIOCB),
+ .cancel = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+ SingleAIOCB *scb = opaque;
+ DupAIOCB *dcb = scb->parent;
+
+ scb->finished = 1;
+ dcb->count--;
+ assert(dcb->count >= 0);
+ if (ret < 0) {
+ dcb->ret = ret;
+ }
+ if (dcb->count == 0) {
+ dcb->common.cb(dcb->common.opaque, dcb->ret);
+ qemu_aio_release(dcb);
+ }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ DupAIOCB *dcb;
+ int i;
+
+ dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+ if (!dcb) {
+ return NULL;
+ }
+ dcb->count = 2;
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].parent = dcb;
+ dcb->aios[i].finished = 0;
+ }
+ dcb->ret = 0;
+
+ return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BdrvMirrorState *m = bs->opaque;
+ DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+ nb_sectors, &blkmirror_aio_cb,
+ &dcb->aios[i]);
+ }
+
+ return &dcb->common;
+}
+
+static coroutine_fn int blkmirror_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BdrvMirrorState *m = bs->opaque;
+ int ret;
+
+ ret = bdrv_co_discard(m->bs[0], sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_co_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+ .format_name = "blkmirror",
+ .protocol_name = "blkmirror",
+ .instance_size = sizeof(BdrvMirrorState),
+
+ .bdrv_getlength = blkmirror_getlength,
+
+ .bdrv_file_open = blkmirror_open,
+ .bdrv_close = blkmirror_close,
+ .bdrv_co_flush_to_disk = blkmirror_co_flush,
+ .bdrv_co_discard = blkmirror_co_discard,
+
+ .bdrv_aio_readv = blkmirror_aio_readv,
+ .bdrv_aio_writev = blkmirror_aio_writev,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+ bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
diff --git a/cutils.c b/cutils.c
index af308cd..ae8ddfb 100644
--- a/cutils.c
+++ b/cutils.c
@@ -54,6 +54,62 @@ char *pstrcat(char *buf, int buf_size, const char *s)
return buf;
}

+/* strtok_r with escaping support */
+char *estrtok_r(char *str, const char *delim, char esc, char **p)
+{
+ int n = 0, escape = 0;
+
+ if (str == NULL) {
+ str = *p;
+ }
+
+ for (*p = str; **p != '\0'; (*p)++) {
+ if (!escape && strchr(delim, **p)) {
+ str[n] = '\0', (*p)++;
+ return str;
+ }
+
+ if (!escape && **p == esc) {
+ escape = 1;
+ } else {
+ escape = 0;
+ }
+
+ if (!escape) {
+ str[n++] = **p;
+ }
+ }
+
+ str[n] = '\0', *p = NULL;
+ return str;
+}
+
+/* strdup with escaping support */
+char *estrdup(const char *str, const char *delim, char esc)
+{
+ int i, j;
+ const char *p;
+ char *ret;
+
+ for (p = str, j = 0, i = 0; *p != '\0'; p++, i++) {
+ if (strchr(delim, *p) || *p == esc) {
+ j++;
+ }
+ }
+
+ ret = g_malloc(i + (j * 2) + 1);
+
+ for (p = str, i = 0; *p != '\0'; p++, i++) {
+ if (strchr(delim, *p) || *p == esc) {
+ ret[i++] = esc;
+ }
+ ret[i] = *p;
+ }
+
+ ret[i] = '\0';
+ return ret;
+}
+
int strstart(const char *str, const char *val, const char **ptr)
{
const char *p, *q;
diff --git a/docs/blkmirror.txt b/docs/blkmirror.txt
new file mode 100644
index 0000000..c9967eb
--- /dev/null
+++ b/docs/blkmirror.txt
@@ -0,0 +1,15 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+It's used internally by live block copy.
+
+Format
+------
+
+blkmirror:fmt1:/image1.img:fmt2:/image2.img
+
+Backslash '\' can be used to escape colon processing as
+a separator. Backslashes themselves also can be escaped
+as '\\'.
+
diff --git a/qemu-common.h b/qemu-common.h
index c5e9cad..af9621f 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -122,6 +122,8 @@ int qemu_timedate_diff(struct tm *tm);
/* cutils.c */
void pstrcpy(char *buf, int buf_size, const char *str);
char *pstrcat(char *buf, int buf_size, const char *s);
+char *estrtok_r(char *str, const char *delim, char esc, char **p);
+char *estrdup(const char *str, const char *delim, char esc);
int strstart(const char *str, const char *val, const char **ptr);
int stristart(const char *str, const char *val, const char **ptr);
int qemu_strnlen(const char *s, int max_len);

--
1.7.1

Marcelo Tosatti

2012-03-05 16:59:02 UTC

Post by Federico Simoncelli
Hi,
recently I've been working on live block migration combining the live
snapshots and the blkmirror patch sent by Marcelo Tosatti few months ago.
http://www.ovirt.org/wiki/Features/Design/StorageLiveMigration
The design assumes that the qemu process can reach both the source and
destination storages and no real VM migration between hosts is involved.
The principal problem that it tries to solve is moving a VM to a new
reachable storage (more space, faster) without temporarily disrupting its
services.
The following set of patches are implementing the required changes in
QEMU.

What is the motivation here? What is the limitation with image streaming
that this tries to solve?

Post by Federico Simoncelli
Here it is a quick example of the use case (for consistency with the
Preparation
===========
$ mkdir /tmp/{src/dst}
$ qemu-img create -f qcow2 /tmp/src/hd0base.qcow2 20G
Formatting '/tmp/src/hd0base.qcow2', fmt=qcow2 size=21474836480
encryption=off cluster_size=65536
Step 1 - Initital Scenario
==========================
VM1 is running on the src/hd0base. (Where "<=" stands for "uses")
[src/hd0base] <= VM1(read-write)
$ qemu-system-x86_64 -hda /tmp/src/hd0base.qcow2 -monitor stdio
QEMU 1.0.50 monitor - type 'help' for more information
(qemu)
Step 3 - Mirrored Live Snapshot
===============================
A mirrored live snapshot is issued using src/hd0snap1 and dst/hd0snap1 as
image files. (Where "<-" stands for "has backing file")
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
... <- [dst/hd0snap1] <= VM1(write-only)
$ qemu-img create -f qcow2 \
-b /tmp/src/hd0base.qcow2 /tmp/src/hd0snap1.qcow2 20G
Formatting '/tmp/src/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536
$ qemu-img create -f qcow2 \
-b /tmp/dst/hd0base.qcow2 /tmp/dst/hd0snap1.qcow2 20G
Formatting '/tmp/dst/hd0snap1.qcow2', fmt=qcow2 size=21474836480
backing_file='/tmp/src/hd0base.qcow2' encryption=off cluster_size=65536
(qemu) snapshot_blkdev -n ide0-hd0 \
blkmirror:/tmp/src/hd0snap1.qcow2:/tmp/dst/hd0snap1.qcow2 blkmirror
Step 4 - Backing File Copy
==========================
An external manager copies src/hd0base to the destination dst/hd0base.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
[dst/hd0base] <- [dst/hd0snap1] <= VM1(write-only)
$ cp -a /tmp/src/hd0base.qcow2 /tmp/dst/hd0base.qcow2
Step 5 - Final Switch to Destination
====================================
VM1 is now able to switch to the destination for both read and write
operations.
[src/hd0base] <- [src/hd0snap1] <= VM1(read-write)
(qemu) snapshot_blkdev -n ide0-hd0 /tmp/dst/hd0snap1.qcow2
--
Federico

Eric Blake

2012-03-05 17:20:36 UTC

Post by Marcelo Tosatti

Post by Federico Simoncelli
Hi,
recently I've been working on live block migration combining the live
snapshots and the blkmirror patch sent by Marcelo Tosatti few months ago.
http://www.ovirt.org/wiki/Features/Design/StorageLiveMigration
The design assumes that the qemu process can reach both the source and
destination storages and no real VM migration between hosts is involved.
The principal problem that it tries to solve is moving a VM to a new
reachable storage (more space, faster) without temporarily disrupting its
services.
The following set of patches are implementing the required changes in
QEMU.

What is the motivation here? What is the limitation with image streaming
that this tries to solve?

My understanding is that this solves the scenario of a storage failure
during the migration. The original post-copy approach has the flaw that
you are setting up a situation where qemu is operating on a qcow2 file
on one storage domain that is backed by a file on another storage
domain. After you start the migration process, but before it completes,
any failure in the migration is fatal to the domain: if the destination
storage domain fails, then you have lost all the delta changes made
since the migration started. And after the migration has completed, you
still have the problem that qemu is crossing storage domains - if the
source storage domain fails, then qemu's access to the backing file
renders the destination qcow2 worthless, so you cannot shut down the
source storage domain without also restarting the guest.

But a mirrored solution does not have these drawbacks - at all points
through the migration phase, you are guaranteed that _all_ data is
accessible from a single storage domain. If the destination storage
fails, you still have the source storage intact, and can restart the
migration process. Then, when the migration is complete, you tell qemu
to atomically switch storage domains, at which point the entire storage
is accessed from the destination domain, and you can safely shut down
the source storage domain while the guest continues to run..

--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org

Marcelo Tosatti

2012-03-05 17:44:31 UTC

Post by Eric Blake

Post by Marcelo Tosatti

Post by Federico Simoncelli
Hi,
recently I've been working on live block migration combining the live
snapshots and the blkmirror patch sent by Marcelo Tosatti few months ago.
http://www.ovirt.org/wiki/Features/Design/StorageLiveMigration
The design assumes that the qemu process can reach both the source and
destination storages and no real VM migration between hosts is involved.
The principal problem that it tries to solve is moving a VM to a new
reachable storage (more space, faster) without temporarily disrupting its
services.
The following set of patches are implementing the required changes in
QEMU.

What is the motivation here? What is the limitation with image streaming
that this tries to solve?

My understanding is that this solves the scenario of a storage failure
during the migration. The original post-copy approach has the flaw that
you are setting up a situation where qemu is operating on a qcow2 file
on one storage domain that is backed by a file on another storage
domain. After you start the migration process, but before it completes,
any failure in the migration is fatal to the domain: if the destination
storage domain fails, then you have lost all the delta changes made
since the migration started. And after the migration has completed, you
still have the problem that qemu is crossing storage domains - if the
source storage domain fails, then qemu's access to the backing file
renders the destination qcow2 worthless, so you cannot shut down the
source storage domain without also restarting the guest.
But a mirrored solution does not have these drawbacks - at all points
through the migration phase, you are guaranteed that _all_ data is
accessible from a single storage domain. If the destination storage
fails, you still have the source storage intact, and can restart the
migration process. Then, when the migration is complete, you tell qemu
to atomically switch storage domains, at which point the entire storage
is accessed from the destination domain, and you can safely shut down
the source storage domain while the guest continues to run..

OK, can't it be fixed by image streaming on top of a blkmirror device?
This would avoid a duplicate interface (such as no need to snapshot_blkdev
to change to final copy).

That is, start image streaming to a blkmirror device so that updates to
the new snapshot are replicated across target and destination domains.

Obviously then usage of blkmirror is only necessary when moving across
image domains.

Paolo Bonzini

2012-03-05 18:05:03 UTC

Post by Marcelo Tosatti
OK, can't it be fixed by image streaming on top of a blkmirror device?
This would avoid a duplicate interface (such as no need to snapshot_blkdev
to change to final copy).
That is, start image streaming to a blkmirror device so that updates to
the new snapshot are replicated across target and destination domains.

This works too, but if you don't have a base image, streaming will
"complete" both the source and destination images with zero clusters.
It's just a limitation of the current implementation, of course.

Paolo

94 Replies
1 View
Permalink to this page
Disable enhanced parsing

Thread Navigation

Federico Simoncelli 2012-02-22 17:13:32 UTC

Federico Simoncelli 2012-02-22 17:13:34 UTC

Paolo Bonzini 2012-02-23 07:18:28 UTC

Federico Simoncelli 2012-02-23 09:44:25 UTC

Paolo Bonzini 2012-02-23 09:45:51 UTC

Federico Simoncelli 2012-02-22 17:13:35 UTC

Paolo Bonzini 2012-02-23 07:19:41 UTC

Paolo Bonzini 2012-02-23 07:38:55 UTC

Federico Simoncelli 2012-02-23 09:39:04 UTC

Paolo Bonzini 2012-02-23 09:48:59 UTC

Federico Simoncelli 2012-02-23 10:19:08 UTC

Paolo Bonzini 2012-02-23 11:30:42 UTC

Federico Simoncelli 2012-02-22 17:13:33 UTC

Stefan Hajnoczi 2012-02-23 16:14:09 UTC

Stefan Hajnoczi 2012-02-23 16:18:41 UTC

Federico Simoncelli 2012-02-23 16:20:24 UTC

Stefan Hajnoczi 2012-02-23 16:28:23 UTC

Federico Simoncelli 2012-02-23 16:51:37 UTC

Federico Simoncelli 2012-02-23 16:18:54 UTC

Stefan Hajnoczi 2012-02-27 09:23:38 UTC

Paolo Bonzini 2012-02-27 11:37:34 UTC

Stefan Hajnoczi 2012-02-27 11:42:35 UTC

Paolo Bonzini 2012-02-27 11:48:41 UTC

Stefan Hajnoczi 2012-02-27 13:09:00 UTC

Paolo Bonzini 2012-02-27 13:47:03 UTC

Stefan Hajnoczi 2012-02-27 14:49:11 UTC

Stefan Hajnoczi 2012-02-27 14:59:55 UTC

Paolo Bonzini 2012-02-27 15:08:20 UTC

Stefan Hajnoczi 2012-02-23 15:47:38 UTC

Federico Simoncelli 2012-02-23 16:10:52 UTC

Stefan Hajnoczi 2012-02-23 16:35:23 UTC

Federico Simoncelli 2012-02-23 17:06:34 UTC

Federico Simoncelli 2012-02-24 11:37:38 UTC

Federico Simoncelli 2012-02-24 11:37:39 UTC

Kevin Wolf 2012-02-24 12:03:08 UTC

Federico Simoncelli 2012-02-24 12:12:27 UTC

Paolo Bonzini 2012-02-24 13:11:31 UTC

Luiz Capitulino 2012-02-24 17:04:25 UTC

Markus Armbruster 2012-02-27 14:57:23 UTC

Federico Simoncelli 2012-02-24 16:49:03 UTC

Eric Blake 2012-02-24 17:02:36 UTC

Federico Simoncelli 2012-02-24 17:15:25 UTC

Paolo Bonzini 2012-02-24 18:49:42 UTC

Luiz Capitulino 2012-02-24 18:17:22 UTC

Federico Simoncelli 2012-02-27 09:17:12 UTC

Federico Simoncelli 2012-02-24 16:49:04 UTC

Eric Blake 2012-02-24 17:46:26 UTC

Paolo Bonzini 2012-02-24 18:57:52 UTC

Eric Blake 2012-02-24 19:37:24 UTC

Luiz Capitulino 2012-02-24 19:01:43 UTC

Eric Blake 2012-02-24 19:40:17 UTC

Luiz Capitulino 2012-02-24 20:26:55 UTC

Eric Blake 2012-02-24 22:46:02 UTC

Paolo Bonzini 2012-02-24 20:32:30 UTC

Luiz Capitulino 2012-02-24 20:36:46 UTC

Paolo Bonzini 2012-02-24 21:05:29 UTC

Eric Blake 2012-02-24 22:30:28 UTC

Paolo Bonzini 2012-02-25 06:47:11 UTC

Federico Simoncelli 2012-02-27 11:29:39 UTC

Luiz Capitulino 2012-02-27 12:12:15 UTC

Paolo Bonzini 2012-02-27 12:49:17 UTC

Luiz Capitulino 2012-02-27 13:06:45 UTC

Paolo Bonzini 2012-02-27 14:39:33 UTC

Anthony Liguori 2012-02-27 14:46:57 UTC

Paolo Bonzini 2012-02-27 14:54:41 UTC

Anthony Liguori 2012-02-27 14:59:18 UTC

Paolo Bonzini 2012-02-27 15:03:48 UTC

Anthony Liguori 2012-02-27 15:06:15 UTC

Kevin Wolf 2012-02-27 15:17:07 UTC

Anthony Liguori 2012-02-27 15:24:52 UTC

Paolo Bonzini 2012-02-27 16:51:47 UTC

Anthony Liguori 2012-02-27 16:58:33 UTC

Paolo Bonzini 2012-02-27 17:06:58 UTC

Federico Simoncelli 2012-02-27 16:33:00 UTC

Paolo Bonzini 2012-02-27 16:41:35 UTC

Anthony Liguori 2012-02-27 16:42:31 UTC

Federico Simoncelli 2012-02-27 16:50:43 UTC

Anthony Liguori 2012-02-27 16:53:27 UTC

Paolo Bonzini 2012-02-27 16:54:25 UTC

Anthony Liguori 2012-02-27 16:59:37 UTC

Luiz Capitulino 2012-02-27 17:37:45 UTC

Stefan Hajnoczi 2012-02-28 15:47:48 UTC

Federico Simoncelli 2012-02-28 17:15:44 UTC

Paolo Bonzini 2012-02-28 17:36:57 UTC

Federico Simoncelli 2012-02-28 17:46:55 UTC

Paolo Bonzini 2012-02-28 18:02:40 UTC

Federico Simoncelli 2012-02-28 18:21:40 UTC

Paolo Bonzini 2012-02-28 17:26:04 UTC

Federico Simoncelli 2012-02-29 12:28:21 UTC

Federico Simoncelli 2012-02-29 13:02:09 UTC

Federico Simoncelli 2012-02-29 17:01:14 UTC

Marcelo Tosatti 2012-03-05 16:59:02 UTC

Eric Blake 2012-03-05 17:20:36 UTC

Marcelo Tosatti 2012-03-05 17:44:31 UTC

Paolo Bonzini 2012-03-05 18:05:03 UTC

about - legalese

Loading...