Qcow2: External snapshots

classic Classic list List threaded Threaded
12 messages Options
Reply | Threaded
Open this post in threaded view
|

Qcow2: External snapshots

Ori Bernstein
I've added support to vmd for external snapshots. That is,
snapshots that are derived from a base image. Data lookups
start in the derived image, and if the derived image does not
contain some data, the search proceeds ot the base image.
Multiple derived images may exist off of a single base image.

A limitation of this format is that modifying the base image
will corrupt the derived image.

This change also adds support for creating disk derived disk
images to vmctl.  To use it:

        vmctl create derived.img -s 16G -b base.img -f qcow2

The main implementation change is that we now probe base
images before sending the disk FDs to the VM, which means that
we can actually open the images.

The base image paths may be relative. If they are relative,
they are interpreted relative to the location of the derived
image, and not relative to the directory where vmd happens to
be running.

For review, a bit of scrutiny could be directed to the
messaging.  It relies on imsg being in-order, which seems to
be the case, but isn't documented in the manpage -- If I can't
rely on that, the protocol needs to be tweaked.

After this change, we send imsgs to the same disk index
repeatedly, and each message adds another base to the stack of
images. So, for example, if I have 2images image that look
like this:

        disk0 -> base0 -> base1
        disk1

Then we send the following messages:

        VMDOP_START_VM_DISK (i=0, fd=open(disk0))
        VMDOP_START_VM_DISK (i=0, fd=open(base0))
        VMDOP_START_VM_DISK (i=0, fd=open(base1))

        VMDOP_START_VM_DISK (i=1, fd=open(disk1))

This also opens the door to ephemeral snapshots, which vmd can
implicitly create when it starts a vm, and removes
automatically on exit.

Testing has been the usual -- OpenBSD installs, a bit of catting,
and some random 'dd'. Heavier use and testing would be appreciated.

Diff below.


diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
index c2a5f42d5f6..1f8673e0e26 100644
--- regress/usr.sbin/vmd/diskfmt/Makefile
+++ regress/usr.sbin/vmd/diskfmt/Makefile
@@ -11,7 +11,7 @@
 VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
 
 PROG=vioscribble
-SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+SRCS=vioscribble.c vioqcow2.c vioraw.c
 CFLAGS+=-I$(VMD_DIR) -pthread
 LDFLAGS+=-pthread
 
@@ -26,3 +26,6 @@ scribble-images:
 .PHONY: ${REGRESS_TARGETS} scribble-images
 
 .include <bsd.regress.mk>
+
+vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+ cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
index 14d720db652..1da8efedac7 100644
--- regress/usr.sbin/vmd/diskfmt/vioscribble.c
+++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
@@ -122,16 +122,18 @@ main(int argc, char **argv)
  verbose = !!getenv("VERBOSE");
  qcfd = open("scribble.qc2", O_RDWR);
  rawfd = open("scribble.raw", O_RDWR);
- if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
+ if (qcfd == -1)
  err(1, "unable to open qcow");
- if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
+ if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
+ err(1, "unable to init qcow");
+ if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
  err(1, "unable to open raw");
 
  srandom_deterministic(123);
 
  /* scribble to both disks */
  printf("scribbling...\n");
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < 1024*16; i++) {
  off = (random() % DISKSZ);
  len = random() % sizeof buf + 1;
  fill(off, buf, sizeof buf);
diff --git sys/arch/amd64/include/vmmvar.h sys/arch/amd64/include/vmmvar.h
index 812e660f5f2..98cc498bf84 100644
--- sys/arch/amd64/include/vmmvar.h
+++ sys/arch/amd64/include/vmmvar.h
@@ -24,6 +24,7 @@
 #define VMM_HV_SIGNATURE "OpenBSDVMM58"
 
 #define VMM_MAX_MEM_RANGES 16
+#define VMM_MAX_BASE_PER_DISK 4
 #define VMM_MAX_DISKS_PER_VM 4
 #define VMM_MAX_PATH_DISK 128
 #define VMM_MAX_PATH_CDROM 128
diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
index 0f06a9ed1d2..d75a51b83c5 100644
--- usr.sbin/vmctl/main.c
+++ usr.sbin/vmctl/main.c
@@ -64,7 +64,7 @@ int ctl_receive(struct parse_result *, int, char *[]);
 struct ctl_command ctl_commands[] = {
  { "console", CMD_CONSOLE, ctl_console, "id" },
  { "create", CMD_CREATE, ctl_create,
- "\"path\" -s size [-f fmt]", 1 },
+ "\"path\" -s size [-f fmt] [-b base]", 1 },
  { "load", CMD_LOAD, ctl_load, "\"path\"" },
  { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
  { "reload", CMD_RELOAD, ctl_reload, "" },
@@ -504,11 +504,12 @@ int
 ctl_create(struct parse_result *res, int argc, char *argv[])
 {
  int ch, ret;
- const char *paths[2], *format;
+ const char *paths[2], *format, *base;
 
  if (argc < 2)
  ctl_usage(res->ctl);
 
+ base = NULL;
  paths[0] = argv[1];
  paths[1] = NULL;
  format = "raw";
@@ -521,7 +522,7 @@ ctl_create(struct parse_result *res, int argc, char *argv[])
  argc--;
  argv++;
 
- while ((ch = getopt(argc, argv, "s:f:")) != -1) {
+ while ((ch = getopt(argc, argv, "s:f:b:")) != -1) {
  switch (ch) {
  case 's':
  if (parse_size(res, optarg, 0) != 0)
@@ -530,6 +531,9 @@ ctl_create(struct parse_result *res, int argc, char *argv[])
  case 'f':
  format = optarg;
  break;
+ case 'b':
+ base = optarg;
+ break;
  default:
  ctl_usage(res->ctl);
  /* NOTREACHED */
@@ -537,13 +541,16 @@ ctl_create(struct parse_result *res, int argc, char *argv[])
  }
 
  if (res->size == 0) {
- fprintf(stderr, "missing size argument\n");
+ fprintf(stderr, "could not create %s: missing size argument\n",
+    paths[0]);
  ctl_usage(res->ctl);
  }
- if (strcmp(format, "raw") == 0)
+ if (strcmp(format, "raw") == 0) {
+ if (base)
+ errx(1, "raw images do not accept base argument");
  ret = create_raw_imagefile(paths[0], res->size);
- else if (strcmp(format, "qcow2") == 0)
- ret = create_qc2_imagefile(paths[0], res->size);
+ } else if (strcmp(format, "qcow2") == 0)
+ ret = create_qc2_imagefile(paths[0], base, res->size);
  else
  errx(1, "unknown image format %s", format);
  if (ret != 0) {
diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
index 4941cde3b07..c9d297d2e90 100644
--- usr.sbin/vmctl/vmctl.8
+++ usr.sbin/vmctl/vmctl.8
@@ -50,7 +50,7 @@ Using
 .Xr cu 1
 connect to the console of the VM with the specified
 .Ar id .
-.It Cm create Ar path Fl s Ar size Op Fl f Ar format
+.It Cm create Ar path Fl s Ar size Op Fl f Ar format Fl b Ar base
 Creates a VM disk image file with the specified
 .Ar path
 and
@@ -65,6 +65,10 @@ or
 defaulting to
 .Ar raw
 if left unspecified.
+For qcow2, a
+.Ar base
+image may be specified. The base image is not modified. The
+derived image contains only the changes written by the VM.
 .It Cm load Ar filename
 Load additional configuration from the specified file.
 .It Cm log brief
diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
index 355bd4d0900..6101bb310dd 100644
--- usr.sbin/vmctl/vmctl.c
+++ usr.sbin/vmctl/vmctl.c
@@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
 #define ALIGN(sz, align) \
  ((sz + align - 1) & ~(align - 1))
 int
-create_qc2_imagefile(const char *imgfile_path, long imgsize)
+create_qc2_imagefile(const char *imgfile_path,
+    const char *base_path, long imgsize)
 {
  struct qcheader {
  char magic[4];
@@ -871,6 +872,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  uint32_t headersz;
  } __packed hdr;
  int fd, ret;
+ ssize_t base_len;
  uint64_t l1sz, refsz, disksz, initsz, clustersz;
  uint64_t l1off, refoff, v, i;
  uint16_t refs;
@@ -888,11 +890,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  refsz = 1;
 
  initsz = ALIGN(refoff + refsz*clustersz, clustersz);
+ base_len = base_path ? strlen(base_path) : 0;
 
  memcpy(hdr.magic, "QFI\xfb", 4);
  hdr.version = htobe32(3);
- hdr.backingoff = htobe64(0);
- hdr.backingsz = htobe32(0);
+ hdr.backingoff = htobe64(base_path ? sizeof hdr : 0);
+ hdr.backingsz = htobe32(base_len);
  hdr.clustershift = htobe32(16);
  hdr.disksz = htobe64(disksz);
  hdr.cryptmethod = htobe32(0);
@@ -918,6 +921,10 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
  goto error;
 
+ /* Add the base image */
+ if (base_path && write(fd, base_path, base_len) != base_len)
+ goto error;
+
  /* Extend to desired size, and add one refcount cluster */
  if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
  goto error;
diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
index 27e094c26de..da96e926e36 100644
--- usr.sbin/vmctl/vmctl.h
+++ usr.sbin/vmctl/vmctl.h
@@ -87,7 +87,7 @@ __dead void
 
 /* vmctl.c */
 int create_raw_imagefile(const char *, long);
-int create_qc2_imagefile(const char *, long);
+int create_qc2_imagefile(const char *, const char*, long);
 int vm_start(uint32_t, const char *, int, int, char **, int,
     char **, int *, char *, char *, char *);
 int vm_start_complete(struct imsg *, int *, int);
diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
index 550b73c1a39..ba70e41f930 100644
--- usr.sbin/vmd/config.c
+++ usr.sbin/vmd/config.c
@@ -35,6 +35,7 @@
 #include <util.h>
 #include <errno.h>
 #include <imsg.h>
+#include <libgen.h>
 
 #include "proc.h"
 #include "vmd.h"
@@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
 int
 config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 {
+ int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
  struct vmd_if *vif;
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
- unsigned int i;
+ unsigned int i, j;
  int fd = -1, vmboot = 0;
- int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
+ int kernfd = -1;
+ int *tapfds;
  int cdromfd = -1;
  int saved_errno = 0;
+ int n = 0, flags;
  char ifname[IF_NAMESIZE], *s;
  char path[PATH_MAX];
+ char base[PATH_MAX];
+ char expanded[PATH_MAX];
  unsigned int unit;
 
  errno = 0;
@@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  }
  }
 
- diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
- if (diskfds == NULL) {
- log_warn("%s: can't allocate disk fds", __func__);
- goto fail;
- }
- for (i = 0; i < vcp->vcp_ndisks; i++)
- diskfds[i] = -1;
+ for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+ diskfds[i][j] = -1;
 
  tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
  if (tapfds == NULL) {
@@ -289,22 +291,55 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 
  /* Open disk images for child */
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
-                /* Stat disk[i] to ensure it is a regular file */
- if ((diskfds[i] = open(vcp->vcp_disks[i],
-    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
- log_warn("%s: can't open disk %s", __func__,
-    vcp->vcp_disks[i]);
- errno = VMD_DISK_MISSING;
- goto fail;
- }
+ if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX)
+ log_warnx("%s, disk path too long", __func__);
+ memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases);
+ flags = O_RDWR|O_EXLOCK|O_NONBLOCK;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ /* Stat disk[i] to ensure it is a regular file */
+ if ((diskfds[i][j] = open(path, flags)) == -1) {
+ log_warn("%s: can't open disk %s", __func__,
+    vcp->vcp_disks[i]);
+ errno = VMD_DISK_MISSING;
+ goto fail;
+ }
+
+ if (vm_checkaccess(diskfds[i][j],
+    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
+    uid, R_OK|W_OK) == -1) {
+ log_warnx("vm \"%s\" no read/write "
+    "access to disk %s", vcp->vcp_name,
+    vcp->vcp_disks[i]);
+ errno = EPERM;
+ goto fail;
+ }
+
+ /*
+ * Clear the read-write flag for base images.
+ * All writes should go to the top image.
+ */
+ flags = O_RDONLY|O_EXLOCK|O_NONBLOCK;
+ n = virtio_get_base(diskfds[i][j], base, sizeof base,
+    vmc->vmc_disktypes[i]);
+ if (n == -1)
+ log_warnx("vm \"%s\" unable to read"
+    "base for disk %s", vcp->vcp_name,
+    vcp->vcp_disks[i]);
+ if (n == 0)
+ break;
+ /*
+ * Relative paths should be interpreted relative
+ * to the disk image, rather than relative to the
+ * directory vmd happens to be running in, since
+ * this is the only userful interpretation.
+ */
+ if (base[0] != '/') {
+ s = dirname(path);
+ snprintf(expanded, sizeof expanded,
+    "%s/%s", s, base);
+ realpath(expanded, path);
+ }
 
- if (vm_checkaccess(diskfds[i],
-    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
-    uid, R_OK|W_OK) == -1) {
- log_warnx("vm \"%s\" no read/write access to disk %s",
-    vcp->vcp_name, vcp->vcp_disks[i]);
- errno = EPERM;
- goto fail;
  }
  }
 
@@ -402,9 +437,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
     NULL, 0);
 
  for (i = 0; i < vcp->vcp_ndisks; i++) {
- proc_compose_imsg(ps, PROC_VMM, -1,
-    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
-    &i, sizeof(i));
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ if (diskfds[i][j] == -1)
+ break;
+ proc_compose_imsg(ps, PROC_VMM, -1,
+    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
+    diskfds[i][j], &i, sizeof(i));
+ }
  }
  for (i = 0; i < vcp->vcp_nnics; i++) {
  proc_compose_imsg(ps, PROC_VMM, -1,
@@ -416,7 +455,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  proc_compose_imsg(ps, PROC_VMM, -1,
     IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
 
- free(diskfds);
  free(tapfds);
 
  vm->vm_running = 1;
@@ -430,11 +468,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  close(kernfd);
  if (cdromfd != -1)
  close(cdromfd);
- if (diskfds != NULL) {
- for (i = 0; i < vcp->vcp_ndisks; i++)
- close(diskfds[i]);
- free(diskfds);
- }
+ for (i = 0; i < vcp->vcp_ndisks; i++)
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+ if (diskfds[i][j] != -1)
+ close(diskfds[i][j]);
  if (tapfds != NULL) {
  for (i = 0; i < vcp->vcp_nnics; i++)
  close(tapfds[i]);
@@ -489,7 +526,7 @@ int
 config_getdisk(struct privsep *ps, struct imsg *imsg)
 {
  struct vmd_vm *vm;
- unsigned int n;
+ unsigned int n, idx;
 
  errno = 0;
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
@@ -500,14 +537,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
  IMSG_SIZE_CHECK(imsg, &n);
  memcpy(&n, imsg->data, sizeof(n));
 
- if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
-    vm->vm_disks[n] != -1 || imsg->fd == -1) {
+ if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
  log_warnx("invalid disk id");
  errno = EINVAL;
  return (-1);
  }
- vm->vm_disks[n] = imsg->fd;
-
+ idx = vm->vm_params.vmc_diskbases[n]++;
+ if (idx >= VMM_MAX_BASE_PER_DISK) {
+ log_warnx("too many bases for disk");
+ errno = EINVAL;
+ return (-1);
+ }
+ vm->vm_disks[n][idx] = imsg->fd;
  return (0);
 }
 
diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
index ada0bafc72d..e4ebe020813 100644
--- usr.sbin/vmd/vioqcow2.c
+++ usr.sbin/vmd/vioqcow2.c
@@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
 static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static int inc_refs(struct qcdisk *, off_t, int);
-static int qc2_openpath(struct qcdisk *, char *, int);
-static int qc2_open(struct qcdisk *, int);
+static int qc2_open(struct qcdisk *, int *, size_t);
 static ssize_t qc2_pread(void *, char *, size_t, off_t);
 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
 static void qc2_close(void *, int);
@@ -118,14 +117,14 @@ static void qc2_close(void *, int);
  * May open snapshot base images.
  */
 int
-virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  struct qcdisk *diskp;
 
  diskp = malloc(sizeof(struct qcdisk));
  if (diskp == NULL)
  return -1;
- if (qc2_open(diskp, fd) == -1) {
+ if (qc2_open(diskp, fd, nfd) == -1) {
  log_warnx("%s: could not open qcow2 disk", __func__);
  free(diskp);
  return -1;
@@ -138,19 +137,41 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
  return 0;
 }
 
-static int
-qc2_openpath(struct qcdisk *disk, char *path, int flags)
+ssize_t
+virtio_qcow2_get_base(int fd, char *path, size_t npath)
 {
- int fd;
+ struct qcheader header;
+ uint64_t backingoff;
+ uint32_t backingsz;
 
- fd = open(path, flags);
- if (fd < 0)
+ if (pread(fd, &header, sizeof header, 0) != sizeof header) {
+ log_warn("%s: short read on header", __func__);
  return -1;
- return qc2_open(disk, fd);
+ }
+ if (strncmp(header.magic, "QFI\xfb", 4) != 0) {
+ log_warn("%s: invalid magic numbers", __func__);
+ return -1;
+ }
+ backingoff = be64toh(header.backingoff);
+ backingsz = be32toh(header.backingsz);
+ if (backingsz != 0) {
+ if (backingsz >= npath - 1) {
+ log_warn("%s: snapshot path too long", __func__);
+ return -1;
+ }
+ if (pread(fd, path, npath, backingoff) != backingsz) {
+ log_warn("%s: could not read snapshot base name",
+    __func__);
+ return -1;
+ }
+ path[backingsz] = 0;
+ log_warn("path: %s\n", path);
+ }
+ return backingsz;
 }
 
 static int
-qc2_open(struct qcdisk *disk, int fd)
+qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
 {
  char basepath[PATH_MAX];
  struct stat st;
@@ -158,9 +179,10 @@ qc2_open(struct qcdisk *disk, int fd)
  uint64_t backingoff;
  uint32_t backingsz;
  size_t i;
- int version;
+ int version, fd;
 
  pthread_rwlock_init(&disk->lock, NULL);
+ fd = fds[0];
  disk->fd = fd;
  disk->base = NULL;
  disk->l1 = NULL;
@@ -222,13 +244,6 @@ qc2_open(struct qcdisk *disk, int fd)
  backingoff = be64toh(header.backingoff);
  backingsz = be32toh(header.backingsz);
  if (backingsz != 0) {
- /*
- * FIXME: we need to figure out a way of opening these things,
- * otherwise we just crash with a pledge violation.
- */
- log_warn("%s: unsupported external snapshot images", __func__);
- goto error;
-
  if (backingsz >= sizeof basepath - 1) {
  log_warn("%s: snapshot path too long", __func__);
  goto error;
@@ -239,11 +254,16 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
  basepath[backingsz] = 0;
+ if (nfd <= 1) {
+ log_warnx("%s: missing base image %s", __func__, basepath);
+ goto error;
+ }
+
 
  disk->base = calloc(1, sizeof(struct qcdisk));
  if (!disk->base)
  goto error;
- if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
+ if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
  log_warn("%s: could not open %s", basepath, __func__);
  goto error;
  }
diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
index e02ab67c5dc..ff4bbb3095e 100644
--- usr.sbin/vmd/vioraw.c
+++ usr.sbin/vmd/vioraw.c
@@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
  * returning -1 for error, 0 for success.
  */
 int
-virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  off_t sz;
  int *fdp;
 
- sz = lseek(fd, 0, SEEK_END);
+ if (nfd != 1)
+ return -1;
+ sz = lseek(fd[0], 0, SEEK_END);
  if (sz == -1)
  return -1;
 
  fdp = malloc(sizeof(int));
  if (!fdp)
  return -1;
- *fdp = fd;
+ *fdp = fd[0];
  file->p = fdp;
  file->pread = raw_pread;
  file->pwrite = raw_pwrite;
diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index 93490344560..1c66f33216a 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
  return (0);
 }
 
+int
+virtio_get_base(int fd, char *path, size_t npath ,int type)
+{
+ switch (type) {
+ case VMDF_RAW: return 0;
+ case VMDF_QCOW2: return virtio_qcow2_get_base(fd, path, npath);
+ }
+ log_warnx("%s: invalid disk format", __func__);
+ return -1;
+}
+
+/*
+ * Initializes a struct virtio_backing using the list of fds.
+ */
 static int
-virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
+virtio_init_disk(struct virtio_backing *file, off_t *sz,
+    int *fd, size_t nfd, int type)
 {
  /*
  * probe disk types in order of preference, first one to work wins.
  * TODO: provide a way of specifying the type and options.
  */
  switch (type) {
- case VMDF_RAW: return virtio_init_raw(file, sz, fd);
- case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
+ case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
+ case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
  }
  log_warnx("%s: invalid disk format", __func__);
  return -1;
 }
 
 void
-virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_init(struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
@@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioblk[i].vm_id = vcp->vcp_id;
  vioblk[i].irq = pci_get_dev_irq(id);
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1) {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return;
@@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioscsi->vq[i].last_avail = 0;
  }
  if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
-    child_cdrom, VMDF_RAW) == -1) {
+    &child_cdrom, 1, VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format",
     __func__);
  return;
@@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
 }
 
 int
-vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
+vioblk_restore(int fd, struct vmop_create_params *vmc,
+    int child_disks[][VMM_MAX_BASE_PER_DISK])
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  uint8_t i;
@@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
  return (-1);
  }
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1)  {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return (-1);
@@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
  return (-1);
  }
 
- if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
+ if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
     VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format", __func__);
  return (-1);
@@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
 }
 
 int
-virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
index 46006916b6a..91f0e323204 100644
--- usr.sbin/vmd/virtio.h
+++ usr.sbin/vmd/virtio.h
@@ -257,10 +257,11 @@ struct ioinfo {
 };
 
 /* virtio.c */
-void virtio_init(struct vmd_vm *, int, int *, int *);
+void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK], int *);
 void virtio_shutdown(struct vmd_vm *);
 int virtio_dump(int);
-int virtio_restore(int, struct vmd_vm *, int, int *, int *);
+int virtio_restore(int, struct vmd_vm *, int,
+    int[][VMM_MAX_BASE_PER_DISK], int *);
 uint32_t vring_size(uint32_t);
 
 int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
@@ -270,12 +271,14 @@ void viornd_update_qs(void);
 void viornd_update_qa(void);
 int viornd_notifyq(void);
 
-int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
-int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
+ssize_t virtio_qcow2_get_base(int, char *, size_t);
+int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
+int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
 
 int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vioblk_dump(int);
-int vioblk_restore(int, struct vmop_create_params *, int *);
+int vioblk_restore(int, struct vmop_create_params *,
+    int[][VMM_MAX_BASE_PER_DISK]);
 void vioblk_update_qs(struct vioblk_dev *);
 void vioblk_update_qa(struct vioblk_dev *);
 int vioblk_notifyq(struct vioblk_dev *);
diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
index ef4494d918b..7c9aaf27012 100644
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,8 +65,8 @@
 
 io_fn_t ioports_map[MAX_PORTS];
 
-int run_vm(int, int *, int *, struct vmop_create_params *,
-    struct vcpu_reg_state *);
+int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *,
+    struct vmop_create_params *, struct vcpu_reg_state *);
 void vm_dispatch_vmm(int, short, void *);
 void *event_thread(void *);
 void *vcpu_run_loop(void *);
@@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
 void create_memory_map(struct vm_create_params *);
 int alloc_guest_mem(struct vm_create_params *);
 int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
-void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
+void init_emulated_hw(struct vmop_create_params *, int,
+    int[][VMM_MAX_BASE_PER_DISK], int *);
+void restore_emulated_hw(struct vm_create_params *, int, int *,
+    int[][VMM_MAX_BASE_PER_DISK],int);
 void vcpu_exit_inout(struct vm_run_params *);
 uint8_t vcpu_exit_pci(struct vm_run_params *);
 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
@@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd)
 
  /* Find and open kernel image */
  if ((fp = vmboot_open(vm->vm_kernel,
-    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
+    vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
  fatalx("failed to open kernel - exiting");
 
  /* Load kernel image */
@@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp)
  */
 void
 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
-    int *child_disks, int *child_taps)
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  int i;
@@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
  */
 void
 restore_emulated_hw(struct vm_create_params *vcp, int fd,
-    int *child_taps, int *child_disks, int child_cdrom)
+    int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int child_cdrom)
 {
  /* struct vm_create_params *vcp = &vmc->vmc_params; */
  int i;
@@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
  *  !0 : the VM exited abnormally or failed to start
  */
 int
-run_vm(int child_cdrom, int *child_disks, int *child_taps,
-    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
+run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK],
+    int *child_taps, struct vmop_create_params *vmc,
+    struct vcpu_reg_state *vrs)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  struct vm_rwregs_params vregsp;
diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
index 44ceeb64a7e..f23d8d6b25a 100644
--- usr.sbin/vmd/vmboot.c
+++ usr.sbin/vmd/vmboot.c
@@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
 
  switch (vmboot->vbp_type) {
  case VMDF_RAW:
- if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) {
  log_debug("%s: could not open raw disk", __func__);
  goto fail;
  }
  break;
  case VMDF_QCOW2:
- if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) {
  log_debug("%s: could not open qcow2 disk", __func__);
  goto fail;
  }
diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
index 890e9288dd1..133abe12722 100644
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -1080,7 +1080,7 @@ void
 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
 {
  struct privsep *ps = &env->vmd_ps;
- unsigned int i;
+ unsigned int i, j;
 
  if (vm == NULL)
  return;
@@ -1100,9 +1100,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
  close(vm->vm_iev.ibuf.fd);
  }
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
- if (vm->vm_disks[i] != -1) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1) {
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  }
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
@@ -1159,7 +1161,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  struct vmop_owner *vmo = NULL;
  struct vmd_user *usr = NULL;
  uint32_t rng;
- unsigned int i;
+ unsigned int i, j;
  struct vmd_switch *sw;
  char *s;
 
@@ -1250,7 +1252,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  vm->vm_user = usr;
 
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+ vm->vm_disks[i][j] = -1;
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
  vm->vm_ifs[i].vif_fd = -1;
  for (i = 0; i < vcp->vcp_nnics; i++) {
diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
index 4cf0295d946..b803d44340c 100644
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -166,6 +166,7 @@ struct vmop_create_params {
 #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
 
  unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
+ unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
 #define VMDF_RAW 0x01
 #define VMDF_QCOW2 0x02
 
@@ -238,7 +239,7 @@ struct vmd_vm {
  uint32_t vm_vmid;
  int vm_kernel;
  int vm_cdrom;
- int vm_disks[VMM_MAX_DISKS_PER_VM];
+ int vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
  struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
  char *vm_ttyname;
  int vm_tty;
@@ -412,4 +413,7 @@ int parse_config(const char *);
 int cmdline_symset(char *);
 int host(const char *, struct address *);
 
+/* virtio.c */
+int virtio_get_base(int, char *, size_t, int);
+
 #endif /* VMD_H */
diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
index 7757856323f..3176fd85713 100644
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  struct vmd_vm *vm;
  int ret = EINVAL;
  int fds[2];
- size_t i;
+ size_t i, j;
 
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
  log_warnx("%s: can't find vm", __func__);
@@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  close(fds[1]);
 
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1)
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  for (i = 0 ; i < vcp->vcp_nnics; i++) {
  close(vm->vm_ifs[i].vif_fd);

--
    Ori Bernstein

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Ori Bernstein
On Sun, 30 Sep 2018 12:27:00 -0700, Ori Bernstein <[hidden email]> wrote:

> I've added support to vmd for external snapshots. That is,
> snapshots that are derived from a base image. Data lookups
> start in the derived image, and if the derived image does not
> contain some data, the search proceeds ot the base image.
> Multiple derived images may exist off of a single base image.

And, a few fixes (thanks to Anton Lindqvist for testing and pointing
out issues). Please apply on top of the previous diff.

diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
index 21aeb050371..ff9f79b87b2 100644
--- usr.sbin/vmd/config.c
+++ usr.sbin/vmd/config.c
@@ -314,26 +314,30 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  goto fail;
  }
 
- /*
- * Clear the read-write flag for base images.
+ /*
+ * Clear the read-write flag for base images.
  * All writes should go to the top image.
  */
  flags = O_RDONLY|O_EXLOCK|O_NONBLOCK;
  n = virtio_get_base(diskfds[i][j], base, sizeof base,
     vmc->vmc_disktypes[i]);
- if (n == -1)
- log_warnx("vm \"%s\" unable to read"
-    "base for disk %s", vcp->vcp_name,
-    vcp->vcp_disks[i]);
  if (n == 0)
  break;
- /*
+ if (n == -1) {
+ log_warnx("vm \"%s\" unable to read "
+    "base for disk %s", vcp->vcp_name,
+    vcp->vcp_disks[i]);
+ goto fail;
+ }
+ /*
  * Relative paths should be interpreted relative
  * to the disk image, rather than relative to the
  * directory vmd happens to be running in, since
  * this is the only userful interpretation.
  */
- if (base[0] != '/') {
+ if (base[0] == '/') {
+ realpath(base, path);
+ } else {
  s = dirname(path);
  snprintf(expanded, sizeof expanded,
     "%s/%s", s, base);

--
    Ori Bernstein

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Reyk Floeter-2
In reply to this post by Ori Bernstein
Hi Ori,

On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
> I've added support to vmd for external snapshots. That is,
> snapshots that are derived from a base image. Data lookups
> start in the derived image, and if the derived image does not
> contain some data, the search proceeds ot the base image.
> Multiple derived images may exist off of a single base image.
>

Nice work!  This will be quite useful, thanks.

I think I broke your diff as my last commit to derive the raw/qcow2
format introduced some conflicts.  I had posted it on hackers@ and
forgot that your aren't on the internal list yet - sorry for that.

> A limitation of this format is that modifying the base image
> will corrupt the derived image.
>
> This change also adds support for creating disk derived disk
> images to vmctl.  To use it:
>
> vmctl create derived.img -s 16G -b base.img -f qcow2
>

I removed -f fmt to be more consistent and the new syntax will be

        vmctl create qcow2:derived.img -s 16G -b base.img

or

        vmctl create derived.qcow2 -s 16G -b base.img

but we should be able to derive it from the base as well (there's now
base in raw images), so the following should work as well:

        vmctl create derived.img -s 16G -b base.img

> The main implementation change is that we now probe base
> images before sending the disk FDs to the VM, which means that
> we can actually open the images.
>
> The base image paths may be relative. If they are relative,
> they are interpreted relative to the location of the derived
> image, and not relative to the directory where vmd happens to
> be running.
>

OK, that needs some care + review.

> For review, a bit of scrutiny could be directed to the
> messaging.  It relies on imsg being in-order, which seems to
> be the case, but isn't documented in the manpage -- If I can't
> rely on that, the protocol needs to be tweaked.
>

imsgs are guaranteed to be in order as long as you don't mux them with
other messages from the same sender in an async way.

> After this change, we send imsgs to the same disk index
> repeatedly, and each message adds another base to the stack of
> images. So, for example, if I have 2images image that look
> like this:
>
> disk0 -> base0 -> base1
> disk1
>
> Then we send the following messages:
>
> VMDOP_START_VM_DISK (i=0, fd=open(disk0))
> VMDOP_START_VM_DISK (i=0, fd=open(base0))
> VMDOP_START_VM_DISK (i=0, fd=open(base1))
>
> VMDOP_START_VM_DISK (i=1, fd=open(disk1))
>

Makes sense.

> This also opens the door to ephemeral snapshots, which vmd can
> implicitly create when it starts a vm, and removes
> automatically on exit.
>

Please be extremely careful with the design here.  Unlike qemu, a vmd
VM is not able to create new files itself and it should never be able
to do it.  So when we create snapshots, we need to find a way that the
parent prepares the file, sends the fd, and asks the VM process to use
it.

> Testing has been the usual -- OpenBSD installs, a bit of catting,
> and some random 'dd'. Heavier use and testing would be appreciated.
>

I will test the updated diff that includes the second fix and the merge ;)

Initial comments inline below.

Reyk

>
>
> diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
> index c2a5f42d5f6..1f8673e0e26 100644
> --- regress/usr.sbin/vmd/diskfmt/Makefile
> +++ regress/usr.sbin/vmd/diskfmt/Makefile
> @@ -11,7 +11,7 @@
>  VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
>  
>  PROG=vioscribble
> -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> +SRCS=vioscribble.c vioqcow2.c vioraw.c
>  CFLAGS+=-I$(VMD_DIR) -pthread
>  LDFLAGS+=-pthread
>  
> @@ -26,3 +26,6 @@ scribble-images:
>  .PHONY: ${REGRESS_TARGETS} scribble-images
>  
>  .include <bsd.regress.mk>
> +
> +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
> diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
> index 14d720db652..1da8efedac7 100644
> --- regress/usr.sbin/vmd/diskfmt/vioscribble.c
> +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
> @@ -122,16 +122,18 @@ main(int argc, char **argv)
>   verbose = !!getenv("VERBOSE");
>   qcfd = open("scribble.qc2", O_RDWR);
>   rawfd = open("scribble.raw", O_RDWR);
> - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
> + if (qcfd == -1)
>   err(1, "unable to open qcow");
> - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
> + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
> + err(1, "unable to init qcow");
> + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
>   err(1, "unable to open raw");
>  
>   srandom_deterministic(123);
>  
>   /* scribble to both disks */
>   printf("scribbling...\n");
> - for (i = 0; i < 16; i++) {
> + for (i = 0; i < 1024*16; i++) {
>   off = (random() % DISKSZ);
>   len = random() % sizeof buf + 1;
>   fill(off, buf, sizeof buf);
> diff --git sys/arch/amd64/include/vmmvar.h sys/arch/amd64/include/vmmvar.h
> index 812e660f5f2..98cc498bf84 100644
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -24,6 +24,7 @@
>  #define VMM_HV_SIGNATURE "OpenBSDVMM58"
>  
>  #define VMM_MAX_MEM_RANGES 16
> +#define VMM_MAX_BASE_PER_DISK 4

Please don't add this to vmmvar.h.  This has nothing to do with the
kernel and should only be in vmd.h (e.g. as VM_MAX_BASE_PER_DISK).

We have some structs and defines that are shared between vmd(8) and
vmm(4), and some of them should probably be in userspace only but are
in vmmvar.h for historical reasons.  But it is extemely painful to
maintain when doing changes to the daemon.

>  #define VMM_MAX_DISKS_PER_VM 4
>  #define VMM_MAX_PATH_DISK 128
>  #define VMM_MAX_PATH_CDROM 128
> diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
> index 0f06a9ed1d2..d75a51b83c5 100644
> --- usr.sbin/vmctl/main.c
> +++ usr.sbin/vmctl/main.c

The following bits need changes as I removed -f fmt ...

> @@ -64,7 +64,7 @@ int ctl_receive(struct parse_result *, int, char *[]);
>  struct ctl_command ctl_commands[] = {
>   { "console", CMD_CONSOLE, ctl_console, "id" },
>   { "create", CMD_CREATE, ctl_create,
> - "\"path\" -s size [-f fmt]", 1 },
> + "\"path\" -s size [-f fmt] [-b base]", 1 },
>   { "load", CMD_LOAD, ctl_load, "\"path\"" },
>   { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
>   { "reload", CMD_RELOAD, ctl_reload, "" },
> @@ -504,11 +504,12 @@ int
>  ctl_create(struct parse_result *res, int argc, char *argv[])
>  {
>   int ch, ret;
> - const char *paths[2], *format;
> + const char *paths[2], *format, *base;
>  
>   if (argc < 2)
>   ctl_usage(res->ctl);
>  
> + base = NULL;
>   paths[0] = argv[1];
>   paths[1] = NULL;
>   format = "raw";
> @@ -521,7 +522,7 @@ ctl_create(struct parse_result *res, int argc, char *argv[])
>   argc--;
>   argv++;
>  
> - while ((ch = getopt(argc, argv, "s:f:")) != -1) {
> + while ((ch = getopt(argc, argv, "s:f:b:")) != -1) {
>   switch (ch) {
>   case 's':
>   if (parse_size(res, optarg, 0) != 0)
> @@ -530,6 +531,9 @@ ctl_create(struct parse_result *res, int argc, char *argv[])
>   case 'f':
>   format = optarg;
>   break;
> + case 'b':
> + base = optarg;
> + break;
>   default:
>   ctl_usage(res->ctl);
>   /* NOTREACHED */
> @@ -537,13 +541,16 @@ ctl_create(struct parse_result *res, int argc, char *argv[])
>   }
>  
>   if (res->size == 0) {
> - fprintf(stderr, "missing size argument\n");
> + fprintf(stderr, "could not create %s: missing size argument\n",
> +    paths[0]);
>   ctl_usage(res->ctl);
>   }
> - if (strcmp(format, "raw") == 0)
> + if (strcmp(format, "raw") == 0) {
> + if (base)
> + errx(1, "raw images do not accept base argument");
>   ret = create_raw_imagefile(paths[0], res->size);
> - else if (strcmp(format, "qcow2") == 0)
> - ret = create_qc2_imagefile(paths[0], res->size);
> + } else if (strcmp(format, "qcow2") == 0)
> + ret = create_qc2_imagefile(paths[0], base, res->size);
>   else
>   errx(1, "unknown image format %s", format);
>   if (ret != 0) {
> diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
> index 4941cde3b07..c9d297d2e90 100644
> --- usr.sbin/vmctl/vmctl.8
> +++ usr.sbin/vmctl/vmctl.8
> @@ -50,7 +50,7 @@ Using
>  .Xr cu 1
>  connect to the console of the VM with the specified
>  .Ar id .
> -.It Cm create Ar path Fl s Ar size Op Fl f Ar format
> +.It Cm create Ar path Fl s Ar size Op Fl f Ar format Fl b Ar base
>  Creates a VM disk image file with the specified
>  .Ar path
>  and
> @@ -65,6 +65,10 @@ or
>  defaulting to
>  .Ar raw
>  if left unspecified.
> +For qcow2, a
> +.Ar base
> +image may be specified. The base image is not modified. The
> +derived image contains only the changes written by the VM.
>  .It Cm load Ar filename
>  Load additional configuration from the specified file.
>  .It Cm log brief
> diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
> index 355bd4d0900..6101bb310dd 100644
> --- usr.sbin/vmctl/vmctl.c
> +++ usr.sbin/vmctl/vmctl.c
> @@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
>  #define ALIGN(sz, align) \
>   ((sz + align - 1) & ~(align - 1))
>  int
> -create_qc2_imagefile(const char *imgfile_path, long imgsize)
> +create_qc2_imagefile(const char *imgfile_path,
> +    const char *base_path, long imgsize)
>  {
>   struct qcheader {
>   char magic[4];
> @@ -871,6 +872,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   uint32_t headersz;
>   } __packed hdr;
>   int fd, ret;
> + ssize_t base_len;
>   uint64_t l1sz, refsz, disksz, initsz, clustersz;
>   uint64_t l1off, refoff, v, i;
>   uint16_t refs;
> @@ -888,11 +890,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   refsz = 1;
>  
>   initsz = ALIGN(refoff + refsz*clustersz, clustersz);
> + base_len = base_path ? strlen(base_path) : 0;
>  
>   memcpy(hdr.magic, "QFI\xfb", 4);

The magic bytes are now defined as VM_MAGIC_QCOW and can be used as:

        memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW))

>   hdr.version = htobe32(3);
> - hdr.backingoff = htobe64(0);
> - hdr.backingsz = htobe32(0);
> + hdr.backingoff = htobe64(base_path ? sizeof hdr : 0);

knf is sizeof(hdr), with braces.  There are a few such sizeof without
braces below, but I'm not commenting on all of them.

> + hdr.backingsz = htobe32(base_len);
>   hdr.clustershift = htobe32(16);
>   hdr.disksz = htobe64(disksz);
>   hdr.cryptmethod = htobe32(0);
> @@ -918,6 +921,10 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
>   goto error;
>  
> + /* Add the base image */
> + if (base_path && write(fd, base_path, base_len) != base_len)
> + goto error;
> +
>   /* Extend to desired size, and add one refcount cluster */
>   if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
>   goto error;
> diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
> index 27e094c26de..da96e926e36 100644
> --- usr.sbin/vmctl/vmctl.h
> +++ usr.sbin/vmctl/vmctl.h
> @@ -87,7 +87,7 @@ __dead void
>  
>  /* vmctl.c */
>  int create_raw_imagefile(const char *, long);
> -int create_qc2_imagefile(const char *, long);
> +int create_qc2_imagefile(const char *, const char*, long);
>  int vm_start(uint32_t, const char *, int, int, char **, int,
>      char **, int *, char *, char *, char *);
>  int vm_start_complete(struct imsg *, int *, int);
> diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
> index 550b73c1a39..ba70e41f930 100644
> --- usr.sbin/vmd/config.c
> +++ usr.sbin/vmd/config.c
> @@ -35,6 +35,7 @@
>  #include <util.h>
>  #include <errno.h>
>  #include <imsg.h>
> +#include <libgen.h>
>  
>  #include "proc.h"
>  #include "vmd.h"
> @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
>  int
>  config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  {
> + int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
>   struct vmd_if *vif;
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> - unsigned int i;
> + unsigned int i, j;
>   int fd = -1, vmboot = 0;
> - int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
> + int kernfd = -1;
> + int *tapfds;
>   int cdromfd = -1;
>   int saved_errno = 0;
> + int n = 0, flags;
>   char ifname[IF_NAMESIZE], *s;
>   char path[PATH_MAX];
> + char base[PATH_MAX];
> + char expanded[PATH_MAX];
>   unsigned int unit;
>  
>   errno = 0;
> @@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   }
>   }
>  
> - diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
> - if (diskfds == NULL) {
> - log_warn("%s: can't allocate disk fds", __func__);
> - goto fail;
> - }
> - for (i = 0; i < vcp->vcp_ndisks; i++)
> - diskfds[i] = -1;
> + for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
> + diskfds[i][j] = -1;
>  
>   tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
>   if (tapfds == NULL) {
> @@ -289,22 +291,55 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  
>   /* Open disk images for child */
>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
> -                /* Stat disk[i] to ensure it is a regular file */
> - if ((diskfds[i] = open(vcp->vcp_disks[i],
> -    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
> - log_warn("%s: can't open disk %s", __func__,
> -    vcp->vcp_disks[i]);
> - errno = VMD_DISK_MISSING;
> - goto fail;
> - }
> + if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX)
> + log_warnx("%s, disk path too long", __func__);
> + memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases);
> + flags = O_RDWR|O_EXLOCK|O_NONBLOCK;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + /* Stat disk[i] to ensure it is a regular file */
> + if ((diskfds[i][j] = open(path, flags)) == -1) {
> + log_warn("%s: can't open disk %s", __func__,
> +    vcp->vcp_disks[i]);
> + errno = VMD_DISK_MISSING;
> + goto fail;
> + }
> +
> + if (vm_checkaccess(diskfds[i][j],
> +    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
> +    uid, R_OK|W_OK) == -1) {

See below...

> + log_warnx("vm \"%s\" no read/write "
> +    "access to disk %s", vcp->vcp_name,
> +    vcp->vcp_disks[i]);
> + errno = EPERM;
> + goto fail;
> + }
> +
> + /*
> + * Clear the read-write flag for base images.
> + * All writes should go to the top image.
> + */
> + flags = O_RDONLY|O_EXLOCK|O_NONBLOCK;

...this should also clear the W_OK flag when checking disk access.  A
base doesn't have to be writable for the user.

It is good that you don't to attempt to open it writable so the VM
will never be able to change a base image!  I actually plan to use
bases with VM templates to allow users running VMs using pre-defined
and system-wide base images.

> + n = virtio_get_base(diskfds[i][j], base, sizeof base,
> +    vmc->vmc_disktypes[i]);
> + if (n == -1)
> + log_warnx("vm \"%s\" unable to read"
> +    "base for disk %s", vcp->vcp_name,
> +    vcp->vcp_disks[i]);
> + if (n == 0)
> + break;
> + /*
> + * Relative paths should be interpreted relative
> + * to the disk image, rather than relative to the
> + * directory vmd happens to be running in, since
> + * this is the only userful interpretation.
> + */

OK, I see that this was fixed in your 2nd diff but please send the
complete thing for diff updates in the future as it is hard to comment
incremental diffs.

> + if (base[0] != '/') {
> + s = dirname(path);
> + snprintf(expanded, sizeof expanded,
> +    "%s/%s", s, base);
> + realpath(expanded, path);
> + }
>  
> - if (vm_checkaccess(diskfds[i],
> -    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
> -    uid, R_OK|W_OK) == -1) {
> - log_warnx("vm \"%s\" no read/write access to disk %s",
> -    vcp->vcp_name, vcp->vcp_disks[i]);
> - errno = EPERM;
> - goto fail;
>   }
>   }
>  
> @@ -402,9 +437,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>      NULL, 0);
>  
>   for (i = 0; i < vcp->vcp_ndisks; i++) {
> - proc_compose_imsg(ps, PROC_VMM, -1,
> -    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
> -    &i, sizeof(i));
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + if (diskfds[i][j] == -1)
> + break;
> + proc_compose_imsg(ps, PROC_VMM, -1,
> +    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
> +    diskfds[i][j], &i, sizeof(i));
> + }
>   }
>   for (i = 0; i < vcp->vcp_nnics; i++) {
>   proc_compose_imsg(ps, PROC_VMM, -1,
> @@ -416,7 +455,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   proc_compose_imsg(ps, PROC_VMM, -1,
>      IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
>  
> - free(diskfds);
>   free(tapfds);
>  
>   vm->vm_running = 1;
> @@ -430,11 +468,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   close(kernfd);
>   if (cdromfd != -1)
>   close(cdromfd);
> - if (diskfds != NULL) {
> - for (i = 0; i < vcp->vcp_ndisks; i++)
> - close(diskfds[i]);
> - free(diskfds);
> - }
> + for (i = 0; i < vcp->vcp_ndisks; i++)
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
> + if (diskfds[i][j] != -1)
> + close(diskfds[i][j]);
>   if (tapfds != NULL) {
>   for (i = 0; i < vcp->vcp_nnics; i++)
>   close(tapfds[i]);
> @@ -489,7 +526,7 @@ int
>  config_getdisk(struct privsep *ps, struct imsg *imsg)
>  {
>   struct vmd_vm *vm;
> - unsigned int n;
> + unsigned int n, idx;
>  
>   errno = 0;
>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
> @@ -500,14 +537,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
>   IMSG_SIZE_CHECK(imsg, &n);
>   memcpy(&n, imsg->data, sizeof(n));
>  
> - if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
> -    vm->vm_disks[n] != -1 || imsg->fd == -1) {
> + if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {

For the bases you are removing the check if a disk has already been sent...

>   log_warnx("invalid disk id");
>   errno = EINVAL;
>   return (-1);
>   }
> - vm->vm_disks[n] = imsg->fd;
> -
> + idx = vm->vm_params.vmc_diskbases[n]++;
> + if (idx >= VMM_MAX_BASE_PER_DISK) {
> + log_warnx("too many bases for disk");
> + errno = EINVAL;
> + return (-1);
> + }
> + vm->vm_disks[n][idx] = imsg->fd;

...but you should keep the paranoia and check it around here
(fail if vm->vm_disks[n][idx] != -1).

>   return (0);
>  }
>  
> diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
> index ada0bafc72d..e4ebe020813 100644
> --- usr.sbin/vmd/vioqcow2.c
> +++ usr.sbin/vmd/vioqcow2.c
> @@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
>  static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
>  static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
>  static int inc_refs(struct qcdisk *, off_t, int);
> -static int qc2_openpath(struct qcdisk *, char *, int);
> -static int qc2_open(struct qcdisk *, int);
> +static int qc2_open(struct qcdisk *, int *, size_t);
>  static ssize_t qc2_pread(void *, char *, size_t, off_t);
>  static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
>  static void qc2_close(void *, int);
> @@ -118,14 +117,14 @@ static void qc2_close(void *, int);
>   * May open snapshot base images.
>   */
>  int
> -virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
> +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
>  {
>   struct qcdisk *diskp;
>  
>   diskp = malloc(sizeof(struct qcdisk));
>   if (diskp == NULL)
>   return -1;
> - if (qc2_open(diskp, fd) == -1) {
> + if (qc2_open(diskp, fd, nfd) == -1) {
>   log_warnx("%s: could not open qcow2 disk", __func__);
>   free(diskp);
>   return -1;
> @@ -138,19 +137,41 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
>   return 0;
>  }
>  
> -static int
> -qc2_openpath(struct qcdisk *disk, char *path, int flags)
> +ssize_t
> +virtio_qcow2_get_base(int fd, char *path, size_t npath)
>  {
> - int fd;
> + struct qcheader header;
> + uint64_t backingoff;
> + uint32_t backingsz;
>  
> - fd = open(path, flags);
> - if (fd < 0)
> + if (pread(fd, &header, sizeof header, 0) != sizeof header) {
> + log_warn("%s: short read on header", __func__);
>   return -1;
> - return qc2_open(disk, fd);
> + }
> + if (strncmp(header.magic, "QFI\xfb", 4) != 0) {

See my notes on the magic bytes above.

> + log_warn("%s: invalid magic numbers", __func__);
> + return -1;
> + }
> + backingoff = be64toh(header.backingoff);
> + backingsz = be32toh(header.backingsz);
> + if (backingsz != 0) {
> + if (backingsz >= npath - 1) {
> + log_warn("%s: snapshot path too long", __func__);
> + return -1;
> + }
> + if (pread(fd, path, npath, backingoff) != backingsz) {
> + log_warn("%s: could not read snapshot base name",
> +    __func__);
> + return -1;
> + }
> + path[backingsz] = 0;
> + log_warn("path: %s\n", path);
> + }
> + return backingsz;
>  }
>  
>  static int
> -qc2_open(struct qcdisk *disk, int fd)
> +qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
>  {
>   char basepath[PATH_MAX];
>   struct stat st;
> @@ -158,9 +179,10 @@ qc2_open(struct qcdisk *disk, int fd)
>   uint64_t backingoff;
>   uint32_t backingsz;
>   size_t i;
> - int version;
> + int version, fd;
>  
>   pthread_rwlock_init(&disk->lock, NULL);
> + fd = fds[0];
>   disk->fd = fd;
>   disk->base = NULL;
>   disk->l1 = NULL;
> @@ -222,13 +244,6 @@ qc2_open(struct qcdisk *disk, int fd)
>   backingoff = be64toh(header.backingoff);
>   backingsz = be32toh(header.backingsz);
>   if (backingsz != 0) {
> - /*
> - * FIXME: we need to figure out a way of opening these things,
> - * otherwise we just crash with a pledge violation.
> - */
> - log_warn("%s: unsupported external snapshot images", __func__);
> - goto error;
> -
>   if (backingsz >= sizeof basepath - 1) {
>   log_warn("%s: snapshot path too long", __func__);
>   goto error;
> @@ -239,11 +254,16 @@ qc2_open(struct qcdisk *disk, int fd)
>   goto error;
>   }
>   basepath[backingsz] = 0;
> + if (nfd <= 1) {
> + log_warnx("%s: missing base image %s", __func__, basepath);
> + goto error;
> + }
> +
>  
>   disk->base = calloc(1, sizeof(struct qcdisk));
>   if (!disk->base)
>   goto error;
> - if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
> + if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
>   log_warn("%s: could not open %s", basepath, __func__);
>   goto error;
>   }
> diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
> index e02ab67c5dc..ff4bbb3095e 100644
> --- usr.sbin/vmd/vioraw.c
> +++ usr.sbin/vmd/vioraw.c
> @@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
>   * returning -1 for error, 0 for success.
>   */
>  int
> -virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
> +virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
>  {
>   off_t sz;
>   int *fdp;
>  
> - sz = lseek(fd, 0, SEEK_END);
> + if (nfd != 1)
> + return -1;
> + sz = lseek(fd[0], 0, SEEK_END);
>   if (sz == -1)
>   return -1;
>  
>   fdp = malloc(sizeof(int));
>   if (!fdp)
>   return -1;
> - *fdp = fd;
> + *fdp = fd[0];
>   file->p = fdp;
>   file->pread = raw_pread;
>   file->pwrite = raw_pwrite;
> diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
> index 93490344560..1c66f33216a 100644
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
>   return (0);
>  }
>  
> +int
> +virtio_get_base(int fd, char *path, size_t npath ,int type)
> +{
> + switch (type) {
> + case VMDF_RAW: return 0;
> + case VMDF_QCOW2: return virtio_qcow2_get_base(fd, path, npath);

There is no explicit note about this in style(9) (only examples) but I
can't remember that I've seen this style in our C code.  We add a
newline after cases, followed by a 1 tab indent:

        case VMDF_RAW:
                return 0;
        case VMDF_QCOW2:
                return virtio_qcow2_get_base(fd, path, npath);

> + }
> + log_warnx("%s: invalid disk format", __func__);
> + return -1;
> +}
> +
> +/*
> + * Initializes a struct virtio_backing using the list of fds.
> + */
>  static int
> -virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
> +virtio_init_disk(struct virtio_backing *file, off_t *sz,
> +    int *fd, size_t nfd, int type)
>  {
>   /*
>   * probe disk types in order of preference, first one to work wins.
>   * TODO: provide a way of specifying the type and options.
>   */
>   switch (type) {
> - case VMDF_RAW: return virtio_init_raw(file, sz, fd);
> - case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
> + case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
> + case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);

Same newline+tab here.

>   }
>   log_warnx("%s: invalid disk format", __func__);
>   return -1;
>  }
>  
>  void
> -virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
> -    int *child_taps)
> +virtio_init(struct vmd_vm *vm, int child_cdrom,
> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> @@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>   vioblk[i].vm_id = vcp->vcp_id;
>   vioblk[i].irq = pci_get_dev_irq(id);
>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
> -    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
> +    child_disks[i], vmc->vmc_diskbases[i],
> +    vmc->vmc_disktypes[i]) == -1) {
>   log_warnx("%s: unable to determine disk format",
>      __func__);
>   return;
> @@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>   vioscsi->vq[i].last_avail = 0;
>   }
>   if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
> -    child_cdrom, VMDF_RAW) == -1) {
> +    &child_cdrom, 1, VMDF_RAW) == -1) {
>   log_warnx("%s: unable to determine iso format",
>      __func__);
>   return;
> @@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
>  }
>  
>  int
> -vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
> +vioblk_restore(int fd, struct vmop_create_params *vmc,
> +    int child_disks[][VMM_MAX_BASE_PER_DISK])
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   uint8_t i;
> @@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
>   return (-1);
>   }
>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
> -    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
> +    child_disks[i], vmc->vmc_diskbases[i],
> +    vmc->vmc_disktypes[i]) == -1)  {
>   log_warnx("%s: unable to determine disk format",
>      __func__);
>   return (-1);
> @@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
>   return (-1);
>   }
>  
> - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
> + if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
>      VMDF_RAW) == -1) {
>   log_warnx("%s: unable to determine iso format", __func__);
>   return (-1);
> @@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
>  }
>  
>  int
> -virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
> -    int *child_taps)
> +virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
> index 46006916b6a..91f0e323204 100644
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -257,10 +257,11 @@ struct ioinfo {
>  };
>  
>  /* virtio.c */
> -void virtio_init(struct vmd_vm *, int, int *, int *);
> +void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK], int *);
>  void virtio_shutdown(struct vmd_vm *);
>  int virtio_dump(int);
> -int virtio_restore(int, struct vmd_vm *, int, int *, int *);
> +int virtio_restore(int, struct vmd_vm *, int,
> +    int[][VMM_MAX_BASE_PER_DISK], int *);
>  uint32_t vring_size(uint32_t);
>  
>  int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
> @@ -270,12 +271,14 @@ void viornd_update_qs(void);
>  void viornd_update_qa(void);
>  int viornd_notifyq(void);
>  
> -int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
> -int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
> +ssize_t virtio_qcow2_get_base(int, char *, size_t);
> +int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
> +int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
>  
>  int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
>  int vioblk_dump(int);
> -int vioblk_restore(int, struct vmop_create_params *, int *);
> +int vioblk_restore(int, struct vmop_create_params *,
> +    int[][VMM_MAX_BASE_PER_DISK]);
>  void vioblk_update_qs(struct vioblk_dev *);
>  void vioblk_update_qa(struct vioblk_dev *);
>  int vioblk_notifyq(struct vioblk_dev *);
> diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
> index ef4494d918b..7c9aaf27012 100644
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -65,8 +65,8 @@
>  
>  io_fn_t ioports_map[MAX_PORTS];
>  
> -int run_vm(int, int *, int *, struct vmop_create_params *,
> -    struct vcpu_reg_state *);
> +int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *,
> +    struct vmop_create_params *, struct vcpu_reg_state *);
>  void vm_dispatch_vmm(int, short, void *);
>  void *event_thread(void *);
>  void *vcpu_run_loop(void *);
> @@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
>  void create_memory_map(struct vm_create_params *);
>  int alloc_guest_mem(struct vm_create_params *);
>  int vmm_create_vm(struct vm_create_params *);
> -void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
> -void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
> +void init_emulated_hw(struct vmop_create_params *, int,
> +    int[][VMM_MAX_BASE_PER_DISK], int *);
> +void restore_emulated_hw(struct vm_create_params *, int, int *,
> +    int[][VMM_MAX_BASE_PER_DISK],int);
>  void vcpu_exit_inout(struct vm_run_params *);
>  uint8_t vcpu_exit_pci(struct vm_run_params *);
>  int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
> @@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd)
>  
>   /* Find and open kernel image */
>   if ((fp = vmboot_open(vm->vm_kernel,
> -    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
> +    vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == NULL)

That doesn't look right - does this open the kernel from the base
image?  What if a user updates the kernel in the working image?

>   fatalx("failed to open kernel - exiting");
>  
>   /* Load kernel image */
> @@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp)
>   */
>  void
>  init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
> -    int *child_disks, int *child_taps)
> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   int i;
> @@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
>   */
>  void
>  restore_emulated_hw(struct vm_create_params *vcp, int fd,
> -    int *child_taps, int *child_disks, int child_cdrom)
> +    int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int child_cdrom)
>  {
>   /* struct vm_create_params *vcp = &vmc->vmc_params; */
>   int i;
> @@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
>   *  !0 : the VM exited abnormally or failed to start
>   */
>  int
> -run_vm(int child_cdrom, int *child_disks, int *child_taps,
> -    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
> +run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK],
> +    int *child_taps, struct vmop_create_params *vmc,
> +    struct vcpu_reg_state *vrs)
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   struct vm_rwregs_params vregsp;
> diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
> index 44ceeb64a7e..f23d8d6b25a 100644
> --- usr.sbin/vmd/vmboot.c
> +++ usr.sbin/vmd/vmboot.c
> @@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
>  
>   switch (vmboot->vbp_type) {
>   case VMDF_RAW:
> - if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
> + if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) {
>   log_debug("%s: could not open raw disk", __func__);
>   goto fail;
>   }
>   break;
>   case VMDF_QCOW2:
> - if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
> + if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) {

Same problem here.

>   log_debug("%s: could not open qcow2 disk", __func__);
>   goto fail;
>   }
> diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
> index 890e9288dd1..133abe12722 100644
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -1080,7 +1080,7 @@ void
>  vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>  {
>   struct privsep *ps = &env->vmd_ps;
> - unsigned int i;
> + unsigned int i, j;
>  
>   if (vm == NULL)
>   return;
> @@ -1100,9 +1100,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>   close(vm->vm_iev.ibuf.fd);
>   }
>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
> - if (vm->vm_disks[i] != -1) {
> - close(vm->vm_disks[i]);
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + if (vm->vm_disks[i][j] != -1) {
> + close(vm->vm_disks[i][j]);
> + vm->vm_disks[i][j] = -1;
> + }
>   }
>   }
>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
> @@ -1159,7 +1161,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
>   struct vmop_owner *vmo = NULL;
>   struct vmd_user *usr = NULL;
>   uint32_t rng;
> - unsigned int i;
> + unsigned int i, j;
>   struct vmd_switch *sw;
>   char *s;
>  
> @@ -1250,7 +1252,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
>   vm->vm_user = usr;
>  
>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
> + vm->vm_disks[i][j] = -1;
>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
>   vm->vm_ifs[i].vif_fd = -1;
>   for (i = 0; i < vcp->vcp_nnics; i++) {
> diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
> index 4cf0295d946..b803d44340c 100644
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -166,6 +166,7 @@ struct vmop_create_params {
>  #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
>  
>   unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
> + unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
>  #define VMDF_RAW 0x01
>  #define VMDF_QCOW2 0x02

- These defines should go up after vmc_disktypes.
- vmc_disktypes should be uint8_t unless you want to align it somehow.

>  
> @@ -238,7 +239,7 @@ struct vmd_vm {
>   uint32_t vm_vmid;
>   int vm_kernel;
>   int vm_cdrom;
> - int vm_disks[VMM_MAX_DISKS_PER_VM];
> + int vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
>   struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
>   char *vm_ttyname;
>   int vm_tty;
> @@ -412,4 +413,7 @@ int parse_config(const char *);
>  int cmdline_symset(char *);
>  int host(const char *, struct address *);
>  
> +/* virtio.c */
> +int virtio_get_base(int, char *, size_t, int);
> +
>  #endif /* VMD_H */
> diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
> index 7757856323f..3176fd85713 100644
> --- usr.sbin/vmd/vmm.c
> +++ usr.sbin/vmd/vmm.c
> @@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
>   struct vmd_vm *vm;
>   int ret = EINVAL;
>   int fds[2];
> - size_t i;
> + size_t i, j;
>  
>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
>   log_warnx("%s: can't find vm", __func__);
> @@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
>   close(fds[1]);
>  
>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
> - close(vm->vm_disks[i]);
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + if (vm->vm_disks[i][j] != -1)
> + close(vm->vm_disks[i][j]);
> + vm->vm_disks[i][j] = -1;
> + }
>   }
>   for (i = 0 ; i < vcp->vcp_nnics; i++) {
>   close(vm->vm_ifs[i].vif_fd);
>
> --
>     Ori Bernstein
>

--

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Ori Bernstein
On Mon, 1 Oct 2018 12:55:12 +0200
Reyk Floeter <[hidden email]> wrote:

> Hi Ori,
>
> On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
> > I've added support to vmd for external snapshots. That is,
> > snapshots that are derived from a base image. Data lookups
> > start in the derived image, and if the derived image does not
> > contain some data, the search proceeds ot the base image.
> > Multiple derived images may exist off of a single base image.
> >
>
> Nice work!  This will be quite useful, thanks.
>
> I think I broke your diff as my last commit to derive the raw/qcow2
> format introduced some conflicts.  I had posted it on hackers@ and
> forgot that your aren't on the internal list yet - sorry for that.
 
No worries -- I'll rebase it after work.

> but we should be able to derive it from the base as well (there's now
> base in raw images), so the following should work as well:
>
> vmctl create derived.img -s 16G -b base.img

Will fold that in to this change. I also want the creation to
automatically figure out the size here.
 

> > The main implementation change is that we now probe base
> > images before sending the disk FDs to the VM, which means that
> > we can actually open the images.
> >
> > The base image paths may be relative. If they are relative,
> > they are interpreted relative to the location of the derived
> > image, and not relative to the directory where vmd happens to
> > be running.
> >
>
> OK, that needs some care + review.
 
Indeed -- I hadn't gotten it quite right.

> Please be extremely careful with the design here.  Unlike qemu, a vmd
> VM is not able to create new files itself and it should never be able
> to do it.  So when we create snapshots, we need to find a way that the
> parent prepares the file, sends the fd, and asks the VM process to use
> it.

I was not planning on allowing snapshots of running images -- merely
starting instances where all changes were transient. My plan was to make
vmctl create a temporary disk image, and point vmd to that.

--
Ori Bernstein <[hidden email]>

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Alexander Hall
In reply to this post by Reyk Floeter-2
Uh-oh... Don't mention hackers@ on tech@... (FWIW) :-)

/Alexander

On October 1, 2018 12:55:12 PM GMT+02:00, Reyk Floeter <[hidden email]> wrote:

>Hi Ori,
>
>On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
>> I've added support to vmd for external snapshots. That is,
>> snapshots that are derived from a base image. Data lookups
>> start in the derived image, and if the derived image does not
>> contain some data, the search proceeds ot the base image.
>> Multiple derived images may exist off of a single base image.
>>
>
>Nice work!  This will be quite useful, thanks.
>
>I think I broke your diff as my last commit to derive the raw/qcow2
>format introduced some conflicts.  I had posted it on hackers@ and
>forgot that your aren't on the internal list yet - sorry for that.
>
>> A limitation of this format is that modifying the base image
>> will corrupt the derived image.
>>
>> This change also adds support for creating disk derived disk
>> images to vmctl.  To use it:
>>
>> vmctl create derived.img -s 16G -b base.img -f qcow2
>>
>
>I removed -f fmt to be more consistent and the new syntax will be
>
> vmctl create qcow2:derived.img -s 16G -b base.img
>
>or
>
> vmctl create derived.qcow2 -s 16G -b base.img
>
>but we should be able to derive it from the base as well (there's now
>base in raw images), so the following should work as well:
>
> vmctl create derived.img -s 16G -b base.img
>
>> The main implementation change is that we now probe base
>> images before sending the disk FDs to the VM, which means that
>> we can actually open the images.
>>
>> The base image paths may be relative. If they are relative,
>> they are interpreted relative to the location of the derived
>> image, and not relative to the directory where vmd happens to
>> be running.
>>
>
>OK, that needs some care + review.
>
>> For review, a bit of scrutiny could be directed to the
>> messaging.  It relies on imsg being in-order, which seems to
>> be the case, but isn't documented in the manpage -- If I can't
>> rely on that, the protocol needs to be tweaked.
>>
>
>imsgs are guaranteed to be in order as long as you don't mux them with
>other messages from the same sender in an async way.
>
>> After this change, we send imsgs to the same disk index
>> repeatedly, and each message adds another base to the stack of
>> images. So, for example, if I have 2images image that look
>> like this:
>>
>> disk0 -> base0 -> base1
>> disk1
>>
>> Then we send the following messages:
>>
>> VMDOP_START_VM_DISK (i=0, fd=open(disk0))
>> VMDOP_START_VM_DISK (i=0, fd=open(base0))
>> VMDOP_START_VM_DISK (i=0, fd=open(base1))
>>
>> VMDOP_START_VM_DISK (i=1, fd=open(disk1))
>>
>
>Makes sense.
>
>> This also opens the door to ephemeral snapshots, which vmd can
>> implicitly create when it starts a vm, and removes
>> automatically on exit.
>>
>
>Please be extremely careful with the design here.  Unlike qemu, a vmd
>VM is not able to create new files itself and it should never be able
>to do it.  So when we create snapshots, we need to find a way that the
>parent prepares the file, sends the fd, and asks the VM process to use
>it.
>
>> Testing has been the usual -- OpenBSD installs, a bit of catting,
>> and some random 'dd'. Heavier use and testing would be appreciated.
>>
>
>I will test the updated diff that includes the second fix and the merge
>;)
>
>Initial comments inline below.
>
>Reyk
>
>>
>>
>> diff --git regress/usr.sbin/vmd/diskfmt/Makefile
>regress/usr.sbin/vmd/diskfmt/Makefile
>> index c2a5f42d5f6..1f8673e0e26 100644
>> --- regress/usr.sbin/vmd/diskfmt/Makefile
>> +++ regress/usr.sbin/vmd/diskfmt/Makefile
>> @@ -11,7 +11,7 @@
>>  VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
>>  
>>  PROG=vioscribble
>> -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
>> +SRCS=vioscribble.c vioqcow2.c vioraw.c
>>  CFLAGS+=-I$(VMD_DIR) -pthread
>>  LDFLAGS+=-pthread
>>  
>> @@ -26,3 +26,6 @@ scribble-images:
>>  .PHONY: ${REGRESS_TARGETS} scribble-images
>>  
>>  .include <bsd.regress.mk>
>> +
>> +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
>> + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
>> diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c
>regress/usr.sbin/vmd/diskfmt/vioscribble.c
>> index 14d720db652..1da8efedac7 100644
>> --- regress/usr.sbin/vmd/diskfmt/vioscribble.c
>> +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
>> @@ -122,16 +122,18 @@ main(int argc, char **argv)
>>   verbose = !!getenv("VERBOSE");
>>   qcfd = open("scribble.qc2", O_RDWR);
>>   rawfd = open("scribble.raw", O_RDWR);
>> - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
>> + if (qcfd == -1)
>>   err(1, "unable to open qcow");
>> - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
>> + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
>> + err(1, "unable to init qcow");
>> + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) ==
>-1)
>>   err(1, "unable to open raw");
>>  
>>   srandom_deterministic(123);
>>  
>>   /* scribble to both disks */
>>   printf("scribbling...\n");
>> - for (i = 0; i < 16; i++) {
>> + for (i = 0; i < 1024*16; i++) {
>>   off = (random() % DISKSZ);
>>   len = random() % sizeof buf + 1;
>>   fill(off, buf, sizeof buf);
>> diff --git sys/arch/amd64/include/vmmvar.h
>sys/arch/amd64/include/vmmvar.h
>> index 812e660f5f2..98cc498bf84 100644
>> --- sys/arch/amd64/include/vmmvar.h
>> +++ sys/arch/amd64/include/vmmvar.h
>> @@ -24,6 +24,7 @@
>>  #define VMM_HV_SIGNATURE "OpenBSDVMM58"
>>  
>>  #define VMM_MAX_MEM_RANGES 16
>> +#define VMM_MAX_BASE_PER_DISK 4
>
>Please don't add this to vmmvar.h.  This has nothing to do with the
>kernel and should only be in vmd.h (e.g. as VM_MAX_BASE_PER_DISK).
>
>We have some structs and defines that are shared between vmd(8) and
>vmm(4), and some of them should probably be in userspace only but are
>in vmmvar.h for historical reasons.  But it is extemely painful to
>maintain when doing changes to the daemon.
>
>>  #define VMM_MAX_DISKS_PER_VM 4
>>  #define VMM_MAX_PATH_DISK 128
>>  #define VMM_MAX_PATH_CDROM 128
>> diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
>> index 0f06a9ed1d2..d75a51b83c5 100644
>> --- usr.sbin/vmctl/main.c
>> +++ usr.sbin/vmctl/main.c
>
>The following bits need changes as I removed -f fmt ...
>
>> @@ -64,7 +64,7 @@ int ctl_receive(struct parse_result *, int, char
>*[]);
>>  struct ctl_command ctl_commands[] = {
>>   { "console", CMD_CONSOLE, ctl_console, "id" },
>>   { "create", CMD_CREATE, ctl_create,
>> - "\"path\" -s size [-f fmt]", 1 },
>> + "\"path\" -s size [-f fmt] [-b base]", 1 },
>>   { "load", CMD_LOAD, ctl_load, "\"path\"" },
>>   { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
>>   { "reload", CMD_RELOAD, ctl_reload, "" },
>> @@ -504,11 +504,12 @@ int
>>  ctl_create(struct parse_result *res, int argc, char *argv[])
>>  {
>>   int ch, ret;
>> - const char *paths[2], *format;
>> + const char *paths[2], *format, *base;
>>  
>>   if (argc < 2)
>>   ctl_usage(res->ctl);
>>  
>> + base = NULL;
>>   paths[0] = argv[1];
>>   paths[1] = NULL;
>>   format = "raw";
>> @@ -521,7 +522,7 @@ ctl_create(struct parse_result *res, int argc,
>char *argv[])
>>   argc--;
>>   argv++;
>>  
>> - while ((ch = getopt(argc, argv, "s:f:")) != -1) {
>> + while ((ch = getopt(argc, argv, "s:f:b:")) != -1) {
>>   switch (ch) {
>>   case 's':
>>   if (parse_size(res, optarg, 0) != 0)
>> @@ -530,6 +531,9 @@ ctl_create(struct parse_result *res, int argc,
>char *argv[])
>>   case 'f':
>>   format = optarg;
>>   break;
>> + case 'b':
>> + base = optarg;
>> + break;
>>   default:
>>   ctl_usage(res->ctl);
>>   /* NOTREACHED */
>> @@ -537,13 +541,16 @@ ctl_create(struct parse_result *res, int argc,
>char *argv[])
>>   }
>>  
>>   if (res->size == 0) {
>> - fprintf(stderr, "missing size argument\n");
>> + fprintf(stderr, "could not create %s: missing size argument\n",
>> +    paths[0]);
>>   ctl_usage(res->ctl);
>>   }
>> - if (strcmp(format, "raw") == 0)
>> + if (strcmp(format, "raw") == 0) {
>> + if (base)
>> + errx(1, "raw images do not accept base argument");
>>   ret = create_raw_imagefile(paths[0], res->size);
>> - else if (strcmp(format, "qcow2") == 0)
>> - ret = create_qc2_imagefile(paths[0], res->size);
>> + } else if (strcmp(format, "qcow2") == 0)
>> + ret = create_qc2_imagefile(paths[0], base, res->size);
>>   else
>>   errx(1, "unknown image format %s", format);
>>   if (ret != 0) {
>> diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
>> index 4941cde3b07..c9d297d2e90 100644
>> --- usr.sbin/vmctl/vmctl.8
>> +++ usr.sbin/vmctl/vmctl.8
>> @@ -50,7 +50,7 @@ Using
>>  .Xr cu 1
>>  connect to the console of the VM with the specified
>>  .Ar id .
>> -.It Cm create Ar path Fl s Ar size Op Fl f Ar format
>> +.It Cm create Ar path Fl s Ar size Op Fl f Ar format Fl b Ar base
>>  Creates a VM disk image file with the specified
>>  .Ar path
>>  and
>> @@ -65,6 +65,10 @@ or
>>  defaulting to
>>  .Ar raw
>>  if left unspecified.
>> +For qcow2, a
>> +.Ar base
>> +image may be specified. The base image is not modified. The
>> +derived image contains only the changes written by the VM.
>>  .It Cm load Ar filename
>>  Load additional configuration from the specified file.
>>  .It Cm log brief
>> diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
>> index 355bd4d0900..6101bb310dd 100644
>> --- usr.sbin/vmctl/vmctl.c
>> +++ usr.sbin/vmctl/vmctl.c
>> @@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path,
>long imgsize)
>>  #define ALIGN(sz, align) \
>>   ((sz + align - 1) & ~(align - 1))
>>  int
>> -create_qc2_imagefile(const char *imgfile_path, long imgsize)
>> +create_qc2_imagefile(const char *imgfile_path,
>> +    const char *base_path, long imgsize)
>>  {
>>   struct qcheader {
>>   char magic[4];
>> @@ -871,6 +872,7 @@ create_qc2_imagefile(const char *imgfile_path,
>long imgsize)
>>   uint32_t headersz;
>>   } __packed hdr;
>>   int fd, ret;
>> + ssize_t base_len;
>>   uint64_t l1sz, refsz, disksz, initsz, clustersz;
>>   uint64_t l1off, refoff, v, i;
>>   uint16_t refs;
>> @@ -888,11 +890,12 @@ create_qc2_imagefile(const char *imgfile_path,
>long imgsize)
>>   refsz = 1;
>>  
>>   initsz = ALIGN(refoff + refsz*clustersz, clustersz);
>> + base_len = base_path ? strlen(base_path) : 0;
>>  
>>   memcpy(hdr.magic, "QFI\xfb", 4);
>
>The magic bytes are now defined as VM_MAGIC_QCOW and can be used as:
>
> memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW))
>
>>   hdr.version = htobe32(3);
>> - hdr.backingoff = htobe64(0);
>> - hdr.backingsz = htobe32(0);
>> + hdr.backingoff = htobe64(base_path ? sizeof hdr : 0);
>
>knf is sizeof(hdr), with braces.  There are a few such sizeof without
>braces below, but I'm not commenting on all of them.
>
>> + hdr.backingsz = htobe32(base_len);
>>   hdr.clustershift = htobe32(16);
>>   hdr.disksz = htobe64(disksz);
>>   hdr.cryptmethod = htobe32(0);
>> @@ -918,6 +921,10 @@ create_qc2_imagefile(const char *imgfile_path,
>long imgsize)
>>   if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
>>   goto error;
>>  
>> + /* Add the base image */
>> + if (base_path && write(fd, base_path, base_len) != base_len)
>> + goto error;
>> +
>>   /* Extend to desired size, and add one refcount cluster */
>>   if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
>>   goto error;
>> diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
>> index 27e094c26de..da96e926e36 100644
>> --- usr.sbin/vmctl/vmctl.h
>> +++ usr.sbin/vmctl/vmctl.h
>> @@ -87,7 +87,7 @@ __dead void
>>  
>>  /* vmctl.c */
>>  int create_raw_imagefile(const char *, long);
>> -int create_qc2_imagefile(const char *, long);
>> +int create_qc2_imagefile(const char *, const char*, long);
>>  int vm_start(uint32_t, const char *, int, int, char **, int,
>>      char **, int *, char *, char *, char *);
>>  int vm_start_complete(struct imsg *, int *, int);
>> diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
>> index 550b73c1a39..ba70e41f930 100644
>> --- usr.sbin/vmd/config.c
>> +++ usr.sbin/vmd/config.c
>> @@ -35,6 +35,7 @@
>>  #include <util.h>
>>  #include <errno.h>
>>  #include <imsg.h>
>> +#include <libgen.h>
>>  
>>  #include "proc.h"
>>  #include "vmd.h"
>> @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg
>*imsg)
>>  int
>>  config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid,
>uid_t uid)
>>  {
>> + int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
>>   struct vmd_if *vif;
>>   struct vmop_create_params *vmc = &vm->vm_params;
>>   struct vm_create_params *vcp = &vmc->vmc_params;
>> - unsigned int i;
>> + unsigned int i, j;
>>   int fd = -1, vmboot = 0;
>> - int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
>> + int kernfd = -1;
>> + int *tapfds;
>>   int cdromfd = -1;
>>   int saved_errno = 0;
>> + int n = 0, flags;
>>   char ifname[IF_NAMESIZE], *s;
>>   char path[PATH_MAX];
>> + char base[PATH_MAX];
>> + char expanded[PATH_MAX];
>>   unsigned int unit;
>>  
>>   errno = 0;
>> @@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm
>*vm, uint32_t peerid, uid_t uid)
>>   }
>>   }
>>  
>> - diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
>> - if (diskfds == NULL) {
>> - log_warn("%s: can't allocate disk fds", __func__);
>> - goto fail;
>> - }
>> - for (i = 0; i < vcp->vcp_ndisks; i++)
>> - diskfds[i] = -1;
>> + for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
>> + diskfds[i][j] = -1;
>>  
>>   tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
>>   if (tapfds == NULL) {
>> @@ -289,22 +291,55 @@ config_setvm(struct privsep *ps, struct vmd_vm
>*vm, uint32_t peerid, uid_t uid)
>>  
>>   /* Open disk images for child */
>>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
>> -                /* Stat disk[i] to ensure it is a regular file */
>> - if ((diskfds[i] = open(vcp->vcp_disks[i],
>> -    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
>> - log_warn("%s: can't open disk %s", __func__,
>> -    vcp->vcp_disks[i]);
>> - errno = VMD_DISK_MISSING;
>> - goto fail;
>> - }
>> + if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX)
>> + log_warnx("%s, disk path too long", __func__);
>> + memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases);
>> + flags = O_RDWR|O_EXLOCK|O_NONBLOCK;
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
>> + /* Stat disk[i] to ensure it is a regular file */
>> + if ((diskfds[i][j] = open(path, flags)) == -1) {
>> + log_warn("%s: can't open disk %s", __func__,
>> +    vcp->vcp_disks[i]);
>> + errno = VMD_DISK_MISSING;
>> + goto fail;
>> + }
>> +
>> + if (vm_checkaccess(diskfds[i][j],
>> +    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
>> +    uid, R_OK|W_OK) == -1) {
>
>See below...
>
>> + log_warnx("vm \"%s\" no read/write "
>> +    "access to disk %s", vcp->vcp_name,
>> +    vcp->vcp_disks[i]);
>> + errno = EPERM;
>> + goto fail;
>> + }
>> +
>> + /*
>> + * Clear the read-write flag for base images.
>> + * All writes should go to the top image.
>> + */
>> + flags = O_RDONLY|O_EXLOCK|O_NONBLOCK;
>
>...this should also clear the W_OK flag when checking disk access.  A
>base doesn't have to be writable for the user.
>
>It is good that you don't to attempt to open it writable so the VM
>will never be able to change a base image!  I actually plan to use
>bases with VM templates to allow users running VMs using pre-defined
>and system-wide base images.
>
>> + n = virtio_get_base(diskfds[i][j], base, sizeof base,
>> +    vmc->vmc_disktypes[i]);
>> + if (n == -1)
>> + log_warnx("vm \"%s\" unable to read"
>> +    "base for disk %s", vcp->vcp_name,
>> +    vcp->vcp_disks[i]);
>> + if (n == 0)
>> + break;
>> + /*
>> + * Relative paths should be interpreted relative
>> + * to the disk image, rather than relative to the
>> + * directory vmd happens to be running in, since
>> + * this is the only userful interpretation.
>> + */
>
>OK, I see that this was fixed in your 2nd diff but please send the
>complete thing for diff updates in the future as it is hard to comment
>incremental diffs.
>
>> + if (base[0] != '/') {
>> + s = dirname(path);
>> + snprintf(expanded, sizeof expanded,
>> +    "%s/%s", s, base);
>> + realpath(expanded, path);
>> + }
>>  
>> - if (vm_checkaccess(diskfds[i],
>> -    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
>> -    uid, R_OK|W_OK) == -1) {
>> - log_warnx("vm \"%s\" no read/write access to disk %s",
>> -    vcp->vcp_name, vcp->vcp_disks[i]);
>> - errno = EPERM;
>> - goto fail;
>>   }
>>   }
>>  
>> @@ -402,9 +437,13 @@ config_setvm(struct privsep *ps, struct vmd_vm
>*vm, uint32_t peerid, uid_t uid)
>>      NULL, 0);
>>  
>>   for (i = 0; i < vcp->vcp_ndisks; i++) {
>> - proc_compose_imsg(ps, PROC_VMM, -1,
>> -    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
>> -    &i, sizeof(i));
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
>> + if (diskfds[i][j] == -1)
>> + break;
>> + proc_compose_imsg(ps, PROC_VMM, -1,
>> +    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
>> +    diskfds[i][j], &i, sizeof(i));
>> + }
>>   }
>>   for (i = 0; i < vcp->vcp_nnics; i++) {
>>   proc_compose_imsg(ps, PROC_VMM, -1,
>> @@ -416,7 +455,6 @@ config_setvm(struct privsep *ps, struct vmd_vm
>*vm, uint32_t peerid, uid_t uid)
>>   proc_compose_imsg(ps, PROC_VMM, -1,
>>      IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
>>  
>> - free(diskfds);
>>   free(tapfds);
>>  
>>   vm->vm_running = 1;
>> @@ -430,11 +468,10 @@ config_setvm(struct privsep *ps, struct vmd_vm
>*vm, uint32_t peerid, uid_t uid)
>>   close(kernfd);
>>   if (cdromfd != -1)
>>   close(cdromfd);
>> - if (diskfds != NULL) {
>> - for (i = 0; i < vcp->vcp_ndisks; i++)
>> - close(diskfds[i]);
>> - free(diskfds);
>> - }
>> + for (i = 0; i < vcp->vcp_ndisks; i++)
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
>> + if (diskfds[i][j] != -1)
>> + close(diskfds[i][j]);
>>   if (tapfds != NULL) {
>>   for (i = 0; i < vcp->vcp_nnics; i++)
>>   close(tapfds[i]);
>> @@ -489,7 +526,7 @@ int
>>  config_getdisk(struct privsep *ps, struct imsg *imsg)
>>  {
>>   struct vmd_vm *vm;
>> - unsigned int n;
>> + unsigned int n, idx;
>>  
>>   errno = 0;
>>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
>> @@ -500,14 +537,18 @@ config_getdisk(struct privsep *ps, struct imsg
>*imsg)
>>   IMSG_SIZE_CHECK(imsg, &n);
>>   memcpy(&n, imsg->data, sizeof(n));
>>  
>> - if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
>> -    vm->vm_disks[n] != -1 || imsg->fd == -1) {
>> + if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
>
>For the bases you are removing the check if a disk has already been
>sent...
>
>>   log_warnx("invalid disk id");
>>   errno = EINVAL;
>>   return (-1);
>>   }
>> - vm->vm_disks[n] = imsg->fd;
>> -
>> + idx = vm->vm_params.vmc_diskbases[n]++;
>> + if (idx >= VMM_MAX_BASE_PER_DISK) {
>> + log_warnx("too many bases for disk");
>> + errno = EINVAL;
>> + return (-1);
>> + }
>> + vm->vm_disks[n][idx] = imsg->fd;
>
>...but you should keep the paranoia and check it around here
>(fail if vm->vm_disks[n][idx] != -1).
>
>>   return (0);
>>  }
>>  
>> diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
>> index ada0bafc72d..e4ebe020813 100644
>> --- usr.sbin/vmd/vioqcow2.c
>> +++ usr.sbin/vmd/vioqcow2.c
>> @@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int
>*);
>>  static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t,
>off_t);
>>  static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t,
>off_t);
>>  static int inc_refs(struct qcdisk *, off_t, int);
>> -static int qc2_openpath(struct qcdisk *, char *, int);
>> -static int qc2_open(struct qcdisk *, int);
>> +static int qc2_open(struct qcdisk *, int *, size_t);
>>  static ssize_t qc2_pread(void *, char *, size_t, off_t);
>>  static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
>>  static void qc2_close(void *, int);
>> @@ -118,14 +117,14 @@ static void qc2_close(void *, int);
>>   * May open snapshot base images.
>>   */
>>  int
>> -virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
>> +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd,
>size_t nfd)
>>  {
>>   struct qcdisk *diskp;
>>  
>>   diskp = malloc(sizeof(struct qcdisk));
>>   if (diskp == NULL)
>>   return -1;
>> - if (qc2_open(diskp, fd) == -1) {
>> + if (qc2_open(diskp, fd, nfd) == -1) {
>>   log_warnx("%s: could not open qcow2 disk", __func__);
>>   free(diskp);
>>   return -1;
>> @@ -138,19 +137,41 @@ virtio_init_qcow2(struct virtio_backing *file,
>off_t *szp, int fd)
>>   return 0;
>>  }
>>  
>> -static int
>> -qc2_openpath(struct qcdisk *disk, char *path, int flags)
>> +ssize_t
>> +virtio_qcow2_get_base(int fd, char *path, size_t npath)
>>  {
>> - int fd;
>> + struct qcheader header;
>> + uint64_t backingoff;
>> + uint32_t backingsz;
>>  
>> - fd = open(path, flags);
>> - if (fd < 0)
>> + if (pread(fd, &header, sizeof header, 0) != sizeof header) {
>> + log_warn("%s: short read on header", __func__);
>>   return -1;
>> - return qc2_open(disk, fd);
>> + }
>> + if (strncmp(header.magic, "QFI\xfb", 4) != 0) {
>
>See my notes on the magic bytes above.
>
>> + log_warn("%s: invalid magic numbers", __func__);
>> + return -1;
>> + }
>> + backingoff = be64toh(header.backingoff);
>> + backingsz = be32toh(header.backingsz);
>> + if (backingsz != 0) {
>> + if (backingsz >= npath - 1) {
>> + log_warn("%s: snapshot path too long", __func__);
>> + return -1;
>> + }
>> + if (pread(fd, path, npath, backingoff) != backingsz) {
>> + log_warn("%s: could not read snapshot base name",
>> +    __func__);
>> + return -1;
>> + }
>> + path[backingsz] = 0;
>> + log_warn("path: %s\n", path);
>> + }
>> + return backingsz;
>>  }
>>  
>>  static int
>> -qc2_open(struct qcdisk *disk, int fd)
>> +qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
>>  {
>>   char basepath[PATH_MAX];
>>   struct stat st;
>> @@ -158,9 +179,10 @@ qc2_open(struct qcdisk *disk, int fd)
>>   uint64_t backingoff;
>>   uint32_t backingsz;
>>   size_t i;
>> - int version;
>> + int version, fd;
>>  
>>   pthread_rwlock_init(&disk->lock, NULL);
>> + fd = fds[0];
>>   disk->fd = fd;
>>   disk->base = NULL;
>>   disk->l1 = NULL;
>> @@ -222,13 +244,6 @@ qc2_open(struct qcdisk *disk, int fd)
>>   backingoff = be64toh(header.backingoff);
>>   backingsz = be32toh(header.backingsz);
>>   if (backingsz != 0) {
>> - /*
>> - * FIXME: we need to figure out a way of opening these things,
>> - * otherwise we just crash with a pledge violation.
>> - */
>> - log_warn("%s: unsupported external snapshot images", __func__);
>> - goto error;
>> -
>>   if (backingsz >= sizeof basepath - 1) {
>>   log_warn("%s: snapshot path too long", __func__);
>>   goto error;
>> @@ -239,11 +254,16 @@ qc2_open(struct qcdisk *disk, int fd)
>>   goto error;
>>   }
>>   basepath[backingsz] = 0;
>> + if (nfd <= 1) {
>> + log_warnx("%s: missing base image %s", __func__, basepath);
>> + goto error;
>> + }
>> +
>>  
>>   disk->base = calloc(1, sizeof(struct qcdisk));
>>   if (!disk->base)
>>   goto error;
>> - if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
>> + if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
>>   log_warn("%s: could not open %s", basepath, __func__);
>>   goto error;
>>   }
>> diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
>> index e02ab67c5dc..ff4bbb3095e 100644
>> --- usr.sbin/vmd/vioraw.c
>> +++ usr.sbin/vmd/vioraw.c
>> @@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
>>   * returning -1 for error, 0 for success.
>>   */
>>  int
>> -virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
>> +virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd,
>size_t nfd)
>>  {
>>   off_t sz;
>>   int *fdp;
>>  
>> - sz = lseek(fd, 0, SEEK_END);
>> + if (nfd != 1)
>> + return -1;
>> + sz = lseek(fd[0], 0, SEEK_END);
>>   if (sz == -1)
>>   return -1;
>>  
>>   fdp = malloc(sizeof(int));
>>   if (!fdp)
>>   return -1;
>> - *fdp = fd;
>> + *fdp = fd[0];
>>   file->p = fdp;
>>   file->pread = raw_pread;
>>   file->pwrite = raw_pwrite;
>> diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
>> index 93490344560..1c66f33216a 100644
>> --- usr.sbin/vmd/virtio.c
>> +++ usr.sbin/vmd/virtio.c
>> @@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t
>*data, uint8_t *intr,
>>   return (0);
>>  }
>>  
>> +int
>> +virtio_get_base(int fd, char *path, size_t npath ,int type)
>> +{
>> + switch (type) {
>> + case VMDF_RAW: return 0;
>> + case VMDF_QCOW2: return virtio_qcow2_get_base(fd, path, npath);
>
>There is no explicit note about this in style(9) (only examples) but I
>can't remember that I've seen this style in our C code.  We add a
>newline after cases, followed by a 1 tab indent:
>
> case VMDF_RAW:
> return 0;
> case VMDF_QCOW2:
> return virtio_qcow2_get_base(fd, path, npath);
>
>> + }
>> + log_warnx("%s: invalid disk format", __func__);
>> + return -1;
>> +}
>> +
>> +/*
>> + * Initializes a struct virtio_backing using the list of fds.
>> + */
>>  static int
>> -virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int
>type)
>> +virtio_init_disk(struct virtio_backing *file, off_t *sz,
>> +    int *fd, size_t nfd, int type)
>>  {
>>   /*
>>   * probe disk types in order of preference, first one to work wins.
>>   * TODO: provide a way of specifying the type and options.
>>   */
>>   switch (type) {
>> - case VMDF_RAW: return virtio_init_raw(file, sz, fd);
>> - case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
>> + case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
>> + case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
>
>Same newline+tab here.
>
>>   }
>>   log_warnx("%s: invalid disk format", __func__);
>>   return -1;
>>  }
>>  
>>  void
>> -virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>> -    int *child_taps)
>> +virtio_init(struct vmd_vm *vm, int child_cdrom,
>> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>>  {
>>   struct vmop_create_params *vmc = &vm->vm_params;
>>   struct vm_create_params *vcp = &vmc->vmc_params;
>> @@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
>int *child_disks,
>>   vioblk[i].vm_id = vcp->vcp_id;
>>   vioblk[i].irq = pci_get_dev_irq(id);
>>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
>> -    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
>> +    child_disks[i], vmc->vmc_diskbases[i],
>> +    vmc->vmc_disktypes[i]) == -1) {
>>   log_warnx("%s: unable to determine disk format",
>>      __func__);
>>   return;
>> @@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
>int *child_disks,
>>   vioscsi->vq[i].last_avail = 0;
>>   }
>>   if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
>> -    child_cdrom, VMDF_RAW) == -1) {
>> +    &child_cdrom, 1, VMDF_RAW) == -1) {
>>   log_warnx("%s: unable to determine iso format",
>>      __func__);
>>   return;
>> @@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int
>*child_taps)
>>  }
>>  
>>  int
>> -vioblk_restore(int fd, struct vmop_create_params *vmc, int
>*child_disks)
>> +vioblk_restore(int fd, struct vmop_create_params *vmc,
>> +    int child_disks[][VMM_MAX_BASE_PER_DISK])
>>  {
>>   struct vm_create_params *vcp = &vmc->vmc_params;
>>   uint8_t i;
>> @@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct
>vmop_create_params *vmc, int *child_disks)
>>   return (-1);
>>   }
>>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
>> -    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
>> +    child_disks[i], vmc->vmc_diskbases[i],
>> +    vmc->vmc_disktypes[i]) == -1)  {
>>   log_warnx("%s: unable to determine disk format",
>>      __func__);
>>   return (-1);
>> @@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params
>*vcp, int child_cdrom)
>>   return (-1);
>>   }
>>  
>> - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
>> + if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
>>      VMDF_RAW) == -1) {
>>   log_warnx("%s: unable to determine iso format", __func__);
>>   return (-1);
>> @@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params
>*vcp, int child_cdrom)
>>  }
>>  
>>  int
>> -virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int
>*child_disks,
>> -    int *child_taps)
>> +virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
>> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>>  {
>>   struct vmop_create_params *vmc = &vm->vm_params;
>>   struct vm_create_params *vcp = &vmc->vmc_params;
>> diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
>> index 46006916b6a..91f0e323204 100644
>> --- usr.sbin/vmd/virtio.h
>> +++ usr.sbin/vmd/virtio.h
>> @@ -257,10 +257,11 @@ struct ioinfo {
>>  };
>>  
>>  /* virtio.c */
>> -void virtio_init(struct vmd_vm *, int, int *, int *);
>> +void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK],
>int *);
>>  void virtio_shutdown(struct vmd_vm *);
>>  int virtio_dump(int);
>> -int virtio_restore(int, struct vmd_vm *, int, int *, int *);
>> +int virtio_restore(int, struct vmd_vm *, int,
>> +    int[][VMM_MAX_BASE_PER_DISK], int *);
>>  uint32_t vring_size(uint32_t);
>>  
>>  int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *,
>uint8_t);
>> @@ -270,12 +271,14 @@ void viornd_update_qs(void);
>>  void viornd_update_qa(void);
>>  int viornd_notifyq(void);
>>  
>> -int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
>> -int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int
>fd);
>> +ssize_t virtio_qcow2_get_base(int, char *, size_t);
>> +int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
>> +int virtio_init_qcow2(struct virtio_backing *, off_t *, int*,
>size_t);
>>  
>>  int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *,
>uint8_t);
>>  int vioblk_dump(int);
>> -int vioblk_restore(int, struct vmop_create_params *, int *);
>> +int vioblk_restore(int, struct vmop_create_params *,
>> +    int[][VMM_MAX_BASE_PER_DISK]);
>>  void vioblk_update_qs(struct vioblk_dev *);
>>  void vioblk_update_qa(struct vioblk_dev *);
>>  int vioblk_notifyq(struct vioblk_dev *);
>> diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
>> index ef4494d918b..7c9aaf27012 100644
>> --- usr.sbin/vmd/vm.c
>> +++ usr.sbin/vmd/vm.c
>> @@ -65,8 +65,8 @@
>>  
>>  io_fn_t ioports_map[MAX_PORTS];
>>  
>> -int run_vm(int, int *, int *, struct vmop_create_params *,
>> -    struct vcpu_reg_state *);
>> +int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *,
>> +    struct vmop_create_params *, struct vcpu_reg_state *);
>>  void vm_dispatch_vmm(int, short, void *);
>>  void *event_thread(void *);
>>  void *vcpu_run_loop(void *);
>> @@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct
>vcpu_reg_state *);
>>  void create_memory_map(struct vm_create_params *);
>>  int alloc_guest_mem(struct vm_create_params *);
>>  int vmm_create_vm(struct vm_create_params *);
>> -void init_emulated_hw(struct vmop_create_params *, int, int *, int
>*);
>> -void restore_emulated_hw(struct vm_create_params *, int, int *, int
>*,int);
>> +void init_emulated_hw(struct vmop_create_params *, int,
>> +    int[][VMM_MAX_BASE_PER_DISK], int *);
>> +void restore_emulated_hw(struct vm_create_params *, int, int *,
>> +    int[][VMM_MAX_BASE_PER_DISK],int);
>>  void vcpu_exit_inout(struct vm_run_params *);
>>  uint8_t vcpu_exit_pci(struct vm_run_params *);
>>  int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
>> @@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd)
>>  
>>   /* Find and open kernel image */
>>   if ((fp = vmboot_open(vm->vm_kernel,
>> -    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
>> +    vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
>
>That doesn't look right - does this open the kernel from the base
>image?  What if a user updates the kernel in the working image?
>
>>   fatalx("failed to open kernel - exiting");
>>  
>>   /* Load kernel image */
>> @@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp)
>>   */
>>  void
>>  init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
>> -    int *child_disks, int *child_taps)
>> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>>  {
>>   struct vm_create_params *vcp = &vmc->vmc_params;
>>   int i;
>> @@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc,
>int child_cdrom,
>>   */
>>  void
>>  restore_emulated_hw(struct vm_create_params *vcp, int fd,
>> -    int *child_taps, int *child_disks, int child_cdrom)
>> +    int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int
>child_cdrom)
>>  {
>>   /* struct vm_create_params *vcp = &vmc->vmc_params; */
>>   int i;
>> @@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params
>*vcp, int fd,
>>   *  !0 : the VM exited abnormally or failed to start
>>   */
>>  int
>> -run_vm(int child_cdrom, int *child_disks, int *child_taps,
>> -    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
>> +run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK],
>> +    int *child_taps, struct vmop_create_params *vmc,
>> +    struct vcpu_reg_state *vrs)
>>  {
>>   struct vm_create_params *vcp = &vmc->vmc_params;
>>   struct vm_rwregs_params vregsp;
>> diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
>> index 44ceeb64a7e..f23d8d6b25a 100644
>> --- usr.sbin/vmd/vmboot.c
>> +++ usr.sbin/vmd/vmboot.c
>> @@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd,
>unsigned int disk_type,
>>  
>>   switch (vmboot->vbp_type) {
>>   case VMDF_RAW:
>> - if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
>> + if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) {
>>   log_debug("%s: could not open raw disk", __func__);
>>   goto fail;
>>   }
>>   break;
>>   case VMDF_QCOW2:
>> - if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
>> + if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) {
>
>Same problem here.
>
>>   log_debug("%s: could not open qcow2 disk", __func__);
>>   goto fail;
>>   }
>> diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
>> index 890e9288dd1..133abe12722 100644
>> --- usr.sbin/vmd/vmd.c
>> +++ usr.sbin/vmd/vmd.c
>> @@ -1080,7 +1080,7 @@ void
>>  vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>>  {
>>   struct privsep *ps = &env->vmd_ps;
>> - unsigned int i;
>> + unsigned int i, j;
>>  
>>   if (vm == NULL)
>>   return;
>> @@ -1100,9 +1100,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const
>char *caller)
>>   close(vm->vm_iev.ibuf.fd);
>>   }
>>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
>> - if (vm->vm_disks[i] != -1) {
>> - close(vm->vm_disks[i]);
>> - vm->vm_disks[i] = -1;
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
>> + if (vm->vm_disks[i][j] != -1) {
>> + close(vm->vm_disks[i][j]);
>> + vm->vm_disks[i][j] = -1;
>> + }
>>   }
>>   }
>>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
>> @@ -1159,7 +1161,7 @@ vm_register(struct privsep *ps, struct
>vmop_create_params *vmc,
>>   struct vmop_owner *vmo = NULL;
>>   struct vmd_user *usr = NULL;
>>   uint32_t rng;
>> - unsigned int i;
>> + unsigned int i, j;
>>   struct vmd_switch *sw;
>>   char *s;
>>  
>> @@ -1250,7 +1252,8 @@ vm_register(struct privsep *ps, struct
>vmop_create_params *vmc,
>>   vm->vm_user = usr;
>>  
>>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
>> - vm->vm_disks[i] = -1;
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
>> + vm->vm_disks[i][j] = -1;
>>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
>>   vm->vm_ifs[i].vif_fd = -1;
>>   for (i = 0; i < vcp->vcp_nnics; i++) {
>> diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
>> index 4cf0295d946..b803d44340c 100644
>> --- usr.sbin/vmd/vmd.h
>> +++ usr.sbin/vmd/vmd.h
>> @@ -166,6 +166,7 @@ struct vmop_create_params {
>>  #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
>>  
>>   unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
>> + unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
>>  #define VMDF_RAW 0x01
>>  #define VMDF_QCOW2 0x02
>
>- These defines should go up after vmc_disktypes.
>- vmc_disktypes should be uint8_t unless you want to align it somehow.
>
>>  
>> @@ -238,7 +239,7 @@ struct vmd_vm {
>>   uint32_t vm_vmid;
>>   int vm_kernel;
>>   int vm_cdrom;
>> - int vm_disks[VMM_MAX_DISKS_PER_VM];
>> + int vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
>>   struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
>>   char *vm_ttyname;
>>   int vm_tty;
>> @@ -412,4 +413,7 @@ int parse_config(const char *);
>>  int cmdline_symset(char *);
>>  int host(const char *, struct address *);
>>  
>> +/* virtio.c */
>> +int virtio_get_base(int, char *, size_t, int);
>> +
>>  #endif /* VMD_H */
>> diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
>> index 7757856323f..3176fd85713 100644
>> --- usr.sbin/vmd/vmm.c
>> +++ usr.sbin/vmd/vmm.c
>> @@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id,
>pid_t *pid)
>>   struct vmd_vm *vm;
>>   int ret = EINVAL;
>>   int fds[2];
>> - size_t i;
>> + size_t i, j;
>>  
>>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
>>   log_warnx("%s: can't find vm", __func__);
>> @@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id,
>pid_t *pid)
>>   close(fds[1]);
>>  
>>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
>> - close(vm->vm_disks[i]);
>> - vm->vm_disks[i] = -1;
>> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
>> + if (vm->vm_disks[i][j] != -1)
>> + close(vm->vm_disks[i][j]);
>> + vm->vm_disks[i][j] = -1;
>> + }
>>   }
>>   for (i = 0 ; i < vcp->vcp_nnics; i++) {
>>   close(vm->vm_ifs[i].vif_fd);
>>
>> --
>>     Ori Bernstein
>>

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Ori Bernstein
In reply to this post by Ori Bernstein
On Mon, 1 Oct 2018 11:24:01 -0700, Ori Bernstein <[hidden email]> wrote:

> On Mon, 1 Oct 2018 12:55:12 +0200
> Reyk Floeter <[hidden email]> wrote:
>
> > Hi Ori,
> >
> > On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
> > > I've added support to vmd for external snapshots. That is,
> > > snapshots that are derived from a base image. Data lookups
> > > start in the derived image, and if the derived image does not
> > > contain some data, the search proceeds ot the base image.
> > > Multiple derived images may exist off of a single base image.
> > >
> >
> > Nice work!  This will be quite useful, thanks.
> >
> > I think I broke your diff as my last commit to derive the raw/qcow2
> > format introduced some conflicts.  I had posted it on hackers@ and
> > forgot that your aren't on the internal list yet - sorry for that.

Updated version. Changes from the last diff:

- Merge in syntax changes.
- Don't over-read when getting the base images.
- Fix relative paths in base images.
- Allow multiple derived images to use a single base image, and allow a user
  with only read permisssions to base their images on top of it.
- Probe the base image size, use/validate it when craeting disk images.
- Fix style a bit (long lines, changing from sizeof foo to sizeof(foo).
- Move a define out of vmmvar.h
- And update the manpage with these changes.
- Improve error checking around creating/resolving base disk paths.


diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
index c2a5f42d5f6..1f8673e0e26 100644
--- regress/usr.sbin/vmd/diskfmt/Makefile
+++ regress/usr.sbin/vmd/diskfmt/Makefile
@@ -11,7 +11,7 @@
 VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
 
 PROG=vioscribble
-SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+SRCS=vioscribble.c vioqcow2.c vioraw.c
 CFLAGS+=-I$(VMD_DIR) -pthread
 LDFLAGS+=-pthread
 
@@ -26,3 +26,6 @@ scribble-images:
 .PHONY: ${REGRESS_TARGETS} scribble-images
 
 .include <bsd.regress.mk>
+
+vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+ cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
index 14d720db652..1da8efedac7 100644
--- regress/usr.sbin/vmd/diskfmt/vioscribble.c
+++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
@@ -122,16 +122,18 @@ main(int argc, char **argv)
  verbose = !!getenv("VERBOSE");
  qcfd = open("scribble.qc2", O_RDWR);
  rawfd = open("scribble.raw", O_RDWR);
- if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
+ if (qcfd == -1)
  err(1, "unable to open qcow");
- if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
+ if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
+ err(1, "unable to init qcow");
+ if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
  err(1, "unable to open raw");
 
  srandom_deterministic(123);
 
  /* scribble to both disks */
  printf("scribbling...\n");
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < 1024*16; i++) {
  off = (random() % DISKSZ);
  len = random() % sizeof buf + 1;
  fill(off, buf, sizeof buf);
diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
index 8748ecfdedc..4637256452b 100644
--- usr.sbin/vmctl/main.c
+++ usr.sbin/vmctl/main.c
@@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char *[]);
 
 struct ctl_command ctl_commands[] = {
  { "console", CMD_CONSOLE, ctl_console, "id" },
- { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 },
+ { "create", CMD_CREATE, ctl_create,
+ "\"path\" [-s size] [-b base]", 1 },
  { "load", CMD_LOAD, ctl_load, "\"path\"" },
  { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
  { "reload", CMD_RELOAD, ctl_reload, "" },
@@ -538,47 +539,54 @@ int
 ctl_create(struct parse_result *res, int argc, char *argv[])
 {
  int ch, ret, type;
- const char *paths[2], *disk, *format;
+ const char *disk, *format, *base;
 
  if (argc < 2)
  ctl_usage(res->ctl);
 
+ base = NULL;
  type = parse_disktype(argv[1], &disk);
 
- paths[0] = disk;
- paths[1] = NULL;
-
- if (unveil(paths[0], "rwc") == -1)
+ if (unveil(disk, "rwc") == -1)
  err(1, "unveil");
 
- if (pledge("stdio rpath wpath cpath", NULL) == -1)
- err(1, "pledge");
  argc--;
  argv++;
 
- while ((ch = getopt(argc, argv, "s:")) != -1) {
+ while ((ch = getopt(argc, argv, "s:b:")) != -1) {
  switch (ch) {
  case 's':
  if (parse_size(res, optarg, 0) != 0)
  errx(1, "invalid size: %s", optarg);
  break;
+ case 'b':
+ base = optarg;
+ if (unveil(base, "r") == -1)
+ err(1, "unveil");
+ break;
  default:
  ctl_usage(res->ctl);
  /* NOTREACHED */
  }
  }
 
- if (res->size == 0) {
- fprintf(stderr, "missing size argument\n");
+ if (pledge("stdio rpath wpath cpath", NULL) == -1)
+ err(1, "pledge");
+
+ if (base && type != VMDF_QCOW2)
+ errx(1, "base images require qcow2 disk format");
+ if (res->size == 0 && !base) {
+ fprintf(stderr, "could not create %s: missing size argument\n",
+    disk);
  ctl_usage(res->ctl);
  }
 
  if (type == VMDF_QCOW2) {
  format = "qcow2";
- ret = create_qc2_imagefile(paths[0], res->size);
+ ret = create_qc2_imagefile(disk, base, res->size);
  } else {
  format = "raw";
- ret = create_raw_imagefile(paths[0], res->size);
+ ret = create_raw_imagefile(disk, res->size);
  }
 
  if (ret != 0) {
diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
index f7890ac99f8..7a02452789c 100644
--- usr.sbin/vmctl/vmctl.8
+++ usr.sbin/vmctl/vmctl.8
@@ -50,7 +50,7 @@ Using
 .Xr cu 1
 connect to the console of the VM with the specified
 .Ar id .
-.It Cm create Ar path Fl s Ar size
+.It Cm create Ar path Fl s Op Ar size Op  Fl b Ar base
 Creates a VM disk image file with the specified
 .Ar path
 and
@@ -65,7 +65,14 @@ or
 in order to specify the disk format.
 If left unspecified, the format defaults to
 .Pa raw
-if it cannot be derived automatically.
+if it cannot be derived automatically.  For qcow2, a
+.Ar base
+image may be specified. The base image is not modified. The derived image
+contains only the changes written by the VM. When creating a derived image,
+the
+.Ar size
+may be omitted, and probed from the base image. If it is provided, it must
+match the base image size.
 .It Cm load Ar filename
 Load additional configuration from the specified file.
 .It Cm log brief
diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
index b09e1115ff7..81fbdba9a28 100644
--- usr.sbin/vmctl/vmctl.c
+++ usr.sbin/vmctl/vmctl.c
@@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
 #define ALIGN(sz, align) \
  ((sz + align - 1) & ~(align - 1))
 int
-create_qc2_imagefile(const char *imgfile_path, long imgsize)
+create_qc2_imagefile(const char *imgfile_path,
+    const char *base_path, long imgsize)
 {
  struct qcheader {
  char magic[4];
@@ -869,15 +870,30 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  uint64_t autoclearfeatures;
  uint32_t reforder;
  uint32_t headersz;
- } __packed hdr;
+ } __packed hdr, basehdr;
  int fd, ret;
+ ssize_t base_len;
  uint64_t l1sz, refsz, disksz, initsz, clustersz;
  uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
  uint16_t refs;
 
  disksz = 1024*1024*imgsize;
+
+ if (base_path) {
+ fd = open(base_path, O_RDONLY);
+ if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
+ err(1, "failure to read base image header");
+ close(fd);
+ if (!disksz)
+ disksz = betoh64(basehdr.disksz);
+ else if (disksz != betoh64(basehdr.disksz))
+ errx(1, "base size does not match requested size");
+ }
+ if (!base_path && !disksz)
+ errx(1, "missing disk size");
+
  clustersz = (1<<16);
- l1off = ALIGN(sizeof hdr, clustersz);
+ l1off = ALIGN(sizeof(hdr), clustersz);
 
  l1entrysz = clustersz * clustersz / 8;
  l1sz = (disksz + l1entrysz - 1) / l1entrysz;
@@ -887,11 +903,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  refsz = (disksz + refentrysz - 1) / refentrysz;
 
  initsz = ALIGN(refoff + refsz*clustersz, clustersz);
+ base_len = base_path ? strlen(base_path) : 0;
 
- memcpy(hdr.magic, "QFI\xfb", 4);
+ memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
  hdr.version = htobe32(3);
- hdr.backingoff = htobe64(0);
- hdr.backingsz = htobe32(0);
+ hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0);
+ hdr.backingsz = htobe32(base_len);
  hdr.clustershift = htobe32(16);
  hdr.disksz = htobe64(disksz);
  hdr.cryptmethod = htobe32(0);
@@ -905,7 +922,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  hdr.compatfeatures = htobe64(0);
  hdr.autoclearfeatures = htobe64(0);
  hdr.reforder = htobe32(4);
- hdr.headersz = htobe32(sizeof hdr);
+ hdr.headersz = htobe32(sizeof(hdr));
 
  /* Refuse to overwrite an existing image */
  fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
@@ -914,7 +931,11 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  return (errno);
 
  /* Write out the header */
- if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
+ if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+ goto error;
+
+ /* Add the base image */
+ if (base_path && write(fd, base_path, base_len) != base_len)
  goto error;
 
  /* Extend to desired size, and add one refcount cluster */
diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
index 006411d9785..cdf50cad755 100644
--- usr.sbin/vmctl/vmctl.h
+++ usr.sbin/vmctl/vmctl.h
@@ -87,7 +87,7 @@ __dead void
 
 /* vmctl.c */
 int create_raw_imagefile(const char *, long);
-int create_qc2_imagefile(const char *, long);
+int create_qc2_imagefile(const char *, const char*, long);
 int vm_start(uint32_t, const char *, int, int, char **, int,
     char **, int *, char *, char *, char *);
 int vm_start_complete(struct imsg *, int *, int);
diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
index 550b73c1a39..8350d0fc4fc 100644
--- usr.sbin/vmd/config.c
+++ usr.sbin/vmd/config.c
@@ -35,6 +35,7 @@
 #include <util.h>
 #include <errno.h>
 #include <imsg.h>
+#include <libgen.h>
 
 #include "proc.h"
 #include "vmd.h"
@@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
 int
 config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 {
+ int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
  struct vmd_if *vif;
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
- unsigned int i;
+ unsigned int i, j;
  int fd = -1, vmboot = 0;
- int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
+ int kernfd = -1;
+ int *tapfds;
  int cdromfd = -1;
  int saved_errno = 0;
+ int n = 0, aflags, oflags;
  char ifname[IF_NAMESIZE], *s;
  char path[PATH_MAX];
+ char base[PATH_MAX];
+ char expanded[PATH_MAX];
  unsigned int unit;
 
  errno = 0;
@@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  }
  }
 
- diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
- if (diskfds == NULL) {
- log_warn("%s: can't allocate disk fds", __func__);
- goto fail;
- }
- for (i = 0; i < vcp->vcp_ndisks; i++)
- diskfds[i] = -1;
+ for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+ diskfds[i][j] = -1;
 
  tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
  if (tapfds == NULL) {
@@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 
  /* Open disk images for child */
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
-                /* Stat disk[i] to ensure it is a regular file */
- if ((diskfds[i] = open(vcp->vcp_disks[i],
-    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
- log_warn("%s: can't open disk %s", __func__,
-    vcp->vcp_disks[i]);
- errno = VMD_DISK_MISSING;
- goto fail;
- }
+ if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX)
+ log_warnx("%s, disk path too long", __func__);
+ memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases);
+ oflags = O_RDWR|O_EXLOCK|O_NONBLOCK;
+ aflags = R_OK|W_OK;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ /* Stat disk[i] to ensure it is a regular file */
+ if ((diskfds[i][j] = open(path, oflags)) == -1) {
+ log_warn("%s: can't open disk %s", __func__,
+    vcp->vcp_disks[i]);
+ errno = VMD_DISK_MISSING;
+ goto fail;
+ }
 
- if (vm_checkaccess(diskfds[i],
-    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
-    uid, R_OK|W_OK) == -1) {
- log_warnx("vm \"%s\" no read/write access to disk %s",
-    vcp->vcp_name, vcp->vcp_disks[i]);
- errno = EPERM;
- goto fail;
+ if (vm_checkaccess(diskfds[i][j],
+    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
+    uid, R_OK|W_OK) == -1) {
+ log_warnx("vm \"%s\" unable to access "
+    "disk %s", vcp->vcp_name,
+    vcp->vcp_disks[i]);
+ errno = EPERM;
+ goto fail;
+ }
+
+ /*
+ * Clear the write and exclusive flags for base images.
+ * All writes should go to the top image, allowing them
+ * to be shared.
+ */
+ oflags = O_RDONLY|O_NONBLOCK;
+ aflags = R_OK;
+ n = virtio_get_base(diskfds[i][j], base, sizeof base,
+    vmc->vmc_disktypes[i]);
+ if (n == 0)
+ break;
+ if (n == -1) {
+ log_warnx("vm \"%s\" unable to read "
+    "base for disk %s", vcp->vcp_name,
+    vcp->vcp_disks[i]);
+ goto fail;
+ }
+ /*
+ * Relative paths should be interpreted relative
+ * to the disk image, rather than relative to the
+ * directory vmd happens to be running in, since
+ * this is the only userful interpretation.
+ */
+ if (base[0] == '/') {
+ if (realpath(base, path) == NULL) {
+ log_warn("unable to resolve %s", base);
+ goto fail;
+ }
+ } else {
+ s = dirname(path);
+ if (snprintf(expanded, sizeof(expanded),
+    "%s/%s", s, base) >= (int)sizeof(expanded)) {
+ log_warn("path too long: %s/%s",
+    s, base);
+ goto fail;
+ }
+ if (realpath(expanded, path) == NULL) {
+ log_warn("unable to resolve %s", base);
+ goto fail;
+ }
+ }
  }
  }
 
@@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
     NULL, 0);
 
  for (i = 0; i < vcp->vcp_ndisks; i++) {
- proc_compose_imsg(ps, PROC_VMM, -1,
-    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
-    &i, sizeof(i));
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ if (diskfds[i][j] == -1)
+ break;
+ proc_compose_imsg(ps, PROC_VMM, -1,
+    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
+    diskfds[i][j], &i, sizeof(i));
+ }
  }
  for (i = 0; i < vcp->vcp_nnics; i++) {
  proc_compose_imsg(ps, PROC_VMM, -1,
@@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  proc_compose_imsg(ps, PROC_VMM, -1,
     IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
 
- free(diskfds);
  free(tapfds);
 
  vm->vm_running = 1;
@@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  close(kernfd);
  if (cdromfd != -1)
  close(cdromfd);
- if (diskfds != NULL) {
- for (i = 0; i < vcp->vcp_ndisks; i++)
- close(diskfds[i]);
- free(diskfds);
- }
+ for (i = 0; i < vcp->vcp_ndisks; i++)
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+ if (diskfds[i][j] != -1)
+ close(diskfds[i][j]);
  if (tapfds != NULL) {
  for (i = 0; i < vcp->vcp_nnics; i++)
  close(tapfds[i]);
@@ -489,7 +542,7 @@ int
 config_getdisk(struct privsep *ps, struct imsg *imsg)
 {
  struct vmd_vm *vm;
- unsigned int n;
+ unsigned int n, idx;
 
  errno = 0;
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
@@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
  IMSG_SIZE_CHECK(imsg, &n);
  memcpy(&n, imsg->data, sizeof(n));
 
- if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
-    vm->vm_disks[n] != -1 || imsg->fd == -1) {
+ if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
  log_warnx("invalid disk id");
  errno = EINVAL;
  return (-1);
  }
- vm->vm_disks[n] = imsg->fd;
-
+ idx = vm->vm_params.vmc_diskbases[n]++;
+ if (idx >= VMM_MAX_BASE_PER_DISK) {
+ log_warnx("too many bases for disk");
+ errno = EINVAL;
+ return (-1);
+ }
+ vm->vm_disks[n][idx] = imsg->fd;
  return (0);
 }
 
diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
index c3211d186fa..81dca129896 100644
--- usr.sbin/vmd/vioqcow2.c
+++ usr.sbin/vmd/vioqcow2.c
@@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
 static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static int inc_refs(struct qcdisk *, off_t, int);
-static int qc2_openpath(struct qcdisk *, char *, int);
-static int qc2_open(struct qcdisk *, int);
+static int qc2_open(struct qcdisk *, int *, size_t);
 static ssize_t qc2_pread(void *, char *, size_t, off_t);
 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
 static void qc2_close(void *, int);
@@ -118,14 +117,14 @@ static void qc2_close(void *, int);
  * May open snapshot base images.
  */
 int
-virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  struct qcdisk *diskp;
 
  diskp = malloc(sizeof(struct qcdisk));
  if (diskp == NULL)
  return -1;
- if (qc2_open(diskp, fd) == -1) {
+ if (qc2_open(diskp, fd, nfd) == -1) {
  log_warnx("%s: could not open qcow2 disk", __func__);
  return -1;
  }
@@ -137,19 +136,41 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
  return 0;
 }
 
-static int
-qc2_openpath(struct qcdisk *disk, char *path, int flags)
+ssize_t
+virtio_qcow2_get_base(int fd, char *path, size_t npath)
 {
- int fd;
+ struct qcheader header;
+ uint64_t backingoff;
+ uint32_t backingsz;
 
- fd = open(path, flags);
- if (fd < 0)
+ if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
+ log_warnx("%s: short read on header", __func__);
+ return -1;
+ }
+ if (strncmp(header.magic, "QFI\xfb", 4) != 0) {
+ log_warn("%s: invalid magic numbers", __func__);
  return -1;
- return qc2_open(disk, fd);
+ }
+ backingoff = be64toh(header.backingoff);
+ backingsz = be32toh(header.backingsz);
+ if (backingsz != 0) {
+ if (backingsz >= npath - 1) {
+ log_warn("%s: snapshot path too long", __func__);
+ return -1;
+ }
+ if (pread(fd, path, backingsz, backingoff) != backingsz) {
+ log_warnx("%s: could not read snapshot base name",
+    __func__);
+ return -1;
+ }
+ path[backingsz] = '\0';
+ log_warnx("path: %s\n", path);
+ }
+ return backingsz;
 }
 
 static int
-qc2_open(struct qcdisk *disk, int fd)
+qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
 {
  char basepath[PATH_MAX];
  struct stat st;
@@ -157,14 +178,15 @@ qc2_open(struct qcdisk *disk, int fd)
  uint64_t backingoff;
  uint32_t backingsz;
  size_t i;
- int version;
+ int version, fd;
 
  pthread_rwlock_init(&disk->lock, NULL);
+ fd = fds[0];
  disk->fd = fd;
  disk->base = NULL;
  disk->l1 = NULL;
 
- if (pread(fd, &header, sizeof header, 0) != sizeof header) {
+ if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
  log_warn("%s: short read on header", __func__);
  goto error;
  }
@@ -203,7 +225,7 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
 
- disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
+ disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
  if (!disk->l1)
  goto error;
  if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off)
@@ -222,14 +244,7 @@ qc2_open(struct qcdisk *disk, int fd)
  backingoff = be64toh(header.backingoff);
  backingsz = be32toh(header.backingsz);
  if (backingsz != 0) {
- /*
- * FIXME: we need to figure out a way of opening these things,
- * otherwise we just crash with a pledge violation.
- */
- log_warn("%s: unsupported external snapshot images", __func__);
- goto error;
-
- if (backingsz >= sizeof basepath - 1) {
+ if (backingsz >= sizeof(basepath) - 1) {
  log_warn("%s: snapshot path too long", __func__);
  goto error;
  }
@@ -239,11 +254,17 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
  basepath[backingsz] = 0;
+ if (nfd <= 1) {
+ log_warnx("%s: missing base image %s", __func__,
+    basepath);
+ goto error;
+ }
+
 
  disk->base = calloc(1, sizeof(struct qcdisk));
  if (!disk->base)
  goto error;
- if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
+ if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
  log_warn("%s: could not open %s", basepath, __func__);
  goto error;
  }
@@ -521,12 +542,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
  cluster = disk->end;
  disk->end += disk->clustersz;
  buf = htobe64(cluster | QCOW2_INPLACE);
- if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
+ if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off*8) != sizeof(buf))
  goto fail;
 
  /* TODO: lazily sync: currently VMD doesn't close things */
  buf = htobe64(disk->l1[l1off]);
- if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
+ if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8*l1off) != 8)
  goto fail;
  if (inc_refs(disk, cluster, 1) == -1)
  goto fail;
@@ -571,7 +592,7 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
  l1idx = (off / disk->clustersz) / nper;
  l2idx = (off / disk->clustersz) % nper;
  l1off = disk->refoff + 8*l1idx;
- if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
+ if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
  return -1;
 
  l2cluster = be64toh(buf);
@@ -583,19 +604,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
  return -1;
  }
  buf = htobe64(l2cluster);
- if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
+ if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) {
  return -1;
  }
  }
 
  refs = 1;
  if (!newcluster) {
- if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
+ if (pread(disk->fd, &refs, sizeof(refs),
+    l2cluster+2*l2idx) != 2)
  return -1;
  refs = be16toh(refs) + 1;
  }
  refs = htobe16(refs);
- if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
+ if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2*l2idx) != 2) {
  log_warn("%s: could not write ref block", __func__);
  return -1;
  }
diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
index e02ab67c5dc..ff4bbb3095e 100644
--- usr.sbin/vmd/vioraw.c
+++ usr.sbin/vmd/vioraw.c
@@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
  * returning -1 for error, 0 for success.
  */
 int
-virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  off_t sz;
  int *fdp;
 
- sz = lseek(fd, 0, SEEK_END);
+ if (nfd != 1)
+ return -1;
+ sz = lseek(fd[0], 0, SEEK_END);
  if (sz == -1)
  return -1;
 
  fdp = malloc(sizeof(int));
  if (!fdp)
  return -1;
- *fdp = fd;
+ *fdp = fd[0];
  file->p = fdp;
  file->pread = raw_pread;
  file->pwrite = raw_pwrite;
diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index 93490344560..1c66f33216a 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
  return (0);
 }
 
+int
+virtio_get_base(int fd, char *path, size_t npath ,int type)
+{
+ switch (type) {
+ case VMDF_RAW: return 0;
+ case VMDF_QCOW2: return virtio_qcow2_get_base(fd, path, npath);
+ }
+ log_warnx("%s: invalid disk format", __func__);
+ return -1;
+}
+
+/*
+ * Initializes a struct virtio_backing using the list of fds.
+ */
 static int
-virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
+virtio_init_disk(struct virtio_backing *file, off_t *sz,
+    int *fd, size_t nfd, int type)
 {
  /*
  * probe disk types in order of preference, first one to work wins.
  * TODO: provide a way of specifying the type and options.
  */
  switch (type) {
- case VMDF_RAW: return virtio_init_raw(file, sz, fd);
- case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
+ case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
+ case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
  }
  log_warnx("%s: invalid disk format", __func__);
  return -1;
 }
 
 void
-virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_init(struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
@@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioblk[i].vm_id = vcp->vcp_id;
  vioblk[i].irq = pci_get_dev_irq(id);
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1) {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return;
@@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioscsi->vq[i].last_avail = 0;
  }
  if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
-    child_cdrom, VMDF_RAW) == -1) {
+    &child_cdrom, 1, VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format",
     __func__);
  return;
@@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
 }
 
 int
-vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
+vioblk_restore(int fd, struct vmop_create_params *vmc,
+    int child_disks[][VMM_MAX_BASE_PER_DISK])
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  uint8_t i;
@@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
  return (-1);
  }
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1)  {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return (-1);
@@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
  return (-1);
  }
 
- if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
+ if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
     VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format", __func__);
  return (-1);
@@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
 }
 
 int
-virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
index 46006916b6a..91f0e323204 100644
--- usr.sbin/vmd/virtio.h
+++ usr.sbin/vmd/virtio.h
@@ -257,10 +257,11 @@ struct ioinfo {
 };
 
 /* virtio.c */
-void virtio_init(struct vmd_vm *, int, int *, int *);
+void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK], int *);
 void virtio_shutdown(struct vmd_vm *);
 int virtio_dump(int);
-int virtio_restore(int, struct vmd_vm *, int, int *, int *);
+int virtio_restore(int, struct vmd_vm *, int,
+    int[][VMM_MAX_BASE_PER_DISK], int *);
 uint32_t vring_size(uint32_t);
 
 int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
@@ -270,12 +271,14 @@ void viornd_update_qs(void);
 void viornd_update_qa(void);
 int viornd_notifyq(void);
 
-int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
-int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
+ssize_t virtio_qcow2_get_base(int, char *, size_t);
+int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
+int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
 
 int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vioblk_dump(int);
-int vioblk_restore(int, struct vmop_create_params *, int *);
+int vioblk_restore(int, struct vmop_create_params *,
+    int[][VMM_MAX_BASE_PER_DISK]);
 void vioblk_update_qs(struct vioblk_dev *);
 void vioblk_update_qa(struct vioblk_dev *);
 int vioblk_notifyq(struct vioblk_dev *);
diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
index ef4494d918b..7c9aaf27012 100644
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,8 +65,8 @@
 
 io_fn_t ioports_map[MAX_PORTS];
 
-int run_vm(int, int *, int *, struct vmop_create_params *,
-    struct vcpu_reg_state *);
+int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *,
+    struct vmop_create_params *, struct vcpu_reg_state *);
 void vm_dispatch_vmm(int, short, void *);
 void *event_thread(void *);
 void *vcpu_run_loop(void *);
@@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
 void create_memory_map(struct vm_create_params *);
 int alloc_guest_mem(struct vm_create_params *);
 int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
-void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
+void init_emulated_hw(struct vmop_create_params *, int,
+    int[][VMM_MAX_BASE_PER_DISK], int *);
+void restore_emulated_hw(struct vm_create_params *, int, int *,
+    int[][VMM_MAX_BASE_PER_DISK],int);
 void vcpu_exit_inout(struct vm_run_params *);
 uint8_t vcpu_exit_pci(struct vm_run_params *);
 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
@@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd)
 
  /* Find and open kernel image */
  if ((fp = vmboot_open(vm->vm_kernel,
-    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
+    vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
  fatalx("failed to open kernel - exiting");
 
  /* Load kernel image */
@@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp)
  */
 void
 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
-    int *child_disks, int *child_taps)
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  int i;
@@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
  */
 void
 restore_emulated_hw(struct vm_create_params *vcp, int fd,
-    int *child_taps, int *child_disks, int child_cdrom)
+    int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int child_cdrom)
 {
  /* struct vm_create_params *vcp = &vmc->vmc_params; */
  int i;
@@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
  *  !0 : the VM exited abnormally or failed to start
  */
 int
-run_vm(int child_cdrom, int *child_disks, int *child_taps,
-    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
+run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK],
+    int *child_taps, struct vmop_create_params *vmc,
+    struct vcpu_reg_state *vrs)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  struct vm_rwregs_params vregsp;
diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
index 44ceeb64a7e..f23d8d6b25a 100644
--- usr.sbin/vmd/vmboot.c
+++ usr.sbin/vmd/vmboot.c
@@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
 
  switch (vmboot->vbp_type) {
  case VMDF_RAW:
- if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) {
  log_debug("%s: could not open raw disk", __func__);
  goto fail;
  }
  break;
  case VMDF_QCOW2:
- if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) {
  log_debug("%s: could not open qcow2 disk", __func__);
  goto fail;
  }
diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
index 1571be21bc5..8377812c316 100644
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -1097,7 +1097,7 @@ void
 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
 {
  struct privsep *ps = &env->vmd_ps;
- unsigned int i;
+ unsigned int i, j;
 
  if (vm == NULL)
  return;
@@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
  close(vm->vm_iev.ibuf.fd);
  }
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
- if (vm->vm_disks[i] != -1) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1) {
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  }
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
@@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  struct vmop_owner *vmo = NULL;
  struct vmd_user *usr = NULL;
  uint32_t rng;
- unsigned int i;
+ unsigned int i, j;
  struct vmd_switch *sw;
  char *s;
 
@@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  vm->vm_user = usr;
 
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+ vm->vm_disks[i][j] = -1;
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
  vm->vm_ifs[i].vif_fd = -1;
  for (i = 0; i < vcp->vcp_nnics; i++) {
diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
index b348d12c757..0683629b3b1 100644
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -48,6 +48,7 @@
 #define VM_DEFAULT_DEVICE "hd0a"
 #define VM_BOOT_CONF "/etc/boot.conf"
 #define VM_NAME_MAX 64
+#define VM_MAX_BASE_PER_DISK 4
 #define VM_TTYNAME_MAX 16
 #define MAX_TAP 256
 #define NR_BACKLOG 5
@@ -169,6 +170,7 @@ struct vmop_create_params {
 #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
 
  unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
+ unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
 #define VMDF_RAW 0x01
 #define VMDF_QCOW2 0x02
 
@@ -241,7 +243,7 @@ struct vmd_vm {
  uint32_t vm_vmid;
  int vm_kernel;
  int vm_cdrom;
- int vm_disks[VMM_MAX_DISKS_PER_VM];
+ int vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
  struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
  char *vm_ttyname;
  int vm_tty;
@@ -415,4 +417,7 @@ int parse_config(const char *);
 int cmdline_symset(char *);
 int host(const char *, struct address *);
 
+/* virtio.c */
+int virtio_get_base(int, char *, size_t, int);
+
 #endif /* VMD_H */
diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
index 7757856323f..3176fd85713 100644
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  struct vmd_vm *vm;
  int ret = EINVAL;
  int fds[2];
- size_t i;
+ size_t i, j;
 
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
  log_warnx("%s: can't find vm", __func__);
@@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  close(fds[1]);
 
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1)
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  for (i = 0 ; i < vcp->vcp_nnics; i++) {
  close(vm->vm_ifs[i].vif_fd);

--
    Ori Bernstein

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Jason McIntyre-2
On Tue, Oct 02, 2018 at 11:13:35PM -0700, Ori Bernstein wrote:

>
> Updated version. Changes from the last diff:
>
> - Merge in syntax changes.
> - Don't over-read when getting the base images.
> - Fix relative paths in base images.
> - Allow multiple derived images to use a single base image, and allow a user
>   with only read permisssions to base their images on top of it.
> - Probe the base image size, use/validate it when craeting disk images.
> - Fix style a bit (long lines, changing from sizeof foo to sizeof(foo).
> - Move a define out of vmmvar.h
> - And update the manpage with these changes.
> - Improve error checking around creating/resolving base disk paths.
>

morning.

you should start new sentences on new lines - it forces a double spacing
between sentences that all man pages have.

if you run your proposed changes to man pages through "mandoc -Tlint",
it will pick up on silly things like that.

note there is also a double space in "Op  Fl b"

jmc

>
> diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
> index f7890ac99f8..7a02452789c 100644
> --- usr.sbin/vmctl/vmctl.8
> +++ usr.sbin/vmctl/vmctl.8
> @@ -50,7 +50,7 @@ Using
>  .Xr cu 1
>  connect to the console of the VM with the specified
>  .Ar id .
> -.It Cm create Ar path Fl s Ar size
> +.It Cm create Ar path Fl s Op Ar size Op  Fl b Ar base
>  Creates a VM disk image file with the specified
>  .Ar path
>  and
> @@ -65,7 +65,14 @@ or
>  in order to specify the disk format.
>  If left unspecified, the format defaults to
>  .Pa raw
> -if it cannot be derived automatically.
> +if it cannot be derived automatically.  For qcow2, a
> +.Ar base
> +image may be specified. The base image is not modified. The derived image
> +contains only the changes written by the VM. When creating a derived image,
> +the
> +.Ar size
> +may be omitted, and probed from the base image. If it is provided, it must
> +match the base image size.
>  .It Cm load Ar filename
>  Load additional configuration from the specified file.
>  .It Cm log brief

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Reyk Floeter-2
In reply to this post by Ori Bernstein
On Tue, Oct 02, 2018 at 11:13:35PM -0700, Ori Bernstein wrote:

> On Mon, 1 Oct 2018 11:24:01 -0700, Ori Bernstein <[hidden email]> wrote:
>
> > On Mon, 1 Oct 2018 12:55:12 +0200
> > Reyk Floeter <[hidden email]> wrote:
> >
> > > Hi Ori,
> > >
> > > On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
> > > > I've added support to vmd for external snapshots. That is,
> > > > snapshots that are derived from a base image. Data lookups
> > > > start in the derived image, and if the derived image does not
> > > > contain some data, the search proceeds ot the base image.
> > > > Multiple derived images may exist off of a single base image.
> > > >
> > >
> > > Nice work!  This will be quite useful, thanks.
> > >
> > > I think I broke your diff as my last commit to derive the raw/qcow2
> > > format introduced some conflicts.  I had posted it on hackers@ and
> > > forgot that your aren't on the internal list yet - sorry for that.
>
> Updated version. Changes from the last diff:
>
> - Merge in syntax changes.
> - Don't over-read when getting the base images.
> - Fix relative paths in base images.
> - Allow multiple derived images to use a single base image, and allow a user
>   with only read permisssions to base their images on top of it.
> - Probe the base image size, use/validate it when craeting disk images.
> - Fix style a bit (long lines, changing from sizeof foo to sizeof(foo).
> - Move a define out of vmmvar.h

You're still using VMM_MAX_BASE_PER_DISK instead of
VM_MAX_BASE_PER_DISK in the code.  I patched the diff to make it
compile without the vmmvar.h change here.

> - And update the manpage with these changes.
> - Improve error checking around creating/resolving base disk paths.
>

Light testing works except of an issue with read-only base images; the
required fix is in the comments below.

Other than that, it is really cool to run many VMs from a single base
image.  In my tests, I installed OpenBSD once and started a few VMs
using the installed disk as a base.

More comments below.

Reyk

>
> diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
> index c2a5f42d5f6..1f8673e0e26 100644
> --- regress/usr.sbin/vmd/diskfmt/Makefile
> +++ regress/usr.sbin/vmd/diskfmt/Makefile
> @@ -11,7 +11,7 @@
>  VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
>  
>  PROG=vioscribble
> -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> +SRCS=vioscribble.c vioqcow2.c vioraw.c
>  CFLAGS+=-I$(VMD_DIR) -pthread
>  LDFLAGS+=-pthread
>  
> @@ -26,3 +26,6 @@ scribble-images:
>  .PHONY: ${REGRESS_TARGETS} scribble-images
>  
>  .include <bsd.regress.mk>
> +
> +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
> diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
> index 14d720db652..1da8efedac7 100644
> --- regress/usr.sbin/vmd/diskfmt/vioscribble.c
> +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
> @@ -122,16 +122,18 @@ main(int argc, char **argv)
>   verbose = !!getenv("VERBOSE");
>   qcfd = open("scribble.qc2", O_RDWR);
>   rawfd = open("scribble.raw", O_RDWR);
> - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
> + if (qcfd == -1)
>   err(1, "unable to open qcow");
> - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
> + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
> + err(1, "unable to init qcow");
> + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
>   err(1, "unable to open raw");
>  
>   srandom_deterministic(123);
>  
>   /* scribble to both disks */
>   printf("scribbling...\n");
> - for (i = 0; i < 16; i++) {
> + for (i = 0; i < 1024*16; i++) {
>   off = (random() % DISKSZ);
>   len = random() % sizeof buf + 1;
>   fill(off, buf, sizeof buf);
> diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
> index 8748ecfdedc..4637256452b 100644
> --- usr.sbin/vmctl/main.c
> +++ usr.sbin/vmctl/main.c
> @@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char *[]);
>  
>  struct ctl_command ctl_commands[] = {
>   { "console", CMD_CONSOLE, ctl_console, "id" },
> - { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 },
> + { "create", CMD_CREATE, ctl_create,
> + "\"path\" [-s size] [-b base]", 1 },
>   { "load", CMD_LOAD, ctl_load, "\"path\"" },
>   { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
>   { "reload", CMD_RELOAD, ctl_reload, "" },
> @@ -538,47 +539,54 @@ int
>  ctl_create(struct parse_result *res, int argc, char *argv[])
>  {
>   int ch, ret, type;
> - const char *paths[2], *disk, *format;
> + const char *disk, *format, *base;
>  
>   if (argc < 2)
>   ctl_usage(res->ctl);
>  
> + base = NULL;
>   type = parse_disktype(argv[1], &disk);
>  
> - paths[0] = disk;
> - paths[1] = NULL;
> -
> - if (unveil(paths[0], "rwc") == -1)
> + if (unveil(disk, "rwc") == -1)
>   err(1, "unveil");
>  
> - if (pledge("stdio rpath wpath cpath", NULL) == -1)
> - err(1, "pledge");

Instead of doing the initial disk parsing without pledge, it is also
possible to pledge early with the "unveil" promise and to lock unveil
manually later with unveil(NULL, NULL).  It wouldn't need any other
pledge promises so it is not making it worse but better.  You can also
add another pledge without the *path and unveil promises after opening
the files.

>   argc--;
>   argv++;
>  
> - while ((ch = getopt(argc, argv, "s:")) != -1) {
> + while ((ch = getopt(argc, argv, "s:b:")) != -1) {
>   switch (ch) {
>   case 's':
>   if (parse_size(res, optarg, 0) != 0)
>   errx(1, "invalid size: %s", optarg);
>   break;
> + case 'b':
> + base = optarg;
> + if (unveil(base, "r") == -1)
> + err(1, "unveil");
> + break;
>   default:
>   ctl_usage(res->ctl);
>   /* NOTREACHED */
>   }
>   }
>  
> - if (res->size == 0) {
> - fprintf(stderr, "missing size argument\n");
> + if (pledge("stdio rpath wpath cpath", NULL) == -1)
> + err(1, "pledge");
> +
> + if (base && type != VMDF_QCOW2)
> + errx(1, "base images require qcow2 disk format");
> + if (res->size == 0 && !base) {
> + fprintf(stderr, "could not create %s: missing size argument\n",
> +    disk);
>   ctl_usage(res->ctl);
>   }
>  
>   if (type == VMDF_QCOW2) {
>   format = "qcow2";
> - ret = create_qc2_imagefile(paths[0], res->size);
> + ret = create_qc2_imagefile(disk, base, res->size);
>   } else {
>   format = "raw";
> - ret = create_raw_imagefile(paths[0], res->size);
> + ret = create_raw_imagefile(disk, res->size);
>   }
>  
>   if (ret != 0) {
> diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
> index f7890ac99f8..7a02452789c 100644
> --- usr.sbin/vmctl/vmctl.8
> +++ usr.sbin/vmctl/vmctl.8
> @@ -50,7 +50,7 @@ Using
>  .Xr cu 1
>  connect to the console of the VM with the specified
>  .Ar id .
> -.It Cm create Ar path Fl s Ar size
> +.It Cm create Ar path Fl s Op Ar size Op  Fl b Ar base
>  Creates a VM disk image file with the specified
>  .Ar path
>  and
> @@ -65,7 +65,14 @@ or
>  in order to specify the disk format.
>  If left unspecified, the format defaults to
>  .Pa raw
> -if it cannot be derived automatically.
> +if it cannot be derived automatically.  For qcow2, a
> +.Ar base
> +image may be specified. The base image is not modified. The derived image
> +contains only the changes written by the VM. When creating a derived image,
> +the
> +.Ar size
> +may be omitted, and probed from the base image. If it is provided, it must
> +match the base image size.
>  .It Cm load Ar filename
>  Load additional configuration from the specified file.
>  .It Cm log brief

... I'm with jmc@ on the manpage here.

> diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
> index b09e1115ff7..81fbdba9a28 100644
> --- usr.sbin/vmctl/vmctl.c
> +++ usr.sbin/vmctl/vmctl.c
> @@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
>  #define ALIGN(sz, align) \
>   ((sz + align - 1) & ~(align - 1))
>  int
> -create_qc2_imagefile(const char *imgfile_path, long imgsize)
> +create_qc2_imagefile(const char *imgfile_path,
> +    const char *base_path, long imgsize)
>  {
>   struct qcheader {
>   char magic[4];
> @@ -869,15 +870,30 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   uint64_t autoclearfeatures;
>   uint32_t reforder;
>   uint32_t headersz;
> - } __packed hdr;
> + } __packed hdr, basehdr;
>   int fd, ret;
> + ssize_t base_len;
>   uint64_t l1sz, refsz, disksz, initsz, clustersz;
>   uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
>   uint16_t refs;
>  
>   disksz = 1024*1024*imgsize;

This is not part of the diff but sticks into my eye.

According to style(9):
"Unary operators don't require spaces; binary operators do."

So it would be slightly more readable and better knf:

        disksz = 1024 * 1024 * imgsize;

> +
> + if (base_path) {
> + fd = open(base_path, O_RDONLY);
> + if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
> + err(1, "failure to read base image header");

I think you should compare the VM_MAGIC_QCOW here as well, at least
the magic bytes without the version.  vmd will do it later but it is
good to prevent users from trying to use a non-qcow2 base early in the
process.

> + close(fd);
> + if (!disksz)
> + disksz = betoh64(basehdr.disksz);
> + else if (disksz != betoh64(basehdr.disksz))
> + errx(1, "base size does not match requested size");
> + }
> + if (!base_path && !disksz)
> + errx(1, "missing disk size");
> +
>   clustersz = (1<<16);
> - l1off = ALIGN(sizeof hdr, clustersz);
> + l1off = ALIGN(sizeof(hdr), clustersz);
>  
>   l1entrysz = clustersz * clustersz / 8;
>   l1sz = (disksz + l1entrysz - 1) / l1entrysz;
> @@ -887,11 +903,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   refsz = (disksz + refentrysz - 1) / refentrysz;
>  
>   initsz = ALIGN(refoff + refsz*clustersz, clustersz);
> + base_len = base_path ? strlen(base_path) : 0;
>  
> - memcpy(hdr.magic, "QFI\xfb", 4);
> + memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
>   hdr.version = htobe32(3);
> - hdr.backingoff = htobe64(0);
> - hdr.backingsz = htobe32(0);
> + hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0);
> + hdr.backingsz = htobe32(base_len);
>   hdr.clustershift = htobe32(16);
>   hdr.disksz = htobe64(disksz);
>   hdr.cryptmethod = htobe32(0);
> @@ -905,7 +922,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   hdr.compatfeatures = htobe64(0);
>   hdr.autoclearfeatures = htobe64(0);
>   hdr.reforder = htobe32(4);
> - hdr.headersz = htobe32(sizeof hdr);
> + hdr.headersz = htobe32(sizeof(hdr));
>  
>   /* Refuse to overwrite an existing image */
>   fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
> @@ -914,7 +931,11 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   return (errno);
>  
>   /* Write out the header */
> - if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
> + if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
> + goto error;
> +
> + /* Add the base image */
> + if (base_path && write(fd, base_path, base_len) != base_len)
>   goto error;
>  
>   /* Extend to desired size, and add one refcount cluster */
> diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
> index 006411d9785..cdf50cad755 100644
> --- usr.sbin/vmctl/vmctl.h
> +++ usr.sbin/vmctl/vmctl.h
> @@ -87,7 +87,7 @@ __dead void
>  
>  /* vmctl.c */
>  int create_raw_imagefile(const char *, long);
> -int create_qc2_imagefile(const char *, long);
> +int create_qc2_imagefile(const char *, const char*, long);

Add a space: const char *

>  int vm_start(uint32_t, const char *, int, int, char **, int,
>      char **, int *, char *, char *, char *);
>  int vm_start_complete(struct imsg *, int *, int);
> diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
> index 550b73c1a39..8350d0fc4fc 100644
> --- usr.sbin/vmd/config.c
> +++ usr.sbin/vmd/config.c
> @@ -35,6 +35,7 @@
>  #include <util.h>
>  #include <errno.h>
>  #include <imsg.h>
> +#include <libgen.h>
>  
>  #include "proc.h"
>  #include "vmd.h"
> @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
>  int
>  config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  {
> + int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
>   struct vmd_if *vif;
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> - unsigned int i;
> + unsigned int i, j;
>   int fd = -1, vmboot = 0;
> - int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
> + int kernfd = -1;
> + int *tapfds;
>   int cdromfd = -1;
>   int saved_errno = 0;
> + int n = 0, aflags, oflags;
>   char ifname[IF_NAMESIZE], *s;
>   char path[PATH_MAX];
> + char base[PATH_MAX];
> + char expanded[PATH_MAX];
>   unsigned int unit;
>  
>   errno = 0;
> @@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   }
>   }
>  
> - diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
> - if (diskfds == NULL) {
> - log_warn("%s: can't allocate disk fds", __func__);
> - goto fail;
> - }
> - for (i = 0; i < vcp->vcp_ndisks; i++)
> - diskfds[i] = -1;
> + for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
> + diskfds[i][j] = -1;
>  
>   tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
>   if (tapfds == NULL) {
> @@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  
>   /* Open disk images for child */
>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
> -                /* Stat disk[i] to ensure it is a regular file */
> - if ((diskfds[i] = open(vcp->vcp_disks[i],
> -    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
> - log_warn("%s: can't open disk %s", __func__,
> -    vcp->vcp_disks[i]);
> - errno = VMD_DISK_MISSING;
> - goto fail;
> - }
> + if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX)

The correct idiom is:

                if (strlcpy(path, vcp->vcp_disks[i],
                    sizeof(path)) >= sizeof(path))

- >=
- sizeof(path) instead PATH_MAX to be error-prone

> + log_warnx("%s, disk path too long", __func__);
> + memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases);

sizeof with braces ;)

> + oflags = O_RDWR|O_EXLOCK|O_NONBLOCK;
> + aflags = R_OK|W_OK;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + /* Stat disk[i] to ensure it is a regular file */
> + if ((diskfds[i][j] = open(path, oflags)) == -1) {
> + log_warn("%s: can't open disk %s", __func__,
> +    vcp->vcp_disks[i]);
> + errno = VMD_DISK_MISSING;
> + goto fail;
> + }
>  
> - if (vm_checkaccess(diskfds[i],
> -    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
> -    uid, R_OK|W_OK) == -1) {
> - log_warnx("vm \"%s\" no read/write access to disk %s",
> -    vcp->vcp_name, vcp->vcp_disks[i]);
> - errno = EPERM;
> - goto fail;
> + if (vm_checkaccess(diskfds[i][j],
> +    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
> +    uid, R_OK|W_OK) == -1) {

This doesn't use aflags and access to read-only bases still fails, it must be:

                            uid, aflags) == -1) {

> + log_warnx("vm \"%s\" unable to access "
> +    "disk %s", vcp->vcp_name,
> +    vcp->vcp_disks[i]);

And here you should print path instead of vcp->vcp_disks[i]: If access
to the base image fails, it would print the disk name instead of the
base image.

> + errno = EPERM;
> + goto fail;
> + }
> +
> + /*
> + * Clear the write and exclusive flags for base images.
> + * All writes should go to the top image, allowing them
> + * to be shared.
> + */
> + oflags = O_RDONLY|O_NONBLOCK;
> + aflags = R_OK;
> + n = virtio_get_base(diskfds[i][j], base, sizeof base,
> +    vmc->vmc_disktypes[i]);
> + if (n == 0)
> + break;
> + if (n == -1) {
> + log_warnx("vm \"%s\" unable to read "
> +    "base for disk %s", vcp->vcp_name,
> +    vcp->vcp_disks[i]);
> + goto fail;
> + }
> + /*
> + * Relative paths should be interpreted relative
> + * to the disk image, rather than relative to the
> + * directory vmd happens to be running in, since
> + * this is the only userful interpretation.
> + */
> + if (base[0] == '/') {
> + if (realpath(base, path) == NULL) {
> + log_warn("unable to resolve %s", base);
> + goto fail;
> + }
> + } else {
> + s = dirname(path);
> + if (snprintf(expanded, sizeof(expanded),
> +    "%s/%s", s, base) >= (int)sizeof(expanded)) {
> + log_warn("path too long: %s/%s",
> +    s, base);
> + goto fail;
> + }
> + if (realpath(expanded, path) == NULL) {
> + log_warn("unable to resolve %s", base);
> + goto fail;
> + }
> + }
>   }
>   }
>  
> @@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>      NULL, 0);
>  
>   for (i = 0; i < vcp->vcp_ndisks; i++) {
> - proc_compose_imsg(ps, PROC_VMM, -1,
> -    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
> -    &i, sizeof(i));
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + if (diskfds[i][j] == -1)
> + break;
> + proc_compose_imsg(ps, PROC_VMM, -1,
> +    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
> +    diskfds[i][j], &i, sizeof(i));
> + }
>   }
>   for (i = 0; i < vcp->vcp_nnics; i++) {
>   proc_compose_imsg(ps, PROC_VMM, -1,
> @@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   proc_compose_imsg(ps, PROC_VMM, -1,
>      IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
>  
> - free(diskfds);
>   free(tapfds);
>  
>   vm->vm_running = 1;
> @@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   close(kernfd);
>   if (cdromfd != -1)
>   close(cdromfd);
> - if (diskfds != NULL) {
> - for (i = 0; i < vcp->vcp_ndisks; i++)
> - close(diskfds[i]);
> - free(diskfds);
> - }
> + for (i = 0; i < vcp->vcp_ndisks; i++)
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
> + if (diskfds[i][j] != -1)
> + close(diskfds[i][j]);
>   if (tapfds != NULL) {
>   for (i = 0; i < vcp->vcp_nnics; i++)
>   close(tapfds[i]);
> @@ -489,7 +542,7 @@ int
>  config_getdisk(struct privsep *ps, struct imsg *imsg)
>  {
>   struct vmd_vm *vm;
> - unsigned int n;
> + unsigned int n, idx;
>  
>   errno = 0;
>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
> @@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
>   IMSG_SIZE_CHECK(imsg, &n);
>   memcpy(&n, imsg->data, sizeof(n));
>  
> - if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
> -    vm->vm_disks[n] != -1 || imsg->fd == -1) {
> + if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
>   log_warnx("invalid disk id");
>   errno = EINVAL;
>   return (-1);
>   }
> - vm->vm_disks[n] = imsg->fd;
> -
> + idx = vm->vm_params.vmc_diskbases[n]++;
> + if (idx >= VMM_MAX_BASE_PER_DISK) {
> + log_warnx("too many bases for disk");
> + errno = EINVAL;
> + return (-1);
> + }
> + vm->vm_disks[n][idx] = imsg->fd;
>   return (0);
>  }
>  
> diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
> index c3211d186fa..81dca129896 100644
> --- usr.sbin/vmd/vioqcow2.c
> +++ usr.sbin/vmd/vioqcow2.c
> @@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
>  static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
>  static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
>  static int inc_refs(struct qcdisk *, off_t, int);
> -static int qc2_openpath(struct qcdisk *, char *, int);
> -static int qc2_open(struct qcdisk *, int);
> +static int qc2_open(struct qcdisk *, int *, size_t);
>  static ssize_t qc2_pread(void *, char *, size_t, off_t);
>  static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
>  static void qc2_close(void *, int);
> @@ -118,14 +117,14 @@ static void qc2_close(void *, int);
>   * May open snapshot base images.
>   */
>  int
> -virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
> +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
>  {
>   struct qcdisk *diskp;
>  
>   diskp = malloc(sizeof(struct qcdisk));
>   if (diskp == NULL)
>   return -1;
> - if (qc2_open(diskp, fd) == -1) {
> + if (qc2_open(diskp, fd, nfd) == -1) {
>   log_warnx("%s: could not open qcow2 disk", __func__);
>   return -1;
>   }
> @@ -137,19 +136,41 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
>   return 0;
>  }
>  
> -static int
> -qc2_openpath(struct qcdisk *disk, char *path, int flags)
> +ssize_t
> +virtio_qcow2_get_base(int fd, char *path, size_t npath)
>  {
> - int fd;
> + struct qcheader header;
> + uint64_t backingoff;
> + uint32_t backingsz;
>  
> - fd = open(path, flags);
> - if (fd < 0)
> + if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
> + log_warnx("%s: short read on header", __func__);
> + return -1;
> + }
> + if (strncmp(header.magic, "QFI\xfb", 4) != 0) {

It is more for consistency, but we have a #define:

        strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW))

> + log_warn("%s: invalid magic numbers", __func__);
>   return -1;
> - return qc2_open(disk, fd);
> + }
> + backingoff = be64toh(header.backingoff);
> + backingsz = be32toh(header.backingsz);
> + if (backingsz != 0) {
> + if (backingsz >= npath - 1) {
> + log_warn("%s: snapshot path too long", __func__);
> + return -1;
> + }
> + if (pread(fd, path, backingsz, backingoff) != backingsz) {
> + log_warnx("%s: could not read snapshot base name",
> +    __func__);
> + return -1;
> + }
> + path[backingsz] = '\0';
> + log_warnx("path: %s\n", path);

That looks like a left-over debug message.  And it shouldn't be warn.
And shouldn't have a \n.

> + }
> + return backingsz;
>  }
>  
>  static int
> -qc2_open(struct qcdisk *disk, int fd)
> +qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
>  {
>   char basepath[PATH_MAX];
>   struct stat st;
> @@ -157,14 +178,15 @@ qc2_open(struct qcdisk *disk, int fd)
>   uint64_t backingoff;
>   uint32_t backingsz;
>   size_t i;
> - int version;
> + int version, fd;
>  
>   pthread_rwlock_init(&disk->lock, NULL);
> + fd = fds[0];
>   disk->fd = fd;
>   disk->base = NULL;
>   disk->l1 = NULL;
>  
> - if (pread(fd, &header, sizeof header, 0) != sizeof header) {
> + if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
>   log_warn("%s: short read on header", __func__);
>   goto error;
>   }
> @@ -203,7 +225,7 @@ qc2_open(struct qcdisk *disk, int fd)
>   goto error;
>   }
>  
> - disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
> + disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
>   if (!disk->l1)
>   goto error;
>   if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off)
> @@ -222,14 +244,7 @@ qc2_open(struct qcdisk *disk, int fd)
>   backingoff = be64toh(header.backingoff);
>   backingsz = be32toh(header.backingsz);
>   if (backingsz != 0) {
> - /*
> - * FIXME: we need to figure out a way of opening these things,
> - * otherwise we just crash with a pledge violation.
> - */
> - log_warn("%s: unsupported external snapshot images", __func__);
> - goto error;
> -
> - if (backingsz >= sizeof basepath - 1) {
> + if (backingsz >= sizeof(basepath) - 1) {
>   log_warn("%s: snapshot path too long", __func__);
>   goto error;
>   }
> @@ -239,11 +254,17 @@ qc2_open(struct qcdisk *disk, int fd)
>   goto error;
>   }
>   basepath[backingsz] = 0;
> + if (nfd <= 1) {
> + log_warnx("%s: missing base image %s", __func__,
> +    basepath);
> + goto error;
> + }
> +
>  
>   disk->base = calloc(1, sizeof(struct qcdisk));
>   if (!disk->base)
>   goto error;
> - if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
> + if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
>   log_warn("%s: could not open %s", basepath, __func__);
>   goto error;
>   }
> @@ -521,12 +542,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
>   cluster = disk->end;
>   disk->end += disk->clustersz;
>   buf = htobe64(cluster | QCOW2_INPLACE);
> - if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
> + if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off*8) != sizeof(buf))

While here, you can add spaces but than you have to wrap the line

        if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) !=
            sizeof(buf))
>   goto fail;
>  
>   /* TODO: lazily sync: currently VMD doesn't close things */
>   buf = htobe64(disk->l1[l1off]);
> - if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
> + if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8*l1off) != 8)

And this one can remain on the same line :)

        if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)

>   goto fail;
>   if (inc_refs(disk, cluster, 1) == -1)
>   goto fail;
> @@ -571,7 +592,7 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
>   l1idx = (off / disk->clustersz) / nper;
>   l2idx = (off / disk->clustersz) % nper;
>   l1off = disk->refoff + 8*l1idx;
> - if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
> + if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
>   return -1;
>  
>   l2cluster = be64toh(buf);
> @@ -583,19 +604,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
>   return -1;
>   }
>   buf = htobe64(l2cluster);
> - if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
> + if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) {
>   return -1;
>   }
>   }
>  
>   refs = 1;
>   if (!newcluster) {
> - if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
> + if (pread(disk->fd, &refs, sizeof(refs),
> +    l2cluster+2*l2idx) != 2)

That is a good example why spaces around binary operators are more readable:

                    l2cluster + 2 * l2idx

>   return -1;
>   refs = be16toh(refs) + 1;
>   }
>   refs = htobe16(refs);
> - if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
> + if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2*l2idx) != 2) {
>   log_warn("%s: could not write ref block", __func__);
>   return -1;
>   }
> diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
> index e02ab67c5dc..ff4bbb3095e 100644
> --- usr.sbin/vmd/vioraw.c
> +++ usr.sbin/vmd/vioraw.c
> @@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
>   * returning -1 for error, 0 for success.
>   */
>  int
> -virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
> +virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
>  {
>   off_t sz;
>   int *fdp;
>  
> - sz = lseek(fd, 0, SEEK_END);
> + if (nfd != 1)
> + return -1;
> + sz = lseek(fd[0], 0, SEEK_END);
>   if (sz == -1)
>   return -1;
>  
>   fdp = malloc(sizeof(int));
>   if (!fdp)
>   return -1;
> - *fdp = fd;
> + *fdp = fd[0];
>   file->p = fdp;
>   file->pread = raw_pread;
>   file->pwrite = raw_pwrite;
> diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
> index 93490344560..1c66f33216a 100644
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
>   return (0);
>  }
>  
> +int
> +virtio_get_base(int fd, char *path, size_t npath ,int type)
> +{
> + switch (type) {
> + case VMDF_RAW: return 0;

As mentioned before, unlike shell code or Rust match statements, we
usually put a newline between case and the block, even for single-line
statements.

> + case VMDF_QCOW2: return virtio_qcow2_get_base(fd, path, npath);
> + }
> + log_warnx("%s: invalid disk format", __func__);
> + return -1;
> +}
> +
> +/*
> + * Initializes a struct virtio_backing using the list of fds.
> + */
>  static int
> -virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
> +virtio_init_disk(struct virtio_backing *file, off_t *sz,
> +    int *fd, size_t nfd, int type)
>  {
>   /*
>   * probe disk types in order of preference, first one to work wins.
>   * TODO: provide a way of specifying the type and options.
>   */
>   switch (type) {
> - case VMDF_RAW: return virtio_init_raw(file, sz, fd);
> - case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
> + case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
> + case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
>   }
>   log_warnx("%s: invalid disk format", __func__);
>   return -1;
>  }
>  
>  void
> -virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
> -    int *child_taps)
> +virtio_init(struct vmd_vm *vm, int child_cdrom,
> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> @@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>   vioblk[i].vm_id = vcp->vcp_id;
>   vioblk[i].irq = pci_get_dev_irq(id);
>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
> -    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
> +    child_disks[i], vmc->vmc_diskbases[i],
> +    vmc->vmc_disktypes[i]) == -1) {
>   log_warnx("%s: unable to determine disk format",
>      __func__);
>   return;
> @@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>   vioscsi->vq[i].last_avail = 0;
>   }
>   if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
> -    child_cdrom, VMDF_RAW) == -1) {
> +    &child_cdrom, 1, VMDF_RAW) == -1) {
>   log_warnx("%s: unable to determine iso format",
>      __func__);
>   return;
> @@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
>  }
>  
>  int
> -vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
> +vioblk_restore(int fd, struct vmop_create_params *vmc,
> +    int child_disks[][VMM_MAX_BASE_PER_DISK])
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   uint8_t i;
> @@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
>   return (-1);
>   }
>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
> -    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
> +    child_disks[i], vmc->vmc_diskbases[i],
> +    vmc->vmc_disktypes[i]) == -1)  {
>   log_warnx("%s: unable to determine disk format",
>      __func__);
>   return (-1);
> @@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
>   return (-1);
>   }
>  
> - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
> + if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
>      VMDF_RAW) == -1) {
>   log_warnx("%s: unable to determine iso format", __func__);
>   return (-1);
> @@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
>  }
>  
>  int
> -virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
> -    int *child_taps)
> +virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
> index 46006916b6a..91f0e323204 100644
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -257,10 +257,11 @@ struct ioinfo {
>  };
>  
>  /* virtio.c */
> -void virtio_init(struct vmd_vm *, int, int *, int *);
> +void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK], int *);
>  void virtio_shutdown(struct vmd_vm *);
>  int virtio_dump(int);
> -int virtio_restore(int, struct vmd_vm *, int, int *, int *);
> +int virtio_restore(int, struct vmd_vm *, int,
> +    int[][VMM_MAX_BASE_PER_DISK], int *);
>  uint32_t vring_size(uint32_t);
>  
>  int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
> @@ -270,12 +271,14 @@ void viornd_update_qs(void);
>  void viornd_update_qa(void);
>  int viornd_notifyq(void);
>  
> -int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
> -int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
> +ssize_t virtio_qcow2_get_base(int, char *, size_t);
> +int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
> +int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
>  
>  int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
>  int vioblk_dump(int);
> -int vioblk_restore(int, struct vmop_create_params *, int *);
> +int vioblk_restore(int, struct vmop_create_params *,
> +    int[][VMM_MAX_BASE_PER_DISK]);
>  void vioblk_update_qs(struct vioblk_dev *);
>  void vioblk_update_qa(struct vioblk_dev *);
>  int vioblk_notifyq(struct vioblk_dev *);
> diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
> index ef4494d918b..7c9aaf27012 100644
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -65,8 +65,8 @@
>  
>  io_fn_t ioports_map[MAX_PORTS];
>  
> -int run_vm(int, int *, int *, struct vmop_create_params *,
> -    struct vcpu_reg_state *);
> +int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *,
> +    struct vmop_create_params *, struct vcpu_reg_state *);
>  void vm_dispatch_vmm(int, short, void *);
>  void *event_thread(void *);
>  void *vcpu_run_loop(void *);
> @@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
>  void create_memory_map(struct vm_create_params *);
>  int alloc_guest_mem(struct vm_create_params *);
>  int vmm_create_vm(struct vm_create_params *);
> -void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
> -void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
> +void init_emulated_hw(struct vmop_create_params *, int,
> +    int[][VMM_MAX_BASE_PER_DISK], int *);
> +void restore_emulated_hw(struct vm_create_params *, int, int *,
> +    int[][VMM_MAX_BASE_PER_DISK],int);
>  void vcpu_exit_inout(struct vm_run_params *);
>  uint8_t vcpu_exit_pci(struct vm_run_params *);
>  int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
> @@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd)
>  
>   /* Find and open kernel image */
>   if ((fp = vmboot_open(vm->vm_kernel,
> -    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
> +    vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == NULL)

OK, now I get that the first fd is the actual "working" disk image,
but I still think that vmboot_open needs the other fds for the base
images as well.  But we can probably fix that in another diff after
this one got in.

>   fatalx("failed to open kernel - exiting");
>  
>   /* Load kernel image */
> @@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp)
>   */
>  void
>  init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
> -    int *child_disks, int *child_taps)
> +    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   int i;
> @@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
>   */
>  void
>  restore_emulated_hw(struct vm_create_params *vcp, int fd,
> -    int *child_taps, int *child_disks, int child_cdrom)
> +    int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int child_cdrom)
>  {
>   /* struct vm_create_params *vcp = &vmc->vmc_params; */
>   int i;
> @@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
>   *  !0 : the VM exited abnormally or failed to start
>   */
>  int
> -run_vm(int child_cdrom, int *child_disks, int *child_taps,
> -    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
> +run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK],
> +    int *child_taps, struct vmop_create_params *vmc,
> +    struct vcpu_reg_state *vrs)
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   struct vm_rwregs_params vregsp;
> diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
> index 44ceeb64a7e..f23d8d6b25a 100644
> --- usr.sbin/vmd/vmboot.c
> +++ usr.sbin/vmd/vmboot.c
> @@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
>  
>   switch (vmboot->vbp_type) {
>   case VMDF_RAW:
> - if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
> + if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) {
>   log_debug("%s: could not open raw disk", __func__);
>   goto fail;
>   }
>   break;
>   case VMDF_QCOW2:
> - if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
> + if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) {

jup, a qcow2 disk with base image would fail here.

>   log_debug("%s: could not open qcow2 disk", __func__);
>   goto fail;
>   }
> diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
> index 1571be21bc5..8377812c316 100644
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -1097,7 +1097,7 @@ void
>  vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>  {
>   struct privsep *ps = &env->vmd_ps;
> - unsigned int i;
> + unsigned int i, j;
>  
>   if (vm == NULL)
>   return;
> @@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>   close(vm->vm_iev.ibuf.fd);
>   }
>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
> - if (vm->vm_disks[i] != -1) {
> - close(vm->vm_disks[i]);
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {

As mentioned on top, there are some VMM_MAX_BASE_PER_DISK instead of
VM_MAX_BASE_PER_DISK in here.

> + if (vm->vm_disks[i][j] != -1) {
> + close(vm->vm_disks[i][j]);
> + vm->vm_disks[i][j] = -1;
> + }
>   }
>   }
>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
> @@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
>   struct vmop_owner *vmo = NULL;
>   struct vmd_user *usr = NULL;
>   uint32_t rng;
> - unsigned int i;
> + unsigned int i, j;
>   struct vmd_switch *sw;
>   char *s;
>  
> @@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
>   vm->vm_user = usr;
>  
>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
> + vm->vm_disks[i][j] = -1;
>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
>   vm->vm_ifs[i].vif_fd = -1;
>   for (i = 0; i < vcp->vcp_nnics; i++) {
> diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
> index b348d12c757..0683629b3b1 100644
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -48,6 +48,7 @@
>  #define VM_DEFAULT_DEVICE "hd0a"
>  #define VM_BOOT_CONF "/etc/boot.conf"
>  #define VM_NAME_MAX 64
> +#define VM_MAX_BASE_PER_DISK 4
>  #define VM_TTYNAME_MAX 16
>  #define MAX_TAP 256
>  #define NR_BACKLOG 5
> @@ -169,6 +170,7 @@ struct vmop_create_params {
>  #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
>  
>   unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
> + unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
>  #define VMDF_RAW 0x01
>  #define VMDF_QCOW2 0x02
>  
> @@ -241,7 +243,7 @@ struct vmd_vm {
>   uint32_t vm_vmid;
>   int vm_kernel;
>   int vm_cdrom;
> - int vm_disks[VMM_MAX_DISKS_PER_VM];
> + int vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
>   struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
>   char *vm_ttyname;
>   int vm_tty;
> @@ -415,4 +417,7 @@ int parse_config(const char *);
>  int cmdline_symset(char *);
>  int host(const char *, struct address *);
>  
> +/* virtio.c */
> +int virtio_get_base(int, char *, size_t, int);
> +
>  #endif /* VMD_H */
> diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
> index 7757856323f..3176fd85713 100644
> --- usr.sbin/vmd/vmm.c
> +++ usr.sbin/vmd/vmm.c
> @@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
>   struct vmd_vm *vm;
>   int ret = EINVAL;
>   int fds[2];
> - size_t i;
> + size_t i, j;
>  
>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
>   log_warnx("%s: can't find vm", __func__);
> @@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
>   close(fds[1]);
>  
>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
> - close(vm->vm_disks[i]);
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
> + if (vm->vm_disks[i][j] != -1)
> + close(vm->vm_disks[i][j]);
> + vm->vm_disks[i][j] = -1;
> + }
>   }
>   for (i = 0 ; i < vcp->vcp_nnics; i++) {
>   close(vm->vm_ifs[i].vif_fd);
>
> --
>     Ori Bernstein

--

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Ori Bernstein
On Wed, 3 Oct 2018 11:36:53 +0200, Reyk Floeter <[hidden email]> wrote:

> On Tue, Oct 02, 2018 at 11:13:35PM -0700, Ori Bernstein wrote:
> > On Mon, 1 Oct 2018 11:24:01 -0700, Ori Bernstein <[hidden email]> wrote:
> >
> > > On Mon, 1 Oct 2018 12:55:12 +0200
> > > Reyk Floeter <[hidden email]> wrote:
> > >
> > > > Hi Ori,
> > > >
> > > > On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
> > > > > I've added support to vmd for external snapshots. That is,
> > > > > snapshots that are derived from a base image. Data lookups
> > > > > start in the derived image, and if the derived image does not
> > > > > contain some data, the search proceeds ot the base image.
> > > > > Multiple derived images may exist off of a single base image.
> > > > >
> > > >
> > > > Nice work!  This will be quite useful, thanks.
> > > >
> > > > I think I broke your diff as my last commit to derive the raw/qcow2
> > > > format introduced some conflicts.  I had posted it on hackers@ and
> > > > forgot that your aren't on the internal list yet - sorry for that.
> >
> > Updated version. Changes from the last diff:
> >
> > - Merge in syntax changes.
> > - Don't over-read when getting the base images.
> > - Fix relative paths in base images.
> > - Allow multiple derived images to use a single base image, and allow a user
> >   with only read permisssions to base their images on top of it.
> > - Probe the base image size, use/validate it when craeting disk images.
> > - Fix style a bit (long lines, changing from sizeof foo to sizeof(foo).
> > - Move a define out of vmmvar.h
 

> Light testing works except of an issue with read-only base images; the
> required fix is in the comments below.
>
> Other than that, it is really cool to run many VMs from a single base
> image.  In my tests, I installed OpenBSD once and started a few VMs
> using the installed disk as a base.
>
> More comments below.
>
> Reyk
 
Thanks, another update based on Reyk's feeback and fixes.

diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
index c2a5f42d5f6..1f8673e0e26 100644
--- regress/usr.sbin/vmd/diskfmt/Makefile
+++ regress/usr.sbin/vmd/diskfmt/Makefile
@@ -11,7 +11,7 @@
 VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
 
 PROG=vioscribble
-SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+SRCS=vioscribble.c vioqcow2.c vioraw.c
 CFLAGS+=-I$(VMD_DIR) -pthread
 LDFLAGS+=-pthread
 
@@ -26,3 +26,6 @@ scribble-images:
 .PHONY: ${REGRESS_TARGETS} scribble-images
 
 .include <bsd.regress.mk>
+
+vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+ cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
index 14d720db652..1da8efedac7 100644
--- regress/usr.sbin/vmd/diskfmt/vioscribble.c
+++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
@@ -122,16 +122,18 @@ main(int argc, char **argv)
  verbose = !!getenv("VERBOSE");
  qcfd = open("scribble.qc2", O_RDWR);
  rawfd = open("scribble.raw", O_RDWR);
- if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
+ if (qcfd == -1)
  err(1, "unable to open qcow");
- if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
+ if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
+ err(1, "unable to init qcow");
+ if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
  err(1, "unable to open raw");
 
  srandom_deterministic(123);
 
  /* scribble to both disks */
  printf("scribbling...\n");
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < 1024*16; i++) {
  off = (random() % DISKSZ);
  len = random() % sizeof buf + 1;
  fill(off, buf, sizeof buf);
diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
index 8748ecfdedc..a3ab4672370 100644
--- usr.sbin/vmctl/main.c
+++ usr.sbin/vmctl/main.c
@@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char *[]);
 
 struct ctl_command ctl_commands[] = {
  { "console", CMD_CONSOLE, ctl_console, "id" },
- { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 },
+ { "create", CMD_CREATE, ctl_create,
+ "\"path\" [-s size] [-b base]", 1 },
  { "load", CMD_LOAD, ctl_load, "\"path\"" },
  { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
  { "reload", CMD_RELOAD, ctl_reload, "" },
@@ -538,47 +539,55 @@ int
 ctl_create(struct parse_result *res, int argc, char *argv[])
 {
  int ch, ret, type;
- const char *paths[2], *disk, *format;
+ const char *disk, *format, *base;
 
  if (argc < 2)
  ctl_usage(res->ctl);
 
+ base = NULL;
  type = parse_disktype(argv[1], &disk);
 
- paths[0] = disk;
- paths[1] = NULL;
-
- if (unveil(paths[0], "rwc") == -1)
+ if (pledge("stdio rpath wpath cpath unveil", NULL) == -1)
+ err(1, "pledge");
+ if (unveil(disk, "rwc") == -1)
  err(1, "unveil");
 
- if (pledge("stdio rpath wpath cpath", NULL) == -1)
- err(1, "pledge");
  argc--;
  argv++;
 
- while ((ch = getopt(argc, argv, "s:")) != -1) {
+ while ((ch = getopt(argc, argv, "s:b:")) != -1) {
  switch (ch) {
  case 's':
  if (parse_size(res, optarg, 0) != 0)
  errx(1, "invalid size: %s", optarg);
  break;
+ case 'b':
+ base = optarg;
+ if (unveil(base, "r") == -1)
+ err(1, "unveil");
+ break;
  default:
  ctl_usage(res->ctl);
  /* NOTREACHED */
  }
  }
+ if (unveil(NULL, NULL))
+ err(1, "unveil");
 
- if (res->size == 0) {
- fprintf(stderr, "missing size argument\n");
+ if (base && type != VMDF_QCOW2)
+ errx(1, "base images require qcow2 disk format");
+ if (res->size == 0 && !base) {
+ fprintf(stderr, "could not create %s: missing size argument\n",
+    disk);
  ctl_usage(res->ctl);
  }
 
  if (type == VMDF_QCOW2) {
  format = "qcow2";
- ret = create_qc2_imagefile(paths[0], res->size);
+ ret = create_qc2_imagefile(disk, base, res->size);
  } else {
  format = "raw";
- ret = create_raw_imagefile(paths[0], res->size);
+ ret = create_raw_imagefile(disk, res->size);
  }
 
  if (ret != 0) {
diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
index f7890ac99f8..7a02452789c 100644
--- usr.sbin/vmctl/vmctl.8
+++ usr.sbin/vmctl/vmctl.8
@@ -50,7 +50,7 @@ Using
 .Xr cu 1
 connect to the console of the VM with the specified
 .Ar id .
-.It Cm create Ar path Fl s Ar size
+.It Cm create Ar path Fl s Op Ar size Op  Fl b Ar base
 Creates a VM disk image file with the specified
 .Ar path
 and
@@ -65,7 +65,14 @@ or
 in order to specify the disk format.
 If left unspecified, the format defaults to
 .Pa raw
-if it cannot be derived automatically.
+if it cannot be derived automatically.  For qcow2, a
+.Ar base
+image may be specified. The base image is not modified. The derived image
+contains only the changes written by the VM. When creating a derived image,
+the
+.Ar size
+may be omitted, and probed from the base image. If it is provided, it must
+match the base image size.
 .It Cm load Ar filename
 Load additional configuration from the specified file.
 .It Cm log brief
diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
index b09e1115ff7..12db3f69525 100644
--- usr.sbin/vmctl/vmctl.c
+++ usr.sbin/vmctl/vmctl.c
@@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
 #define ALIGN(sz, align) \
  ((sz + align - 1) & ~(align - 1))
 int
-create_qc2_imagefile(const char *imgfile_path, long imgsize)
+create_qc2_imagefile(const char *imgfile_path,
+    const char *base_path, long imgsize)
 {
  struct qcheader {
  char magic[4];
@@ -869,15 +870,33 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  uint64_t autoclearfeatures;
  uint32_t reforder;
  uint32_t headersz;
- } __packed hdr;
+ } __packed hdr, basehdr;
  int fd, ret;
+ ssize_t base_len;
  uint64_t l1sz, refsz, disksz, initsz, clustersz;
  uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
  uint16_t refs;
 
- disksz = 1024*1024*imgsize;
+ disksz = 1024 * 1024 * imgsize;
+
+ if (base_path) {
+ fd = open(base_path, O_RDONLY);
+ if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
+ err(1, "failure to read base image header");
+ close(fd);
+ if (strncmp(basehdr.magic,
+    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
+ errx(1, "base image is not a qcow2 file");
+ if (!disksz)
+ disksz = betoh64(basehdr.disksz);
+ else if (disksz != betoh64(basehdr.disksz))
+ errx(1, "base size does not match requested size");
+ }
+ if (!base_path && !disksz)
+ errx(1, "missing disk size");
+
  clustersz = (1<<16);
- l1off = ALIGN(sizeof hdr, clustersz);
+ l1off = ALIGN(sizeof(hdr), clustersz);
 
  l1entrysz = clustersz * clustersz / 8;
  l1sz = (disksz + l1entrysz - 1) / l1entrysz;
@@ -887,11 +906,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  refsz = (disksz + refentrysz - 1) / refentrysz;
 
  initsz = ALIGN(refoff + refsz*clustersz, clustersz);
+ base_len = base_path ? strlen(base_path) : 0;
 
- memcpy(hdr.magic, "QFI\xfb", 4);
+ memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
  hdr.version = htobe32(3);
- hdr.backingoff = htobe64(0);
- hdr.backingsz = htobe32(0);
+ hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0);
+ hdr.backingsz = htobe32(base_len);
  hdr.clustershift = htobe32(16);
  hdr.disksz = htobe64(disksz);
  hdr.cryptmethod = htobe32(0);
@@ -905,7 +925,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  hdr.compatfeatures = htobe64(0);
  hdr.autoclearfeatures = htobe64(0);
  hdr.reforder = htobe32(4);
- hdr.headersz = htobe32(sizeof hdr);
+ hdr.headersz = htobe32(sizeof(hdr));
 
  /* Refuse to overwrite an existing image */
  fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
@@ -914,7 +934,11 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  return (errno);
 
  /* Write out the header */
- if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
+ if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+ goto error;
+
+ /* Add the base image */
+ if (base_path && write(fd, base_path, base_len) != base_len)
  goto error;
 
  /* Extend to desired size, and add one refcount cluster */
diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
index 006411d9785..2d0355450ee 100644
--- usr.sbin/vmctl/vmctl.h
+++ usr.sbin/vmctl/vmctl.h
@@ -87,7 +87,7 @@ __dead void
 
 /* vmctl.c */
 int create_raw_imagefile(const char *, long);
-int create_qc2_imagefile(const char *, long);
+int create_qc2_imagefile(const char *, const char *, long);
 int vm_start(uint32_t, const char *, int, int, char **, int,
     char **, int *, char *, char *, char *);
 int vm_start_complete(struct imsg *, int *, int);
diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
index 550b73c1a39..68be738d304 100644
--- usr.sbin/vmd/config.c
+++ usr.sbin/vmd/config.c
@@ -35,6 +35,7 @@
 #include <util.h>
 #include <errno.h>
 #include <imsg.h>
+#include <libgen.h>
 
 #include "proc.h"
 #include "vmd.h"
@@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
 int
 config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 {
+ int diskfds[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
  struct vmd_if *vif;
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
- unsigned int i;
+ unsigned int i, j;
  int fd = -1, vmboot = 0;
- int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
+ int kernfd = -1;
+ int *tapfds;
  int cdromfd = -1;
  int saved_errno = 0;
+ int n = 0, aflags, oflags;
  char ifname[IF_NAMESIZE], *s;
  char path[PATH_MAX];
+ char base[PATH_MAX];
+ char expanded[PATH_MAX];
  unsigned int unit;
 
  errno = 0;
@@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  }
  }
 
- diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
- if (diskfds == NULL) {
- log_warn("%s: can't allocate disk fds", __func__);
- goto fail;
- }
- for (i = 0; i < vcp->vcp_ndisks; i++)
- diskfds[i] = -1;
+ for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ diskfds[i][j] = -1;
 
  tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
  if (tapfds == NULL) {
@@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 
  /* Open disk images for child */
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
-                /* Stat disk[i] to ensure it is a regular file */
- if ((diskfds[i] = open(vcp->vcp_disks[i],
-    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
- log_warn("%s: can't open disk %s", __func__,
-    vcp->vcp_disks[i]);
- errno = VMD_DISK_MISSING;
- goto fail;
- }
+ if (strlcpy(path, vcp->vcp_disks[i], sizeof(path))
+   >= sizeof(path))
+ log_warnx("%s, disk path too long", __func__);
+ memset(vmc->vmc_diskbases, 0, sizeof(vmc->vmc_diskbases));
+ oflags = O_RDWR|O_EXLOCK|O_NONBLOCK;
+ aflags = R_OK|W_OK;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ /* Stat disk[i] to ensure it is a regular file */
+ if ((diskfds[i][j] = open(path, oflags)) == -1) {
+ log_warn("%s: can't open disk %s", __func__,
+    vcp->vcp_disks[i]);
+ errno = VMD_DISK_MISSING;
+ goto fail;
+ }
 
- if (vm_checkaccess(diskfds[i],
-    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
-    uid, R_OK|W_OK) == -1) {
- log_warnx("vm \"%s\" no read/write access to disk %s",
-    vcp->vcp_name, vcp->vcp_disks[i]);
- errno = EPERM;
- goto fail;
+ if (vm_checkaccess(diskfds[i][j],
+    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
+    uid, aflags) == -1) {
+ log_warnx("vm \"%s\" unable to access "
+    "disk %s", vcp->vcp_name, path);
+ errno = EPERM;
+ goto fail;
+ }
+
+ /*
+ * Clear the write and exclusive flags for base images.
+ * All writes should go to the top image, allowing them
+ * to be shared.
+ */
+ oflags = O_RDONLY|O_NONBLOCK;
+ aflags = R_OK;
+ n = virtio_get_base(diskfds[i][j], base, sizeof base,
+    vmc->vmc_disktypes[i]);
+ if (n == 0)
+ break;
+ if (n == -1) {
+ log_warnx("vm \"%s\" unable to read "
+    "base %s for disk %s", vcp->vcp_name,
+    base, vcp->vcp_disks[i]);
+ goto fail;
+ }
+ /*
+ * Relative paths should be interpreted relative
+ * to the disk image, rather than relative to the
+ * directory vmd happens to be running in, since
+ * this is the only userful interpretation.
+ */
+ if (base[0] == '/') {
+ if (realpath(base, path) == NULL) {
+ log_warn("unable to resolve %s", base);
+ goto fail;
+ }
+ } else {
+ s = dirname(path);
+ if (snprintf(expanded, sizeof(expanded),
+    "%s/%s", s, base) >= (int)sizeof(expanded)) {
+ log_warn("path too long: %s/%s",
+    s, base);
+ goto fail;
+ }
+ if (realpath(expanded, path) == NULL) {
+ log_warn("unable to resolve %s", base);
+ goto fail;
+ }
+ }
  }
  }
 
@@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
     NULL, 0);
 
  for (i = 0; i < vcp->vcp_ndisks; i++) {
- proc_compose_imsg(ps, PROC_VMM, -1,
-    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
-    &i, sizeof(i));
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ if (diskfds[i][j] == -1)
+ break;
+ proc_compose_imsg(ps, PROC_VMM, -1,
+    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
+    diskfds[i][j], &i, sizeof(i));
+ }
  }
  for (i = 0; i < vcp->vcp_nnics; i++) {
  proc_compose_imsg(ps, PROC_VMM, -1,
@@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  proc_compose_imsg(ps, PROC_VMM, -1,
     IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
 
- free(diskfds);
  free(tapfds);
 
  vm->vm_running = 1;
@@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  close(kernfd);
  if (cdromfd != -1)
  close(cdromfd);
- if (diskfds != NULL) {
- for (i = 0; i < vcp->vcp_ndisks; i++)
- close(diskfds[i]);
- free(diskfds);
- }
+ for (i = 0; i < vcp->vcp_ndisks; i++)
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ if (diskfds[i][j] != -1)
+ close(diskfds[i][j]);
  if (tapfds != NULL) {
  for (i = 0; i < vcp->vcp_nnics; i++)
  close(tapfds[i]);
@@ -489,7 +542,7 @@ int
 config_getdisk(struct privsep *ps, struct imsg *imsg)
 {
  struct vmd_vm *vm;
- unsigned int n;
+ unsigned int n, idx;
 
  errno = 0;
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
@@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
  IMSG_SIZE_CHECK(imsg, &n);
  memcpy(&n, imsg->data, sizeof(n));
 
- if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
-    vm->vm_disks[n] != -1 || imsg->fd == -1) {
+ if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
  log_warnx("invalid disk id");
  errno = EINVAL;
  return (-1);
  }
- vm->vm_disks[n] = imsg->fd;
-
+ idx = vm->vm_params.vmc_diskbases[n]++;
+ if (idx >= VM_MAX_BASE_PER_DISK) {
+ log_warnx("too many bases for disk");
+ errno = EINVAL;
+ return (-1);
+ }
+ vm->vm_disks[n][idx] = imsg->fd;
  return (0);
 }
 
diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
index c3211d186fa..25799cc5a3d 100644
--- usr.sbin/vmd/vioqcow2.c
+++ usr.sbin/vmd/vioqcow2.c
@@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
 static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static int inc_refs(struct qcdisk *, off_t, int);
-static int qc2_openpath(struct qcdisk *, char *, int);
-static int qc2_open(struct qcdisk *, int);
+static int qc2_open(struct qcdisk *, int *, size_t);
 static ssize_t qc2_pread(void *, char *, size_t, off_t);
 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
 static void qc2_close(void *, int);
@@ -118,14 +117,14 @@ static void qc2_close(void *, int);
  * May open snapshot base images.
  */
 int
-virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  struct qcdisk *diskp;
 
  diskp = malloc(sizeof(struct qcdisk));
  if (diskp == NULL)
  return -1;
- if (qc2_open(diskp, fd) == -1) {
+ if (qc2_open(diskp, fd, nfd) == -1) {
  log_warnx("%s: could not open qcow2 disk", __func__);
  return -1;
  }
@@ -137,19 +136,40 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
  return 0;
 }
 
-static int
-qc2_openpath(struct qcdisk *disk, char *path, int flags)
+ssize_t
+virtio_qcow2_get_base(int fd, char *path, size_t npath)
 {
- int fd;
+ struct qcheader header;
+ uint64_t backingoff;
+ uint32_t backingsz;
 
- fd = open(path, flags);
- if (fd < 0)
+ if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
+ log_warnx("%s: short read on header", __func__);
+ return -1;
+ }
+ if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
+ log_warn("%s: invalid magic numbers", __func__);
  return -1;
- return qc2_open(disk, fd);
+ }
+ backingoff = be64toh(header.backingoff);
+ backingsz = be32toh(header.backingsz);
+ if (backingsz != 0) {
+ if (backingsz >= npath - 1) {
+ log_warn("%s: snapshot path too long", __func__);
+ return -1;
+ }
+ if (pread(fd, path, backingsz, backingoff) != backingsz) {
+ log_warnx("%s: could not read snapshot base name",
+    __func__);
+ return -1;
+ }
+ path[backingsz] = '\0';
+ }
+ return backingsz;
 }
 
 static int
-qc2_open(struct qcdisk *disk, int fd)
+qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
 {
  char basepath[PATH_MAX];
  struct stat st;
@@ -157,14 +177,15 @@ qc2_open(struct qcdisk *disk, int fd)
  uint64_t backingoff;
  uint32_t backingsz;
  size_t i;
- int version;
+ int version, fd;
 
  pthread_rwlock_init(&disk->lock, NULL);
+ fd = fds[0];
  disk->fd = fd;
  disk->base = NULL;
  disk->l1 = NULL;
 
- if (pread(fd, &header, sizeof header, 0) != sizeof header) {
+ if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
  log_warn("%s: short read on header", __func__);
  goto error;
  }
@@ -203,11 +224,11 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
 
- disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
+ disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
  if (!disk->l1)
  goto error;
- if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off)
-    != 8*disk->l1sz) {
+ if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
+    != 8 * disk->l1sz) {
  log_warn("%s: unable to read qcow2 L1 table", __func__);
  goto error;
  }
@@ -222,14 +243,7 @@ qc2_open(struct qcdisk *disk, int fd)
  backingoff = be64toh(header.backingoff);
  backingsz = be32toh(header.backingsz);
  if (backingsz != 0) {
- /*
- * FIXME: we need to figure out a way of opening these things,
- * otherwise we just crash with a pledge violation.
- */
- log_warn("%s: unsupported external snapshot images", __func__);
- goto error;
-
- if (backingsz >= sizeof basepath - 1) {
+ if (backingsz >= sizeof(basepath) - 1) {
  log_warn("%s: snapshot path too long", __func__);
  goto error;
  }
@@ -239,11 +253,17 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
  basepath[backingsz] = 0;
+ if (nfd <= 1) {
+ log_warnx("%s: missing base image %s", __func__,
+    basepath);
+ goto error;
+ }
+
 
  disk->base = calloc(1, sizeof(struct qcdisk));
  if (!disk->base)
  goto error;
- if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
+ if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
  log_warn("%s: could not open %s", basepath, __func__);
  goto error;
  }
@@ -428,7 +448,7 @@ xlate(struct qcdisk *disk, off_t off, int *inplace)
  return 0;
  }
  l2off = (off / disk->clustersz) % l2sz;
- pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8);
+ pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
  cluster = be64toh(buf);
  /*
  * cluster may be 0, but all future operations don't affect
@@ -521,12 +541,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
  cluster = disk->end;
  disk->end += disk->clustersz;
  buf = htobe64(cluster | QCOW2_INPLACE);
- if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
+ if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
  goto fail;
 
  /* TODO: lazily sync: currently VMD doesn't close things */
  buf = htobe64(disk->l1[l1off]);
- if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
+ if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
  goto fail;
  if (inc_refs(disk, cluster, 1) == -1)
  goto fail;
@@ -570,8 +590,8 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
  nper = disk->clustersz / 2;
  l1idx = (off / disk->clustersz) / nper;
  l2idx = (off / disk->clustersz) % nper;
- l1off = disk->refoff + 8*l1idx;
- if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
+ l1off = disk->refoff + 8 * l1idx;
+ if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
  return -1;
 
  l2cluster = be64toh(buf);
@@ -583,19 +603,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
  return -1;
  }
  buf = htobe64(l2cluster);
- if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
+ if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) {
  return -1;
  }
  }
 
  refs = 1;
  if (!newcluster) {
- if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
+ if (pread(disk->fd, &refs, sizeof(refs),
+    l2cluster + 2 * l2idx) != 2)
  return -1;
  refs = be16toh(refs) + 1;
  }
  refs = htobe16(refs);
- if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
+ if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) {
  log_warn("%s: could not write ref block", __func__);
  return -1;
  }
diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
index e02ab67c5dc..ff4bbb3095e 100644
--- usr.sbin/vmd/vioraw.c
+++ usr.sbin/vmd/vioraw.c
@@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
  * returning -1 for error, 0 for success.
  */
 int
-virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  off_t sz;
  int *fdp;
 
- sz = lseek(fd, 0, SEEK_END);
+ if (nfd != 1)
+ return -1;
+ sz = lseek(fd[0], 0, SEEK_END);
  if (sz == -1)
  return -1;
 
  fdp = malloc(sizeof(int));
  if (!fdp)
  return -1;
- *fdp = fd;
+ *fdp = fd[0];
  file->p = fdp;
  file->pread = raw_pread;
  file->pwrite = raw_pwrite;
diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index 93490344560..94818b24d9e 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1745,24 +1745,41 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
  return (0);
 }
 
+int
+virtio_get_base(int fd, char *path, size_t npath ,int type)
+{
+ switch (type) {
+ case VMDF_RAW:
+ return 0;
+ case VMDF_QCOW2:
+ return virtio_qcow2_get_base(fd, path, npath);
+ }
+ log_warnx("%s: invalid disk format", __func__);
+ return -1;
+}
+
+/*
+ * Initializes a struct virtio_backing using the list of fds.
+ */
 static int
-virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
+virtio_init_disk(struct virtio_backing *file, off_t *sz,
+    int *fd, size_t nfd, int type)
 {
  /*
  * probe disk types in order of preference, first one to work wins.
  * TODO: provide a way of specifying the type and options.
  */
  switch (type) {
- case VMDF_RAW: return virtio_init_raw(file, sz, fd);
- case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
+ case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
+ case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
  }
  log_warnx("%s: invalid disk format", __func__);
  return -1;
 }
 
 void
-virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_init(struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
@@ -1838,7 +1855,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioblk[i].vm_id = vcp->vcp_id;
  vioblk[i].irq = pci_get_dev_irq(id);
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1) {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return;
@@ -1967,7 +1985,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioscsi->vq[i].last_avail = 0;
  }
  if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
-    child_cdrom, VMDF_RAW) == -1) {
+    &child_cdrom, 1, VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format",
     __func__);
  return;
@@ -2123,7 +2141,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
 }
 
 int
-vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
+vioblk_restore(int fd, struct vmop_create_params *vmc,
+    int child_disks[][VM_MAX_BASE_PER_DISK])
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  uint8_t i;
@@ -2149,7 +2168,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
  return (-1);
  }
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1)  {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return (-1);
@@ -2186,7 +2206,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
  return (-1);
  }
 
- if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
+ if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
     VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format", __func__);
  return (-1);
@@ -2198,8 +2218,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
 }
 
 int
-virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
index 46006916b6a..bb632bb5502 100644
--- usr.sbin/vmd/virtio.h
+++ usr.sbin/vmd/virtio.h
@@ -257,10 +257,11 @@ struct ioinfo {
 };
 
 /* virtio.c */
-void virtio_init(struct vmd_vm *, int, int *, int *);
+void virtio_init(struct vmd_vm *, int, int[][VM_MAX_BASE_PER_DISK], int *);
 void virtio_shutdown(struct vmd_vm *);
 int virtio_dump(int);
-int virtio_restore(int, struct vmd_vm *, int, int *, int *);
+int virtio_restore(int, struct vmd_vm *, int,
+    int[][VM_MAX_BASE_PER_DISK], int *);
 uint32_t vring_size(uint32_t);
 
 int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
@@ -270,12 +271,14 @@ void viornd_update_qs(void);
 void viornd_update_qa(void);
 int viornd_notifyq(void);
 
-int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
-int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
+ssize_t virtio_qcow2_get_base(int, char *, size_t);
+int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
+int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
 
 int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vioblk_dump(int);
-int vioblk_restore(int, struct vmop_create_params *, int *);
+int vioblk_restore(int, struct vmop_create_params *,
+    int[][VM_MAX_BASE_PER_DISK]);
 void vioblk_update_qs(struct vioblk_dev *);
 void vioblk_update_qa(struct vioblk_dev *);
 int vioblk_notifyq(struct vioblk_dev *);
diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
index ef4494d918b..37dfb95bb0d 100644
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,8 +65,8 @@
 
 io_fn_t ioports_map[MAX_PORTS];
 
-int run_vm(int, int *, int *, struct vmop_create_params *,
-    struct vcpu_reg_state *);
+int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
+    struct vmop_create_params *, struct vcpu_reg_state *);
 void vm_dispatch_vmm(int, short, void *);
 void *event_thread(void *);
 void *vcpu_run_loop(void *);
@@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
 void create_memory_map(struct vm_create_params *);
 int alloc_guest_mem(struct vm_create_params *);
 int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
-void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
+void init_emulated_hw(struct vmop_create_params *, int,
+    int[][VM_MAX_BASE_PER_DISK], int *);
+void restore_emulated_hw(struct vm_create_params *, int, int *,
+    int[][VM_MAX_BASE_PER_DISK],int);
 void vcpu_exit_inout(struct vm_run_params *);
 uint8_t vcpu_exit_pci(struct vm_run_params *);
 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
@@ -327,7 +329,8 @@ start_vm(struct vmd_vm *vm, int fd)
 
  /* Find and open kernel image */
  if ((fp = vmboot_open(vm->vm_kernel,
-    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
+    vm->vm_disks[0], vmc->vmc_diskbases[0],
+    vmc->vmc_disktypes[0], &vmboot)) == NULL)
  fatalx("failed to open kernel - exiting");
 
  /* Load kernel image */
@@ -903,7 +906,7 @@ vmm_create_vm(struct vm_create_params *vcp)
  */
 void
 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
-    int *child_disks, int *child_taps)
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  int i;
@@ -968,7 +971,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
  */
 void
 restore_emulated_hw(struct vm_create_params *vcp, int fd,
-    int *child_taps, int *child_disks, int child_cdrom)
+    int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
 {
  /* struct vm_create_params *vcp = &vmc->vmc_params; */
  int i;
@@ -1029,8 +1032,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
  *  !0 : the VM exited abnormally or failed to start
  */
 int
-run_vm(int child_cdrom, int *child_disks, int *child_taps,
-    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
+run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
+    int *child_taps, struct vmop_create_params *vmc,
+    struct vcpu_reg_state *vrs)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  struct vm_rwregs_params vregsp;
diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
index 44ceeb64a7e..718c9739fa5 100644
--- usr.sbin/vmd/vmboot.c
+++ usr.sbin/vmd/vmboot.c
@@ -385,7 +385,7 @@ vmboot_loadfile(struct open_file *f, char *file, size_t *size)
 }
 
 FILE *
-vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
+vmboot_open(int kernel_fd, int *disk_fd, int nfd, unsigned int disk_type,
     struct vmboot_params *vmboot)
 {
  char file[PATH_MAX];
@@ -404,7 +404,7 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
  if (kernel_fd != -1)
  return (fdopen(kernel_fd, "r"));
 
- if (disk_fd == -1)
+ if (disk_fd == NULL || nfd < 1)
  return (NULL);
 
  if ((vfp = calloc(1, sizeof(*vfp))) == NULL)
@@ -414,20 +414,19 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
 
  switch (vmboot->vbp_type) {
  case VMDF_RAW:
- if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_raw(vfp, &sz, disk_fd, nfd) == -1) {
  log_debug("%s: could not open raw disk", __func__);
  goto fail;
  }
  break;
  case VMDF_QCOW2:
- if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_qcow2(vfp, &sz, disk_fd, nfd) == -1) {
  log_debug("%s: could not open qcow2 disk", __func__);
  goto fail;
  }
  break;
  }
 
- vmboot->vbp_fd = disk_fd;
  vmboot_file.f_devdata = vmboot;
 
  if ((vmboot->vbp_partoff =
diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
index 1571be21bc5..18a5e0d3d5d 100644
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -1097,7 +1097,7 @@ void
 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
 {
  struct privsep *ps = &env->vmd_ps;
- unsigned int i;
+ unsigned int i, j;
 
  if (vm == NULL)
  return;
@@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
  close(vm->vm_iev.ibuf.fd);
  }
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
- if (vm->vm_disks[i] != -1) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1) {
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  }
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
@@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  struct vmop_owner *vmo = NULL;
  struct vmd_user *usr = NULL;
  uint32_t rng;
- unsigned int i;
+ unsigned int i, j;
  struct vmd_switch *sw;
  char *s;
 
@@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  vm->vm_user = usr;
 
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ vm->vm_disks[i][j] = -1;
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
  vm->vm_ifs[i].vif_fd = -1;
  for (i = 0; i < vcp->vcp_nnics; i++) {
diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
index b348d12c757..b7c012854e8 100644
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -48,6 +48,7 @@
 #define VM_DEFAULT_DEVICE "hd0a"
 #define VM_BOOT_CONF "/etc/boot.conf"
 #define VM_NAME_MAX 64
+#define VM_MAX_BASE_PER_DISK 4
 #define VM_TTYNAME_MAX 16
 #define MAX_TAP 256
 #define NR_BACKLOG 5
@@ -169,6 +170,7 @@ struct vmop_create_params {
 #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
 
  unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
+ unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
 #define VMDF_RAW 0x01
 #define VMDF_QCOW2 0x02
 
@@ -202,7 +204,6 @@ struct vm_dump_header {
 } __packed;
 
 struct vmboot_params {
- int vbp_fd;
  off_t vbp_partoff;
  char vbp_device[PATH_MAX];
  char vbp_image[PATH_MAX];
@@ -241,7 +242,7 @@ struct vmd_vm {
  uint32_t vm_vmid;
  int vm_kernel;
  int vm_cdrom;
- int vm_disks[VMM_MAX_DISKS_PER_VM];
+ int vm_disks[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
  struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
  char *vm_ttyname;
  int vm_tty;
@@ -407,7 +408,7 @@ int config_getif(struct privsep *, struct imsg *);
 int config_getcdrom(struct privsep *, struct imsg *);
 
 /* vmboot.c */
-FILE *vmboot_open(int, int, unsigned int, struct vmboot_params *);
+FILE *vmboot_open(int, int *, int, unsigned int, struct vmboot_params *);
 void vmboot_close(FILE *, struct vmboot_params *);
 
 /* parse.y */
@@ -415,4 +416,7 @@ int parse_config(const char *);
 int cmdline_symset(char *);
 int host(const char *, struct address *);
 
+/* virtio.c */
+int virtio_get_base(int, char *, size_t, int);
+
 #endif /* VMD_H */
diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
index 7757856323f..47e2b2324be 100644
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  struct vmd_vm *vm;
  int ret = EINVAL;
  int fds[2];
- size_t i;
+ size_t i, j;
 
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
  log_warnx("%s: can't find vm", __func__);
@@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  close(fds[1]);
 
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1)
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  for (i = 0 ; i < vcp->vcp_nnics; i++) {
  close(vm->vm_ifs[i].vif_fd);

--
    Ori Bernstein

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Reyk Floeter-2
On Wed, Oct 03, 2018 at 11:41:41PM -0700, Ori Bernstein wrote:
> Thanks, another update based on Reyk's feeback and fixes.
>

You missed one thing: jmc@'s manpage comments.

For everything else:  Looks good!  Tests work fine.  OK reyk@

Reyk

> diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
> index c2a5f42d5f6..1f8673e0e26 100644
> --- regress/usr.sbin/vmd/diskfmt/Makefile
> +++ regress/usr.sbin/vmd/diskfmt/Makefile
> @@ -11,7 +11,7 @@
>  VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
>  
>  PROG=vioscribble
> -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> +SRCS=vioscribble.c vioqcow2.c vioraw.c
>  CFLAGS+=-I$(VMD_DIR) -pthread
>  LDFLAGS+=-pthread
>  
> @@ -26,3 +26,6 @@ scribble-images:
>  .PHONY: ${REGRESS_TARGETS} scribble-images
>  
>  .include <bsd.regress.mk>
> +
> +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
> diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
> index 14d720db652..1da8efedac7 100644
> --- regress/usr.sbin/vmd/diskfmt/vioscribble.c
> +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
> @@ -122,16 +122,18 @@ main(int argc, char **argv)
>   verbose = !!getenv("VERBOSE");
>   qcfd = open("scribble.qc2", O_RDWR);
>   rawfd = open("scribble.raw", O_RDWR);
> - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
> + if (qcfd == -1)
>   err(1, "unable to open qcow");
> - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
> + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
> + err(1, "unable to init qcow");
> + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
>   err(1, "unable to open raw");
>  
>   srandom_deterministic(123);
>  
>   /* scribble to both disks */
>   printf("scribbling...\n");
> - for (i = 0; i < 16; i++) {
> + for (i = 0; i < 1024*16; i++) {
>   off = (random() % DISKSZ);
>   len = random() % sizeof buf + 1;
>   fill(off, buf, sizeof buf);
> diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
> index 8748ecfdedc..a3ab4672370 100644
> --- usr.sbin/vmctl/main.c
> +++ usr.sbin/vmctl/main.c
> @@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char *[]);
>  
>  struct ctl_command ctl_commands[] = {
>   { "console", CMD_CONSOLE, ctl_console, "id" },
> - { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 },
> + { "create", CMD_CREATE, ctl_create,
> + "\"path\" [-s size] [-b base]", 1 },
>   { "load", CMD_LOAD, ctl_load, "\"path\"" },
>   { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
>   { "reload", CMD_RELOAD, ctl_reload, "" },
> @@ -538,47 +539,55 @@ int
>  ctl_create(struct parse_result *res, int argc, char *argv[])
>  {
>   int ch, ret, type;
> - const char *paths[2], *disk, *format;
> + const char *disk, *format, *base;
>  
>   if (argc < 2)
>   ctl_usage(res->ctl);
>  
> + base = NULL;
>   type = parse_disktype(argv[1], &disk);
>  
> - paths[0] = disk;
> - paths[1] = NULL;
> -
> - if (unveil(paths[0], "rwc") == -1)
> + if (pledge("stdio rpath wpath cpath unveil", NULL) == -1)
> + err(1, "pledge");
> + if (unveil(disk, "rwc") == -1)
>   err(1, "unveil");
>  
> - if (pledge("stdio rpath wpath cpath", NULL) == -1)
> - err(1, "pledge");
>   argc--;
>   argv++;
>  
> - while ((ch = getopt(argc, argv, "s:")) != -1) {
> + while ((ch = getopt(argc, argv, "s:b:")) != -1) {
>   switch (ch) {
>   case 's':
>   if (parse_size(res, optarg, 0) != 0)
>   errx(1, "invalid size: %s", optarg);
>   break;
> + case 'b':
> + base = optarg;
> + if (unveil(base, "r") == -1)
> + err(1, "unveil");
> + break;
>   default:
>   ctl_usage(res->ctl);
>   /* NOTREACHED */
>   }
>   }
> + if (unveil(NULL, NULL))
> + err(1, "unveil");
>  
> - if (res->size == 0) {
> - fprintf(stderr, "missing size argument\n");
> + if (base && type != VMDF_QCOW2)
> + errx(1, "base images require qcow2 disk format");
> + if (res->size == 0 && !base) {
> + fprintf(stderr, "could not create %s: missing size argument\n",
> +    disk);
>   ctl_usage(res->ctl);
>   }
>  
>   if (type == VMDF_QCOW2) {
>   format = "qcow2";
> - ret = create_qc2_imagefile(paths[0], res->size);
> + ret = create_qc2_imagefile(disk, base, res->size);
>   } else {
>   format = "raw";
> - ret = create_raw_imagefile(paths[0], res->size);
> + ret = create_raw_imagefile(disk, res->size);
>   }
>  
>   if (ret != 0) {
> diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
> index f7890ac99f8..7a02452789c 100644
> --- usr.sbin/vmctl/vmctl.8
> +++ usr.sbin/vmctl/vmctl.8
> @@ -50,7 +50,7 @@ Using
>  .Xr cu 1
>  connect to the console of the VM with the specified
>  .Ar id .
> -.It Cm create Ar path Fl s Ar size
> +.It Cm create Ar path Fl s Op Ar size Op  Fl b Ar base
>  Creates a VM disk image file with the specified
>  .Ar path
>  and
> @@ -65,7 +65,14 @@ or
>  in order to specify the disk format.
>  If left unspecified, the format defaults to
>  .Pa raw
> -if it cannot be derived automatically.
> +if it cannot be derived automatically.  For qcow2, a
> +.Ar base
> +image may be specified. The base image is not modified. The derived image
> +contains only the changes written by the VM. When creating a derived image,
> +the
> +.Ar size
> +may be omitted, and probed from the base image. If it is provided, it must
> +match the base image size.
>  .It Cm load Ar filename
>  Load additional configuration from the specified file.
>  .It Cm log brief
> diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
> index b09e1115ff7..12db3f69525 100644
> --- usr.sbin/vmctl/vmctl.c
> +++ usr.sbin/vmctl/vmctl.c
> @@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
>  #define ALIGN(sz, align) \
>   ((sz + align - 1) & ~(align - 1))
>  int
> -create_qc2_imagefile(const char *imgfile_path, long imgsize)
> +create_qc2_imagefile(const char *imgfile_path,
> +    const char *base_path, long imgsize)
>  {
>   struct qcheader {
>   char magic[4];
> @@ -869,15 +870,33 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   uint64_t autoclearfeatures;
>   uint32_t reforder;
>   uint32_t headersz;
> - } __packed hdr;
> + } __packed hdr, basehdr;
>   int fd, ret;
> + ssize_t base_len;
>   uint64_t l1sz, refsz, disksz, initsz, clustersz;
>   uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
>   uint16_t refs;
>  
> - disksz = 1024*1024*imgsize;
> + disksz = 1024 * 1024 * imgsize;
> +
> + if (base_path) {
> + fd = open(base_path, O_RDONLY);
> + if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
> + err(1, "failure to read base image header");
> + close(fd);
> + if (strncmp(basehdr.magic,
> +    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
> + errx(1, "base image is not a qcow2 file");
> + if (!disksz)
> + disksz = betoh64(basehdr.disksz);
> + else if (disksz != betoh64(basehdr.disksz))
> + errx(1, "base size does not match requested size");
> + }
> + if (!base_path && !disksz)
> + errx(1, "missing disk size");
> +
>   clustersz = (1<<16);
> - l1off = ALIGN(sizeof hdr, clustersz);
> + l1off = ALIGN(sizeof(hdr), clustersz);
>  
>   l1entrysz = clustersz * clustersz / 8;
>   l1sz = (disksz + l1entrysz - 1) / l1entrysz;
> @@ -887,11 +906,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   refsz = (disksz + refentrysz - 1) / refentrysz;
>  
>   initsz = ALIGN(refoff + refsz*clustersz, clustersz);
> + base_len = base_path ? strlen(base_path) : 0;
>  
> - memcpy(hdr.magic, "QFI\xfb", 4);
> + memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
>   hdr.version = htobe32(3);
> - hdr.backingoff = htobe64(0);
> - hdr.backingsz = htobe32(0);
> + hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0);
> + hdr.backingsz = htobe32(base_len);
>   hdr.clustershift = htobe32(16);
>   hdr.disksz = htobe64(disksz);
>   hdr.cryptmethod = htobe32(0);
> @@ -905,7 +925,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   hdr.compatfeatures = htobe64(0);
>   hdr.autoclearfeatures = htobe64(0);
>   hdr.reforder = htobe32(4);
> - hdr.headersz = htobe32(sizeof hdr);
> + hdr.headersz = htobe32(sizeof(hdr));
>  
>   /* Refuse to overwrite an existing image */
>   fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
> @@ -914,7 +934,11 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
>   return (errno);
>  
>   /* Write out the header */
> - if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
> + if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
> + goto error;
> +
> + /* Add the base image */
> + if (base_path && write(fd, base_path, base_len) != base_len)
>   goto error;
>  
>   /* Extend to desired size, and add one refcount cluster */
> diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
> index 006411d9785..2d0355450ee 100644
> --- usr.sbin/vmctl/vmctl.h
> +++ usr.sbin/vmctl/vmctl.h
> @@ -87,7 +87,7 @@ __dead void
>  
>  /* vmctl.c */
>  int create_raw_imagefile(const char *, long);
> -int create_qc2_imagefile(const char *, long);
> +int create_qc2_imagefile(const char *, const char *, long);
>  int vm_start(uint32_t, const char *, int, int, char **, int,
>      char **, int *, char *, char *, char *);
>  int vm_start_complete(struct imsg *, int *, int);
> diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
> index 550b73c1a39..68be738d304 100644
> --- usr.sbin/vmd/config.c
> +++ usr.sbin/vmd/config.c
> @@ -35,6 +35,7 @@
>  #include <util.h>
>  #include <errno.h>
>  #include <imsg.h>
> +#include <libgen.h>
>  
>  #include "proc.h"
>  #include "vmd.h"
> @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
>  int
>  config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  {
> + int diskfds[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
>   struct vmd_if *vif;
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> - unsigned int i;
> + unsigned int i, j;
>   int fd = -1, vmboot = 0;
> - int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
> + int kernfd = -1;
> + int *tapfds;
>   int cdromfd = -1;
>   int saved_errno = 0;
> + int n = 0, aflags, oflags;
>   char ifname[IF_NAMESIZE], *s;
>   char path[PATH_MAX];
> + char base[PATH_MAX];
> + char expanded[PATH_MAX];
>   unsigned int unit;
>  
>   errno = 0;
> @@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   }
>   }
>  
> - diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
> - if (diskfds == NULL) {
> - log_warn("%s: can't allocate disk fds", __func__);
> - goto fail;
> - }
> - for (i = 0; i < vcp->vcp_ndisks; i++)
> - diskfds[i] = -1;
> + for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
> + diskfds[i][j] = -1;
>  
>   tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
>   if (tapfds == NULL) {
> @@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  
>   /* Open disk images for child */
>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
> -                /* Stat disk[i] to ensure it is a regular file */
> - if ((diskfds[i] = open(vcp->vcp_disks[i],
> -    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
> - log_warn("%s: can't open disk %s", __func__,
> -    vcp->vcp_disks[i]);
> - errno = VMD_DISK_MISSING;
> - goto fail;
> - }
> + if (strlcpy(path, vcp->vcp_disks[i], sizeof(path))
> +   >= sizeof(path))
> + log_warnx("%s, disk path too long", __func__);
> + memset(vmc->vmc_diskbases, 0, sizeof(vmc->vmc_diskbases));
> + oflags = O_RDWR|O_EXLOCK|O_NONBLOCK;
> + aflags = R_OK|W_OK;
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
> + /* Stat disk[i] to ensure it is a regular file */
> + if ((diskfds[i][j] = open(path, oflags)) == -1) {
> + log_warn("%s: can't open disk %s", __func__,
> +    vcp->vcp_disks[i]);
> + errno = VMD_DISK_MISSING;
> + goto fail;
> + }
>  
> - if (vm_checkaccess(diskfds[i],
> -    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
> -    uid, R_OK|W_OK) == -1) {
> - log_warnx("vm \"%s\" no read/write access to disk %s",
> -    vcp->vcp_name, vcp->vcp_disks[i]);
> - errno = EPERM;
> - goto fail;
> + if (vm_checkaccess(diskfds[i][j],
> +    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
> +    uid, aflags) == -1) {
> + log_warnx("vm \"%s\" unable to access "
> +    "disk %s", vcp->vcp_name, path);
> + errno = EPERM;
> + goto fail;
> + }
> +
> + /*
> + * Clear the write and exclusive flags for base images.
> + * All writes should go to the top image, allowing them
> + * to be shared.
> + */
> + oflags = O_RDONLY|O_NONBLOCK;
> + aflags = R_OK;
> + n = virtio_get_base(diskfds[i][j], base, sizeof base,
> +    vmc->vmc_disktypes[i]);
> + if (n == 0)
> + break;
> + if (n == -1) {
> + log_warnx("vm \"%s\" unable to read "
> +    "base %s for disk %s", vcp->vcp_name,
> +    base, vcp->vcp_disks[i]);
> + goto fail;
> + }
> + /*
> + * Relative paths should be interpreted relative
> + * to the disk image, rather than relative to the
> + * directory vmd happens to be running in, since
> + * this is the only userful interpretation.
> + */
> + if (base[0] == '/') {
> + if (realpath(base, path) == NULL) {
> + log_warn("unable to resolve %s", base);
> + goto fail;
> + }
> + } else {
> + s = dirname(path);
> + if (snprintf(expanded, sizeof(expanded),
> +    "%s/%s", s, base) >= (int)sizeof(expanded)) {
> + log_warn("path too long: %s/%s",
> +    s, base);
> + goto fail;
> + }
> + if (realpath(expanded, path) == NULL) {
> + log_warn("unable to resolve %s", base);
> + goto fail;
> + }
> + }
>   }
>   }
>  
> @@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>      NULL, 0);
>  
>   for (i = 0; i < vcp->vcp_ndisks; i++) {
> - proc_compose_imsg(ps, PROC_VMM, -1,
> -    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
> -    &i, sizeof(i));
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
> + if (diskfds[i][j] == -1)
> + break;
> + proc_compose_imsg(ps, PROC_VMM, -1,
> +    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
> +    diskfds[i][j], &i, sizeof(i));
> + }
>   }
>   for (i = 0; i < vcp->vcp_nnics; i++) {
>   proc_compose_imsg(ps, PROC_VMM, -1,
> @@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   proc_compose_imsg(ps, PROC_VMM, -1,
>      IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
>  
> - free(diskfds);
>   free(tapfds);
>  
>   vm->vm_running = 1;
> @@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>   close(kernfd);
>   if (cdromfd != -1)
>   close(cdromfd);
> - if (diskfds != NULL) {
> - for (i = 0; i < vcp->vcp_ndisks; i++)
> - close(diskfds[i]);
> - free(diskfds);
> - }
> + for (i = 0; i < vcp->vcp_ndisks; i++)
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
> + if (diskfds[i][j] != -1)
> + close(diskfds[i][j]);
>   if (tapfds != NULL) {
>   for (i = 0; i < vcp->vcp_nnics; i++)
>   close(tapfds[i]);
> @@ -489,7 +542,7 @@ int
>  config_getdisk(struct privsep *ps, struct imsg *imsg)
>  {
>   struct vmd_vm *vm;
> - unsigned int n;
> + unsigned int n, idx;
>  
>   errno = 0;
>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
> @@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
>   IMSG_SIZE_CHECK(imsg, &n);
>   memcpy(&n, imsg->data, sizeof(n));
>  
> - if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
> -    vm->vm_disks[n] != -1 || imsg->fd == -1) {
> + if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
>   log_warnx("invalid disk id");
>   errno = EINVAL;
>   return (-1);
>   }
> - vm->vm_disks[n] = imsg->fd;
> -
> + idx = vm->vm_params.vmc_diskbases[n]++;
> + if (idx >= VM_MAX_BASE_PER_DISK) {
> + log_warnx("too many bases for disk");
> + errno = EINVAL;
> + return (-1);
> + }
> + vm->vm_disks[n][idx] = imsg->fd;
>   return (0);
>  }
>  
> diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
> index c3211d186fa..25799cc5a3d 100644
> --- usr.sbin/vmd/vioqcow2.c
> +++ usr.sbin/vmd/vioqcow2.c
> @@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
>  static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
>  static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
>  static int inc_refs(struct qcdisk *, off_t, int);
> -static int qc2_openpath(struct qcdisk *, char *, int);
> -static int qc2_open(struct qcdisk *, int);
> +static int qc2_open(struct qcdisk *, int *, size_t);
>  static ssize_t qc2_pread(void *, char *, size_t, off_t);
>  static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
>  static void qc2_close(void *, int);
> @@ -118,14 +117,14 @@ static void qc2_close(void *, int);
>   * May open snapshot base images.
>   */
>  int
> -virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
> +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
>  {
>   struct qcdisk *diskp;
>  
>   diskp = malloc(sizeof(struct qcdisk));
>   if (diskp == NULL)
>   return -1;
> - if (qc2_open(diskp, fd) == -1) {
> + if (qc2_open(diskp, fd, nfd) == -1) {
>   log_warnx("%s: could not open qcow2 disk", __func__);
>   return -1;
>   }
> @@ -137,19 +136,40 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
>   return 0;
>  }
>  
> -static int
> -qc2_openpath(struct qcdisk *disk, char *path, int flags)
> +ssize_t
> +virtio_qcow2_get_base(int fd, char *path, size_t npath)
>  {
> - int fd;
> + struct qcheader header;
> + uint64_t backingoff;
> + uint32_t backingsz;
>  
> - fd = open(path, flags);
> - if (fd < 0)
> + if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
> + log_warnx("%s: short read on header", __func__);
> + return -1;
> + }
> + if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
> + log_warn("%s: invalid magic numbers", __func__);
>   return -1;
> - return qc2_open(disk, fd);
> + }
> + backingoff = be64toh(header.backingoff);
> + backingsz = be32toh(header.backingsz);
> + if (backingsz != 0) {
> + if (backingsz >= npath - 1) {
> + log_warn("%s: snapshot path too long", __func__);
> + return -1;
> + }
> + if (pread(fd, path, backingsz, backingoff) != backingsz) {
> + log_warnx("%s: could not read snapshot base name",
> +    __func__);
> + return -1;
> + }
> + path[backingsz] = '\0';
> + }
> + return backingsz;
>  }
>  
>  static int
> -qc2_open(struct qcdisk *disk, int fd)
> +qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
>  {
>   char basepath[PATH_MAX];
>   struct stat st;
> @@ -157,14 +177,15 @@ qc2_open(struct qcdisk *disk, int fd)
>   uint64_t backingoff;
>   uint32_t backingsz;
>   size_t i;
> - int version;
> + int version, fd;
>  
>   pthread_rwlock_init(&disk->lock, NULL);
> + fd = fds[0];
>   disk->fd = fd;
>   disk->base = NULL;
>   disk->l1 = NULL;
>  
> - if (pread(fd, &header, sizeof header, 0) != sizeof header) {
> + if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
>   log_warn("%s: short read on header", __func__);
>   goto error;
>   }
> @@ -203,11 +224,11 @@ qc2_open(struct qcdisk *disk, int fd)
>   goto error;
>   }
>  
> - disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
> + disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
>   if (!disk->l1)
>   goto error;
> - if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off)
> -    != 8*disk->l1sz) {
> + if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
> +    != 8 * disk->l1sz) {
>   log_warn("%s: unable to read qcow2 L1 table", __func__);
>   goto error;
>   }
> @@ -222,14 +243,7 @@ qc2_open(struct qcdisk *disk, int fd)
>   backingoff = be64toh(header.backingoff);
>   backingsz = be32toh(header.backingsz);
>   if (backingsz != 0) {
> - /*
> - * FIXME: we need to figure out a way of opening these things,
> - * otherwise we just crash with a pledge violation.
> - */
> - log_warn("%s: unsupported external snapshot images", __func__);
> - goto error;
> -
> - if (backingsz >= sizeof basepath - 1) {
> + if (backingsz >= sizeof(basepath) - 1) {
>   log_warn("%s: snapshot path too long", __func__);
>   goto error;
>   }
> @@ -239,11 +253,17 @@ qc2_open(struct qcdisk *disk, int fd)
>   goto error;
>   }
>   basepath[backingsz] = 0;
> + if (nfd <= 1) {
> + log_warnx("%s: missing base image %s", __func__,
> +    basepath);
> + goto error;
> + }
> +
>  
>   disk->base = calloc(1, sizeof(struct qcdisk));
>   if (!disk->base)
>   goto error;
> - if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
> + if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
>   log_warn("%s: could not open %s", basepath, __func__);
>   goto error;
>   }
> @@ -428,7 +448,7 @@ xlate(struct qcdisk *disk, off_t off, int *inplace)
>   return 0;
>   }
>   l2off = (off / disk->clustersz) % l2sz;
> - pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8);
> + pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
>   cluster = be64toh(buf);
>   /*
>   * cluster may be 0, but all future operations don't affect
> @@ -521,12 +541,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
>   cluster = disk->end;
>   disk->end += disk->clustersz;
>   buf = htobe64(cluster | QCOW2_INPLACE);
> - if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
> + if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
>   goto fail;
>  
>   /* TODO: lazily sync: currently VMD doesn't close things */
>   buf = htobe64(disk->l1[l1off]);
> - if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
> + if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
>   goto fail;
>   if (inc_refs(disk, cluster, 1) == -1)
>   goto fail;
> @@ -570,8 +590,8 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
>   nper = disk->clustersz / 2;
>   l1idx = (off / disk->clustersz) / nper;
>   l2idx = (off / disk->clustersz) % nper;
> - l1off = disk->refoff + 8*l1idx;
> - if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
> + l1off = disk->refoff + 8 * l1idx;
> + if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
>   return -1;
>  
>   l2cluster = be64toh(buf);
> @@ -583,19 +603,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
>   return -1;
>   }
>   buf = htobe64(l2cluster);
> - if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
> + if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) {
>   return -1;
>   }
>   }
>  
>   refs = 1;
>   if (!newcluster) {
> - if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
> + if (pread(disk->fd, &refs, sizeof(refs),
> +    l2cluster + 2 * l2idx) != 2)
>   return -1;
>   refs = be16toh(refs) + 1;
>   }
>   refs = htobe16(refs);
> - if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
> + if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) {
>   log_warn("%s: could not write ref block", __func__);
>   return -1;
>   }
> diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
> index e02ab67c5dc..ff4bbb3095e 100644
> --- usr.sbin/vmd/vioraw.c
> +++ usr.sbin/vmd/vioraw.c
> @@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
>   * returning -1 for error, 0 for success.
>   */
>  int
> -virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
> +virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
>  {
>   off_t sz;
>   int *fdp;
>  
> - sz = lseek(fd, 0, SEEK_END);
> + if (nfd != 1)
> + return -1;
> + sz = lseek(fd[0], 0, SEEK_END);
>   if (sz == -1)
>   return -1;
>  
>   fdp = malloc(sizeof(int));
>   if (!fdp)
>   return -1;
> - *fdp = fd;
> + *fdp = fd[0];
>   file->p = fdp;
>   file->pread = raw_pread;
>   file->pwrite = raw_pwrite;
> diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
> index 93490344560..94818b24d9e 100644
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -1745,24 +1745,41 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
>   return (0);
>  }
>  
> +int
> +virtio_get_base(int fd, char *path, size_t npath ,int type)
> +{
> + switch (type) {
> + case VMDF_RAW:
> + return 0;
> + case VMDF_QCOW2:
> + return virtio_qcow2_get_base(fd, path, npath);
> + }
> + log_warnx("%s: invalid disk format", __func__);
> + return -1;
> +}
> +
> +/*
> + * Initializes a struct virtio_backing using the list of fds.
> + */
>  static int
> -virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
> +virtio_init_disk(struct virtio_backing *file, off_t *sz,
> +    int *fd, size_t nfd, int type)
>  {
>   /*
>   * probe disk types in order of preference, first one to work wins.
>   * TODO: provide a way of specifying the type and options.
>   */
>   switch (type) {
> - case VMDF_RAW: return virtio_init_raw(file, sz, fd);
> - case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
> + case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
> + case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
>   }
>   log_warnx("%s: invalid disk format", __func__);
>   return -1;
>  }
>  
>  void
> -virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
> -    int *child_taps)
> +virtio_init(struct vmd_vm *vm, int child_cdrom,
> +    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> @@ -1838,7 +1855,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>   vioblk[i].vm_id = vcp->vcp_id;
>   vioblk[i].irq = pci_get_dev_irq(id);
>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
> -    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
> +    child_disks[i], vmc->vmc_diskbases[i],
> +    vmc->vmc_disktypes[i]) == -1) {
>   log_warnx("%s: unable to determine disk format",
>      __func__);
>   return;
> @@ -1967,7 +1985,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
>   vioscsi->vq[i].last_avail = 0;
>   }
>   if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
> -    child_cdrom, VMDF_RAW) == -1) {
> +    &child_cdrom, 1, VMDF_RAW) == -1) {
>   log_warnx("%s: unable to determine iso format",
>      __func__);
>   return;
> @@ -2123,7 +2141,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
>  }
>  
>  int
> -vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
> +vioblk_restore(int fd, struct vmop_create_params *vmc,
> +    int child_disks[][VM_MAX_BASE_PER_DISK])
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   uint8_t i;
> @@ -2149,7 +2168,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
>   return (-1);
>   }
>   if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
> -    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
> +    child_disks[i], vmc->vmc_diskbases[i],
> +    vmc->vmc_disktypes[i]) == -1)  {
>   log_warnx("%s: unable to determine disk format",
>      __func__);
>   return (-1);
> @@ -2186,7 +2206,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
>   return (-1);
>   }
>  
> - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
> + if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
>      VMDF_RAW) == -1) {
>   log_warnx("%s: unable to determine iso format", __func__);
>   return (-1);
> @@ -2198,8 +2218,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
>  }
>  
>  int
> -virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
> -    int *child_taps)
> +virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
> +    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
> index 46006916b6a..bb632bb5502 100644
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -257,10 +257,11 @@ struct ioinfo {
>  };
>  
>  /* virtio.c */
> -void virtio_init(struct vmd_vm *, int, int *, int *);
> +void virtio_init(struct vmd_vm *, int, int[][VM_MAX_BASE_PER_DISK], int *);
>  void virtio_shutdown(struct vmd_vm *);
>  int virtio_dump(int);
> -int virtio_restore(int, struct vmd_vm *, int, int *, int *);
> +int virtio_restore(int, struct vmd_vm *, int,
> +    int[][VM_MAX_BASE_PER_DISK], int *);
>  uint32_t vring_size(uint32_t);
>  
>  int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
> @@ -270,12 +271,14 @@ void viornd_update_qs(void);
>  void viornd_update_qa(void);
>  int viornd_notifyq(void);
>  
> -int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
> -int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
> +ssize_t virtio_qcow2_get_base(int, char *, size_t);
> +int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
> +int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
>  
>  int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
>  int vioblk_dump(int);
> -int vioblk_restore(int, struct vmop_create_params *, int *);
> +int vioblk_restore(int, struct vmop_create_params *,
> +    int[][VM_MAX_BASE_PER_DISK]);
>  void vioblk_update_qs(struct vioblk_dev *);
>  void vioblk_update_qa(struct vioblk_dev *);
>  int vioblk_notifyq(struct vioblk_dev *);
> diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
> index ef4494d918b..37dfb95bb0d 100644
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -65,8 +65,8 @@
>  
>  io_fn_t ioports_map[MAX_PORTS];
>  
> -int run_vm(int, int *, int *, struct vmop_create_params *,
> -    struct vcpu_reg_state *);
> +int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
> +    struct vmop_create_params *, struct vcpu_reg_state *);
>  void vm_dispatch_vmm(int, short, void *);
>  void *event_thread(void *);
>  void *vcpu_run_loop(void *);
> @@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
>  void create_memory_map(struct vm_create_params *);
>  int alloc_guest_mem(struct vm_create_params *);
>  int vmm_create_vm(struct vm_create_params *);
> -void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
> -void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
> +void init_emulated_hw(struct vmop_create_params *, int,
> +    int[][VM_MAX_BASE_PER_DISK], int *);
> +void restore_emulated_hw(struct vm_create_params *, int, int *,
> +    int[][VM_MAX_BASE_PER_DISK],int);
>  void vcpu_exit_inout(struct vm_run_params *);
>  uint8_t vcpu_exit_pci(struct vm_run_params *);
>  int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
> @@ -327,7 +329,8 @@ start_vm(struct vmd_vm *vm, int fd)
>  
>   /* Find and open kernel image */
>   if ((fp = vmboot_open(vm->vm_kernel,
> -    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
> +    vm->vm_disks[0], vmc->vmc_diskbases[0],
> +    vmc->vmc_disktypes[0], &vmboot)) == NULL)
>   fatalx("failed to open kernel - exiting");
>  
>   /* Load kernel image */
> @@ -903,7 +906,7 @@ vmm_create_vm(struct vm_create_params *vcp)
>   */
>  void
>  init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
> -    int *child_disks, int *child_taps)
> +    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   int i;
> @@ -968,7 +971,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
>   */
>  void
>  restore_emulated_hw(struct vm_create_params *vcp, int fd,
> -    int *child_taps, int *child_disks, int child_cdrom)
> +    int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
>  {
>   /* struct vm_create_params *vcp = &vmc->vmc_params; */
>   int i;
> @@ -1029,8 +1032,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
>   *  !0 : the VM exited abnormally or failed to start
>   */
>  int
> -run_vm(int child_cdrom, int *child_disks, int *child_taps,
> -    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
> +run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
> +    int *child_taps, struct vmop_create_params *vmc,
> +    struct vcpu_reg_state *vrs)
>  {
>   struct vm_create_params *vcp = &vmc->vmc_params;
>   struct vm_rwregs_params vregsp;
> diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
> index 44ceeb64a7e..718c9739fa5 100644
> --- usr.sbin/vmd/vmboot.c
> +++ usr.sbin/vmd/vmboot.c
> @@ -385,7 +385,7 @@ vmboot_loadfile(struct open_file *f, char *file, size_t *size)
>  }
>  
>  FILE *
> -vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
> +vmboot_open(int kernel_fd, int *disk_fd, int nfd, unsigned int disk_type,
>      struct vmboot_params *vmboot)
>  {
>   char file[PATH_MAX];
> @@ -404,7 +404,7 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
>   if (kernel_fd != -1)
>   return (fdopen(kernel_fd, "r"));
>  
> - if (disk_fd == -1)
> + if (disk_fd == NULL || nfd < 1)
>   return (NULL);
>  
>   if ((vfp = calloc(1, sizeof(*vfp))) == NULL)
> @@ -414,20 +414,19 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
>  
>   switch (vmboot->vbp_type) {
>   case VMDF_RAW:
> - if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
> + if (virtio_init_raw(vfp, &sz, disk_fd, nfd) == -1) {
>   log_debug("%s: could not open raw disk", __func__);
>   goto fail;
>   }
>   break;
>   case VMDF_QCOW2:
> - if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
> + if (virtio_init_qcow2(vfp, &sz, disk_fd, nfd) == -1) {
>   log_debug("%s: could not open qcow2 disk", __func__);
>   goto fail;
>   }
>   break;
>   }
>  
> - vmboot->vbp_fd = disk_fd;
>   vmboot_file.f_devdata = vmboot;
>  
>   if ((vmboot->vbp_partoff =
> diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
> index 1571be21bc5..18a5e0d3d5d 100644
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -1097,7 +1097,7 @@ void
>  vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>  {
>   struct privsep *ps = &env->vmd_ps;
> - unsigned int i;
> + unsigned int i, j;
>  
>   if (vm == NULL)
>   return;
> @@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
>   close(vm->vm_iev.ibuf.fd);
>   }
>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
> - if (vm->vm_disks[i] != -1) {
> - close(vm->vm_disks[i]);
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
> + if (vm->vm_disks[i][j] != -1) {
> + close(vm->vm_disks[i][j]);
> + vm->vm_disks[i][j] = -1;
> + }
>   }
>   }
>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
> @@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
>   struct vmop_owner *vmo = NULL;
>   struct vmd_user *usr = NULL;
>   uint32_t rng;
> - unsigned int i;
> + unsigned int i, j;
>   struct vmd_switch *sw;
>   char *s;
>  
> @@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
>   vm->vm_user = usr;
>  
>   for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
> + vm->vm_disks[i][j] = -1;
>   for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
>   vm->vm_ifs[i].vif_fd = -1;
>   for (i = 0; i < vcp->vcp_nnics; i++) {
> diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
> index b348d12c757..b7c012854e8 100644
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -48,6 +48,7 @@
>  #define VM_DEFAULT_DEVICE "hd0a"
>  #define VM_BOOT_CONF "/etc/boot.conf"
>  #define VM_NAME_MAX 64
> +#define VM_MAX_BASE_PER_DISK 4
>  #define VM_TTYNAME_MAX 16
>  #define MAX_TAP 256
>  #define NR_BACKLOG 5
> @@ -169,6 +170,7 @@ struct vmop_create_params {
>  #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
>  
>   unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
> + unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
>  #define VMDF_RAW 0x01
>  #define VMDF_QCOW2 0x02
>  
> @@ -202,7 +204,6 @@ struct vm_dump_header {
>  } __packed;
>  
>  struct vmboot_params {
> - int vbp_fd;
>   off_t vbp_partoff;
>   char vbp_device[PATH_MAX];
>   char vbp_image[PATH_MAX];
> @@ -241,7 +242,7 @@ struct vmd_vm {
>   uint32_t vm_vmid;
>   int vm_kernel;
>   int vm_cdrom;
> - int vm_disks[VMM_MAX_DISKS_PER_VM];
> + int vm_disks[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
>   struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
>   char *vm_ttyname;
>   int vm_tty;
> @@ -407,7 +408,7 @@ int config_getif(struct privsep *, struct imsg *);
>  int config_getcdrom(struct privsep *, struct imsg *);
>  
>  /* vmboot.c */
> -FILE *vmboot_open(int, int, unsigned int, struct vmboot_params *);
> +FILE *vmboot_open(int, int *, int, unsigned int, struct vmboot_params *);
>  void vmboot_close(FILE *, struct vmboot_params *);
>  
>  /* parse.y */
> @@ -415,4 +416,7 @@ int parse_config(const char *);
>  int cmdline_symset(char *);
>  int host(const char *, struct address *);
>  
> +/* virtio.c */
> +int virtio_get_base(int, char *, size_t, int);
> +
>  #endif /* VMD_H */
> diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
> index 7757856323f..47e2b2324be 100644
> --- usr.sbin/vmd/vmm.c
> +++ usr.sbin/vmd/vmm.c
> @@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
>   struct vmd_vm *vm;
>   int ret = EINVAL;
>   int fds[2];
> - size_t i;
> + size_t i, j;
>  
>   if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
>   log_warnx("%s: can't find vm", __func__);
> @@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
>   close(fds[1]);
>  
>   for (i = 0 ; i < vcp->vcp_ndisks; i++) {
> - close(vm->vm_disks[i]);
> - vm->vm_disks[i] = -1;
> + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
> + if (vm->vm_disks[i][j] != -1)
> + close(vm->vm_disks[i][j]);
> + vm->vm_disks[i][j] = -1;
> + }
>   }
>   for (i = 0 ; i < vcp->vcp_nnics; i++) {
>   close(vm->vm_ifs[i].vif_fd);
>
> --
>     Ori Bernstein

--

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Reyk Floeter-2
In reply to this post by Ori Bernstein
On Wed, Oct 03, 2018 at 11:41:41PM -0700, Ori Bernstein wrote:

> diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
> index 550b73c1a39..68be738d304 100644
> --- usr.sbin/vmd/config.c
> +++ usr.sbin/vmd/config.c
> @@ -35,6 +35,7 @@
>  #include <util.h>
>  #include <errno.h>
>  #include <imsg.h>
> +#include <libgen.h>
>  
>  #include "proc.h"
>  #include "vmd.h"
> @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
>  int
>  config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
>  {
> + int diskfds[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
>   struct vmd_if *vif;
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> - unsigned int i;
> + unsigned int i, j;
>   int fd = -1, vmboot = 0;
> - int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
> + int kernfd = -1;
> + int *tapfds;

keep tapfds = NULL or you might cause a segfault in the goto fail case...

[snip]
>   if (tapfds != NULL) {
>   for (i = 0; i < vcp->vcp_nnics; i++)
>   close(tapfds[i]);

...here (same function).

Reyk

Reply | Threaded
Open this post in threaded view
|

Re: Qcow2: External snapshots

Ori Bernstein
In reply to this post by Jason McIntyre-2
On Wed, 3 Oct 2018 07:27:17 +0100, Jason McIntyre <[hidden email]> wrote:

> On Tue, Oct 02, 2018 at 11:13:35PM -0700, Ori Bernstein wrote:
> >
> > Updated version. Changes from the last diff:
> >
> > - Merge in syntax changes.
> > - Don't over-read when getting the base images.
> > - Fix relative paths in base images.
> > - Allow multiple derived images to use a single base image, and allow a user
> >   with only read permisssions to base their images on top of it.
> > - Probe the base image size, use/validate it when craeting disk images.
> > - Fix style a bit (long lines, changing from sizeof foo to sizeof(foo).
> > - Move a define out of vmmvar.h
> > - And update the manpage with these changes.
> > - Improve error checking around creating/resolving base disk paths.
> >
>
> morning.
>
> you should start new sentences on new lines - it forces a double spacing
> between sentences that all man pages have.
>
> if you run your proposed changes to man pages through "mandoc -Tlint",
> it will pick up on silly things like that.
>
> note there is also a double space in "Op  Fl b"
>
> jmc
>

Forgot to do this in my last update -- done now, and ran it through mandoc -Tlint.
Also fixed the bug that Reyk caught.

diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile
index c2a5f42d5f6..1f8673e0e26 100644
--- regress/usr.sbin/vmd/diskfmt/Makefile
+++ regress/usr.sbin/vmd/diskfmt/Makefile
@@ -11,7 +11,7 @@
 VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
 
 PROG=vioscribble
-SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+SRCS=vioscribble.c vioqcow2.c vioraw.c
 CFLAGS+=-I$(VMD_DIR) -pthread
 LDFLAGS+=-pthread
 
@@ -26,3 +26,6 @@ scribble-images:
 .PHONY: ${REGRESS_TARGETS} scribble-images
 
 .include <bsd.regress.mk>
+
+vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+ cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c
index 14d720db652..1da8efedac7 100644
--- regress/usr.sbin/vmd/diskfmt/vioscribble.c
+++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
@@ -122,16 +122,18 @@ main(int argc, char **argv)
  verbose = !!getenv("VERBOSE");
  qcfd = open("scribble.qc2", O_RDWR);
  rawfd = open("scribble.raw", O_RDWR);
- if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
+ if (qcfd == -1)
  err(1, "unable to open qcow");
- if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
+ if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
+ err(1, "unable to init qcow");
+ if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
  err(1, "unable to open raw");
 
  srandom_deterministic(123);
 
  /* scribble to both disks */
  printf("scribbling...\n");
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < 1024*16; i++) {
  off = (random() % DISKSZ);
  len = random() % sizeof buf + 1;
  fill(off, buf, sizeof buf);
diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
index 69c5e013f4c..2cfb6848899 100644
--- usr.sbin/vmctl/main.c
+++ usr.sbin/vmctl/main.c
@@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char *[]);
 
 struct ctl_command ctl_commands[] = {
  { "console", CMD_CONSOLE, ctl_console, "id" },
- { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 },
+ { "create", CMD_CREATE, ctl_create,
+ "\"path\" [-s size] [-b base]", 1 },
  { "load", CMD_LOAD, ctl_load, "\"path\"" },
  { "log", CMD_LOG, ctl_log, "[verbose|brief]" },
  { "reload", CMD_RELOAD, ctl_reload, "" },
@@ -539,47 +540,55 @@ int
 ctl_create(struct parse_result *res, int argc, char *argv[])
 {
  int ch, ret, type;
- const char *paths[2], *disk, *format;
+ const char *disk, *format, *base;
 
  if (argc < 2)
  ctl_usage(res->ctl);
 
+ base = NULL;
  type = parse_disktype(argv[1], &disk);
 
- paths[0] = disk;
- paths[1] = NULL;
-
- if (unveil(paths[0], "rwc") == -1)
+ if (pledge("stdio rpath wpath cpath unveil", NULL) == -1)
+ err(1, "pledge");
+ if (unveil(disk, "rwc") == -1)
  err(1, "unveil");
 
- if (pledge("stdio rpath wpath cpath", NULL) == -1)
- err(1, "pledge");
  argc--;
  argv++;
 
- while ((ch = getopt(argc, argv, "s:")) != -1) {
+ while ((ch = getopt(argc, argv, "s:b:")) != -1) {
  switch (ch) {
  case 's':
  if (parse_size(res, optarg, 0) != 0)
  errx(1, "invalid size: %s", optarg);
  break;
+ case 'b':
+ base = optarg;
+ if (unveil(base, "r") == -1)
+ err(1, "unveil");
+ break;
  default:
  ctl_usage(res->ctl);
  /* NOTREACHED */
  }
  }
+ if (unveil(NULL, NULL))
+ err(1, "unveil");
 
- if (res->size == 0) {
- fprintf(stderr, "missing size argument\n");
+ if (base && type != VMDF_QCOW2)
+ errx(1, "base images require qcow2 disk format");
+ if (res->size == 0 && !base) {
+ fprintf(stderr, "could not create %s: missing size argument\n",
+    disk);
  ctl_usage(res->ctl);
  }
 
  if (type == VMDF_QCOW2) {
  format = "qcow2";
- ret = create_qc2_imagefile(paths[0], res->size);
+ ret = create_qc2_imagefile(disk, base, res->size);
  } else {
  format = "raw";
- ret = create_raw_imagefile(paths[0], res->size);
+ ret = create_raw_imagefile(disk, res->size);
  }
 
  if (ret != 0) {
diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
index f7890ac99f8..72738d473b7 100644
--- usr.sbin/vmctl/vmctl.8
+++ usr.sbin/vmctl/vmctl.8
@@ -50,7 +50,7 @@ Using
 .Xr cu 1
 connect to the console of the VM with the specified
 .Ar id .
-.It Cm create Ar path Fl s Ar size
+.It Cm create Ar path Fl s Op Ar size Op Fl b Ar base
 Creates a VM disk image file with the specified
 .Ar path
 and
@@ -59,13 +59,22 @@ rounded to megabytes.
 The disk
 .Ar path
 may be prefixed with a format prefix
-.Pf ( Pa raw:
+.Pf ( Pa raw :
 or
-.Pa qcow2: )
+.Pa qcow2 : )
 in order to specify the disk format.
 If left unspecified, the format defaults to
 .Pa raw
 if it cannot be derived automatically.
+For qcow2, a
+.Ar base
+image may be specified.
+The base image is not modified.
+The derived image contains only the changes written by the VM.
+When creating a derived image, the
+.Ar size
+may be omitted, and probed from the base image.
+If it is provided, it must match the base image size.
 .It Cm load Ar filename
 Load additional configuration from the specified file.
 .It Cm log brief
@@ -127,9 +136,9 @@ Disk image file (may be specified multiple times to add multiple disk images).
 The disk
 .Ar path
 may be prefixed with a format prefix
-.Pf ( Pa raw:
+.Pf ( Pa raw :
 or
-.Pa qcow2: )
+.Pa qcow2 : )
 in order to specify the disk format.
 If left unspecified, the format defaults to
 .Pa raw
diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
index b09e1115ff7..12db3f69525 100644
--- usr.sbin/vmctl/vmctl.c
+++ usr.sbin/vmctl/vmctl.c
@@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
 #define ALIGN(sz, align) \
  ((sz + align - 1) & ~(align - 1))
 int
-create_qc2_imagefile(const char *imgfile_path, long imgsize)
+create_qc2_imagefile(const char *imgfile_path,
+    const char *base_path, long imgsize)
 {
  struct qcheader {
  char magic[4];
@@ -869,15 +870,33 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  uint64_t autoclearfeatures;
  uint32_t reforder;
  uint32_t headersz;
- } __packed hdr;
+ } __packed hdr, basehdr;
  int fd, ret;
+ ssize_t base_len;
  uint64_t l1sz, refsz, disksz, initsz, clustersz;
  uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
  uint16_t refs;
 
- disksz = 1024*1024*imgsize;
+ disksz = 1024 * 1024 * imgsize;
+
+ if (base_path) {
+ fd = open(base_path, O_RDONLY);
+ if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
+ err(1, "failure to read base image header");
+ close(fd);
+ if (strncmp(basehdr.magic,
+    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
+ errx(1, "base image is not a qcow2 file");
+ if (!disksz)
+ disksz = betoh64(basehdr.disksz);
+ else if (disksz != betoh64(basehdr.disksz))
+ errx(1, "base size does not match requested size");
+ }
+ if (!base_path && !disksz)
+ errx(1, "missing disk size");
+
  clustersz = (1<<16);
- l1off = ALIGN(sizeof hdr, clustersz);
+ l1off = ALIGN(sizeof(hdr), clustersz);
 
  l1entrysz = clustersz * clustersz / 8;
  l1sz = (disksz + l1entrysz - 1) / l1entrysz;
@@ -887,11 +906,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  refsz = (disksz + refentrysz - 1) / refentrysz;
 
  initsz = ALIGN(refoff + refsz*clustersz, clustersz);
+ base_len = base_path ? strlen(base_path) : 0;
 
- memcpy(hdr.magic, "QFI\xfb", 4);
+ memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
  hdr.version = htobe32(3);
- hdr.backingoff = htobe64(0);
- hdr.backingsz = htobe32(0);
+ hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0);
+ hdr.backingsz = htobe32(base_len);
  hdr.clustershift = htobe32(16);
  hdr.disksz = htobe64(disksz);
  hdr.cryptmethod = htobe32(0);
@@ -905,7 +925,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  hdr.compatfeatures = htobe64(0);
  hdr.autoclearfeatures = htobe64(0);
  hdr.reforder = htobe32(4);
- hdr.headersz = htobe32(sizeof hdr);
+ hdr.headersz = htobe32(sizeof(hdr));
 
  /* Refuse to overwrite an existing image */
  fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
@@ -914,7 +934,11 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
  return (errno);
 
  /* Write out the header */
- if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
+ if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+ goto error;
+
+ /* Add the base image */
+ if (base_path && write(fd, base_path, base_len) != base_len)
  goto error;
 
  /* Extend to desired size, and add one refcount cluster */
diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
index 006411d9785..2d0355450ee 100644
--- usr.sbin/vmctl/vmctl.h
+++ usr.sbin/vmctl/vmctl.h
@@ -87,7 +87,7 @@ __dead void
 
 /* vmctl.c */
 int create_raw_imagefile(const char *, long);
-int create_qc2_imagefile(const char *, long);
+int create_qc2_imagefile(const char *, const char *, long);
 int vm_start(uint32_t, const char *, int, int, char **, int,
     char **, int *, char *, char *, char *);
 int vm_start_complete(struct imsg *, int *, int);
diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
index 550b73c1a39..af12b790002 100644
--- usr.sbin/vmd/config.c
+++ usr.sbin/vmd/config.c
@@ -35,6 +35,7 @@
 #include <util.h>
 #include <errno.h>
 #include <imsg.h>
+#include <libgen.h>
 
 #include "proc.h"
 #include "vmd.h"
@@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
 int
 config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 {
+ int diskfds[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
  struct vmd_if *vif;
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
- unsigned int i;
+ unsigned int i, j;
  int fd = -1, vmboot = 0;
- int kernfd = -1, *diskfds = NULL, *tapfds = NULL;
+ int kernfd = -1;
+ int *tapfds = NULL;
  int cdromfd = -1;
  int saved_errno = 0;
+ int n = 0, aflags, oflags;
  char ifname[IF_NAMESIZE], *s;
  char path[PATH_MAX];
+ char base[PATH_MAX];
+ char expanded[PATH_MAX];
  unsigned int unit;
 
  errno = 0;
@@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  }
  }
 
- diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
- if (diskfds == NULL) {
- log_warn("%s: can't allocate disk fds", __func__);
- goto fail;
- }
- for (i = 0; i < vcp->vcp_ndisks; i++)
- diskfds[i] = -1;
+ for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ diskfds[i][j] = -1;
 
  tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
  if (tapfds == NULL) {
@@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 
  /* Open disk images for child */
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
-                /* Stat disk[i] to ensure it is a regular file */
- if ((diskfds[i] = open(vcp->vcp_disks[i],
-    O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
- log_warn("%s: can't open disk %s", __func__,
-    vcp->vcp_disks[i]);
- errno = VMD_DISK_MISSING;
- goto fail;
- }
+ if (strlcpy(path, vcp->vcp_disks[i], sizeof(path))
+   >= sizeof(path))
+ log_warnx("%s, disk path too long", __func__);
+ memset(vmc->vmc_diskbases, 0, sizeof(vmc->vmc_diskbases));
+ oflags = O_RDWR|O_EXLOCK|O_NONBLOCK;
+ aflags = R_OK|W_OK;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ /* Stat disk[i] to ensure it is a regular file */
+ if ((diskfds[i][j] = open(path, oflags)) == -1) {
+ log_warn("%s: can't open disk %s", __func__,
+    vcp->vcp_disks[i]);
+ errno = VMD_DISK_MISSING;
+ goto fail;
+ }
 
- if (vm_checkaccess(diskfds[i],
-    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
-    uid, R_OK|W_OK) == -1) {
- log_warnx("vm \"%s\" no read/write access to disk %s",
-    vcp->vcp_name, vcp->vcp_disks[i]);
- errno = EPERM;
- goto fail;
+ if (vm_checkaccess(diskfds[i][j],
+    vmc->vmc_checkaccess & VMOP_CREATE_DISK,
+    uid, aflags) == -1) {
+ log_warnx("vm \"%s\" unable to access "
+    "disk %s", vcp->vcp_name, path);
+ errno = EPERM;
+ goto fail;
+ }
+
+ /*
+ * Clear the write and exclusive flags for base images.
+ * All writes should go to the top image, allowing them
+ * to be shared.
+ */
+ oflags = O_RDONLY|O_NONBLOCK;
+ aflags = R_OK;
+ n = virtio_get_base(diskfds[i][j], base, sizeof base,
+    vmc->vmc_disktypes[i]);
+ if (n == 0)
+ break;
+ if (n == -1) {
+ log_warnx("vm \"%s\" unable to read "
+    "base %s for disk %s", vcp->vcp_name,
+    base, vcp->vcp_disks[i]);
+ goto fail;
+ }
+ /*
+ * Relative paths should be interpreted relative
+ * to the disk image, rather than relative to the
+ * directory vmd happens to be running in, since
+ * this is the only userful interpretation.
+ */
+ if (base[0] == '/') {
+ if (realpath(base, path) == NULL) {
+ log_warn("unable to resolve %s", base);
+ goto fail;
+ }
+ } else {
+ s = dirname(path);
+ if (snprintf(expanded, sizeof(expanded),
+    "%s/%s", s, base) >= (int)sizeof(expanded)) {
+ log_warn("path too long: %s/%s",
+    s, base);
+ goto fail;
+ }
+ if (realpath(expanded, path) == NULL) {
+ log_warn("unable to resolve %s", base);
+ goto fail;
+ }
+ }
  }
  }
 
@@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
     NULL, 0);
 
  for (i = 0; i < vcp->vcp_ndisks; i++) {
- proc_compose_imsg(ps, PROC_VMM, -1,
-    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
-    &i, sizeof(i));
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ if (diskfds[i][j] == -1)
+ break;
+ proc_compose_imsg(ps, PROC_VMM, -1,
+    IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
+    diskfds[i][j], &i, sizeof(i));
+ }
  }
  for (i = 0; i < vcp->vcp_nnics; i++) {
  proc_compose_imsg(ps, PROC_VMM, -1,
@@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  proc_compose_imsg(ps, PROC_VMM, -1,
     IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
 
- free(diskfds);
  free(tapfds);
 
  vm->vm_running = 1;
@@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
  close(kernfd);
  if (cdromfd != -1)
  close(cdromfd);
- if (diskfds != NULL) {
- for (i = 0; i < vcp->vcp_ndisks; i++)
- close(diskfds[i]);
- free(diskfds);
- }
+ for (i = 0; i < vcp->vcp_ndisks; i++)
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ if (diskfds[i][j] != -1)
+ close(diskfds[i][j]);
  if (tapfds != NULL) {
  for (i = 0; i < vcp->vcp_nnics; i++)
  close(tapfds[i]);
@@ -489,7 +542,7 @@ int
 config_getdisk(struct privsep *ps, struct imsg *imsg)
 {
  struct vmd_vm *vm;
- unsigned int n;
+ unsigned int n, idx;
 
  errno = 0;
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
@@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
  IMSG_SIZE_CHECK(imsg, &n);
  memcpy(&n, imsg->data, sizeof(n));
 
- if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
-    vm->vm_disks[n] != -1 || imsg->fd == -1) {
+ if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
  log_warnx("invalid disk id");
  errno = EINVAL;
  return (-1);
  }
- vm->vm_disks[n] = imsg->fd;
-
+ idx = vm->vm_params.vmc_diskbases[n]++;
+ if (idx >= VM_MAX_BASE_PER_DISK) {
+ log_warnx("too many bases for disk");
+ errno = EINVAL;
+ return (-1);
+ }
+ vm->vm_disks[n][idx] = imsg->fd;
  return (0);
 }
 
diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
index c3211d186fa..25799cc5a3d 100644
--- usr.sbin/vmd/vioqcow2.c
+++ usr.sbin/vmd/vioqcow2.c
@@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
 static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static int inc_refs(struct qcdisk *, off_t, int);
-static int qc2_openpath(struct qcdisk *, char *, int);
-static int qc2_open(struct qcdisk *, int);
+static int qc2_open(struct qcdisk *, int *, size_t);
 static ssize_t qc2_pread(void *, char *, size_t, off_t);
 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
 static void qc2_close(void *, int);
@@ -118,14 +117,14 @@ static void qc2_close(void *, int);
  * May open snapshot base images.
  */
 int
-virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  struct qcdisk *diskp;
 
  diskp = malloc(sizeof(struct qcdisk));
  if (diskp == NULL)
  return -1;
- if (qc2_open(diskp, fd) == -1) {
+ if (qc2_open(diskp, fd, nfd) == -1) {
  log_warnx("%s: could not open qcow2 disk", __func__);
  return -1;
  }
@@ -137,19 +136,40 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
  return 0;
 }
 
-static int
-qc2_openpath(struct qcdisk *disk, char *path, int flags)
+ssize_t
+virtio_qcow2_get_base(int fd, char *path, size_t npath)
 {
- int fd;
+ struct qcheader header;
+ uint64_t backingoff;
+ uint32_t backingsz;
 
- fd = open(path, flags);
- if (fd < 0)
+ if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
+ log_warnx("%s: short read on header", __func__);
+ return -1;
+ }
+ if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
+ log_warn("%s: invalid magic numbers", __func__);
  return -1;
- return qc2_open(disk, fd);
+ }
+ backingoff = be64toh(header.backingoff);
+ backingsz = be32toh(header.backingsz);
+ if (backingsz != 0) {
+ if (backingsz >= npath - 1) {
+ log_warn("%s: snapshot path too long", __func__);
+ return -1;
+ }
+ if (pread(fd, path, backingsz, backingoff) != backingsz) {
+ log_warnx("%s: could not read snapshot base name",
+    __func__);
+ return -1;
+ }
+ path[backingsz] = '\0';
+ }
+ return backingsz;
 }
 
 static int
-qc2_open(struct qcdisk *disk, int fd)
+qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
 {
  char basepath[PATH_MAX];
  struct stat st;
@@ -157,14 +177,15 @@ qc2_open(struct qcdisk *disk, int fd)
  uint64_t backingoff;
  uint32_t backingsz;
  size_t i;
- int version;
+ int version, fd;
 
  pthread_rwlock_init(&disk->lock, NULL);
+ fd = fds[0];
  disk->fd = fd;
  disk->base = NULL;
  disk->l1 = NULL;
 
- if (pread(fd, &header, sizeof header, 0) != sizeof header) {
+ if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
  log_warn("%s: short read on header", __func__);
  goto error;
  }
@@ -203,11 +224,11 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
 
- disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
+ disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
  if (!disk->l1)
  goto error;
- if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off)
-    != 8*disk->l1sz) {
+ if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
+    != 8 * disk->l1sz) {
  log_warn("%s: unable to read qcow2 L1 table", __func__);
  goto error;
  }
@@ -222,14 +243,7 @@ qc2_open(struct qcdisk *disk, int fd)
  backingoff = be64toh(header.backingoff);
  backingsz = be32toh(header.backingsz);
  if (backingsz != 0) {
- /*
- * FIXME: we need to figure out a way of opening these things,
- * otherwise we just crash with a pledge violation.
- */
- log_warn("%s: unsupported external snapshot images", __func__);
- goto error;
-
- if (backingsz >= sizeof basepath - 1) {
+ if (backingsz >= sizeof(basepath) - 1) {
  log_warn("%s: snapshot path too long", __func__);
  goto error;
  }
@@ -239,11 +253,17 @@ qc2_open(struct qcdisk *disk, int fd)
  goto error;
  }
  basepath[backingsz] = 0;
+ if (nfd <= 1) {
+ log_warnx("%s: missing base image %s", __func__,
+    basepath);
+ goto error;
+ }
+
 
  disk->base = calloc(1, sizeof(struct qcdisk));
  if (!disk->base)
  goto error;
- if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
+ if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
  log_warn("%s: could not open %s", basepath, __func__);
  goto error;
  }
@@ -428,7 +448,7 @@ xlate(struct qcdisk *disk, off_t off, int *inplace)
  return 0;
  }
  l2off = (off / disk->clustersz) % l2sz;
- pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8);
+ pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
  cluster = be64toh(buf);
  /*
  * cluster may be 0, but all future operations don't affect
@@ -521,12 +541,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
  cluster = disk->end;
  disk->end += disk->clustersz;
  buf = htobe64(cluster | QCOW2_INPLACE);
- if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
+ if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
  goto fail;
 
  /* TODO: lazily sync: currently VMD doesn't close things */
  buf = htobe64(disk->l1[l1off]);
- if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
+ if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
  goto fail;
  if (inc_refs(disk, cluster, 1) == -1)
  goto fail;
@@ -570,8 +590,8 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
  nper = disk->clustersz / 2;
  l1idx = (off / disk->clustersz) / nper;
  l2idx = (off / disk->clustersz) % nper;
- l1off = disk->refoff + 8*l1idx;
- if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
+ l1off = disk->refoff + 8 * l1idx;
+ if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
  return -1;
 
  l2cluster = be64toh(buf);
@@ -583,19 +603,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
  return -1;
  }
  buf = htobe64(l2cluster);
- if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
+ if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) {
  return -1;
  }
  }
 
  refs = 1;
  if (!newcluster) {
- if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
+ if (pread(disk->fd, &refs, sizeof(refs),
+    l2cluster + 2 * l2idx) != 2)
  return -1;
  refs = be16toh(refs) + 1;
  }
  refs = htobe16(refs);
- if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
+ if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) {
  log_warn("%s: could not write ref block", __func__);
  return -1;
  }
diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
index e02ab67c5dc..ff4bbb3095e 100644
--- usr.sbin/vmd/vioraw.c
+++ usr.sbin/vmd/vioraw.c
@@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
  * returning -1 for error, 0 for success.
  */
 int
-virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
  off_t sz;
  int *fdp;
 
- sz = lseek(fd, 0, SEEK_END);
+ if (nfd != 1)
+ return -1;
+ sz = lseek(fd[0], 0, SEEK_END);
  if (sz == -1)
  return -1;
 
  fdp = malloc(sizeof(int));
  if (!fdp)
  return -1;
- *fdp = fd;
+ *fdp = fd[0];
  file->p = fdp;
  file->pread = raw_pread;
  file->pwrite = raw_pwrite;
diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index ac4f2a82b7d..d5fd8f91b1f 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1745,24 +1745,41 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
  return (0);
 }
 
+int
+virtio_get_base(int fd, char *path, size_t npath ,int type)
+{
+ switch (type) {
+ case VMDF_RAW:
+ return 0;
+ case VMDF_QCOW2:
+ return virtio_qcow2_get_base(fd, path, npath);
+ }
+ log_warnx("%s: invalid disk format", __func__);
+ return -1;
+}
+
+/*
+ * Initializes a struct virtio_backing using the list of fds.
+ */
 static int
-virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
+virtio_init_disk(struct virtio_backing *file, off_t *sz,
+    int *fd, size_t nfd, int type)
 {
  /*
  * probe disk types in order of preference, first one to work wins.
  * TODO: provide a way of specifying the type and options.
  */
  switch (type) {
- case VMDF_RAW: return virtio_init_raw(file, sz, fd);
- case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd);
+ case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd);
+ case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd);
  }
  log_warnx("%s: invalid disk format", __func__);
  return -1;
 }
 
 void
-virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_init(struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
@@ -1838,7 +1855,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioblk[i].vm_id = vcp->vcp_id;
  vioblk[i].irq = pci_get_dev_irq(id);
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1) {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1) {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return;
@@ -1967,7 +1985,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
  vioscsi->vq[i].last_avail = 0;
  }
  if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
-    child_cdrom, VMDF_RAW) == -1) {
+    &child_cdrom, 1, VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format",
     __func__);
  return;
@@ -2125,7 +2143,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
 }
 
 int
-vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
+vioblk_restore(int fd, struct vmop_create_params *vmc,
+    int child_disks[][VM_MAX_BASE_PER_DISK])
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  uint8_t i;
@@ -2151,7 +2170,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
  return (-1);
  }
  if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-    child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
+    child_disks[i], vmc->vmc_diskbases[i],
+    vmc->vmc_disktypes[i]) == -1)  {
  log_warnx("%s: unable to determine disk format",
     __func__);
  return (-1);
@@ -2188,7 +2208,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
  return (-1);
  }
 
- if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
+ if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
     VMDF_RAW) == -1) {
  log_warnx("%s: unable to determine iso format", __func__);
  return (-1);
@@ -2200,8 +2220,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
 }
 
 int
-virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vmop_create_params *vmc = &vm->vm_params;
  struct vm_create_params *vcp = &vmc->vmc_params;
diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
index 46006916b6a..bb632bb5502 100644
--- usr.sbin/vmd/virtio.h
+++ usr.sbin/vmd/virtio.h
@@ -257,10 +257,11 @@ struct ioinfo {
 };
 
 /* virtio.c */
-void virtio_init(struct vmd_vm *, int, int *, int *);
+void virtio_init(struct vmd_vm *, int, int[][VM_MAX_BASE_PER_DISK], int *);
 void virtio_shutdown(struct vmd_vm *);
 int virtio_dump(int);
-int virtio_restore(int, struct vmd_vm *, int, int *, int *);
+int virtio_restore(int, struct vmd_vm *, int,
+    int[][VM_MAX_BASE_PER_DISK], int *);
 uint32_t vring_size(uint32_t);
 
 int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
@@ -270,12 +271,14 @@ void viornd_update_qs(void);
 void viornd_update_qa(void);
 int viornd_notifyq(void);
 
-int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
-int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
+ssize_t virtio_qcow2_get_base(int, char *, size_t);
+int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
+int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
 
 int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vioblk_dump(int);
-int vioblk_restore(int, struct vmop_create_params *, int *);
+int vioblk_restore(int, struct vmop_create_params *,
+    int[][VM_MAX_BASE_PER_DISK]);
 void vioblk_update_qs(struct vioblk_dev *);
 void vioblk_update_qa(struct vioblk_dev *);
 int vioblk_notifyq(struct vioblk_dev *);
diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
index ef4494d918b..37dfb95bb0d 100644
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,8 +65,8 @@
 
 io_fn_t ioports_map[MAX_PORTS];
 
-int run_vm(int, int *, int *, struct vmop_create_params *,
-    struct vcpu_reg_state *);
+int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
+    struct vmop_create_params *, struct vcpu_reg_state *);
 void vm_dispatch_vmm(int, short, void *);
 void *event_thread(void *);
 void *vcpu_run_loop(void *);
@@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
 void create_memory_map(struct vm_create_params *);
 int alloc_guest_mem(struct vm_create_params *);
 int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
-void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
+void init_emulated_hw(struct vmop_create_params *, int,
+    int[][VM_MAX_BASE_PER_DISK], int *);
+void restore_emulated_hw(struct vm_create_params *, int, int *,
+    int[][VM_MAX_BASE_PER_DISK],int);
 void vcpu_exit_inout(struct vm_run_params *);
 uint8_t vcpu_exit_pci(struct vm_run_params *);
 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
@@ -327,7 +329,8 @@ start_vm(struct vmd_vm *vm, int fd)
 
  /* Find and open kernel image */
  if ((fp = vmboot_open(vm->vm_kernel,
-    vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
+    vm->vm_disks[0], vmc->vmc_diskbases[0],
+    vmc->vmc_disktypes[0], &vmboot)) == NULL)
  fatalx("failed to open kernel - exiting");
 
  /* Load kernel image */
@@ -903,7 +906,7 @@ vmm_create_vm(struct vm_create_params *vcp)
  */
 void
 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
-    int *child_disks, int *child_taps)
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  int i;
@@ -968,7 +971,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
  */
 void
 restore_emulated_hw(struct vm_create_params *vcp, int fd,
-    int *child_taps, int *child_disks, int child_cdrom)
+    int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
 {
  /* struct vm_create_params *vcp = &vmc->vmc_params; */
  int i;
@@ -1029,8 +1032,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
  *  !0 : the VM exited abnormally or failed to start
  */
 int
-run_vm(int child_cdrom, int *child_disks, int *child_taps,
-    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
+run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
+    int *child_taps, struct vmop_create_params *vmc,
+    struct vcpu_reg_state *vrs)
 {
  struct vm_create_params *vcp = &vmc->vmc_params;
  struct vm_rwregs_params vregsp;
diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
index 44ceeb64a7e..718c9739fa5 100644
--- usr.sbin/vmd/vmboot.c
+++ usr.sbin/vmd/vmboot.c
@@ -385,7 +385,7 @@ vmboot_loadfile(struct open_file *f, char *file, size_t *size)
 }
 
 FILE *
-vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
+vmboot_open(int kernel_fd, int *disk_fd, int nfd, unsigned int disk_type,
     struct vmboot_params *vmboot)
 {
  char file[PATH_MAX];
@@ -404,7 +404,7 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
  if (kernel_fd != -1)
  return (fdopen(kernel_fd, "r"));
 
- if (disk_fd == -1)
+ if (disk_fd == NULL || nfd < 1)
  return (NULL);
 
  if ((vfp = calloc(1, sizeof(*vfp))) == NULL)
@@ -414,20 +414,19 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type,
 
  switch (vmboot->vbp_type) {
  case VMDF_RAW:
- if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_raw(vfp, &sz, disk_fd, nfd) == -1) {
  log_debug("%s: could not open raw disk", __func__);
  goto fail;
  }
  break;
  case VMDF_QCOW2:
- if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
+ if (virtio_init_qcow2(vfp, &sz, disk_fd, nfd) == -1) {
  log_debug("%s: could not open qcow2 disk", __func__);
  goto fail;
  }
  break;
  }
 
- vmboot->vbp_fd = disk_fd;
  vmboot_file.f_devdata = vmboot;
 
  if ((vmboot->vbp_partoff =
diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
index 1571be21bc5..18a5e0d3d5d 100644
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -1097,7 +1097,7 @@ void
 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
 {
  struct privsep *ps = &env->vmd_ps;
- unsigned int i;
+ unsigned int i, j;
 
  if (vm == NULL)
  return;
@@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
  close(vm->vm_iev.ibuf.fd);
  }
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
- if (vm->vm_disks[i] != -1) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1) {
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  }
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
@@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  struct vmop_owner *vmo = NULL;
  struct vmd_user *usr = NULL;
  uint32_t rng;
- unsigned int i;
+ unsigned int i, j;
  struct vmd_switch *sw;
  char *s;
 
@@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc,
  vm->vm_user = usr;
 
  for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ vm->vm_disks[i][j] = -1;
  for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
  vm->vm_ifs[i].vif_fd = -1;
  for (i = 0; i < vcp->vcp_nnics; i++) {
diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
index b348d12c757..b7c012854e8 100644
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -48,6 +48,7 @@
 #define VM_DEFAULT_DEVICE "hd0a"
 #define VM_BOOT_CONF "/etc/boot.conf"
 #define VM_NAME_MAX 64
+#define VM_MAX_BASE_PER_DISK 4
 #define VM_TTYNAME_MAX 16
 #define MAX_TAP 256
 #define NR_BACKLOG 5
@@ -169,6 +170,7 @@ struct vmop_create_params {
 #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
 
  unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM];
+ unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM];
 #define VMDF_RAW 0x01
 #define VMDF_QCOW2 0x02
 
@@ -202,7 +204,6 @@ struct vm_dump_header {
 } __packed;
 
 struct vmboot_params {
- int vbp_fd;
  off_t vbp_partoff;
  char vbp_device[PATH_MAX];
  char vbp_image[PATH_MAX];
@@ -241,7 +242,7 @@ struct vmd_vm {
  uint32_t vm_vmid;
  int vm_kernel;
  int vm_cdrom;
- int vm_disks[VMM_MAX_DISKS_PER_VM];
+ int vm_disks[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
  struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM];
  char *vm_ttyname;
  int vm_tty;
@@ -407,7 +408,7 @@ int config_getif(struct privsep *, struct imsg *);
 int config_getcdrom(struct privsep *, struct imsg *);
 
 /* vmboot.c */
-FILE *vmboot_open(int, int, unsigned int, struct vmboot_params *);
+FILE *vmboot_open(int, int *, int, unsigned int, struct vmboot_params *);
 void vmboot_close(FILE *, struct vmboot_params *);
 
 /* parse.y */
@@ -415,4 +416,7 @@ int parse_config(const char *);
 int cmdline_symset(char *);
 int host(const char *, struct address *);
 
+/* virtio.c */
+int virtio_get_base(int, char *, size_t, int);
+
 #endif /* VMD_H */
diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
index 7757856323f..47e2b2324be 100644
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  struct vmd_vm *vm;
  int ret = EINVAL;
  int fds[2];
- size_t i;
+ size_t i, j;
 
  if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
  log_warnx("%s: can't find vm", __func__);
@@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
  close(fds[1]);
 
  for (i = 0 ; i < vcp->vcp_ndisks; i++) {
- close(vm->vm_disks[i]);
- vm->vm_disks[i] = -1;
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
+ if (vm->vm_disks[i][j] != -1)
+ close(vm->vm_disks[i][j]);
+ vm->vm_disks[i][j] = -1;
+ }
  }
  for (i = 0 ; i < vcp->vcp_nnics; i++) {
  close(vm->vm_ifs[i].vif_fd);

--
    Ori Bernstein