Index | Thread | Search

From:
Mike Larkin <mlarkin@nested.page>
Subject:
Re: fix vmmci race conditions & event(3) corruption
To:
Dave Voutila <dv@sisu.io>
Cc:
tech@openbsd.org, mlarkin@openbsd.org
Date:
Tue, 7 Jan 2025 12:49:50 -0800

Download raw body.

Thread
On Fri, Jan 03, 2025 at 10:12:55PM -0500, Dave Voutila wrote:
> As I've followed the vmd(8) rabbit down the hole figuring out how to
> cleanly fix the "reboots instead of shuts down problem"[1],
>
> While I've found a few issues[2], a critical issue I noticed
> the vmmci device in vmd(8) (i.e. *not* the vmmci(4) driver) has two
> critical race conditions in the vm process:
>
> 1. The device thread (the event(3) loop) and the vcpu thread *both*
>    read/write device state (e.g. command and interrupt status register
>    twiddling).
>
> 2. The vcpu thread modifies the device thread's event(3) state, which is
>    a VERY BIG NO-NO as it can corrupt the event loop state.
>
> Historically, other devices suffered similar issues, so the diff below
> adopts the same solutions used with the uart, rtc, & pit:
>
> 1. Use a pthread mutex to protect device state in critical areas.
>
> 2. Use vmd's "device pipe" pattern to allow the vcpu thread to
>    communicate to the device thread that it needs to update event
>    timeouts.
>
> oks?
>

ok mlarkin

> [1] https://marc.info/?l=openbsd-tech&m=173214159319239&w=2
> [2] VMMCI_SYNCRTC commands from the rtc can trample a shutdown request,
>     for example because there's a single command register with no true
>     device state tracking
>
> diffstat refs/heads/master refs/heads/vmmci
>  M  usr.sbin/vmd/virtio.c  |  63+  13-
>  M  usr.sbin/vmd/virtio.h  |   5+   3-
>  M  usr.sbin/vmd/vmd.h     |   2+   0-
>
> 3 files changed, 70 insertions(+), 16 deletions(-)
>
> diff refs/heads/master refs/heads/vmmci
> commit - 4ab10cec8117ac87adc12d73a6e010e2b98c3684
> commit + f31b388d33308debac1fe051d637bd165b05d59d
> blob - 37185b7f96d7e50eda227ba3ddf961072cf74675
> blob + d030c391e6c8a396d331b7003803a1047fe0dc15
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -68,6 +68,7 @@ static int virtio_dev_launch(struct vmd_vm *, struct v
>  static void virtio_dispatch_dev(int, short, void *);
>  static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
>  static int virtio_dev_closefds(struct virtio_dev *);
> +static void vmmci_pipe_dispatch(int, short, void *);
>
>  const char *
>  virtio_reg_name(uint8_t reg)
> @@ -275,17 +276,29 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, u
>  	return (0);
>  }
>
> +/*
> + * vmmci_ctl
> + *
> + * Inject a command into the vmmci device, potentially delivering interrupt.
> + *
> + * Called by the vm process's event(3) loop.
> + */
>  int
>  vmmci_ctl(unsigned int cmd)
>  {
> +	int ret = 0;
>  	struct timeval tv = { 0, 0 };
>
> +	mutex_lock(&vmmci.mutex);
> +
>  	if ((vmmci.cfg.device_status &
> -	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
> -		return (-1);
> +	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) {
> +		ret = -1;
> +		goto unlock;
> +	}
>
>  	if (cmd == vmmci.cmd)
> -		return (0);
> +		goto unlock;
>
>  	switch (cmd) {
>  	case VMMCI_NONE:
> @@ -307,7 +320,7 @@ vmmci_ctl(unsigned int cmd)
>  		vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
>
>  		/* Add ACK timeout */
> -		tv.tv_sec = VMMCI_TIMEOUT;
> +		tv.tv_sec = VMMCI_TIMEOUT_SHORT;
>  		evtimer_add(&vmmci.timeout, &tv);
>  		break;
>  	case VMMCI_SYNCRTC:
> @@ -326,14 +339,22 @@ vmmci_ctl(unsigned int cmd)
>  		fatalx("invalid vmmci command: %d", cmd);
>  	}
>
> -	return (0);
> +unlock:
> +	mutex_unlock(&vmmci.mutex);
> +
> +	return (ret);
>  }
>
> +/*
> + * vmmci_ack
> + *
> + * Process a write to the command register.
> + *
> + * Called by the vcpu thread. Must be called with the mutex held.
> + */
>  void
>  vmmci_ack(unsigned int cmd)
>  {
> -	struct timeval	 tv = { 0, 0 };
> -
>  	switch (cmd) {
>  	case VMMCI_NONE:
>  		break;
> @@ -347,8 +368,7 @@ vmmci_ack(unsigned int cmd)
>  		if (vmmci.cmd == 0) {
>  			log_debug("%s: vm %u requested shutdown", __func__,
>  			    vmmci.vm_id);
> -			tv.tv_sec = VMMCI_TIMEOUT;
> -			evtimer_add(&vmmci.timeout, &tv);
> +			vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT);
>  			return;
>  		}
>  		/* FALLTHROUGH */
> @@ -360,12 +380,10 @@ vmmci_ack(unsigned int cmd)
>  		 * rc.shutdown on the VM), so increase the timeout before
>  		 * killing it forcefully.
>  		 */
> -		if (cmd == vmmci.cmd &&
> -		    evtimer_pending(&vmmci.timeout, NULL)) {
> +		if (cmd == vmmci.cmd) {
>  			log_debug("%s: vm %u acknowledged shutdown request",
>  			    __func__, vmmci.vm_id);
> -			tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
> -			evtimer_add(&vmmci.timeout, &tv);
> +			vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG);
>  		}
>  		break;
>  	case VMMCI_SYNCRTC:
> @@ -392,6 +410,7 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_
>  {
>  	*intr = 0xFF;
>
> +	mutex_lock(&vmmci.mutex);
>  	if (dir == 0) {
>  		switch (reg) {
>  		case VIRTIO_CONFIG_DEVICE_FEATURES:
> @@ -466,6 +485,8 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_
>  			break;
>  		}
>  	}
> +	mutex_unlock(&vmmci.mutex);
> +
>  	return (0);
>  }
>
> @@ -482,6 +503,27 @@ virtio_get_base(int fd, char *path, size_t npath, int
>  	return -1;
>  }
>
> +static void
> +vmmci_pipe_dispatch(int fd, short event, void *arg)
> +{
> +	enum pipe_msg_type msg;
> +	struct timeval tv = { 0, 0 };
> +
> +	msg = vm_pipe_recv(&vmmci.dev_pipe);
> +	switch (msg) {
> +	case VMMCI_SET_TIMEOUT_SHORT:
> +		tv.tv_sec = VMMCI_TIMEOUT_SHORT;
> +		evtimer_add(&vmmci.timeout, &tv);
> +		break;
> +	case VMMCI_SET_TIMEOUT_LONG:
> +		tv.tv_sec = VMMCI_TIMEOUT_LONG;
> +		evtimer_add(&vmmci.timeout, &tv);
> +		break;
> +	default:
> +		log_warnx("%s: invalid pipe message type %d", __func__, msg);
> +	}
> +}
> +
>  void
>  virtio_init(struct vmd_vm *vm, int child_cdrom,
>      int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
> @@ -491,6 +533,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
>  	struct virtio_dev *dev;
>  	uint8_t id;
>  	uint8_t i, j;
> +	int ret = 0;
>
>  	/* Virtio entropy device */
>  	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
> @@ -745,8 +788,15 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
>  	vmmci.vm_id = vcp->vcp_id;
>  	vmmci.irq = pci_get_dev_irq(id);
>  	vmmci.pci_id = id;
> +	ret = pthread_mutex_init(&vmmci.mutex, NULL);
> +	if (ret) {
> +		errno = ret;
> +		fatal("could not initialize vmmci mutex");
> +	}
>
>  	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
> +	vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch);
> +	event_add(&vmmci.dev_pipe.read_ev, NULL);
>  }
>
>  /*
> blob - c293743050c128e12e9e0b28cdd8ab262f27ab12
> blob + 6ed619c5519784bad8a5566b007d2158c1c5e4c8
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -53,8 +53,8 @@
>  #define VIONET_MAX_TXLEN	VIONET_HARD_MTU + ETHER_HDR_LEN
>
>  /* VMM Control Interface shutdown timeout (in seconds) */
> -#define VMMCI_TIMEOUT		3
> -#define VMMCI_SHUTDOWN_TIMEOUT	120
> +#define VMMCI_TIMEOUT_SHORT	3
> +#define VMMCI_TIMEOUT_LONG	120
>
>  /* All the devices we support have either 1, 2 or 3 queues */
>  /* viornd - 1 queue
> @@ -321,8 +321,10 @@ struct vmmci_dev {
>  	enum vmmci_cmd cmd;
>  	uint32_t vm_id;
>  	int irq;
> -
>  	uint8_t pci_id;
> +
> +	pthread_mutex_t mutex;
> +	struct vm_dev_pipe dev_pipe;
>  };
>
>  /* XXX to be removed once vioscsi is adapted to vectorized io. */
> blob - b84656fc20c20c9a6cdfff4a1b7b420b2f472bac
> blob + c8f4b5d909049cd52d5139f36169f132759d27b4
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -424,6 +424,8 @@ enum pipe_msg_type {
>  	VIRTIO_THREAD_PAUSE,
>  	VIRTIO_THREAD_STOP,
>  	VIRTIO_THREAD_ACK,
> +	VMMCI_SET_TIMEOUT_SHORT,
> +	VMMCI_SET_TIMEOUT_LONG,
>  };
>
>  static inline struct sockaddr_in *