Download raw body.
fix vmmci race conditions & event(3) corruption
On Fri, Jan 03, 2025 at 10:12:55PM -0500, Dave Voutila wrote:
> As I've followed the vmd(8) rabbit down the hole figuring out how to
> cleanly fix the "reboots instead of shuts down problem"[1],
>
> While I've found a few issues[2], a critical issue I noticed
> the vmmci device in vmd(8) (i.e. *not* the vmmci(4) driver) has two
> critical race conditions in the vm process:
>
> 1. The device thread (the event(3) loop) and the vcpu thread *both*
> read/write device state (e.g. command and interrupt status register
> twiddling).
>
> 2. The vcpu thread modifies the device thread's event(3) state, which is
> a VERY BIG NO-NO as it can corrupt the event loop state.
>
> Historically, other devices suffered similar issues, so the diff below
> adopts the same solutions used with the uart, rtc, & pit:
>
> 1. Use a pthread mutex to protect device state in critical areas.
>
> 2. Use vmd's "device pipe" pattern to allow the vcpu thread to
> communicate to the device thread that it needs to update event
> timeouts.
>
> oks?
>
ok mlarkin
> [1] https://marc.info/?l=openbsd-tech&m=173214159319239&w=2
> [2] VMMCI_SYNCRTC commands from the rtc can trample a shutdown request,
> for example because there's a single command register with no true
> device state tracking
>
> diffstat refs/heads/master refs/heads/vmmci
> M usr.sbin/vmd/virtio.c | 63+ 13-
> M usr.sbin/vmd/virtio.h | 5+ 3-
> M usr.sbin/vmd/vmd.h | 2+ 0-
>
> 3 files changed, 70 insertions(+), 16 deletions(-)
>
> diff refs/heads/master refs/heads/vmmci
> commit - 4ab10cec8117ac87adc12d73a6e010e2b98c3684
> commit + f31b388d33308debac1fe051d637bd165b05d59d
> blob - 37185b7f96d7e50eda227ba3ddf961072cf74675
> blob + d030c391e6c8a396d331b7003803a1047fe0dc15
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -68,6 +68,7 @@ static int virtio_dev_launch(struct vmd_vm *, struct v
> static void virtio_dispatch_dev(int, short, void *);
> static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
> static int virtio_dev_closefds(struct virtio_dev *);
> +static void vmmci_pipe_dispatch(int, short, void *);
>
> const char *
> virtio_reg_name(uint8_t reg)
> @@ -275,17 +276,29 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, u
> return (0);
> }
>
> +/*
> + * vmmci_ctl
> + *
> + * Inject a command into the vmmci device, potentially delivering interrupt.
> + *
> + * Called by the vm process's event(3) loop.
> + */
> int
> vmmci_ctl(unsigned int cmd)
> {
> + int ret = 0;
> struct timeval tv = { 0, 0 };
>
> + mutex_lock(&vmmci.mutex);
> +
> if ((vmmci.cfg.device_status &
> - VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
> - return (-1);
> + VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) {
> + ret = -1;
> + goto unlock;
> + }
>
> if (cmd == vmmci.cmd)
> - return (0);
> + goto unlock;
>
> switch (cmd) {
> case VMMCI_NONE:
> @@ -307,7 +320,7 @@ vmmci_ctl(unsigned int cmd)
> vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
>
> /* Add ACK timeout */
> - tv.tv_sec = VMMCI_TIMEOUT;
> + tv.tv_sec = VMMCI_TIMEOUT_SHORT;
> evtimer_add(&vmmci.timeout, &tv);
> break;
> case VMMCI_SYNCRTC:
> @@ -326,14 +339,22 @@ vmmci_ctl(unsigned int cmd)
> fatalx("invalid vmmci command: %d", cmd);
> }
>
> - return (0);
> +unlock:
> + mutex_unlock(&vmmci.mutex);
> +
> + return (ret);
> }
>
> +/*
> + * vmmci_ack
> + *
> + * Process a write to the command register.
> + *
> + * Called by the vcpu thread. Must be called with the mutex held.
> + */
> void
> vmmci_ack(unsigned int cmd)
> {
> - struct timeval tv = { 0, 0 };
> -
> switch (cmd) {
> case VMMCI_NONE:
> break;
> @@ -347,8 +368,7 @@ vmmci_ack(unsigned int cmd)
> if (vmmci.cmd == 0) {
> log_debug("%s: vm %u requested shutdown", __func__,
> vmmci.vm_id);
> - tv.tv_sec = VMMCI_TIMEOUT;
> - evtimer_add(&vmmci.timeout, &tv);
> + vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT);
> return;
> }
> /* FALLTHROUGH */
> @@ -360,12 +380,10 @@ vmmci_ack(unsigned int cmd)
> * rc.shutdown on the VM), so increase the timeout before
> * killing it forcefully.
> */
> - if (cmd == vmmci.cmd &&
> - evtimer_pending(&vmmci.timeout, NULL)) {
> + if (cmd == vmmci.cmd) {
> log_debug("%s: vm %u acknowledged shutdown request",
> __func__, vmmci.vm_id);
> - tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
> - evtimer_add(&vmmci.timeout, &tv);
> + vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG);
> }
> break;
> case VMMCI_SYNCRTC:
> @@ -392,6 +410,7 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_
> {
> *intr = 0xFF;
>
> + mutex_lock(&vmmci.mutex);
> if (dir == 0) {
> switch (reg) {
> case VIRTIO_CONFIG_DEVICE_FEATURES:
> @@ -466,6 +485,8 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_
> break;
> }
> }
> + mutex_unlock(&vmmci.mutex);
> +
> return (0);
> }
>
> @@ -482,6 +503,27 @@ virtio_get_base(int fd, char *path, size_t npath, int
> return -1;
> }
>
> +static void
> +vmmci_pipe_dispatch(int fd, short event, void *arg)
> +{
> + enum pipe_msg_type msg;
> + struct timeval tv = { 0, 0 };
> +
> + msg = vm_pipe_recv(&vmmci.dev_pipe);
> + switch (msg) {
> + case VMMCI_SET_TIMEOUT_SHORT:
> + tv.tv_sec = VMMCI_TIMEOUT_SHORT;
> + evtimer_add(&vmmci.timeout, &tv);
> + break;
> + case VMMCI_SET_TIMEOUT_LONG:
> + tv.tv_sec = VMMCI_TIMEOUT_LONG;
> + evtimer_add(&vmmci.timeout, &tv);
> + break;
> + default:
> + log_warnx("%s: invalid pipe message type %d", __func__, msg);
> + }
> +}
> +
> void
> virtio_init(struct vmd_vm *vm, int child_cdrom,
> int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
> @@ -491,6 +533,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
> struct virtio_dev *dev;
> uint8_t id;
> uint8_t i, j;
> + int ret = 0;
>
> /* Virtio entropy device */
> if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
> @@ -745,8 +788,15 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
> vmmci.vm_id = vcp->vcp_id;
> vmmci.irq = pci_get_dev_irq(id);
> vmmci.pci_id = id;
> + ret = pthread_mutex_init(&vmmci.mutex, NULL);
> + if (ret) {
> + errno = ret;
> + fatal("could not initialize vmmci mutex");
> + }
>
> evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
> + vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch);
> + event_add(&vmmci.dev_pipe.read_ev, NULL);
> }
>
> /*
> blob - c293743050c128e12e9e0b28cdd8ab262f27ab12
> blob + 6ed619c5519784bad8a5566b007d2158c1c5e4c8
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -53,8 +53,8 @@
> #define VIONET_MAX_TXLEN VIONET_HARD_MTU + ETHER_HDR_LEN
>
> /* VMM Control Interface shutdown timeout (in seconds) */
> -#define VMMCI_TIMEOUT 3
> -#define VMMCI_SHUTDOWN_TIMEOUT 120
> +#define VMMCI_TIMEOUT_SHORT 3
> +#define VMMCI_TIMEOUT_LONG 120
>
> /* All the devices we support have either 1, 2 or 3 queues */
> /* viornd - 1 queue
> @@ -321,8 +321,10 @@ struct vmmci_dev {
> enum vmmci_cmd cmd;
> uint32_t vm_id;
> int irq;
> -
> uint8_t pci_id;
> +
> + pthread_mutex_t mutex;
> + struct vm_dev_pipe dev_pipe;
> };
>
> /* XXX to be removed once vioscsi is adapted to vectorized io. */
> blob - b84656fc20c20c9a6cdfff4a1b7b420b2f472bac
> blob + c8f4b5d909049cd52d5139f36169f132759d27b4
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -424,6 +424,8 @@ enum pipe_msg_type {
> VIRTIO_THREAD_PAUSE,
> VIRTIO_THREAD_STOP,
> VIRTIO_THREAD_ACK,
> + VMMCI_SET_TIMEOUT_SHORT,
> + VMMCI_SET_TIMEOUT_LONG,
> };
>
> static inline struct sockaddr_in *
fix vmmci race conditions & event(3) corruption