From: Dave Voutila Subject: fix vmmci race conditions & event(3) corruption To: tech@openbsd.org Cc: mlarkin@openbsd.org Date: Fri, 03 Jan 2025 22:12:55 -0500 As I've followed the vmd(8) rabbit down the hole figuring out how to cleanly fix the "reboots instead of shuts down problem"[1], While I've found a few issues[2], a critical issue I noticed the vmmci device in vmd(8) (i.e. *not* the vmmci(4) driver) has two critical race conditions in the vm process: 1. The device thread (the event(3) loop) and the vcpu thread *both* read/write device state (e.g. command and interrupt status register twiddling). 2. The vcpu thread modifies the device thread's event(3) state, which is a VERY BIG NO-NO as it can corrupt the event loop state. Historically, other devices suffered similar issues, so the diff below adopts the same solutions used with the uart, rtc, & pit: 1. Use a pthread mutex to protect device state in critical areas. 2. Use vmd's "device pipe" pattern to allow the vcpu thread to communicate to the device thread that it needs to update event timeouts. oks? [1] https://marc.info/?l=openbsd-tech&m=173214159319239&w=2 [2] VMMCI_SYNCRTC commands from the rtc can trample a shutdown request, for example because there's a single command register with no true device state tracking diffstat refs/heads/master refs/heads/vmmci M usr.sbin/vmd/virtio.c | 63+ 13- M usr.sbin/vmd/virtio.h | 5+ 3- M usr.sbin/vmd/vmd.h | 2+ 0- 3 files changed, 70 insertions(+), 16 deletions(-) diff refs/heads/master refs/heads/vmmci commit - 4ab10cec8117ac87adc12d73a6e010e2b98c3684 commit + f31b388d33308debac1fe051d637bd165b05d59d blob - 37185b7f96d7e50eda227ba3ddf961072cf74675 blob + d030c391e6c8a396d331b7003803a1047fe0dc15 --- usr.sbin/vmd/virtio.c +++ usr.sbin/vmd/virtio.c @@ -68,6 +68,7 @@ static int virtio_dev_launch(struct vmd_vm *, struct v static void virtio_dispatch_dev(int, short, void *); static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *); static int virtio_dev_closefds(struct virtio_dev *); +static void vmmci_pipe_dispatch(int, short, void *); const char * virtio_reg_name(uint8_t reg) @@ -275,17 +276,29 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, u return (0); } +/* + * vmmci_ctl + * + * Inject a command into the vmmci device, potentially delivering interrupt. + * + * Called by the vm process's event(3) loop. + */ int vmmci_ctl(unsigned int cmd) { + int ret = 0; struct timeval tv = { 0, 0 }; + mutex_lock(&vmmci.mutex); + if ((vmmci.cfg.device_status & - VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) - return (-1); + VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) { + ret = -1; + goto unlock; + } if (cmd == vmmci.cmd) - return (0); + goto unlock; switch (cmd) { case VMMCI_NONE: @@ -307,7 +320,7 @@ vmmci_ctl(unsigned int cmd) vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq); /* Add ACK timeout */ - tv.tv_sec = VMMCI_TIMEOUT; + tv.tv_sec = VMMCI_TIMEOUT_SHORT; evtimer_add(&vmmci.timeout, &tv); break; case VMMCI_SYNCRTC: @@ -326,14 +339,22 @@ vmmci_ctl(unsigned int cmd) fatalx("invalid vmmci command: %d", cmd); } - return (0); +unlock: + mutex_unlock(&vmmci.mutex); + + return (ret); } +/* + * vmmci_ack + * + * Process a write to the command register. + * + * Called by the vcpu thread. Must be called with the mutex held. + */ void vmmci_ack(unsigned int cmd) { - struct timeval tv = { 0, 0 }; - switch (cmd) { case VMMCI_NONE: break; @@ -347,8 +368,7 @@ vmmci_ack(unsigned int cmd) if (vmmci.cmd == 0) { log_debug("%s: vm %u requested shutdown", __func__, vmmci.vm_id); - tv.tv_sec = VMMCI_TIMEOUT; - evtimer_add(&vmmci.timeout, &tv); + vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT); return; } /* FALLTHROUGH */ @@ -360,12 +380,10 @@ vmmci_ack(unsigned int cmd) * rc.shutdown on the VM), so increase the timeout before * killing it forcefully. */ - if (cmd == vmmci.cmd && - evtimer_pending(&vmmci.timeout, NULL)) { + if (cmd == vmmci.cmd) { log_debug("%s: vm %u acknowledged shutdown request", __func__, vmmci.vm_id); - tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT; - evtimer_add(&vmmci.timeout, &tv); + vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG); } break; case VMMCI_SYNCRTC: @@ -392,6 +410,7 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_ { *intr = 0xFF; + mutex_lock(&vmmci.mutex); if (dir == 0) { switch (reg) { case VIRTIO_CONFIG_DEVICE_FEATURES: @@ -466,6 +485,8 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_ break; } } + mutex_unlock(&vmmci.mutex); + return (0); } @@ -482,6 +503,27 @@ virtio_get_base(int fd, char *path, size_t npath, int return -1; } +static void +vmmci_pipe_dispatch(int fd, short event, void *arg) +{ + enum pipe_msg_type msg; + struct timeval tv = { 0, 0 }; + + msg = vm_pipe_recv(&vmmci.dev_pipe); + switch (msg) { + case VMMCI_SET_TIMEOUT_SHORT: + tv.tv_sec = VMMCI_TIMEOUT_SHORT; + evtimer_add(&vmmci.timeout, &tv); + break; + case VMMCI_SET_TIMEOUT_LONG: + tv.tv_sec = VMMCI_TIMEOUT_LONG; + evtimer_add(&vmmci.timeout, &tv); + break; + default: + log_warnx("%s: invalid pipe message type %d", __func__, msg); + } +} + void virtio_init(struct vmd_vm *vm, int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) @@ -491,6 +533,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, struct virtio_dev *dev; uint8_t id; uint8_t i, j; + int ret = 0; /* Virtio entropy device */ if (pci_add_device(&id, PCI_VENDOR_QUMRANET, @@ -745,8 +788,15 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, vmmci.vm_id = vcp->vcp_id; vmmci.irq = pci_get_dev_irq(id); vmmci.pci_id = id; + ret = pthread_mutex_init(&vmmci.mutex, NULL); + if (ret) { + errno = ret; + fatal("could not initialize vmmci mutex"); + } evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); + vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch); + event_add(&vmmci.dev_pipe.read_ev, NULL); } /* blob - c293743050c128e12e9e0b28cdd8ab262f27ab12 blob + 6ed619c5519784bad8a5566b007d2158c1c5e4c8 --- usr.sbin/vmd/virtio.h +++ usr.sbin/vmd/virtio.h @@ -53,8 +53,8 @@ #define VIONET_MAX_TXLEN VIONET_HARD_MTU + ETHER_HDR_LEN /* VMM Control Interface shutdown timeout (in seconds) */ -#define VMMCI_TIMEOUT 3 -#define VMMCI_SHUTDOWN_TIMEOUT 120 +#define VMMCI_TIMEOUT_SHORT 3 +#define VMMCI_TIMEOUT_LONG 120 /* All the devices we support have either 1, 2 or 3 queues */ /* viornd - 1 queue @@ -321,8 +321,10 @@ struct vmmci_dev { enum vmmci_cmd cmd; uint32_t vm_id; int irq; - uint8_t pci_id; + + pthread_mutex_t mutex; + struct vm_dev_pipe dev_pipe; }; /* XXX to be removed once vioscsi is adapted to vectorized io. */ blob - b84656fc20c20c9a6cdfff4a1b7b420b2f472bac blob + c8f4b5d909049cd52d5139f36169f132759d27b4 --- usr.sbin/vmd/vmd.h +++ usr.sbin/vmd/vmd.h @@ -424,6 +424,8 @@ enum pipe_msg_type { VIRTIO_THREAD_PAUSE, VIRTIO_THREAD_STOP, VIRTIO_THREAD_ACK, + VMMCI_SET_TIMEOUT_SHORT, + VMMCI_SET_TIMEOUT_LONG, }; static inline struct sockaddr_in *