From: Mike Larkin Subject: Re: fix vmmci race conditions & event(3) corruption To: Dave Voutila Cc: tech@openbsd.org, mlarkin@openbsd.org Date: Tue, 7 Jan 2025 12:49:50 -0800 On Fri, Jan 03, 2025 at 10:12:55PM -0500, Dave Voutila wrote: > As I've followed the vmd(8) rabbit down the hole figuring out how to > cleanly fix the "reboots instead of shuts down problem"[1], > > While I've found a few issues[2], a critical issue I noticed > the vmmci device in vmd(8) (i.e. *not* the vmmci(4) driver) has two > critical race conditions in the vm process: > > 1. The device thread (the event(3) loop) and the vcpu thread *both* > read/write device state (e.g. command and interrupt status register > twiddling). > > 2. The vcpu thread modifies the device thread's event(3) state, which is > a VERY BIG NO-NO as it can corrupt the event loop state. > > Historically, other devices suffered similar issues, so the diff below > adopts the same solutions used with the uart, rtc, & pit: > > 1. Use a pthread mutex to protect device state in critical areas. > > 2. Use vmd's "device pipe" pattern to allow the vcpu thread to > communicate to the device thread that it needs to update event > timeouts. > > oks? > ok mlarkin > [1] https://marc.info/?l=openbsd-tech&m=173214159319239&w=2 > [2] VMMCI_SYNCRTC commands from the rtc can trample a shutdown request, > for example because there's a single command register with no true > device state tracking > > diffstat refs/heads/master refs/heads/vmmci > M usr.sbin/vmd/virtio.c | 63+ 13- > M usr.sbin/vmd/virtio.h | 5+ 3- > M usr.sbin/vmd/vmd.h | 2+ 0- > > 3 files changed, 70 insertions(+), 16 deletions(-) > > diff refs/heads/master refs/heads/vmmci > commit - 4ab10cec8117ac87adc12d73a6e010e2b98c3684 > commit + f31b388d33308debac1fe051d637bd165b05d59d > blob - 37185b7f96d7e50eda227ba3ddf961072cf74675 > blob + d030c391e6c8a396d331b7003803a1047fe0dc15 > --- usr.sbin/vmd/virtio.c > +++ usr.sbin/vmd/virtio.c > @@ -68,6 +68,7 @@ static int virtio_dev_launch(struct vmd_vm *, struct v > static void virtio_dispatch_dev(int, short, void *); > static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *); > static int virtio_dev_closefds(struct virtio_dev *); > +static void vmmci_pipe_dispatch(int, short, void *); > > const char * > virtio_reg_name(uint8_t reg) > @@ -275,17 +276,29 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, u > return (0); > } > > +/* > + * vmmci_ctl > + * > + * Inject a command into the vmmci device, potentially delivering interrupt. > + * > + * Called by the vm process's event(3) loop. > + */ > int > vmmci_ctl(unsigned int cmd) > { > + int ret = 0; > struct timeval tv = { 0, 0 }; > > + mutex_lock(&vmmci.mutex); > + > if ((vmmci.cfg.device_status & > - VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) > - return (-1); > + VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) { > + ret = -1; > + goto unlock; > + } > > if (cmd == vmmci.cmd) > - return (0); > + goto unlock; > > switch (cmd) { > case VMMCI_NONE: > @@ -307,7 +320,7 @@ vmmci_ctl(unsigned int cmd) > vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq); > > /* Add ACK timeout */ > - tv.tv_sec = VMMCI_TIMEOUT; > + tv.tv_sec = VMMCI_TIMEOUT_SHORT; > evtimer_add(&vmmci.timeout, &tv); > break; > case VMMCI_SYNCRTC: > @@ -326,14 +339,22 @@ vmmci_ctl(unsigned int cmd) > fatalx("invalid vmmci command: %d", cmd); > } > > - return (0); > +unlock: > + mutex_unlock(&vmmci.mutex); > + > + return (ret); > } > > +/* > + * vmmci_ack > + * > + * Process a write to the command register. > + * > + * Called by the vcpu thread. Must be called with the mutex held. > + */ > void > vmmci_ack(unsigned int cmd) > { > - struct timeval tv = { 0, 0 }; > - > switch (cmd) { > case VMMCI_NONE: > break; > @@ -347,8 +368,7 @@ vmmci_ack(unsigned int cmd) > if (vmmci.cmd == 0) { > log_debug("%s: vm %u requested shutdown", __func__, > vmmci.vm_id); > - tv.tv_sec = VMMCI_TIMEOUT; > - evtimer_add(&vmmci.timeout, &tv); > + vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT); > return; > } > /* FALLTHROUGH */ > @@ -360,12 +380,10 @@ vmmci_ack(unsigned int cmd) > * rc.shutdown on the VM), so increase the timeout before > * killing it forcefully. > */ > - if (cmd == vmmci.cmd && > - evtimer_pending(&vmmci.timeout, NULL)) { > + if (cmd == vmmci.cmd) { > log_debug("%s: vm %u acknowledged shutdown request", > __func__, vmmci.vm_id); > - tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT; > - evtimer_add(&vmmci.timeout, &tv); > + vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG); > } > break; > case VMMCI_SYNCRTC: > @@ -392,6 +410,7 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_ > { > *intr = 0xFF; > > + mutex_lock(&vmmci.mutex); > if (dir == 0) { > switch (reg) { > case VIRTIO_CONFIG_DEVICE_FEATURES: > @@ -466,6 +485,8 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_ > break; > } > } > + mutex_unlock(&vmmci.mutex); > + > return (0); > } > > @@ -482,6 +503,27 @@ virtio_get_base(int fd, char *path, size_t npath, int > return -1; > } > > +static void > +vmmci_pipe_dispatch(int fd, short event, void *arg) > +{ > + enum pipe_msg_type msg; > + struct timeval tv = { 0, 0 }; > + > + msg = vm_pipe_recv(&vmmci.dev_pipe); > + switch (msg) { > + case VMMCI_SET_TIMEOUT_SHORT: > + tv.tv_sec = VMMCI_TIMEOUT_SHORT; > + evtimer_add(&vmmci.timeout, &tv); > + break; > + case VMMCI_SET_TIMEOUT_LONG: > + tv.tv_sec = VMMCI_TIMEOUT_LONG; > + evtimer_add(&vmmci.timeout, &tv); > + break; > + default: > + log_warnx("%s: invalid pipe message type %d", __func__, msg); > + } > +} > + > void > virtio_init(struct vmd_vm *vm, int child_cdrom, > int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) > @@ -491,6 +533,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, > struct virtio_dev *dev; > uint8_t id; > uint8_t i, j; > + int ret = 0; > > /* Virtio entropy device */ > if (pci_add_device(&id, PCI_VENDOR_QUMRANET, > @@ -745,8 +788,15 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, > vmmci.vm_id = vcp->vcp_id; > vmmci.irq = pci_get_dev_irq(id); > vmmci.pci_id = id; > + ret = pthread_mutex_init(&vmmci.mutex, NULL); > + if (ret) { > + errno = ret; > + fatal("could not initialize vmmci mutex"); > + } > > evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); > + vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch); > + event_add(&vmmci.dev_pipe.read_ev, NULL); > } > > /* > blob - c293743050c128e12e9e0b28cdd8ab262f27ab12 > blob + 6ed619c5519784bad8a5566b007d2158c1c5e4c8 > --- usr.sbin/vmd/virtio.h > +++ usr.sbin/vmd/virtio.h > @@ -53,8 +53,8 @@ > #define VIONET_MAX_TXLEN VIONET_HARD_MTU + ETHER_HDR_LEN > > /* VMM Control Interface shutdown timeout (in seconds) */ > -#define VMMCI_TIMEOUT 3 > -#define VMMCI_SHUTDOWN_TIMEOUT 120 > +#define VMMCI_TIMEOUT_SHORT 3 > +#define VMMCI_TIMEOUT_LONG 120 > > /* All the devices we support have either 1, 2 or 3 queues */ > /* viornd - 1 queue > @@ -321,8 +321,10 @@ struct vmmci_dev { > enum vmmci_cmd cmd; > uint32_t vm_id; > int irq; > - > uint8_t pci_id; > + > + pthread_mutex_t mutex; > + struct vm_dev_pipe dev_pipe; > }; > > /* XXX to be removed once vioscsi is adapted to vectorized io. */ > blob - b84656fc20c20c9a6cdfff4a1b7b420b2f472bac > blob + c8f4b5d909049cd52d5139f36169f132759d27b4 > --- usr.sbin/vmd/vmd.h > +++ usr.sbin/vmd/vmd.h > @@ -424,6 +424,8 @@ enum pipe_msg_type { > VIRTIO_THREAD_PAUSE, > VIRTIO_THREAD_STOP, > VIRTIO_THREAD_ACK, > + VMMCI_SET_TIMEOUT_SHORT, > + VMMCI_SET_TIMEOUT_LONG, > }; > > static inline struct sockaddr_in *