From: Stuart Henderson Subject: Re: Improving time keeping in x86 VMs To: Mark Kettenis Cc: Stefan Fritsch , mlarkin@nested.page, dv@sisu.io, tech@openbsd.org, scottcheloha@gmail.com, tedu@tedunangst.com Date: Tue, 3 Jun 2025 14:13:52 +0100 fwiw, I've been running this on proxmox VE since posted (after removing the kern.timecounter.hardware=acpihpet0 which I was previously using due to timing problems), not a wide sample range but no problems seen. 6.8.12-11-pve #1 SMP PREEMPT_DYNAMIC PMX 6.8.12-11 (2025-05-22T09:39Z) x86_64 pve-qemu-kvm/stable,now 9.2.0-5 amd64 On 2025/05/21 17:12, Mark Kettenis wrote: > > Date: Wed, 21 May 2025 17:00:00 +0200 (CEST) > > From: Stefan Fritsch > > > > Hi Mark, > > > > On Tue, 20 May 2025, Mark Kettenis wrote: > > > Thanks, that cleans up things nicely. > > > > > > > The updated diff below is only the non-vmm(4) part, because the vmm(4) > > > > part needs more discussion. I will reply to mlarkin's mail about that. > > > > > > I think splitting up the diff that way makes a lot of sense. > > > > > > A few (mostly cosmetic) nits below. > > > > thanks for your review. Updated diff below. > > Thanks that looks good to me. > > ok kettenis@, but you probably also want an ok from someone with more > virtualization knowledge. > > > diff --git a/sys/arch/i386/include/cpufunc.h b/sys/arch/i386/include/cpufunc.h > > index 098b5982d20..dca6a8ee473 100644 > > --- a/sys/arch/i386/include/cpufunc.h > > +++ b/sys/arch/i386/include/cpufunc.h > > @@ -229,6 +229,15 @@ rdtsc(void) > > return (tsc); > > } > > > > +static inline uint64_t > > +rdtsc_lfence(void) > > +{ > > + uint64_t tsc; > > + > > + __asm volatile("lfence; rdtsc" : "=A" (tsc)); > > + return tsc; > > +} > > + > > static __inline void > > wrmsr(u_int msr, u_int64_t newval) > > { > > diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c > > index 6d8dd432900..467c641d348 100644 > > --- a/sys/dev/pv/pvclock.c > > +++ b/sys/dev/pv/pvclock.c > > @@ -22,6 +22,7 @@ > > > > #include > > #include > > +#include > > #include > > > > #include > > @@ -35,13 +36,65 @@ > > #define PMAP_NOCRYPT 0 > > #endif > > > > -uint pvclock_lastcount; > > +#if defined(__amd64__) > > + > > +static inline uint64_t > > +pvclock_atomic_load(volatile uint64_t *ptr) > > +{ > > + return *ptr; > > +} > > + > > +static inline uint64_t > > +pvclock_atomic_cas(volatile uint64_t *p, uint64_t e, > > + uint64_t n) > > +{ > > + return atomic_cas_ulong((volatile unsigned long *)p, e, n); > > +} > > + > > +#elif defined(__i386__) > > + > > +/* > > + * We are running on virtualization. Therefore we can assume that we > > + * have cmpxchg8b, available on pentium and newer. > > + */ > > +static inline uint64_t > > +pvclock_atomic_load(volatile uint64_t *ptr) > > +{ > > + uint64_t val; > > + __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; " > > + "lock cmpxchg8b %1" : "=&A" (val) : "m" (*ptr)); > > + return val; > > +} > > + > > +static inline uint64_t > > +pvclock_atomic_cas(volatile uint64_t *p, uint64_t e, > > + uint64_t n) > > +{ > > + __asm volatile("lock cmpxchg8b %1" : "+A" (e), "+m" (*p) > > + : "b" ((uint32_t)n), "c" ((uint32_t)(n >> 32))); > > + return (e); > > +} > > + > > +#else > > +#error "pvclock: unsupported x86 architecture?" > > +#endif > > + > > + > > +uint64_t pvclock_lastcount; > > + > > +struct pvpage { > > + struct pvclock_time_info ti; > > + struct pvclock_wall_clock wc; > > +}; > > > > struct pvclock_softc { > > struct device sc_dev; > > - void *sc_time; > > + struct pvpage *sc_page; > > paddr_t sc_paddr; > > struct timecounter *sc_tc; > > + struct ksensordev sc_sensordev; > > + struct ksensor sc_sensor; > > + struct timeout sc_tick; > > }; > > > > #define DEVNAME(_s) ((_s)->sc_dev.dv_xname) > > @@ -50,12 +103,16 @@ int pvclock_match(struct device *, void *, void *); > > void pvclock_attach(struct device *, struct device *, void *); > > int pvclock_activate(struct device *, int); > > > > +uint64_t pvclock_get(struct timecounter *); > > uint pvclock_get_timecount(struct timecounter *); > > +void pvclock_tick_hook(struct device *); > > > > static inline uint32_t > > pvclock_read_begin(const struct pvclock_time_info *); > > static inline int > > pvclock_read_done(const struct pvclock_time_info *, uint32_t); > > +static inline uint64_t > > + pvclock_scale_delta(uint64_t, uint32_t, int); > > > > const struct cfattach pvclock_ca = { > > sizeof(struct pvclock_softc), > > @@ -123,28 +180,31 @@ void > > pvclock_attach(struct device *parent, struct device *self, void *aux) > > { > > struct pvclock_softc *sc = (struct pvclock_softc *)self; > > + struct pv_attach_args *pva = aux; > > struct pvclock_time_info *ti; > > - paddr_t pa; > > + paddr_t pa; > > uint32_t version; > > uint8_t flags; > > struct vm_page *page; > > + struct pvbus_hv *kvm; > > > > page = uvm_pagealloc(NULL, 0, NULL, 0); > > if (page == NULL) > > goto err; > > - sc->sc_time = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); > > - if (sc->sc_time == NULL) > > + sc->sc_page = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); > > + if (sc->sc_page == NULL) > > goto err; > > > > pa = VM_PAGE_TO_PHYS(page); > > - pmap_kenter_pa((vaddr_t)sc->sc_time, pa | PMAP_NOCRYPT, > > + pmap_kenter_pa((vaddr_t)sc->sc_page, pa | PMAP_NOCRYPT, > > PROT_READ | PROT_WRITE); > > - memset(sc->sc_time, 0, PAGE_SIZE); > > + pmap_update(pmap_kernel()); > > + memset(sc->sc_page, 0, PAGE_SIZE); > > > > wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); > > sc->sc_paddr = pa; > > > > - ti = sc->sc_time; > > + ti = &sc->sc_page->ti; > > do { > > version = pvclock_read_begin(ti); > > flags = ti->ti_flags; > > @@ -168,6 +228,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux) > > > > tc_init(sc->sc_tc); > > > > + /* > > + * The openbsd vmm pvclock does not support the WALL_CLOCK msr, > > + * therefore we look only for kvm. > > + */ > > + kvm = &pva->pva_hv[PVBUS_KVM]; > > + if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) { > > + strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname, > > + sizeof(sc->sc_sensordev.xname)); > > + sc->sc_sensor.type = SENSOR_TIMEDELTA; > > + sc->sc_sensor.status = SENSOR_S_UNKNOWN; > > + sensor_attach(&sc->sc_sensordev, &sc->sc_sensor); > > + sensordev_install(&sc->sc_sensordev); > > + > > + config_mountroot(self, pvclock_tick_hook); > > + } > > + > > printf("\n"); > > return; > > err: > > @@ -211,8 +287,36 @@ pvclock_read_done(const struct pvclock_time_info *ti, > > return (ti->ti_version == version); > > } > > > > -uint > > -pvclock_get_timecount(struct timecounter *tc) > > +static inline uint64_t > > +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) > > +{ > > + uint64_t lower, upper; > > + > > + if (shift < 0) > > + delta >>= -shift; > > + else > > + delta <<= shift; > > + > > + lower = ((uint64_t)mul_frac * ((uint32_t)delta)) >> 32; > > + upper = (uint64_t)mul_frac * (delta >> 32); > > + return lower + upper; > > +} > > + > > +static uint64_t > > +pvclock_cmp_last(uint64_t ctr) > > +{ > > + uint64_t last; > > + > > + do { > > + last = pvclock_atomic_load(&pvclock_lastcount); > > + if (ctr < last) > > + return (last); > > + } while (pvclock_atomic_cas(&pvclock_lastcount, last, ctr) != last); > > + return (ctr); > > +} > > + > > +uint64_t > > +pvclock_get(struct timecounter *tc) > > { > > struct pvclock_softc *sc = tc->tc_priv; > > struct pvclock_time_info *ti; > > @@ -220,8 +324,10 @@ pvclock_get_timecount(struct timecounter *tc) > > uint32_t version, mul_frac; > > int8_t shift; > > uint8_t flags; > > + int s; > > > > - ti = sc->sc_time; > > + ti = &sc->sc_page->ti; > > + s = splhigh(); > > do { > > version = pvclock_read_begin(ti); > > system_time = ti->ti_system_time; > > @@ -229,26 +335,63 @@ pvclock_get_timecount(struct timecounter *tc) > > mul_frac = ti->ti_tsc_to_system_mul; > > shift = ti->ti_tsc_shift; > > flags = ti->ti_flags; > > + delta = rdtsc_lfence(); > > } while (!pvclock_read_done(ti, version)); > > + splx(s); > > > > /* > > * The algorithm is described in > > - * linux/Documentation/virtual/kvm/msr.txt > > + * linux/Documentation/virt/kvm/x86/msr.rst > > */ > > - delta = rdtsc() - tsc_timestamp; > > - if (shift < 0) > > - delta >>= -shift; > > + if (delta > tsc_timestamp) > > + delta -= tsc_timestamp; > > else > > - delta <<= shift; > > - ctr = ((delta * mul_frac) >> 32) + system_time; > > + delta = 0; > > + ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time; > > > > if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0) > > return (ctr); > > > > - if (ctr < pvclock_lastcount) > > - return (pvclock_lastcount); > > + return pvclock_cmp_last(ctr); > > +} > > > > - atomic_swap_uint(&pvclock_lastcount, ctr); > > +uint > > +pvclock_get_timecount(struct timecounter *tc) > > +{ > > + return (pvclock_get(tc)); > > +} > > > > - return (ctr); > > +void > > +pvclock_tick(void *arg) > > +{ > > + struct pvclock_softc *sc = arg; > > + struct timespec ts; > > + struct pvclock_wall_clock *wc = &sc->sc_page->wc; > > + int64_t value; > > + > > + wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc)); > > + while (wc->wc_version & 0x1) > > + virtio_membar_sync(); > > + if (wc->wc_sec) { > > + nanotime(&ts); > > + value = TIMESPEC_TO_NSEC(&ts) - > > + SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec - > > + pvclock_get(&pvclock_timecounter); > > + > > + TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts); > > + sc->sc_sensor.value = value; > > + sc->sc_sensor.status = SENSOR_S_OK; > > + } else > > + sc->sc_sensor.status = SENSOR_S_UNKNOWN; > > + > > + timeout_add_sec(&sc->sc_tick, 15); > > +} > > + > > +void > > +pvclock_tick_hook(struct device *self) > > +{ > > + struct pvclock_softc *sc = (struct pvclock_softc *)self; > > + > > + timeout_set(&sc->sc_tick, pvclock_tick, sc); > > + pvclock_tick(sc); > > } > > >