From: Stefan Fritsch Subject: Re: Improving time keeping in x86 VMs To: Mark Kettenis Cc: mlarkin@nested.page, dv@sisu.io, tech@openbsd.org, scottcheloha@gmail.com, Ted Unangst Date: Mon, 19 May 2025 21:49:59 +0200 On Sat, 17 May 2025, Mark Kettenis wrote: > > Date: Sat, 17 May 2025 17:39:35 +0200 (CEST) > > From: Stefan Fritsch > > > > On Sat, 12 Apr 2025, Stefan Fritsch wrote: > > > > > On Fri, 11 Apr 2025, Mike Larkin wrote: > > > > > > > On Wed, Mar 12, 2025 at 09:27:50PM +0100, Stefan Fritsch wrote: > > > > > On Tue, 11 Mar 2025, Dave Voutila wrote: > > > > > > > > > > > Stefan Fritsch writes: > > > > > > > > > > > > > Hi, > > > > > > > > > > > > > > after talking to bluhm@ about time keeping issues of linux guests on > > > > > > > vmm, I have looked into making linux attach to vmm's pvclock. In doing > > > > > > > so, I also found some issues in the pvclock(4) guest driver. So, here > > > > > > > comes a larger diff that hopefully improves time keeping in VMs, both > > > > > > > with openbsd as guest and as hypervisor. > > > > > > > > > > > > My kingdom for something to kill off my hacky Linux pvclock module. > > > > > > > > > > > > Some questions / comments below. > > > > > > > > > > > > > > > > > > > > The diff changes in pvclock(4): > > > > > > > > > > > > > > * Fix integer overflows during multiplication by taking an assembler > > > > > > > implementation from FreeBSD that uses 128 bit intermediate results. > > > > > > > > > > > > > > * Increase accuracy by disabling interrupts while reading the clock > > > > > > > > > > > > > > * Fix the not-TSC_STABLE handling which was busted because it wrongly > > > > > > > compared 32 and 64 bit values. (This requires some atomic hackery on > > > > > > > i386). > > > > > > > > > > > > > > * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the > > > > > > > sensor in vmmci(4) > > > > > > > > > > > > Lovely... > > > > > > > > > > Today, I found a diff by Scott Cheloha from 2022 that was never > > > > > committed and that I have missed before. It has some overlap with my > > > > > pvclock diff. I will look at it some more and produce a combined diff. > > > > > > Sorry, I got side-tracked by other issues that were morge urgent. Updated > > > diff below. The vmm part is unchanged but the pvclock part has several > > > fixes, some inspired by Scott's earlier diff. > > > > > > This would need to be split up into several diffs before committing. I am > > > not 100% sure that the memory clobbers in i386_atomic_testset_uq and > > > i386_atomic_cas_uq are neccessary, but I think they are. If the new > > > i386_atomic_* functions are not wanted in atomic.h, they could of course > > > be moved into pvclock.c. But i386_atomic_testset_uq() is already in > > > atomic.h. > > > > There was something wrong with the i386 assembler atomics that caused > > havoc if the TSC stable bit was not set by the hypervisor. Updated diff > > below. > > I don't understand why we need so much inline asm and why you we need > all those #ifdefs. Yes we do need those oddly named wrappers for > cmpxchg8b in i386 since we have to be careful that we don't use them > anywhere else. So perhaps they shouldn't even live in atomic.h. > Maybe just put them in pvclock.c where they're used? > > But why do you need to have all that inline asm in > pvclock_scale_delta()? Is it really that difficult to write that code > in plain C? I took the assembler pvclock_scale_delta() from FreeBSD because I knew it to work. But you are right, the C version is infinitely more readable. The resulting code does one more mul, but I don't think that matters in practice. I have also moved the i386 cmpxchg8b wrappers to pvclock.c which also takes care of tedu's comment. The updated diff below is only the non-vmm(4) part, because the vmm(4) part needs more discussion. I will reply to mlarkin's mail about that. diff --git a/sys/arch/i386/include/cpufunc.h b/sys/arch/i386/include/cpufunc.h index 098b5982d20..dca6a8ee473 100644 --- a/sys/arch/i386/include/cpufunc.h +++ b/sys/arch/i386/include/cpufunc.h @@ -229,6 +229,15 @@ rdtsc(void) return (tsc); } +static inline uint64_t +rdtsc_lfence(void) +{ + uint64_t tsc; + + __asm volatile("lfence; rdtsc" : "=A" (tsc)); + return tsc; +} + static __inline void wrmsr(u_int msr, u_int64_t newval) { diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c index 6d8dd432900..7e9f7f11727 100644 --- a/sys/dev/pv/pvclock.c +++ b/sys/dev/pv/pvclock.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -35,13 +36,65 @@ #define PMAP_NOCRYPT 0 #endif -uint pvclock_lastcount; +#if defined(__amd64__) + +static __inline uint64_t +atomic_load(volatile uint64_t *ptr) +{ + return *ptr; +} + +static __inline u_int64_t +atomic_cas(volatile uint64_t *p, uint64_t e, + uint64_t n) +{ + return atomic_cas_ulong((volatile unsigned long *)p, e, n); +} + +#elif defined(__i386__) + +/* + * We are running on virtualization. Therefore we can assume that we + * have cmpxchg8b, available on pentium and newer. + */ +static __inline uint64_t +atomic_load(volatile uint64_t *ptr) +{ + uint64_t val; + __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; " + "lock cmpxchg8b %1" : "=&A" (val) : "m" (*ptr)); + return val; +} + +static __inline u_int64_t +atomic_cas(volatile uint64_t *p, uint64_t e, + uint64_t n) +{ + __asm volatile("lock cmpxchg8b %1" : "+A" (e), "+m" (*p) + : "b" ((uint32_t)n), "c" ((uint32_t)(n >> 32))); + return (e); +} + +#else +#error "pvclock: unsupported x86 architecture?" +#endif + + +uint64_t pvclock_lastcount; + +struct pvpage { + struct pvclock_time_info ti; + struct pvclock_wall_clock wc; +}; struct pvclock_softc { struct device sc_dev; - void *sc_time; + struct pvpage *sc_page; paddr_t sc_paddr; struct timecounter *sc_tc; + struct ksensordev sc_sensordev; + struct ksensor sc_sensor; + struct timeout sc_tick; }; #define DEVNAME(_s) ((_s)->sc_dev.dv_xname) @@ -50,12 +103,16 @@ int pvclock_match(struct device *, void *, void *); void pvclock_attach(struct device *, struct device *, void *); int pvclock_activate(struct device *, int); +uint64_t pvclock_get(struct timecounter *); uint pvclock_get_timecount(struct timecounter *); +void pvclock_tick_hook(struct device *); static inline uint32_t pvclock_read_begin(const struct pvclock_time_info *); static inline int pvclock_read_done(const struct pvclock_time_info *, uint32_t); +static inline uint64_t + pvclock_scale_delta(uint64_t, uint32_t, int); const struct cfattach pvclock_ca = { sizeof(struct pvclock_softc), @@ -123,28 +180,32 @@ void pvclock_attach(struct device *parent, struct device *self, void *aux) { struct pvclock_softc *sc = (struct pvclock_softc *)self; + struct pv_attach_args *pva = aux; struct pvclock_time_info *ti; - paddr_t pa; + paddr_t pa; uint32_t version; uint8_t flags; struct vm_page *page; + struct pvbus_hv *kvm; + page = uvm_pagealloc(NULL, 0, NULL, 0); if (page == NULL) goto err; - sc->sc_time = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); - if (sc->sc_time == NULL) + sc->sc_page = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); + if (sc->sc_page == NULL) goto err; pa = VM_PAGE_TO_PHYS(page); - pmap_kenter_pa((vaddr_t)sc->sc_time, pa | PMAP_NOCRYPT, + pmap_kenter_pa((vaddr_t)sc->sc_page, pa | PMAP_NOCRYPT, PROT_READ | PROT_WRITE); - memset(sc->sc_time, 0, PAGE_SIZE); + pmap_update(pmap_kernel()); + memset(sc->sc_page, 0, PAGE_SIZE); wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); sc->sc_paddr = pa; - ti = sc->sc_time; + ti = &sc->sc_page->ti; do { version = pvclock_read_begin(ti); flags = ti->ti_flags; @@ -168,6 +229,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux) tc_init(sc->sc_tc); + /* + * The openbsd vmm pvclock does not support the WALL_CLOCK msr, + * therefore we look only for kvm. + */ + kvm = &pva->pva_hv[PVBUS_KVM]; + if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) { + strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname, + sizeof(sc->sc_sensordev.xname)); + sc->sc_sensor.type = SENSOR_TIMEDELTA; + sc->sc_sensor.status = SENSOR_S_UNKNOWN; + sensor_attach(&sc->sc_sensordev, &sc->sc_sensor); + sensordev_install(&sc->sc_sensordev); + + config_mountroot(self, pvclock_tick_hook); + } + printf("\n"); return; err: @@ -211,8 +288,36 @@ pvclock_read_done(const struct pvclock_time_info *ti, return (ti->ti_version == version); } -uint -pvclock_get_timecount(struct timecounter *tc) +static inline uint64_t +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t lower, upper; + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + + lower = ((uint64_t)mul_frac * ((uint32_t)delta)) >> 32; + upper = (uint64_t)mul_frac * (delta >> 32); + return lower + upper; +} + +static uint64_t +pvclock_cmp_last(uint64_t ctr) +{ + uint64_t last; + + do { + last = atomic_load(&pvclock_lastcount); + if (ctr < last) + return (last); + } while (atomic_cas(&pvclock_lastcount, last, ctr) != last); + return (ctr); +} + +uint64_t +pvclock_get(struct timecounter *tc) { struct pvclock_softc *sc = tc->tc_priv; struct pvclock_time_info *ti; @@ -220,8 +325,10 @@ pvclock_get_timecount(struct timecounter *tc) uint32_t version, mul_frac; int8_t shift; uint8_t flags; + int s; - ti = sc->sc_time; + ti = &sc->sc_page->ti; + s = splhigh(); do { version = pvclock_read_begin(ti); system_time = ti->ti_system_time; @@ -229,26 +336,63 @@ pvclock_get_timecount(struct timecounter *tc) mul_frac = ti->ti_tsc_to_system_mul; shift = ti->ti_tsc_shift; flags = ti->ti_flags; + delta = rdtsc_lfence(); } while (!pvclock_read_done(ti, version)); + splx(s); /* * The algorithm is described in - * linux/Documentation/virtual/kvm/msr.txt + * linux/Documentation/virt/kvm/x86/msr.rst */ - delta = rdtsc() - tsc_timestamp; - if (shift < 0) - delta >>= -shift; + if (delta > tsc_timestamp) + delta -= tsc_timestamp; else - delta <<= shift; - ctr = ((delta * mul_frac) >> 32) + system_time; + delta = 0; + ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time; if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0) return (ctr); - if (ctr < pvclock_lastcount) - return (pvclock_lastcount); + return pvclock_cmp_last(ctr); +} - atomic_swap_uint(&pvclock_lastcount, ctr); +uint +pvclock_get_timecount(struct timecounter *tc) +{ + return (pvclock_get(tc)); +} - return (ctr); +void +pvclock_tick(void *arg) +{ + struct pvclock_softc *sc = arg; + struct timespec ts; + struct pvclock_wall_clock *wc = &sc->sc_page->wc; + int64_t value; + + wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc)); + while (wc->wc_version & 0x1) + virtio_membar_sync(); + if (wc->wc_sec) { + nanotime(&ts); + value = TIMESPEC_TO_NSEC(&ts) - + SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec - + pvclock_get(&pvclock_timecounter); + + TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts); + sc->sc_sensor.value = value; + sc->sc_sensor.status = SENSOR_S_OK; + } else + sc->sc_sensor.status = SENSOR_S_UNKNOWN; + + timeout_add_sec(&sc->sc_tick, 15); +} + +void +pvclock_tick_hook(struct device *self) +{ + struct pvclock_softc *sc = (struct pvclock_softc *)self; + + timeout_set(&sc->sc_tick, pvclock_tick, sc); + pvclock_tick(sc); }