From: Mike Larkin Subject: Re: Improving time keeping in x86 VMs To: Stefan Fritsch Cc: Dave Voutila , tech@openbsd.org, scottcheloha@gmail.com Date: Sun, 18 May 2025 23:18:07 -0700 On Sat, May 17, 2025 at 05:39:35PM +0200, Stefan Fritsch wrote: > On Sat, 12 Apr 2025, Stefan Fritsch wrote: > > > On Fri, 11 Apr 2025, Mike Larkin wrote: > > > > > On Wed, Mar 12, 2025 at 09:27:50PM +0100, Stefan Fritsch wrote: > > > > On Tue, 11 Mar 2025, Dave Voutila wrote: > > > > > > > > > Stefan Fritsch writes: > > > > > > > > > > > Hi, > > > > > > > > > > > > after talking to bluhm@ about time keeping issues of linux guests on > > > > > > vmm, I have looked into making linux attach to vmm's pvclock. In doing > > > > > > so, I also found some issues in the pvclock(4) guest driver. So, here > > > > > > comes a larger diff that hopefully improves time keeping in VMs, both > > > > > > with openbsd as guest and as hypervisor. > > > > > > > > > > My kingdom for something to kill off my hacky Linux pvclock module. > > > > > > > > > > Some questions / comments below. > > > > > > > > > > > > > > > > > The diff changes in pvclock(4): > > > > > > > > > > > > * Fix integer overflows during multiplication by taking an assembler > > > > > > implementation from FreeBSD that uses 128 bit intermediate results. > > > > > > > > > > > > * Increase accuracy by disabling interrupts while reading the clock > > > > > > > > > > > > * Fix the not-TSC_STABLE handling which was busted because it wrongly > > > > > > compared 32 and 64 bit values. (This requires some atomic hackery on > > > > > > i386). > > > > > > > > > > > > * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the > > > > > > sensor in vmmci(4) > > > > > > > > > > Lovely... > > > > > > > > Today, I found a diff by Scott Cheloha from 2022 that was never > > > > committed and that I have missed before. It has some overlap with my > > > > pvclock diff. I will look at it some more and produce a combined diff. > > > > Sorry, I got side-tracked by other issues that were morge urgent. Updated > > diff below. The vmm part is unchanged but the pvclock part has several > > fixes, some inspired by Scott's earlier diff. > > > > This would need to be split up into several diffs before committing. I am > > not 100% sure that the memory clobbers in i386_atomic_testset_uq and > > i386_atomic_cas_uq are neccessary, but I think they are. If the new > > i386_atomic_* functions are not wanted in atomic.h, they could of course > > be moved into pvclock.c. But i386_atomic_testset_uq() is already in > > atomic.h. > > There was something wrong with the i386 assembler atomics that caused > havoc if the TSC stable bit was not set by the hypervisor. Updated diff > below. > > diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c > index 0a3be15cd64..5a1570aa8d9 100644 > --- a/sys/arch/amd64/amd64/vmm_machdep.c > +++ b/sys/arch/amd64/amd64/vmm_machdep.c > @@ -146,6 +146,7 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *); > int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size); > void vmm_init_pvclock(struct vcpu *, paddr_t); > int vmm_update_pvclock(struct vcpu *); > +void vmm_pv_wall_clock(struct vcpu *, paddr_t); > int vmm_pat_is_valid(uint64_t); > > #ifdef MULTIPROCESSOR > @@ -185,6 +186,7 @@ extern uint64_t tsc_frequency; > extern int tsc_is_invariant; > > const char *vmm_hv_signature = VMM_HV_SIGNATURE; > +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0"; > > const struct kmem_pa_mode vmm_kp_contig = { > .kp_constraint = &no_constraint, > @@ -5578,6 +5580,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu) > vmm_init_pvclock(vcpu, > (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > break; > + case KVM_MSR_WALL_CLOCK: > + vmm_pv_wall_clock(vcpu, > + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > + break; > #ifdef VMM_DEBUG > default: > /* > @@ -5639,6 +5645,10 @@ svm_handle_msr(struct vcpu *vcpu) > vmm_init_pvclock(vcpu, > (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > break; > + case KVM_MSR_WALL_CLOCK: > + vmm_pv_wall_clock(vcpu, > + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > + break; > default: > /* Log the access, to be able to identify unknown MSRs */ > DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data " > @@ -5994,6 +6004,20 @@ vmm_handle_cpuid(struct vcpu *vcpu) > *rcx = 0; > *rdx = 0; > break; > + case 0x40000100: /* Hypervisor information KVM */ > + *rax = 0x40000101; > + *rbx = *((uint32_t *)&kvm_hv_signature[0]); > + *rcx = *((uint32_t *)&kvm_hv_signature[4]); > + *rdx = *((uint32_t *)&kvm_hv_signature[8]); > + break; > + case 0x40000101: /* KVM hypervisor features */ > + *rax = (1 << KVM_FEATURE_CLOCKSOURCE2) | > + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | > + (1 << KVM_FEATURE_NOP_IO_DELAY); > + *rbx = 0; > + *rcx = 0; > + *rdx = 0; > + break; This makes us claim we are kvm. Even though you are only advertising support for these 3 features, that makes a big assumption that every linux kernel out there respects a neutered/weird "KVM" like this. How many varieties of guest kernels did you test this on? I think we need to be very careful if we do this. We are basically claiming to be something that doesn't really exist anywhere else. -ml > case 0x80000000: /* Extended function level */ > /* We don't emulate past 0x8000001f currently. */ > *rax = min(curcpu()->ci_pnfeatset, 0x8000001f); > @@ -6508,7 +6532,7 @@ vmm_update_pvclock(struct vcpu *vcpu) > (++vcpu->vc_pvclock_version << 1) | 0x1; > > pvclock_ti->ti_tsc_timestamp = rdtsc(); > - nanotime(&tv); > + nanouptime(&tv); > pvclock_ti->ti_system_time = > tv.tv_sec * 1000000000L + tv.tv_nsec; > pvclock_ti->ti_tsc_shift = 12; > @@ -6522,6 +6546,36 @@ vmm_update_pvclock(struct vcpu *vcpu) > return (0); > } > > +void > +vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa) > +{ > + struct pvclock_wall_clock *pvclock_wc; > + struct timespec tv; > + struct vm *vm = vcpu->vc_parent; > + paddr_t hpa; > + > + if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock))) > + goto err; > + > + /* XXX: handle case when this struct goes over page boundaries */ > + if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE) > + goto err; > + > + if (!pmap_extract(vm->vm_map->pmap, gpa, &hpa)) > + goto err; > + pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa); > + pvclock_wc->wc_version |= 0x1; > + nanoboottime(&tv); > + pvclock_wc->wc_sec = tv.tv_sec; > + pvclock_wc->wc_nsec = tv.tv_nsec; > + pvclock_wc->wc_version += 1; > + > + return; > +err: > + vmm_inject_gp(vcpu); > +} > + > + > int > vmm_pat_is_valid(uint64_t pat) > { > diff --git a/sys/arch/i386/include/atomic.h b/sys/arch/i386/include/atomic.h > index 45aedb819c8..9034104b91a 100644 > --- a/sys/arch/i386/include/atomic.h > +++ b/sys/arch/i386/include/atomic.h > @@ -278,6 +278,24 @@ i386_atomic_testset_uq(volatile u_int64_t *ptr, u_int64_t val) > return val; > } > > +static __inline u_int64_t > +i386_atomic_load_uq(volatile u_int64_t *ptr) > +{ > + uint64_t val; > + __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; " > + _LOCK " cmpxchg8b %1" : "=&A" (val) : "m" (*ptr)); > + return val; > +} > + > +static __inline u_int64_t > +i386_atomic_cas_uq(volatile u_int64_t *p, u_int64_t e, > + u_int64_t n) > +{ > + __asm volatile(_LOCK " cmpxchg8b %1" : "+A" (e), "+m" (*p) > + : "b" ((u_int32_t)n), "c" ((u_int32_t)(n >> 32))); > + return (e); > +} > + > static __inline u_int32_t > i386_atomic_testset_ul(volatile u_int32_t *ptr, unsigned long val) > { > diff --git a/sys/arch/i386/include/cpufunc.h b/sys/arch/i386/include/cpufunc.h > index 098b5982d20..dca6a8ee473 100644 > --- a/sys/arch/i386/include/cpufunc.h > +++ b/sys/arch/i386/include/cpufunc.h > @@ -229,6 +229,15 @@ rdtsc(void) > return (tsc); > } > > +static inline uint64_t > +rdtsc_lfence(void) > +{ > + uint64_t tsc; > + > + __asm volatile("lfence; rdtsc" : "=A" (tsc)); > + return tsc; > +} > + > static __inline void > wrmsr(u_int msr, u_int64_t newval) > { > diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c > index 6d8dd432900..206cc418062 100644 > --- a/sys/dev/pv/pvclock.c > +++ b/sys/dev/pv/pvclock.c > @@ -22,6 +22,7 @@ > > #include > #include > +#include > #include > > #include > @@ -35,13 +36,25 @@ > #define PMAP_NOCRYPT 0 > #endif > > -uint pvclock_lastcount; > +#ifdef __amd64__ > +unsigned long pvclock_lastcount; > +#else > +uint64_t pvclock_lastcount; > +#endif > + > +struct pvpage { > + struct pvclock_time_info ti; > + struct pvclock_wall_clock wc; > +}; > > struct pvclock_softc { > struct device sc_dev; > - void *sc_time; > + struct pvpage *sc_page; > paddr_t sc_paddr; > struct timecounter *sc_tc; > + struct ksensordev sc_sensordev; > + struct ksensor sc_sensor; > + struct timeout sc_tick; > }; > > #define DEVNAME(_s) ((_s)->sc_dev.dv_xname) > @@ -50,12 +63,16 @@ int pvclock_match(struct device *, void *, void *); > void pvclock_attach(struct device *, struct device *, void *); > int pvclock_activate(struct device *, int); > > +uint64_t pvclock_get(struct timecounter *); > uint pvclock_get_timecount(struct timecounter *); > +void pvclock_tick_hook(struct device *); > > static inline uint32_t > pvclock_read_begin(const struct pvclock_time_info *); > static inline int > pvclock_read_done(const struct pvclock_time_info *, uint32_t); > +static inline uint64_t > + pvclock_scale_delta(uint64_t, uint32_t, int); > > const struct cfattach pvclock_ca = { > sizeof(struct pvclock_softc), > @@ -123,28 +140,32 @@ void > pvclock_attach(struct device *parent, struct device *self, void *aux) > { > struct pvclock_softc *sc = (struct pvclock_softc *)self; > + struct pv_attach_args *pva = aux; > struct pvclock_time_info *ti; > - paddr_t pa; > + paddr_t pa; > uint32_t version; > uint8_t flags; > struct vm_page *page; > + struct pvbus_hv *kvm; > + > > page = uvm_pagealloc(NULL, 0, NULL, 0); > if (page == NULL) > goto err; > - sc->sc_time = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); > - if (sc->sc_time == NULL) > + sc->sc_page = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); > + if (sc->sc_page == NULL) > goto err; > > pa = VM_PAGE_TO_PHYS(page); > - pmap_kenter_pa((vaddr_t)sc->sc_time, pa | PMAP_NOCRYPT, > + pmap_kenter_pa((vaddr_t)sc->sc_page, pa | PMAP_NOCRYPT, > PROT_READ | PROT_WRITE); > - memset(sc->sc_time, 0, PAGE_SIZE); > + pmap_update(pmap_kernel()); > + memset(sc->sc_page, 0, PAGE_SIZE); > > wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); > sc->sc_paddr = pa; > > - ti = sc->sc_time; > + ti = &sc->sc_page->ti; > do { > version = pvclock_read_begin(ti); > flags = ti->ti_flags; > @@ -168,6 +189,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux) > > tc_init(sc->sc_tc); > > + /* > + * The openbsd vmm pvclock does not support the WALL_CLOCK msr, > + * therefore we look only for kvm. > + */ > + kvm = &pva->pva_hv[PVBUS_KVM]; > + if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) { > + strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname, > + sizeof(sc->sc_sensordev.xname)); > + sc->sc_sensor.type = SENSOR_TIMEDELTA; > + sc->sc_sensor.status = SENSOR_S_UNKNOWN; > + sensor_attach(&sc->sc_sensordev, &sc->sc_sensor); > + sensordev_install(&sc->sc_sensordev); > + > + config_mountroot(self, pvclock_tick_hook); > + } > + > printf("\n"); > return; > err: > @@ -211,8 +248,82 @@ pvclock_read_done(const struct pvclock_time_info *ti, > return (ti->ti_version == version); > } > > -uint > -pvclock_get_timecount(struct timecounter *tc) > +static inline uint64_t > +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) > +{ > + uint64_t product; > + > + if (shift < 0) > + delta >>= -shift; > + else > + delta <<= shift; > +#if defined(__i386__) > + { > + uint32_t tmp1, tmp2; > + > + /** > + * For i386, the formula looks like: > + * > + * lower = (mul_frac * (delta & UINT_MAX)) >> 32 > + * upper = mul_frac * (delta >> 32) > + * product = lower + upper > + */ > + __asm__ ( > + "mul %5 ; " > + "mov %4,%%eax ; " > + "mov %%edx,%4 ; " > + "mul %5 ; " > + "xor %5,%5 ; " > + "add %4,%%eax ; " > + "adc %5,%%edx ; " > + : "=A" (product), "=r" (tmp1), "=r" (tmp2) > + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), > + "2" (mul_frac) ); > + } > +#elif defined(__amd64__) > + { > + unsigned long tmp; > + > + __asm__ ( > + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" > + : [lo]"=a" (product), [hi]"=d" (tmp) > + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); > + } > +#else > +#error "pvclock: unsupported x86 architecture?" > +#endif > + return (product); > +} > + > +static uint64_t > +pvclock_cmp_last(uint64_t ctr) > +{ > +#if defined(__i386__) > + uint64_t last; > + > + /* > + * We are running on virtualization. Therefore we can assume that we > + * have cmpxchg8b, available on pentium and newer. > + */ > + do { > + last = i386_atomic_load_uq(&pvclock_lastcount); > + if (ctr < last) > + return (last); > + } while (i386_atomic_cas_uq(&pvclock_lastcount, last, ctr) != last); > +#else > + unsigned long last; > + > + do { > + last = pvclock_lastcount; > + if (ctr < last) > + return (last); > + } while (atomic_cas_ulong(&pvclock_lastcount, last, ctr) != last); > +#endif > + return (ctr); > +} > + > +uint64_t > +pvclock_get(struct timecounter *tc) > { > struct pvclock_softc *sc = tc->tc_priv; > struct pvclock_time_info *ti; > @@ -220,8 +331,10 @@ pvclock_get_timecount(struct timecounter *tc) > uint32_t version, mul_frac; > int8_t shift; > uint8_t flags; > + int s; > > - ti = sc->sc_time; > + ti = &sc->sc_page->ti; > + s = splhigh(); > do { > version = pvclock_read_begin(ti); > system_time = ti->ti_system_time; > @@ -229,26 +342,63 @@ pvclock_get_timecount(struct timecounter *tc) > mul_frac = ti->ti_tsc_to_system_mul; > shift = ti->ti_tsc_shift; > flags = ti->ti_flags; > + delta = rdtsc_lfence(); > } while (!pvclock_read_done(ti, version)); > + splx(s); > > /* > * The algorithm is described in > - * linux/Documentation/virtual/kvm/msr.txt > + * linux/Documentation/virt/kvm/x86/msr.rst > */ > - delta = rdtsc() - tsc_timestamp; > - if (shift < 0) > - delta >>= -shift; > + if (delta > tsc_timestamp) > + delta -= tsc_timestamp; > else > - delta <<= shift; > - ctr = ((delta * mul_frac) >> 32) + system_time; > + delta = 0; > + ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time; > > if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0) > return (ctr); > > - if (ctr < pvclock_lastcount) > - return (pvclock_lastcount); > + return pvclock_cmp_last(ctr); > +} > > - atomic_swap_uint(&pvclock_lastcount, ctr); > +uint > +pvclock_get_timecount(struct timecounter *tc) > +{ > + return (pvclock_get(tc)); > +} > > - return (ctr); > +void > +pvclock_tick(void *arg) > +{ > + struct pvclock_softc *sc = arg; > + struct timespec ts; > + struct pvclock_wall_clock *wc = &sc->sc_page->wc; > + int64_t value; > + > + wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc)); > + while (wc->wc_version & 0x1) > + virtio_membar_sync(); > + if (wc->wc_sec) { > + nanotime(&ts); > + value = TIMESPEC_TO_NSEC(&ts) - > + SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec - > + pvclock_get(&pvclock_timecounter); > + > + TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts); > + sc->sc_sensor.value = value; > + sc->sc_sensor.status = SENSOR_S_OK; > + } else > + sc->sc_sensor.status = SENSOR_S_UNKNOWN; > + > + timeout_add_sec(&sc->sc_tick, 15); > +} > + > +void > +pvclock_tick_hook(struct device *self) > +{ > + struct pvclock_softc *sc = (struct pvclock_softc *)self; > + > + timeout_set(&sc->sc_tick, pvclock_tick, sc); > + pvclock_tick(sc); > }