Download raw body.
Improving time keeping in x86 VMs
On Sat, May 17, 2025 at 05:39:35PM +0200, Stefan Fritsch wrote:
> On Sat, 12 Apr 2025, Stefan Fritsch wrote:
>
> > On Fri, 11 Apr 2025, Mike Larkin wrote:
> >
> > > On Wed, Mar 12, 2025 at 09:27:50PM +0100, Stefan Fritsch wrote:
> > > > On Tue, 11 Mar 2025, Dave Voutila wrote:
> > > >
> > > > > Stefan Fritsch <sf@openbsd.org> writes:
> > > > >
> > > > > > Hi,
> > > > > >
> > > > > > after talking to bluhm@ about time keeping issues of linux guests on
> > > > > > vmm, I have looked into making linux attach to vmm's pvclock. In doing
> > > > > > so, I also found some issues in the pvclock(4) guest driver. So, here
> > > > > > comes a larger diff that hopefully improves time keeping in VMs, both
> > > > > > with openbsd as guest and as hypervisor.
> > > > >
> > > > > My kingdom for something to kill off my hacky Linux pvclock module.
> > > > >
> > > > > Some questions / comments below.
> > > > >
> > > > > >
> > > > > > The diff changes in pvclock(4):
> > > > > >
> > > > > > * Fix integer overflows during multiplication by taking an assembler
> > > > > > implementation from FreeBSD that uses 128 bit intermediate results.
> > > > > >
> > > > > > * Increase accuracy by disabling interrupts while reading the clock
> > > > > >
> > > > > > * Fix the not-TSC_STABLE handling which was busted because it wrongly
> > > > > > compared 32 and 64 bit values. (This requires some atomic hackery on
> > > > > > i386).
> > > > > >
> > > > > > * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the
> > > > > > sensor in vmmci(4)
> > > > >
> > > > > Lovely...
> > > >
> > > > Today, I found a diff by Scott Cheloha from 2022 that was never
> > > > committed and that I have missed before. It has some overlap with my
> > > > pvclock diff. I will look at it some more and produce a combined diff.
> >
> > Sorry, I got side-tracked by other issues that were morge urgent. Updated
> > diff below. The vmm part is unchanged but the pvclock part has several
> > fixes, some inspired by Scott's earlier diff.
> >
> > This would need to be split up into several diffs before committing. I am
> > not 100% sure that the memory clobbers in i386_atomic_testset_uq and
> > i386_atomic_cas_uq are neccessary, but I think they are. If the new
> > i386_atomic_* functions are not wanted in atomic.h, they could of course
> > be moved into pvclock.c. But i386_atomic_testset_uq() is already in
> > atomic.h.
>
> There was something wrong with the i386 assembler atomics that caused
> havoc if the TSC stable bit was not set by the hypervisor. Updated diff
> below.
>
> diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c
> index 0a3be15cd64..5a1570aa8d9 100644
> --- a/sys/arch/amd64/amd64/vmm_machdep.c
> +++ b/sys/arch/amd64/amd64/vmm_machdep.c
> @@ -146,6 +146,7 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *);
> int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size);
> void vmm_init_pvclock(struct vcpu *, paddr_t);
> int vmm_update_pvclock(struct vcpu *);
> +void vmm_pv_wall_clock(struct vcpu *, paddr_t);
> int vmm_pat_is_valid(uint64_t);
>
> #ifdef MULTIPROCESSOR
> @@ -185,6 +186,7 @@ extern uint64_t tsc_frequency;
> extern int tsc_is_invariant;
>
> const char *vmm_hv_signature = VMM_HV_SIGNATURE;
> +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0";
>
> const struct kmem_pa_mode vmm_kp_contig = {
> .kp_constraint = &no_constraint,
> @@ -5578,6 +5580,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu)
> vmm_init_pvclock(vcpu,
> (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> break;
> + case KVM_MSR_WALL_CLOCK:
> + vmm_pv_wall_clock(vcpu,
> + (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> + break;
> #ifdef VMM_DEBUG
> default:
> /*
> @@ -5639,6 +5645,10 @@ svm_handle_msr(struct vcpu *vcpu)
> vmm_init_pvclock(vcpu,
> (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> break;
> + case KVM_MSR_WALL_CLOCK:
> + vmm_pv_wall_clock(vcpu,
> + (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> + break;
> default:
> /* Log the access, to be able to identify unknown MSRs */
> DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data "
> @@ -5994,6 +6004,20 @@ vmm_handle_cpuid(struct vcpu *vcpu)
> *rcx = 0;
> *rdx = 0;
> break;
> + case 0x40000100: /* Hypervisor information KVM */
> + *rax = 0x40000101;
> + *rbx = *((uint32_t *)&kvm_hv_signature[0]);
> + *rcx = *((uint32_t *)&kvm_hv_signature[4]);
> + *rdx = *((uint32_t *)&kvm_hv_signature[8]);
> + break;
> + case 0x40000101: /* KVM hypervisor features */
> + *rax = (1 << KVM_FEATURE_CLOCKSOURCE2) |
> + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> + (1 << KVM_FEATURE_NOP_IO_DELAY);
> + *rbx = 0;
> + *rcx = 0;
> + *rdx = 0;
> + break;
This makes us claim we are kvm. Even though you are only advertising support
for these 3 features, that makes a big assumption that every linux kernel out
there respects a neutered/weird "KVM" like this.
How many varieties of guest kernels did you test this on? I think we need to
be very careful if we do this. We are basically claiming to be something
that doesn't really exist anywhere else.
-ml
> case 0x80000000: /* Extended function level */
> /* We don't emulate past 0x8000001f currently. */
> *rax = min(curcpu()->ci_pnfeatset, 0x8000001f);
> @@ -6508,7 +6532,7 @@ vmm_update_pvclock(struct vcpu *vcpu)
> (++vcpu->vc_pvclock_version << 1) | 0x1;
>
> pvclock_ti->ti_tsc_timestamp = rdtsc();
> - nanotime(&tv);
> + nanouptime(&tv);
> pvclock_ti->ti_system_time =
> tv.tv_sec * 1000000000L + tv.tv_nsec;
> pvclock_ti->ti_tsc_shift = 12;
> @@ -6522,6 +6546,36 @@ vmm_update_pvclock(struct vcpu *vcpu)
> return (0);
> }
>
> +void
> +vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa)
> +{
> + struct pvclock_wall_clock *pvclock_wc;
> + struct timespec tv;
> + struct vm *vm = vcpu->vc_parent;
> + paddr_t hpa;
> +
> + if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock)))
> + goto err;
> +
> + /* XXX: handle case when this struct goes over page boundaries */
> + if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE)
> + goto err;
> +
> + if (!pmap_extract(vm->vm_map->pmap, gpa, &hpa))
> + goto err;
> + pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa);
> + pvclock_wc->wc_version |= 0x1;
> + nanoboottime(&tv);
> + pvclock_wc->wc_sec = tv.tv_sec;
> + pvclock_wc->wc_nsec = tv.tv_nsec;
> + pvclock_wc->wc_version += 1;
> +
> + return;
> +err:
> + vmm_inject_gp(vcpu);
> +}
> +
> +
> int
> vmm_pat_is_valid(uint64_t pat)
> {
> diff --git a/sys/arch/i386/include/atomic.h b/sys/arch/i386/include/atomic.h
> index 45aedb819c8..9034104b91a 100644
> --- a/sys/arch/i386/include/atomic.h
> +++ b/sys/arch/i386/include/atomic.h
> @@ -278,6 +278,24 @@ i386_atomic_testset_uq(volatile u_int64_t *ptr, u_int64_t val)
> return val;
> }
>
> +static __inline u_int64_t
> +i386_atomic_load_uq(volatile u_int64_t *ptr)
> +{
> + uint64_t val;
> + __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; "
> + _LOCK " cmpxchg8b %1" : "=&A" (val) : "m" (*ptr));
> + return val;
> +}
> +
> +static __inline u_int64_t
> +i386_atomic_cas_uq(volatile u_int64_t *p, u_int64_t e,
> + u_int64_t n)
> +{
> + __asm volatile(_LOCK " cmpxchg8b %1" : "+A" (e), "+m" (*p)
> + : "b" ((u_int32_t)n), "c" ((u_int32_t)(n >> 32)));
> + return (e);
> +}
> +
> static __inline u_int32_t
> i386_atomic_testset_ul(volatile u_int32_t *ptr, unsigned long val)
> {
> diff --git a/sys/arch/i386/include/cpufunc.h b/sys/arch/i386/include/cpufunc.h
> index 098b5982d20..dca6a8ee473 100644
> --- a/sys/arch/i386/include/cpufunc.h
> +++ b/sys/arch/i386/include/cpufunc.h
> @@ -229,6 +229,15 @@ rdtsc(void)
> return (tsc);
> }
>
> +static inline uint64_t
> +rdtsc_lfence(void)
> +{
> + uint64_t tsc;
> +
> + __asm volatile("lfence; rdtsc" : "=A" (tsc));
> + return tsc;
> +}
> +
> static __inline void
> wrmsr(u_int msr, u_int64_t newval)
> {
> diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c
> index 6d8dd432900..206cc418062 100644
> --- a/sys/dev/pv/pvclock.c
> +++ b/sys/dev/pv/pvclock.c
> @@ -22,6 +22,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/timeout.h>
> #include <sys/timetc.h>
>
> #include <machine/cpu.h>
> @@ -35,13 +36,25 @@
> #define PMAP_NOCRYPT 0
> #endif
>
> -uint pvclock_lastcount;
> +#ifdef __amd64__
> +unsigned long pvclock_lastcount;
> +#else
> +uint64_t pvclock_lastcount;
> +#endif
> +
> +struct pvpage {
> + struct pvclock_time_info ti;
> + struct pvclock_wall_clock wc;
> +};
>
> struct pvclock_softc {
> struct device sc_dev;
> - void *sc_time;
> + struct pvpage *sc_page;
> paddr_t sc_paddr;
> struct timecounter *sc_tc;
> + struct ksensordev sc_sensordev;
> + struct ksensor sc_sensor;
> + struct timeout sc_tick;
> };
>
> #define DEVNAME(_s) ((_s)->sc_dev.dv_xname)
> @@ -50,12 +63,16 @@ int pvclock_match(struct device *, void *, void *);
> void pvclock_attach(struct device *, struct device *, void *);
> int pvclock_activate(struct device *, int);
>
> +uint64_t pvclock_get(struct timecounter *);
> uint pvclock_get_timecount(struct timecounter *);
> +void pvclock_tick_hook(struct device *);
>
> static inline uint32_t
> pvclock_read_begin(const struct pvclock_time_info *);
> static inline int
> pvclock_read_done(const struct pvclock_time_info *, uint32_t);
> +static inline uint64_t
> + pvclock_scale_delta(uint64_t, uint32_t, int);
>
> const struct cfattach pvclock_ca = {
> sizeof(struct pvclock_softc),
> @@ -123,28 +140,32 @@ void
> pvclock_attach(struct device *parent, struct device *self, void *aux)
> {
> struct pvclock_softc *sc = (struct pvclock_softc *)self;
> + struct pv_attach_args *pva = aux;
> struct pvclock_time_info *ti;
> - paddr_t pa;
> + paddr_t pa;
> uint32_t version;
> uint8_t flags;
> struct vm_page *page;
> + struct pvbus_hv *kvm;
> +
>
> page = uvm_pagealloc(NULL, 0, NULL, 0);
> if (page == NULL)
> goto err;
> - sc->sc_time = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
> - if (sc->sc_time == NULL)
> + sc->sc_page = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
> + if (sc->sc_page == NULL)
> goto err;
>
> pa = VM_PAGE_TO_PHYS(page);
> - pmap_kenter_pa((vaddr_t)sc->sc_time, pa | PMAP_NOCRYPT,
> + pmap_kenter_pa((vaddr_t)sc->sc_page, pa | PMAP_NOCRYPT,
> PROT_READ | PROT_WRITE);
> - memset(sc->sc_time, 0, PAGE_SIZE);
> + pmap_update(pmap_kernel());
> + memset(sc->sc_page, 0, PAGE_SIZE);
>
> wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE);
> sc->sc_paddr = pa;
>
> - ti = sc->sc_time;
> + ti = &sc->sc_page->ti;
> do {
> version = pvclock_read_begin(ti);
> flags = ti->ti_flags;
> @@ -168,6 +189,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux)
>
> tc_init(sc->sc_tc);
>
> + /*
> + * The openbsd vmm pvclock does not support the WALL_CLOCK msr,
> + * therefore we look only for kvm.
> + */
> + kvm = &pva->pva_hv[PVBUS_KVM];
> + if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) {
> + strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname,
> + sizeof(sc->sc_sensordev.xname));
> + sc->sc_sensor.type = SENSOR_TIMEDELTA;
> + sc->sc_sensor.status = SENSOR_S_UNKNOWN;
> + sensor_attach(&sc->sc_sensordev, &sc->sc_sensor);
> + sensordev_install(&sc->sc_sensordev);
> +
> + config_mountroot(self, pvclock_tick_hook);
> + }
> +
> printf("\n");
> return;
> err:
> @@ -211,8 +248,82 @@ pvclock_read_done(const struct pvclock_time_info *ti,
> return (ti->ti_version == version);
> }
>
> -uint
> -pvclock_get_timecount(struct timecounter *tc)
> +static inline uint64_t
> +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
> +{
> + uint64_t product;
> +
> + if (shift < 0)
> + delta >>= -shift;
> + else
> + delta <<= shift;
> +#if defined(__i386__)
> + {
> + uint32_t tmp1, tmp2;
> +
> + /**
> + * For i386, the formula looks like:
> + *
> + * lower = (mul_frac * (delta & UINT_MAX)) >> 32
> + * upper = mul_frac * (delta >> 32)
> + * product = lower + upper
> + */
> + __asm__ (
> + "mul %5 ; "
> + "mov %4,%%eax ; "
> + "mov %%edx,%4 ; "
> + "mul %5 ; "
> + "xor %5,%5 ; "
> + "add %4,%%eax ; "
> + "adc %5,%%edx ; "
> + : "=A" (product), "=r" (tmp1), "=r" (tmp2)
> + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
> + "2" (mul_frac) );
> + }
> +#elif defined(__amd64__)
> + {
> + unsigned long tmp;
> +
> + __asm__ (
> + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
> + : [lo]"=a" (product), [hi]"=d" (tmp)
> + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
> + }
> +#else
> +#error "pvclock: unsupported x86 architecture?"
> +#endif
> + return (product);
> +}
> +
> +static uint64_t
> +pvclock_cmp_last(uint64_t ctr)
> +{
> +#if defined(__i386__)
> + uint64_t last;
> +
> + /*
> + * We are running on virtualization. Therefore we can assume that we
> + * have cmpxchg8b, available on pentium and newer.
> + */
> + do {
> + last = i386_atomic_load_uq(&pvclock_lastcount);
> + if (ctr < last)
> + return (last);
> + } while (i386_atomic_cas_uq(&pvclock_lastcount, last, ctr) != last);
> +#else
> + unsigned long last;
> +
> + do {
> + last = pvclock_lastcount;
> + if (ctr < last)
> + return (last);
> + } while (atomic_cas_ulong(&pvclock_lastcount, last, ctr) != last);
> +#endif
> + return (ctr);
> +}
> +
> +uint64_t
> +pvclock_get(struct timecounter *tc)
> {
> struct pvclock_softc *sc = tc->tc_priv;
> struct pvclock_time_info *ti;
> @@ -220,8 +331,10 @@ pvclock_get_timecount(struct timecounter *tc)
> uint32_t version, mul_frac;
> int8_t shift;
> uint8_t flags;
> + int s;
>
> - ti = sc->sc_time;
> + ti = &sc->sc_page->ti;
> + s = splhigh();
> do {
> version = pvclock_read_begin(ti);
> system_time = ti->ti_system_time;
> @@ -229,26 +342,63 @@ pvclock_get_timecount(struct timecounter *tc)
> mul_frac = ti->ti_tsc_to_system_mul;
> shift = ti->ti_tsc_shift;
> flags = ti->ti_flags;
> + delta = rdtsc_lfence();
> } while (!pvclock_read_done(ti, version));
> + splx(s);
>
> /*
> * The algorithm is described in
> - * linux/Documentation/virtual/kvm/msr.txt
> + * linux/Documentation/virt/kvm/x86/msr.rst
> */
> - delta = rdtsc() - tsc_timestamp;
> - if (shift < 0)
> - delta >>= -shift;
> + if (delta > tsc_timestamp)
> + delta -= tsc_timestamp;
> else
> - delta <<= shift;
> - ctr = ((delta * mul_frac) >> 32) + system_time;
> + delta = 0;
> + ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time;
>
> if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
> return (ctr);
>
> - if (ctr < pvclock_lastcount)
> - return (pvclock_lastcount);
> + return pvclock_cmp_last(ctr);
> +}
>
> - atomic_swap_uint(&pvclock_lastcount, ctr);
> +uint
> +pvclock_get_timecount(struct timecounter *tc)
> +{
> + return (pvclock_get(tc));
> +}
>
> - return (ctr);
> +void
> +pvclock_tick(void *arg)
> +{
> + struct pvclock_softc *sc = arg;
> + struct timespec ts;
> + struct pvclock_wall_clock *wc = &sc->sc_page->wc;
> + int64_t value;
> +
> + wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc));
> + while (wc->wc_version & 0x1)
> + virtio_membar_sync();
> + if (wc->wc_sec) {
> + nanotime(&ts);
> + value = TIMESPEC_TO_NSEC(&ts) -
> + SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec -
> + pvclock_get(&pvclock_timecounter);
> +
> + TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts);
> + sc->sc_sensor.value = value;
> + sc->sc_sensor.status = SENSOR_S_OK;
> + } else
> + sc->sc_sensor.status = SENSOR_S_UNKNOWN;
> +
> + timeout_add_sec(&sc->sc_tick, 15);
> +}
> +
> +void
> +pvclock_tick_hook(struct device *self)
> +{
> + struct pvclock_softc *sc = (struct pvclock_softc *)self;
> +
> + timeout_set(&sc->sc_tick, pvclock_tick, sc);
> + pvclock_tick(sc);
> }
Improving time keeping in x86 VMs