Download raw body.
Improving time keeping in x86 VMs
On Sat, 17 May 2025, Mark Kettenis wrote:
> > Date: Sat, 17 May 2025 17:39:35 +0200 (CEST)
> > From: Stefan Fritsch <sf@openbsd.org>
> >
> > On Sat, 12 Apr 2025, Stefan Fritsch wrote:
> >
> > > On Fri, 11 Apr 2025, Mike Larkin wrote:
> > >
> > > > On Wed, Mar 12, 2025 at 09:27:50PM +0100, Stefan Fritsch wrote:
> > > > > On Tue, 11 Mar 2025, Dave Voutila wrote:
> > > > >
> > > > > > Stefan Fritsch <sf@openbsd.org> writes:
> > > > > >
> > > > > > > Hi,
> > > > > > >
> > > > > > > after talking to bluhm@ about time keeping issues of linux guests on
> > > > > > > vmm, I have looked into making linux attach to vmm's pvclock. In doing
> > > > > > > so, I also found some issues in the pvclock(4) guest driver. So, here
> > > > > > > comes a larger diff that hopefully improves time keeping in VMs, both
> > > > > > > with openbsd as guest and as hypervisor.
> > > > > >
> > > > > > My kingdom for something to kill off my hacky Linux pvclock module.
> > > > > >
> > > > > > Some questions / comments below.
> > > > > >
> > > > > > >
> > > > > > > The diff changes in pvclock(4):
> > > > > > >
> > > > > > > * Fix integer overflows during multiplication by taking an assembler
> > > > > > > implementation from FreeBSD that uses 128 bit intermediate results.
> > > > > > >
> > > > > > > * Increase accuracy by disabling interrupts while reading the clock
> > > > > > >
> > > > > > > * Fix the not-TSC_STABLE handling which was busted because it wrongly
> > > > > > > compared 32 and 64 bit values. (This requires some atomic hackery on
> > > > > > > i386).
> > > > > > >
> > > > > > > * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the
> > > > > > > sensor in vmmci(4)
> > > > > >
> > > > > > Lovely...
> > > > >
> > > > > Today, I found a diff by Scott Cheloha from 2022 that was never
> > > > > committed and that I have missed before. It has some overlap with my
> > > > > pvclock diff. I will look at it some more and produce a combined diff.
> > >
> > > Sorry, I got side-tracked by other issues that were morge urgent. Updated
> > > diff below. The vmm part is unchanged but the pvclock part has several
> > > fixes, some inspired by Scott's earlier diff.
> > >
> > > This would need to be split up into several diffs before committing. I am
> > > not 100% sure that the memory clobbers in i386_atomic_testset_uq and
> > > i386_atomic_cas_uq are neccessary, but I think they are. If the new
> > > i386_atomic_* functions are not wanted in atomic.h, they could of course
> > > be moved into pvclock.c. But i386_atomic_testset_uq() is already in
> > > atomic.h.
> >
> > There was something wrong with the i386 assembler atomics that caused
> > havoc if the TSC stable bit was not set by the hypervisor. Updated diff
> > below.
>
> I don't understand why we need so much inline asm and why you we need
> all those #ifdefs. Yes we do need those oddly named wrappers for
> cmpxchg8b in i386 since we have to be careful that we don't use them
> anywhere else. So perhaps they shouldn't even live in atomic.h.
> Maybe just put them in pvclock.c where they're used?
>
> But why do you need to have all that inline asm in
> pvclock_scale_delta()? Is it really that difficult to write that code
> in plain C?
I took the assembler pvclock_scale_delta() from FreeBSD because I knew it
to work. But you are right, the C version is infinitely more readable. The
resulting code does one more mul, but I don't think that matters in
practice. I have also moved the i386 cmpxchg8b wrappers to pvclock.c which
also takes care of tedu's comment.
The updated diff below is only the non-vmm(4) part, because the vmm(4)
part needs more discussion. I will reply to mlarkin's mail about that.
diff --git a/sys/arch/i386/include/cpufunc.h b/sys/arch/i386/include/cpufunc.h
index 098b5982d20..dca6a8ee473 100644
--- a/sys/arch/i386/include/cpufunc.h
+++ b/sys/arch/i386/include/cpufunc.h
@@ -229,6 +229,15 @@ rdtsc(void)
return (tsc);
}
+static inline uint64_t
+rdtsc_lfence(void)
+{
+ uint64_t tsc;
+
+ __asm volatile("lfence; rdtsc" : "=A" (tsc));
+ return tsc;
+}
+
static __inline void
wrmsr(u_int msr, u_int64_t newval)
{
diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c
index 6d8dd432900..7e9f7f11727 100644
--- a/sys/dev/pv/pvclock.c
+++ b/sys/dev/pv/pvclock.c
@@ -22,6 +22,7 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/timeout.h>
#include <sys/timetc.h>
#include <machine/cpu.h>
@@ -35,13 +36,65 @@
#define PMAP_NOCRYPT 0
#endif
-uint pvclock_lastcount;
+#if defined(__amd64__)
+
+static __inline uint64_t
+atomic_load(volatile uint64_t *ptr)
+{
+ return *ptr;
+}
+
+static __inline u_int64_t
+atomic_cas(volatile uint64_t *p, uint64_t e,
+ uint64_t n)
+{
+ return atomic_cas_ulong((volatile unsigned long *)p, e, n);
+}
+
+#elif defined(__i386__)
+
+/*
+ * We are running on virtualization. Therefore we can assume that we
+ * have cmpxchg8b, available on pentium and newer.
+ */
+static __inline uint64_t
+atomic_load(volatile uint64_t *ptr)
+{
+ uint64_t val;
+ __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; "
+ "lock cmpxchg8b %1" : "=&A" (val) : "m" (*ptr));
+ return val;
+}
+
+static __inline u_int64_t
+atomic_cas(volatile uint64_t *p, uint64_t e,
+ uint64_t n)
+{
+ __asm volatile("lock cmpxchg8b %1" : "+A" (e), "+m" (*p)
+ : "b" ((uint32_t)n), "c" ((uint32_t)(n >> 32)));
+ return (e);
+}
+
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+
+
+uint64_t pvclock_lastcount;
+
+struct pvpage {
+ struct pvclock_time_info ti;
+ struct pvclock_wall_clock wc;
+};
struct pvclock_softc {
struct device sc_dev;
- void *sc_time;
+ struct pvpage *sc_page;
paddr_t sc_paddr;
struct timecounter *sc_tc;
+ struct ksensordev sc_sensordev;
+ struct ksensor sc_sensor;
+ struct timeout sc_tick;
};
#define DEVNAME(_s) ((_s)->sc_dev.dv_xname)
@@ -50,12 +103,16 @@ int pvclock_match(struct device *, void *, void *);
void pvclock_attach(struct device *, struct device *, void *);
int pvclock_activate(struct device *, int);
+uint64_t pvclock_get(struct timecounter *);
uint pvclock_get_timecount(struct timecounter *);
+void pvclock_tick_hook(struct device *);
static inline uint32_t
pvclock_read_begin(const struct pvclock_time_info *);
static inline int
pvclock_read_done(const struct pvclock_time_info *, uint32_t);
+static inline uint64_t
+ pvclock_scale_delta(uint64_t, uint32_t, int);
const struct cfattach pvclock_ca = {
sizeof(struct pvclock_softc),
@@ -123,28 +180,32 @@ void
pvclock_attach(struct device *parent, struct device *self, void *aux)
{
struct pvclock_softc *sc = (struct pvclock_softc *)self;
+ struct pv_attach_args *pva = aux;
struct pvclock_time_info *ti;
- paddr_t pa;
+ paddr_t pa;
uint32_t version;
uint8_t flags;
struct vm_page *page;
+ struct pvbus_hv *kvm;
+
page = uvm_pagealloc(NULL, 0, NULL, 0);
if (page == NULL)
goto err;
- sc->sc_time = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
- if (sc->sc_time == NULL)
+ sc->sc_page = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
+ if (sc->sc_page == NULL)
goto err;
pa = VM_PAGE_TO_PHYS(page);
- pmap_kenter_pa((vaddr_t)sc->sc_time, pa | PMAP_NOCRYPT,
+ pmap_kenter_pa((vaddr_t)sc->sc_page, pa | PMAP_NOCRYPT,
PROT_READ | PROT_WRITE);
- memset(sc->sc_time, 0, PAGE_SIZE);
+ pmap_update(pmap_kernel());
+ memset(sc->sc_page, 0, PAGE_SIZE);
wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE);
sc->sc_paddr = pa;
- ti = sc->sc_time;
+ ti = &sc->sc_page->ti;
do {
version = pvclock_read_begin(ti);
flags = ti->ti_flags;
@@ -168,6 +229,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux)
tc_init(sc->sc_tc);
+ /*
+ * The openbsd vmm pvclock does not support the WALL_CLOCK msr,
+ * therefore we look only for kvm.
+ */
+ kvm = &pva->pva_hv[PVBUS_KVM];
+ if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) {
+ strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname,
+ sizeof(sc->sc_sensordev.xname));
+ sc->sc_sensor.type = SENSOR_TIMEDELTA;
+ sc->sc_sensor.status = SENSOR_S_UNKNOWN;
+ sensor_attach(&sc->sc_sensordev, &sc->sc_sensor);
+ sensordev_install(&sc->sc_sensordev);
+
+ config_mountroot(self, pvclock_tick_hook);
+ }
+
printf("\n");
return;
err:
@@ -211,8 +288,36 @@ pvclock_read_done(const struct pvclock_time_info *ti,
return (ti->ti_version == version);
}
-uint
-pvclock_get_timecount(struct timecounter *tc)
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+ uint64_t lower, upper;
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+ lower = ((uint64_t)mul_frac * ((uint32_t)delta)) >> 32;
+ upper = (uint64_t)mul_frac * (delta >> 32);
+ return lower + upper;
+}
+
+static uint64_t
+pvclock_cmp_last(uint64_t ctr)
+{
+ uint64_t last;
+
+ do {
+ last = atomic_load(&pvclock_lastcount);
+ if (ctr < last)
+ return (last);
+ } while (atomic_cas(&pvclock_lastcount, last, ctr) != last);
+ return (ctr);
+}
+
+uint64_t
+pvclock_get(struct timecounter *tc)
{
struct pvclock_softc *sc = tc->tc_priv;
struct pvclock_time_info *ti;
@@ -220,8 +325,10 @@ pvclock_get_timecount(struct timecounter *tc)
uint32_t version, mul_frac;
int8_t shift;
uint8_t flags;
+ int s;
- ti = sc->sc_time;
+ ti = &sc->sc_page->ti;
+ s = splhigh();
do {
version = pvclock_read_begin(ti);
system_time = ti->ti_system_time;
@@ -229,26 +336,63 @@ pvclock_get_timecount(struct timecounter *tc)
mul_frac = ti->ti_tsc_to_system_mul;
shift = ti->ti_tsc_shift;
flags = ti->ti_flags;
+ delta = rdtsc_lfence();
} while (!pvclock_read_done(ti, version));
+ splx(s);
/*
* The algorithm is described in
- * linux/Documentation/virtual/kvm/msr.txt
+ * linux/Documentation/virt/kvm/x86/msr.rst
*/
- delta = rdtsc() - tsc_timestamp;
- if (shift < 0)
- delta >>= -shift;
+ if (delta > tsc_timestamp)
+ delta -= tsc_timestamp;
else
- delta <<= shift;
- ctr = ((delta * mul_frac) >> 32) + system_time;
+ delta = 0;
+ ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time;
if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
return (ctr);
- if (ctr < pvclock_lastcount)
- return (pvclock_lastcount);
+ return pvclock_cmp_last(ctr);
+}
- atomic_swap_uint(&pvclock_lastcount, ctr);
+uint
+pvclock_get_timecount(struct timecounter *tc)
+{
+ return (pvclock_get(tc));
+}
- return (ctr);
+void
+pvclock_tick(void *arg)
+{
+ struct pvclock_softc *sc = arg;
+ struct timespec ts;
+ struct pvclock_wall_clock *wc = &sc->sc_page->wc;
+ int64_t value;
+
+ wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc));
+ while (wc->wc_version & 0x1)
+ virtio_membar_sync();
+ if (wc->wc_sec) {
+ nanotime(&ts);
+ value = TIMESPEC_TO_NSEC(&ts) -
+ SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec -
+ pvclock_get(&pvclock_timecounter);
+
+ TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts);
+ sc->sc_sensor.value = value;
+ sc->sc_sensor.status = SENSOR_S_OK;
+ } else
+ sc->sc_sensor.status = SENSOR_S_UNKNOWN;
+
+ timeout_add_sec(&sc->sc_tick, 15);
+}
+
+void
+pvclock_tick_hook(struct device *self)
+{
+ struct pvclock_softc *sc = (struct pvclock_softc *)self;
+
+ timeout_set(&sc->sc_tick, pvclock_tick, sc);
+ pvclock_tick(sc);
}
Improving time keeping in x86 VMs