Index | Thread | Search

From:
Stefan Fritsch <sf@openbsd.org>
Subject:
Improving time keeping in x86 VMs
To:
tech@openbsd.org
Date:
Tue, 11 Mar 2025 20:36:58 +0100

Download raw body.

Thread
Hi,

after talking to bluhm@ about time keeping issues of linux guests on
vmm, I have looked into making linux attach to vmm's pvclock. In doing
so, I also found some issues in the pvclock(4) guest driver. So, here
comes a larger diff that hopefully improves time keeping in VMs, both
with openbsd as guest and as hypervisor.

The diff changes in pvclock(4):

* Fix integer overflows during multiplication by taking an assembler
  implementation from FreeBSD that uses 128 bit intermediate results.

* Increase accuracy by disabling interrupts while reading the clock

* Fix the not-TSC_STABLE handling which was busted because it wrongly
  compared 32 and 64 bit values. (This requires some atomic hackery on
  i386).

* Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the
  sensor in vmmci(4)

In vmm(4):

* Expose pvclock to linux guests by adding KVM compatible cpuid pages.
  While there, also expose the NOP_IO_DELAY feature bit which may make
  linux skip some ioport 80 reads. This should be ok at least until
  vmm(4) implements device pass through.

* As there is only a single feature bit for both KVM_MSR_SYSTEM_TIME
  and KVM_MSR_WALL_CLOCK, implement the latter in vmm.

If the integer overflows in pvclock(4) cause problems or not depends
on the multiplicator and shift values chosen by the hypervisor, which
in turn depend on the CPU's TSC frequency. So there may be systems
where this is not a problem at all, while on other systems it may
cause great havoc.

The vmm and pvclock parts of the diff work independently from each other. 
Testers and comments are welcome.

Cheers,
Stefan


diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c
index e3205f48eed..f2ecb0668b0 100644
--- a/sys/arch/amd64/amd64/vmm_machdep.c
+++ b/sys/arch/amd64/amd64/vmm_machdep.c
@@ -142,6 +142,7 @@ void svm_set_dirty(struct vcpu *, uint32_t);
 int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size);
 void vmm_init_pvclock(struct vcpu *, paddr_t);
 int vmm_update_pvclock(struct vcpu *);
+void vmm_pv_wall_clock(struct vcpu *, paddr_t);
 int vmm_pat_is_valid(uint64_t);
 
 #ifdef MULTIPROCESSOR
@@ -181,6 +182,7 @@ extern uint64_t tsc_frequency;
 extern int tsc_is_invariant;
 
 const char *vmm_hv_signature = VMM_HV_SIGNATURE;
+const char *kvm_hv_signature = "KVMKVMKVM\0\0\0";
 
 const struct kmem_pa_mode vmm_kp_contig = {
 	.kp_constraint = &no_constraint,
@@ -5555,6 +5557,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu)
 		vmm_init_pvclock(vcpu,
 		    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
 		break;
+	case KVM_MSR_WALL_CLOCK:
+		vmm_pv_wall_clock(vcpu,
+		    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
+		break;
 #ifdef VMM_DEBUG
 	default:
 		/*
@@ -5616,6 +5622,10 @@ svm_handle_msr(struct vcpu *vcpu)
 			vmm_init_pvclock(vcpu,
 			    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
 			break;
+		case KVM_MSR_WALL_CLOCK:
+			vmm_pv_wall_clock(vcpu,
+			    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
+			break;
 		default:
 			/* Log the access, to be able to identify unknown MSRs */
 			DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data "
@@ -5971,6 +5981,20 @@ vmm_handle_cpuid(struct vcpu *vcpu)
 		*rcx = 0;
 		*rdx = 0;
 		break;
+	case 0x40000100:	/* Hypervisor information KVM */
+		*rax = 0x40000101;
+		*rbx = *((uint32_t *)&kvm_hv_signature[0]);
+		*rcx = *((uint32_t *)&kvm_hv_signature[4]);
+		*rdx = *((uint32_t *)&kvm_hv_signature[8]);
+		break;
+	case 0x40000101:	/* KVM hypervisor features */
+		*rax = (1 << KVM_FEATURE_CLOCKSOURCE2) |
+		    (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+		    (1 << KVM_FEATURE_NOP_IO_DELAY);
+		*rbx = 0;
+		*rcx = 0;
+		*rdx = 0;
+		break;
 	case 0x80000000:	/* Extended function level */
 		/* We don't emulate past 0x8000001f currently. */
 		*rax = min(curcpu()->ci_pnfeatset, 0x8000001f);
@@ -6468,7 +6492,7 @@ vmm_update_pvclock(struct vcpu *vcpu)
 		    (++vcpu->vc_pvclock_version << 1) | 0x1;
 
 		pvclock_ti->ti_tsc_timestamp = rdtsc();
-		nanotime(&tv);
+		nanouptime(&tv);
 		pvclock_ti->ti_system_time =
 		    tv.tv_sec * 1000000000L + tv.tv_nsec;
 		pvclock_ti->ti_tsc_shift = 12;
@@ -6482,6 +6506,36 @@ vmm_update_pvclock(struct vcpu *vcpu)
 	return (0);
 }
 
+void
+vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa)
+{
+	struct pvclock_wall_clock *pvclock_wc;
+	struct timespec tv;
+        struct vm *vm = vcpu->vc_parent;
+        paddr_t hpa;
+
+	if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock)))
+		goto err;
+
+	/* XXX: handle case when this struct goes over page boundaries */
+	if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE)
+		goto err;
+
+	if (!pmap_extract(vm->vm_map->pmap, gpa, &hpa))
+		goto err;
+	pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa);
+	pvclock_wc->wc_version |= 0x1;
+	nanoboottime(&tv);
+	pvclock_wc->wc_sec = tv.tv_sec;
+	pvclock_wc->wc_nsec = tv.tv_nsec;
+	pvclock_wc->wc_version += 1;
+
+	return;
+err:
+	vmm_inject_gp(vcpu);
+}
+
+
 int
 vmm_pat_is_valid(uint64_t pat)
 {
diff --git a/sys/arch/i386/include/atomic.h b/sys/arch/i386/include/atomic.h
index 45aedb819c8..e779b51869f 100644
--- a/sys/arch/i386/include/atomic.h
+++ b/sys/arch/i386/include/atomic.h
@@ -274,7 +274,18 @@ static __inline u_int64_t
 i386_atomic_testset_uq(volatile u_int64_t *ptr, u_int64_t val)
 {
 	__asm__ volatile ("\n1:\t" _LOCK " cmpxchg8b (%1); jnz 1b" : "+A" (val) :
-	    "r" (ptr), "b" ((u_int32_t)val), "c" ((u_int32_t)(val >> 32)));
+	    "r" (ptr), "b" ((u_int32_t)val), "c" ((u_int32_t)(val >> 32)) :
+	    "cc");
+	return val;
+}
+
+static __inline u_int64_t
+i386_atomic_load_uq(volatile u_int64_t *ptr)
+{
+	uint64_t val;
+	__asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; "
+	    _LOCK " cmpxchg8b (%1)" : "=&A" (val) : "r" (ptr) :
+	    "cc");
 	return val;
 }
 
diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c
index 994fc4a337c..c3f4c0049e0 100644
--- a/sys/dev/pv/pvclock.c
+++ b/sys/dev/pv/pvclock.c
@@ -22,6 +22,7 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/timeout.h>
 #include <sys/timetc.h>
 
 #include <machine/cpu.h>
@@ -31,13 +32,21 @@
 #include <dev/pv/pvvar.h>
 #include <dev/pv/pvreg.h>
 
-uint pvclock_lastcount;
+uint64_t pvclock_lastcount;
+
+struct pvpage {
+	struct pvclock_time_info	ti;
+	struct pvclock_wall_clock	wc;
+};
 
 struct pvclock_softc {
 	struct device		 sc_dev;
-	void			*sc_time;
+	struct pvpage		*sc_page;
 	paddr_t			 sc_paddr;
 	struct timecounter	*sc_tc;
+	struct ksensordev	 sc_sensordev;
+	struct ksensor		 sc_sensor;
+	struct timeout		 sc_tick;
 };
 
 #define DEVNAME(_s)			((_s)->sc_dev.dv_xname)
@@ -46,12 +55,16 @@ int	 pvclock_match(struct device *, void *, void *);
 void	 pvclock_attach(struct device *, struct device *, void *);
 int	 pvclock_activate(struct device *, int);
 
+uint64_t pvclock_get(struct timecounter *);
 uint	 pvclock_get_timecount(struct timecounter *);
+void	 pvclock_tick_hook(struct device *);
 
 static inline uint32_t
 	 pvclock_read_begin(const struct pvclock_time_info *);
 static inline int
 	 pvclock_read_done(const struct pvclock_time_info *, uint32_t);
+static inline uint64_t
+	 pvclock_scale_delta(uint64_t, uint32_t, int);
 
 const struct cfattach pvclock_ca = {
 	sizeof(struct pvclock_softc),
@@ -119,26 +132,28 @@ void
 pvclock_attach(struct device *parent, struct device *self, void *aux)
 {
 	struct pvclock_softc		*sc = (struct pvclock_softc *)self;
+	struct pv_attach_args		*pva = aux;
 	struct pvclock_time_info	*ti;
 	paddr_t			 	 pa;
 	uint32_t			 version;
 	uint8_t				 flags;
+	struct pvbus_hv			*kvm;
 
-	if ((sc->sc_time = km_alloc(PAGE_SIZE,
+	if ((sc->sc_page = km_alloc(PAGE_SIZE,
 	    &kv_any, &kp_zero, &kd_nowait)) == NULL) {
 		printf(": time page allocation failed\n");
 		return;
 	}
-	if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_time, &pa)) {
+	if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_page, &pa)) {
 		printf(": time page PA extraction failed\n");
-		km_free(sc->sc_time, PAGE_SIZE, &kv_any, &kp_zero);
+		km_free(sc->sc_page, PAGE_SIZE, &kv_any, &kp_zero);
 		return;
 	}
 
 	wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE);
 	sc->sc_paddr = pa;
 
-	ti = sc->sc_time;
+	ti = &sc->sc_page->ti;
 	do {
 		version = pvclock_read_begin(ti);
 		flags = ti->ti_flags;
@@ -162,6 +177,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux)
 
 	tc_init(sc->sc_tc);
 
+	/*
+	 * The openbsd vmm pvclock does not support the WALL_CLOCK msr,
+	 * therefore we look only for kvm.
+	 */
+	kvm = &pva->pva_hv[PVBUS_KVM];
+	if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) {
+		strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname,
+		    sizeof(sc->sc_sensordev.xname));
+		sc->sc_sensor.type = SENSOR_TIMEDELTA;
+		sc->sc_sensor.status = SENSOR_S_UNKNOWN;
+		sensor_attach(&sc->sc_sensordev, &sc->sc_sensor);
+		sensordev_install(&sc->sc_sensordev);
+
+		config_mountroot(self, pvclock_tick_hook);
+	}
+
 	printf("\n");
 }
 
@@ -200,17 +231,67 @@ pvclock_read_done(const struct pvclock_time_info *ti,
 	return (ti->ti_version == version);
 }
 
-uint
-pvclock_get_timecount(struct timecounter *tc)
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+	uint64_t product;
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+#if defined(__i386__)
+	{
+		uint32_t tmp1, tmp2;
+
+		/**
+		 * For i386, the formula looks like:
+		 *
+		 *   lower = (mul_frac * (delta & UINT_MAX)) >> 32
+		 *   upper = mul_frac * (delta >> 32)
+		 *   product = lower + upper
+		 */
+		__asm__ (
+			"mul  %5       ; "
+			"mov  %4,%%eax ; "
+			"mov  %%edx,%4 ; "
+			"mul  %5       ; "
+			"xor  %5,%5    ; "
+			"add  %4,%%eax ; "
+			"adc  %5,%%edx ; "
+			: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+			: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
+			  "2" (mul_frac) );
+	}
+#elif defined(__amd64__)
+	{
+		unsigned long tmp;
+
+		__asm__ (
+			"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+			: [lo]"=a" (product), [hi]"=d" (tmp)
+			: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
+	}
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+	return (product);
+}
+
+uint64_t
+pvclock_get(struct timecounter *tc)
 {
 	struct pvclock_softc		*sc = tc->tc_priv;
 	struct pvclock_time_info	*ti;
 	uint64_t			 tsc_timestamp, system_time, delta, ctr;
+	uint64_t			 lastcount;
 	uint32_t			 version, mul_frac;
 	int8_t				 shift;
 	uint8_t				 flags;
+	int				 s;
 
-	ti = sc->sc_time;
+	ti = &sc->sc_page->ti;
+	s = splhigh();
 	do {
 		version = pvclock_read_begin(ti);
 		system_time = ti->ti_system_time;
@@ -218,26 +299,82 @@ pvclock_get_timecount(struct timecounter *tc)
 		mul_frac = ti->ti_tsc_to_system_mul;
 		shift = ti->ti_tsc_shift;
 		flags = ti->ti_flags;
+		delta = rdtsc();
 	} while (!pvclock_read_done(ti, version));
+	splx(s);
 
 	/*
 	 * The algorithm is described in
-	 * linux/Documentation/virtual/kvm/msr.txt
+	 * linux/Documentation/virt/kvm/x86/msr.rst
 	 */
-	delta = rdtsc() - tsc_timestamp;
-	if (shift < 0)
-		delta >>= -shift;
-	else
-		delta <<= shift;
-	ctr = ((delta * mul_frac) >> 32) + system_time;
+	delta -= tsc_timestamp;
+	ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time;
 
 	if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
 		return (ctr);
 
-	if (ctr < pvclock_lastcount)
-		return (pvclock_lastcount);
-
-	atomic_swap_uint(&pvclock_lastcount, ctr);
+#if defined(__i386__)
+	/*
+	 * We are running on virtualization. Therefore we can assume that we
+	 * have cmpxchg8b, available on pentium and newer.
+	 */
+	lastcount = i386_atomic_load_uq(&pvclock_lastcount);
+#elif defined(__amd64__)
+	lastcount = pvclock_lastcount;
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+	if (ctr < lastcount)
+		return (lastcount);
+
+#if defined(__i386__)
+	ctr = i386_atomic_testset_uq(&pvclock_lastcount, ctr);
+#elif defined(__amd64__)
+	atomic_swap_64(&pvclock_lastcount, ctr);
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
 
 	return (ctr);
 }
+
+uint
+pvclock_get_timecount(struct timecounter *tc)
+{
+	return (pvclock_get(tc));
+}
+
+void
+pvclock_tick(void *arg)
+{
+	struct pvclock_softc		*sc = arg;
+	struct timespec			 ts;
+	struct pvclock_wall_clock	*wc = &sc->sc_page->wc;
+	int64_t			 	 value;
+
+	wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc));
+	while (wc->wc_version & 0x1)
+		virtio_membar_sync();
+	if (wc->wc_sec) {
+		nanotime(&ts);
+		value = TIMESPEC_TO_NSEC(&ts) -
+		    SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec -
+		    pvclock_get(&pvclock_timecounter);
+
+		TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts);
+		sc->sc_sensor.value = value;
+		sc->sc_sensor.status = SENSOR_S_OK;
+	} else
+		sc->sc_sensor.status = SENSOR_S_UNKNOWN;
+
+	timeout_add_sec(&sc->sc_tick, 15);
+}
+
+void
+pvclock_tick_hook(struct device *self)
+{
+	struct pvclock_softc	*sc = (struct pvclock_softc *)self;
+
+	timeout_set(&sc->sc_tick, pvclock_tick, sc);
+	pvclock_tick(sc);
+}