From: Stefan Fritsch Subject: Improving time keeping in x86 VMs To: tech@openbsd.org Date: Tue, 11 Mar 2025 20:36:58 +0100 Hi, after talking to bluhm@ about time keeping issues of linux guests on vmm, I have looked into making linux attach to vmm's pvclock. In doing so, I also found some issues in the pvclock(4) guest driver. So, here comes a larger diff that hopefully improves time keeping in VMs, both with openbsd as guest and as hypervisor. The diff changes in pvclock(4): * Fix integer overflows during multiplication by taking an assembler implementation from FreeBSD that uses 128 bit intermediate results. * Increase accuracy by disabling interrupts while reading the clock * Fix the not-TSC_STABLE handling which was busted because it wrongly compared 32 and 64 bit values. (This requires some atomic hackery on i386). * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the sensor in vmmci(4) In vmm(4): * Expose pvclock to linux guests by adding KVM compatible cpuid pages. While there, also expose the NOP_IO_DELAY feature bit which may make linux skip some ioport 80 reads. This should be ok at least until vmm(4) implements device pass through. * As there is only a single feature bit for both KVM_MSR_SYSTEM_TIME and KVM_MSR_WALL_CLOCK, implement the latter in vmm. If the integer overflows in pvclock(4) cause problems or not depends on the multiplicator and shift values chosen by the hypervisor, which in turn depend on the CPU's TSC frequency. So there may be systems where this is not a problem at all, while on other systems it may cause great havoc. The vmm and pvclock parts of the diff work independently from each other. Testers and comments are welcome. Cheers, Stefan diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c index e3205f48eed..f2ecb0668b0 100644 --- a/sys/arch/amd64/amd64/vmm_machdep.c +++ b/sys/arch/amd64/amd64/vmm_machdep.c @@ -142,6 +142,7 @@ void svm_set_dirty(struct vcpu *, uint32_t); int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size); void vmm_init_pvclock(struct vcpu *, paddr_t); int vmm_update_pvclock(struct vcpu *); +void vmm_pv_wall_clock(struct vcpu *, paddr_t); int vmm_pat_is_valid(uint64_t); #ifdef MULTIPROCESSOR @@ -181,6 +182,7 @@ extern uint64_t tsc_frequency; extern int tsc_is_invariant; const char *vmm_hv_signature = VMM_HV_SIGNATURE; +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0"; const struct kmem_pa_mode vmm_kp_contig = { .kp_constraint = &no_constraint, @@ -5555,6 +5557,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu) vmm_init_pvclock(vcpu, (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); break; + case KVM_MSR_WALL_CLOCK: + vmm_pv_wall_clock(vcpu, + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); + break; #ifdef VMM_DEBUG default: /* @@ -5616,6 +5622,10 @@ svm_handle_msr(struct vcpu *vcpu) vmm_init_pvclock(vcpu, (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); break; + case KVM_MSR_WALL_CLOCK: + vmm_pv_wall_clock(vcpu, + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); + break; default: /* Log the access, to be able to identify unknown MSRs */ DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data " @@ -5971,6 +5981,20 @@ vmm_handle_cpuid(struct vcpu *vcpu) *rcx = 0; *rdx = 0; break; + case 0x40000100: /* Hypervisor information KVM */ + *rax = 0x40000101; + *rbx = *((uint32_t *)&kvm_hv_signature[0]); + *rcx = *((uint32_t *)&kvm_hv_signature[4]); + *rdx = *((uint32_t *)&kvm_hv_signature[8]); + break; + case 0x40000101: /* KVM hypervisor features */ + *rax = (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_NOP_IO_DELAY); + *rbx = 0; + *rcx = 0; + *rdx = 0; + break; case 0x80000000: /* Extended function level */ /* We don't emulate past 0x8000001f currently. */ *rax = min(curcpu()->ci_pnfeatset, 0x8000001f); @@ -6468,7 +6492,7 @@ vmm_update_pvclock(struct vcpu *vcpu) (++vcpu->vc_pvclock_version << 1) | 0x1; pvclock_ti->ti_tsc_timestamp = rdtsc(); - nanotime(&tv); + nanouptime(&tv); pvclock_ti->ti_system_time = tv.tv_sec * 1000000000L + tv.tv_nsec; pvclock_ti->ti_tsc_shift = 12; @@ -6482,6 +6506,36 @@ vmm_update_pvclock(struct vcpu *vcpu) return (0); } +void +vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa) +{ + struct pvclock_wall_clock *pvclock_wc; + struct timespec tv; + struct vm *vm = vcpu->vc_parent; + paddr_t hpa; + + if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock))) + goto err; + + /* XXX: handle case when this struct goes over page boundaries */ + if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE) + goto err; + + if (!pmap_extract(vm->vm_map->pmap, gpa, &hpa)) + goto err; + pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa); + pvclock_wc->wc_version |= 0x1; + nanoboottime(&tv); + pvclock_wc->wc_sec = tv.tv_sec; + pvclock_wc->wc_nsec = tv.tv_nsec; + pvclock_wc->wc_version += 1; + + return; +err: + vmm_inject_gp(vcpu); +} + + int vmm_pat_is_valid(uint64_t pat) { diff --git a/sys/arch/i386/include/atomic.h b/sys/arch/i386/include/atomic.h index 45aedb819c8..e779b51869f 100644 --- a/sys/arch/i386/include/atomic.h +++ b/sys/arch/i386/include/atomic.h @@ -274,7 +274,18 @@ static __inline u_int64_t i386_atomic_testset_uq(volatile u_int64_t *ptr, u_int64_t val) { __asm__ volatile ("\n1:\t" _LOCK " cmpxchg8b (%1); jnz 1b" : "+A" (val) : - "r" (ptr), "b" ((u_int32_t)val), "c" ((u_int32_t)(val >> 32))); + "r" (ptr), "b" ((u_int32_t)val), "c" ((u_int32_t)(val >> 32)) : + "cc"); + return val; +} + +static __inline u_int64_t +i386_atomic_load_uq(volatile u_int64_t *ptr) +{ + uint64_t val; + __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; " + _LOCK " cmpxchg8b (%1)" : "=&A" (val) : "r" (ptr) : + "cc"); return val; } diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c index 994fc4a337c..c3f4c0049e0 100644 --- a/sys/dev/pv/pvclock.c +++ b/sys/dev/pv/pvclock.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -31,13 +32,21 @@ #include #include -uint pvclock_lastcount; +uint64_t pvclock_lastcount; + +struct pvpage { + struct pvclock_time_info ti; + struct pvclock_wall_clock wc; +}; struct pvclock_softc { struct device sc_dev; - void *sc_time; + struct pvpage *sc_page; paddr_t sc_paddr; struct timecounter *sc_tc; + struct ksensordev sc_sensordev; + struct ksensor sc_sensor; + struct timeout sc_tick; }; #define DEVNAME(_s) ((_s)->sc_dev.dv_xname) @@ -46,12 +55,16 @@ int pvclock_match(struct device *, void *, void *); void pvclock_attach(struct device *, struct device *, void *); int pvclock_activate(struct device *, int); +uint64_t pvclock_get(struct timecounter *); uint pvclock_get_timecount(struct timecounter *); +void pvclock_tick_hook(struct device *); static inline uint32_t pvclock_read_begin(const struct pvclock_time_info *); static inline int pvclock_read_done(const struct pvclock_time_info *, uint32_t); +static inline uint64_t + pvclock_scale_delta(uint64_t, uint32_t, int); const struct cfattach pvclock_ca = { sizeof(struct pvclock_softc), @@ -119,26 +132,28 @@ void pvclock_attach(struct device *parent, struct device *self, void *aux) { struct pvclock_softc *sc = (struct pvclock_softc *)self; + struct pv_attach_args *pva = aux; struct pvclock_time_info *ti; paddr_t pa; uint32_t version; uint8_t flags; + struct pvbus_hv *kvm; - if ((sc->sc_time = km_alloc(PAGE_SIZE, + if ((sc->sc_page = km_alloc(PAGE_SIZE, &kv_any, &kp_zero, &kd_nowait)) == NULL) { printf(": time page allocation failed\n"); return; } - if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_time, &pa)) { + if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_page, &pa)) { printf(": time page PA extraction failed\n"); - km_free(sc->sc_time, PAGE_SIZE, &kv_any, &kp_zero); + km_free(sc->sc_page, PAGE_SIZE, &kv_any, &kp_zero); return; } wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); sc->sc_paddr = pa; - ti = sc->sc_time; + ti = &sc->sc_page->ti; do { version = pvclock_read_begin(ti); flags = ti->ti_flags; @@ -162,6 +177,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux) tc_init(sc->sc_tc); + /* + * The openbsd vmm pvclock does not support the WALL_CLOCK msr, + * therefore we look only for kvm. + */ + kvm = &pva->pva_hv[PVBUS_KVM]; + if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) { + strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname, + sizeof(sc->sc_sensordev.xname)); + sc->sc_sensor.type = SENSOR_TIMEDELTA; + sc->sc_sensor.status = SENSOR_S_UNKNOWN; + sensor_attach(&sc->sc_sensordev, &sc->sc_sensor); + sensordev_install(&sc->sc_sensordev); + + config_mountroot(self, pvclock_tick_hook); + } + printf("\n"); } @@ -200,17 +231,67 @@ pvclock_read_done(const struct pvclock_time_info *ti, return (ti->ti_version == version); } -uint -pvclock_get_timecount(struct timecounter *tc) +static inline uint64_t +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; +#if defined(__i386__) + { + uint32_t tmp1, tmp2; + + /** + * For i386, the formula looks like: + * + * lower = (mul_frac * (delta & UINT_MAX)) >> 32 + * upper = mul_frac * (delta >> 32) + * product = lower + upper + */ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), + "2" (mul_frac) ); + } +#elif defined(__amd64__) + { + unsigned long tmp; + + __asm__ ( + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" + : [lo]"=a" (product), [hi]"=d" (tmp) + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); + } +#else +#error "pvclock: unsupported x86 architecture?" +#endif + return (product); +} + +uint64_t +pvclock_get(struct timecounter *tc) { struct pvclock_softc *sc = tc->tc_priv; struct pvclock_time_info *ti; uint64_t tsc_timestamp, system_time, delta, ctr; + uint64_t lastcount; uint32_t version, mul_frac; int8_t shift; uint8_t flags; + int s; - ti = sc->sc_time; + ti = &sc->sc_page->ti; + s = splhigh(); do { version = pvclock_read_begin(ti); system_time = ti->ti_system_time; @@ -218,26 +299,82 @@ pvclock_get_timecount(struct timecounter *tc) mul_frac = ti->ti_tsc_to_system_mul; shift = ti->ti_tsc_shift; flags = ti->ti_flags; + delta = rdtsc(); } while (!pvclock_read_done(ti, version)); + splx(s); /* * The algorithm is described in - * linux/Documentation/virtual/kvm/msr.txt + * linux/Documentation/virt/kvm/x86/msr.rst */ - delta = rdtsc() - tsc_timestamp; - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - ctr = ((delta * mul_frac) >> 32) + system_time; + delta -= tsc_timestamp; + ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time; if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0) return (ctr); - if (ctr < pvclock_lastcount) - return (pvclock_lastcount); - - atomic_swap_uint(&pvclock_lastcount, ctr); +#if defined(__i386__) + /* + * We are running on virtualization. Therefore we can assume that we + * have cmpxchg8b, available on pentium and newer. + */ + lastcount = i386_atomic_load_uq(&pvclock_lastcount); +#elif defined(__amd64__) + lastcount = pvclock_lastcount; +#else +#error "pvclock: unsupported x86 architecture?" +#endif + if (ctr < lastcount) + return (lastcount); + +#if defined(__i386__) + ctr = i386_atomic_testset_uq(&pvclock_lastcount, ctr); +#elif defined(__amd64__) + atomic_swap_64(&pvclock_lastcount, ctr); +#else +#error "pvclock: unsupported x86 architecture?" +#endif return (ctr); } + +uint +pvclock_get_timecount(struct timecounter *tc) +{ + return (pvclock_get(tc)); +} + +void +pvclock_tick(void *arg) +{ + struct pvclock_softc *sc = arg; + struct timespec ts; + struct pvclock_wall_clock *wc = &sc->sc_page->wc; + int64_t value; + + wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc)); + while (wc->wc_version & 0x1) + virtio_membar_sync(); + if (wc->wc_sec) { + nanotime(&ts); + value = TIMESPEC_TO_NSEC(&ts) - + SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec - + pvclock_get(&pvclock_timecounter); + + TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts); + sc->sc_sensor.value = value; + sc->sc_sensor.status = SENSOR_S_OK; + } else + sc->sc_sensor.status = SENSOR_S_UNKNOWN; + + timeout_add_sec(&sc->sc_tick, 15); +} + +void +pvclock_tick_hook(struct device *self) +{ + struct pvclock_softc *sc = (struct pvclock_softc *)self; + + timeout_set(&sc->sc_tick, pvclock_tick, sc); + pvclock_tick(sc); +}