Download raw body.
Improving time keeping in x86 VMs
Stefan Fritsch <sf@openbsd.org> writes:
> Hi,
>
> after talking to bluhm@ about time keeping issues of linux guests on
> vmm, I have looked into making linux attach to vmm's pvclock. In doing
> so, I also found some issues in the pvclock(4) guest driver. So, here
> comes a larger diff that hopefully improves time keeping in VMs, both
> with openbsd as guest and as hypervisor.
My kingdom for something to kill off my hacky Linux pvclock module.
Some questions / comments below.
>
> The diff changes in pvclock(4):
>
> * Fix integer overflows during multiplication by taking an assembler
> implementation from FreeBSD that uses 128 bit intermediate results.
>
> * Increase accuracy by disabling interrupts while reading the clock
>
> * Fix the not-TSC_STABLE handling which was busted because it wrongly
> compared 32 and 64 bit values. (This requires some atomic hackery on
> i386).
>
> * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the
> sensor in vmmci(4)
Lovely...
>
> In vmm(4):
>
> * Expose pvclock to linux guests by adding KVM compatible cpuid pages.
> While there, also expose the NOP_IO_DELAY feature bit which may make
> linux skip some ioport 80 reads. This should be ok at least until
> vmm(4) implements device pass through.
>
...however, this makes me nervous.
We've purposely not advertised ourselves as KVM in the past to avoid
possibly triggering any KVM-based code paths in Linux that assume
blindly that we are KVM.
I'd like to see testing on a variety of Linux kernel versions to see if
there's fallout. Any modules that trust kvm_para_available() to detect
if the host is actually KVM may make assumptions that don't hold true.
> * As there is only a single feature bit for both KVM_MSR_SYSTEM_TIME
> and KVM_MSR_WALL_CLOCK, implement the latter in vmm.
>
> If the integer overflows in pvclock(4) cause problems or not depends
> on the multiplicator and shift values chosen by the hypervisor, which
> in turn depend on the CPU's TSC frequency. So there may be systems
> where this is not a problem at all, while on other systems it may
> cause great havoc.
>
> The vmm and pvclock parts of the diff work independently from each other.
> Testers and comments are welcome.
>
> Cheers,
> Stefan
>
>
> diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c
> index e3205f48eed..f2ecb0668b0 100644
> --- a/sys/arch/amd64/amd64/vmm_machdep.c
> +++ b/sys/arch/amd64/amd64/vmm_machdep.c
> @@ -142,6 +142,7 @@ void svm_set_dirty(struct vcpu *, uint32_t);
> int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size);
> void vmm_init_pvclock(struct vcpu *, paddr_t);
> int vmm_update_pvclock(struct vcpu *);
> +void vmm_pv_wall_clock(struct vcpu *, paddr_t);
> int vmm_pat_is_valid(uint64_t);
>
> #ifdef MULTIPROCESSOR
> @@ -181,6 +182,7 @@ extern uint64_t tsc_frequency;
> extern int tsc_is_invariant;
>
> const char *vmm_hv_signature = VMM_HV_SIGNATURE;
> +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0";
>
> const struct kmem_pa_mode vmm_kp_contig = {
> .kp_constraint = &no_constraint,
> @@ -5555,6 +5557,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu)
> vmm_init_pvclock(vcpu,
> (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> break;
> + case KVM_MSR_WALL_CLOCK:
> + vmm_pv_wall_clock(vcpu,
> + (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> + break;
> #ifdef VMM_DEBUG
> default:
> /*
> @@ -5616,6 +5622,10 @@ svm_handle_msr(struct vcpu *vcpu)
> vmm_init_pvclock(vcpu,
> (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> break;
> + case KVM_MSR_WALL_CLOCK:
> + vmm_pv_wall_clock(vcpu,
> + (*rax & 0xFFFFFFFFULL) | (*rdx << 32));
> + break;
> default:
> /* Log the access, to be able to identify unknown MSRs */
> DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data "
> @@ -5971,6 +5981,20 @@ vmm_handle_cpuid(struct vcpu *vcpu)
> *rcx = 0;
> *rdx = 0;
> break;
> + case 0x40000100: /* Hypervisor information KVM */
Where did you get these CPUIDs from? I thought 0x40000000 was what Linux
used for KVM detection per what we already do in VMM and:
https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html
I see there's something in linux/arch/x86/kvm/cpuid.c about masking off
0x100 and using those values...but still not sure how this use of
0x40000100 would somehow supersede what vmm(4) is already doing. Is this
some trick to not convince OpenBSD guests that we're KVM?
> + *rax = 0x40000101;
> + *rbx = *((uint32_t *)&kvm_hv_signature[0]);
> + *rcx = *((uint32_t *)&kvm_hv_signature[4]);
> + *rdx = *((uint32_t *)&kvm_hv_signature[8]);
> + break;
> + case 0x40000101: /* KVM hypervisor features */
> + *rax = (1 << KVM_FEATURE_CLOCKSOURCE2) |
> + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> + (1 << KVM_FEATURE_NOP_IO_DELAY);
> + *rbx = 0;
> + *rcx = 0;
> + *rdx = 0;
> + break;
> case 0x80000000: /* Extended function level */
> /* We don't emulate past 0x8000001f currently. */
> *rax = min(curcpu()->ci_pnfeatset, 0x8000001f);
> @@ -6468,7 +6492,7 @@ vmm_update_pvclock(struct vcpu *vcpu)
> (++vcpu->vc_pvclock_version << 1) | 0x1;
>
> pvclock_ti->ti_tsc_timestamp = rdtsc();
> - nanotime(&tv);
> + nanouptime(&tv);
> pvclock_ti->ti_system_time =
> tv.tv_sec * 1000000000L + tv.tv_nsec;
> pvclock_ti->ti_tsc_shift = 12;
> @@ -6482,6 +6506,36 @@ vmm_update_pvclock(struct vcpu *vcpu)
> return (0);
> }
>
> +void
> +vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa)
> +{
> + struct pvclock_wall_clock *pvclock_wc;
> + struct timespec tv;
^^^^^ spaces vs. tabs
> + struct vm *vm = vcpu->vc_parent;
> + paddr_t hpa;
> +
> + if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock)))
> + goto err;
> +
> + /* XXX: handle case when this struct goes over page boundaries */
> + if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE)
> + goto err;
> +
> + if (!pmap_extract(vm->vm_map->pmap, gpa, &hpa))
> + goto err;
> + pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa);
> + pvclock_wc->wc_version |= 0x1;
> + nanoboottime(&tv);
> + pvclock_wc->wc_sec = tv.tv_sec;
> + pvclock_wc->wc_nsec = tv.tv_nsec;
> + pvclock_wc->wc_version += 1;
> +
> + return;
> +err:
> + vmm_inject_gp(vcpu);
> +}
> +
> +
> int
> vmm_pat_is_valid(uint64_t pat)
> {
> diff --git a/sys/arch/i386/include/atomic.h b/sys/arch/i386/include/atomic.h
> index 45aedb819c8..e779b51869f 100644
> --- a/sys/arch/i386/include/atomic.h
> +++ b/sys/arch/i386/include/atomic.h
> @@ -274,7 +274,18 @@ static __inline u_int64_t
> i386_atomic_testset_uq(volatile u_int64_t *ptr, u_int64_t val)
> {
> __asm__ volatile ("\n1:\t" _LOCK " cmpxchg8b (%1); jnz 1b" : "+A" (val) :
> - "r" (ptr), "b" ((u_int32_t)val), "c" ((u_int32_t)(val >> 32)));
> + "r" (ptr), "b" ((u_int32_t)val), "c" ((u_int32_t)(val >> 32)) :
> + "cc");
> + return val;
> +}
> +
> +static __inline u_int64_t
> +i386_atomic_load_uq(volatile u_int64_t *ptr)
> +{
> + uint64_t val;
> + __asm__ volatile ("movl %%ebx,%%eax; movl %%ecx, %%edx; "
> + _LOCK " cmpxchg8b (%1)" : "=&A" (val) : "r" (ptr) :
> + "cc");
> return val;
> }
>
> diff --git a/sys/dev/pv/pvclock.c b/sys/dev/pv/pvclock.c
> index 994fc4a337c..c3f4c0049e0 100644
> --- a/sys/dev/pv/pvclock.c
> +++ b/sys/dev/pv/pvclock.c
> @@ -22,6 +22,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/timeout.h>
> #include <sys/timetc.h>
>
> #include <machine/cpu.h>
> @@ -31,13 +32,21 @@
> #include <dev/pv/pvvar.h>
> #include <dev/pv/pvreg.h>
>
> -uint pvclock_lastcount;
> +uint64_t pvclock_lastcount;
> +
> +struct pvpage {
> + struct pvclock_time_info ti;
> + struct pvclock_wall_clock wc;
> +};
>
> struct pvclock_softc {
> struct device sc_dev;
> - void *sc_time;
> + struct pvpage *sc_page;
> paddr_t sc_paddr;
> struct timecounter *sc_tc;
> + struct ksensordev sc_sensordev;
> + struct ksensor sc_sensor;
> + struct timeout sc_tick;
> };
>
> #define DEVNAME(_s) ((_s)->sc_dev.dv_xname)
> @@ -46,12 +55,16 @@ int pvclock_match(struct device *, void *, void *);
> void pvclock_attach(struct device *, struct device *, void *);
> int pvclock_activate(struct device *, int);
>
> +uint64_t pvclock_get(struct timecounter *);
> uint pvclock_get_timecount(struct timecounter *);
> +void pvclock_tick_hook(struct device *);
>
> static inline uint32_t
> pvclock_read_begin(const struct pvclock_time_info *);
> static inline int
> pvclock_read_done(const struct pvclock_time_info *, uint32_t);
> +static inline uint64_t
> + pvclock_scale_delta(uint64_t, uint32_t, int);
>
> const struct cfattach pvclock_ca = {
> sizeof(struct pvclock_softc),
> @@ -119,26 +132,28 @@ void
> pvclock_attach(struct device *parent, struct device *self, void *aux)
> {
> struct pvclock_softc *sc = (struct pvclock_softc *)self;
> + struct pv_attach_args *pva = aux;
> struct pvclock_time_info *ti;
> paddr_t pa;
> uint32_t version;
> uint8_t flags;
> + struct pvbus_hv *kvm;
>
> - if ((sc->sc_time = km_alloc(PAGE_SIZE,
> + if ((sc->sc_page = km_alloc(PAGE_SIZE,
> &kv_any, &kp_zero, &kd_nowait)) == NULL) {
> printf(": time page allocation failed\n");
> return;
> }
> - if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_time, &pa)) {
> + if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_page, &pa)) {
> printf(": time page PA extraction failed\n");
> - km_free(sc->sc_time, PAGE_SIZE, &kv_any, &kp_zero);
> + km_free(sc->sc_page, PAGE_SIZE, &kv_any, &kp_zero);
> return;
> }
>
> wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE);
> sc->sc_paddr = pa;
>
> - ti = sc->sc_time;
> + ti = &sc->sc_page->ti;
> do {
> version = pvclock_read_begin(ti);
> flags = ti->ti_flags;
> @@ -162,6 +177,22 @@ pvclock_attach(struct device *parent, struct device *self, void *aux)
>
> tc_init(sc->sc_tc);
>
> + /*
> + * The openbsd vmm pvclock does not support the WALL_CLOCK msr,
> + * therefore we look only for kvm.
> + */
> + kvm = &pva->pva_hv[PVBUS_KVM];
> + if (kvm->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) {
> + strlcpy(sc->sc_sensordev.xname, sc->sc_dev.dv_xname,
> + sizeof(sc->sc_sensordev.xname));
> + sc->sc_sensor.type = SENSOR_TIMEDELTA;
> + sc->sc_sensor.status = SENSOR_S_UNKNOWN;
> + sensor_attach(&sc->sc_sensordev, &sc->sc_sensor);
> + sensordev_install(&sc->sc_sensordev);
> +
> + config_mountroot(self, pvclock_tick_hook);
> + }
> +
> printf("\n");
> }
>
> @@ -200,17 +231,67 @@ pvclock_read_done(const struct pvclock_time_info *ti,
> return (ti->ti_version == version);
> }
>
> -uint
> -pvclock_get_timecount(struct timecounter *tc)
> +static inline uint64_t
> +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
> +{
> + uint64_t product;
> +
> + if (shift < 0)
> + delta >>= -shift;
> + else
> + delta <<= shift;
> +#if defined(__i386__)
> + {
> + uint32_t tmp1, tmp2;
> +
> + /**
> + * For i386, the formula looks like:
> + *
> + * lower = (mul_frac * (delta & UINT_MAX)) >> 32
> + * upper = mul_frac * (delta >> 32)
> + * product = lower + upper
> + */
> + __asm__ (
> + "mul %5 ; "
> + "mov %4,%%eax ; "
> + "mov %%edx,%4 ; "
> + "mul %5 ; "
> + "xor %5,%5 ; "
> + "add %4,%%eax ; "
> + "adc %5,%%edx ; "
> + : "=A" (product), "=r" (tmp1), "=r" (tmp2)
> + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
> + "2" (mul_frac) );
> + }
> +#elif defined(__amd64__)
> + {
> + unsigned long tmp;
> +
> + __asm__ (
> + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
> + : [lo]"=a" (product), [hi]"=d" (tmp)
> + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
> + }
> +#else
> +#error "pvclock: unsupported x86 architecture?"
> +#endif
> + return (product);
> +}
> +
> +uint64_t
> +pvclock_get(struct timecounter *tc)
> {
> struct pvclock_softc *sc = tc->tc_priv;
> struct pvclock_time_info *ti;
> uint64_t tsc_timestamp, system_time, delta, ctr;
> + uint64_t lastcount;
> uint32_t version, mul_frac;
> int8_t shift;
> uint8_t flags;
> + int s;
>
> - ti = sc->sc_time;
> + ti = &sc->sc_page->ti;
> + s = splhigh();
> do {
> version = pvclock_read_begin(ti);
> system_time = ti->ti_system_time;
> @@ -218,26 +299,82 @@ pvclock_get_timecount(struct timecounter *tc)
> mul_frac = ti->ti_tsc_to_system_mul;
> shift = ti->ti_tsc_shift;
> flags = ti->ti_flags;
> + delta = rdtsc();
> } while (!pvclock_read_done(ti, version));
> + splx(s);
>
> /*
> * The algorithm is described in
> - * linux/Documentation/virtual/kvm/msr.txt
> + * linux/Documentation/virt/kvm/x86/msr.rst
> */
> - delta = rdtsc() - tsc_timestamp;
> - if (shift < 0)
> - delta >>= -shift;
> - else
> - delta <<= shift;
> - ctr = ((delta * mul_frac) >> 32) + system_time;
> + delta -= tsc_timestamp;
> + ctr = pvclock_scale_delta(delta, mul_frac, shift) + system_time;
>
> if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
> return (ctr);
>
> - if (ctr < pvclock_lastcount)
> - return (pvclock_lastcount);
> -
> - atomic_swap_uint(&pvclock_lastcount, ctr);
> +#if defined(__i386__)
> + /*
> + * We are running on virtualization. Therefore we can assume that we
> + * have cmpxchg8b, available on pentium and newer.
> + */
> + lastcount = i386_atomic_load_uq(&pvclock_lastcount);
> +#elif defined(__amd64__)
> + lastcount = pvclock_lastcount;
> +#else
> +#error "pvclock: unsupported x86 architecture?"
> +#endif
> + if (ctr < lastcount)
> + return (lastcount);
> +
> +#if defined(__i386__)
> + ctr = i386_atomic_testset_uq(&pvclock_lastcount, ctr);
> +#elif defined(__amd64__)
> + atomic_swap_64(&pvclock_lastcount, ctr);
> +#else
> +#error "pvclock: unsupported x86 architecture?"
> +#endif
>
> return (ctr);
> }
> +
> +uint
> +pvclock_get_timecount(struct timecounter *tc)
> +{
> + return (pvclock_get(tc));
> +}
> +
> +void
> +pvclock_tick(void *arg)
> +{
> + struct pvclock_softc *sc = arg;
> + struct timespec ts;
> + struct pvclock_wall_clock *wc = &sc->sc_page->wc;
> + int64_t value;
> +
> + wrmsr(KVM_MSR_WALL_CLOCK, sc->sc_paddr + offsetof(struct pvpage, wc));
> + while (wc->wc_version & 0x1)
> + virtio_membar_sync();
> + if (wc->wc_sec) {
> + nanotime(&ts);
> + value = TIMESPEC_TO_NSEC(&ts) -
> + SEC_TO_NSEC(wc->wc_sec) - wc->wc_nsec -
> + pvclock_get(&pvclock_timecounter);
> +
> + TIMESPEC_TO_TIMEVAL(&sc->sc_sensor.tv, &ts);
> + sc->sc_sensor.value = value;
> + sc->sc_sensor.status = SENSOR_S_OK;
> + } else
> + sc->sc_sensor.status = SENSOR_S_UNKNOWN;
> +
> + timeout_add_sec(&sc->sc_tick, 15);
> +}
> +
> +void
> +pvclock_tick_hook(struct device *self)
> +{
> + struct pvclock_softc *sc = (struct pvclock_softc *)self;
> +
> + timeout_set(&sc->sc_tick, pvclock_tick, sc);
> + pvclock_tick(sc);
> +}
Improving time keeping in x86 VMs