From: Mike Larkin Subject: Re: Improving time keeping in x86 VMs To: Stefan Fritsch Cc: tech@openbsd.org Date: Sun, 1 Jun 2025 21:20:31 -0700 On Fri, May 30, 2025 at 09:23:27AM +0200, Stefan Fritsch wrote: > On Mon, 19 May 2025, Mike Larkin wrote: > > > On Mon, May 19, 2025 at 10:11:54PM +0200, Stefan Fritsch wrote: > > > On Sun, 18 May 2025, Mike Larkin wrote: > > > > > > > On Sat, May 17, 2025 at 05:39:35PM +0200, Stefan Fritsch wrote: > > > > > On Sat, 12 Apr 2025, Stefan Fritsch wrote: > > > > > > > > > > > On Fri, 11 Apr 2025, Mike Larkin wrote: > > > > > > > > > > > > > On Wed, Mar 12, 2025 at 09:27:50PM +0100, Stefan Fritsch wrote: > > > > > > > > On Tue, 11 Mar 2025, Dave Voutila wrote: > > > > > > > > > > > > > > > > > Stefan Fritsch writes: > > > > > > > > > > > > > > > > > > > Hi, > > > > > > > > > > > > > > > > > > > > after talking to bluhm@ about time keeping issues of linux guests on > > > > > > > > > > vmm, I have looked into making linux attach to vmm's pvclock. In doing > > > > > > > > > > so, I also found some issues in the pvclock(4) guest driver. So, here > > > > > > > > > > comes a larger diff that hopefully improves time keeping in VMs, both > > > > > > > > > > with openbsd as guest and as hypervisor. > > > > > > > > > > > > > > > > > > My kingdom for something to kill off my hacky Linux pvclock module. > > > > > > > > > > > > > > > > > > Some questions / comments below. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > The diff changes in pvclock(4): > > > > > > > > > > > > > > > > > > > > * Fix integer overflows during multiplication by taking an assembler > > > > > > > > > > implementation from FreeBSD that uses 128 bit intermediate results. > > > > > > > > > > > > > > > > > > > > * Increase accuracy by disabling interrupts while reading the clock > > > > > > > > > > > > > > > > > > > > * Fix the not-TSC_STABLE handling which was busted because it wrongly > > > > > > > > > > compared 32 and 64 bit values. (This requires some atomic hackery on > > > > > > > > > > i386). > > > > > > > > > > > > > > > > > > > > * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the > > > > > > > > > > sensor in vmmci(4) > > > > > > > > > > > > > > > > > > Lovely... > > > > > > > > > > > > > > > > Today, I found a diff by Scott Cheloha from 2022 that was never > > > > > > > > committed and that I have missed before. It has some overlap with my > > > > > > > > pvclock diff. I will look at it some more and produce a combined diff. > > > > > > > > > > > > Sorry, I got side-tracked by other issues that were morge urgent. Updated > > > > > > diff below. The vmm part is unchanged but the pvclock part has several > > > > > > fixes, some inspired by Scott's earlier diff. > > > > > > > > > > > > This would need to be split up into several diffs before committing. I am > > > > > > not 100% sure that the memory clobbers in i386_atomic_testset_uq and > > > > > > i386_atomic_cas_uq are neccessary, but I think they are. If the new > > > > > > i386_atomic_* functions are not wanted in atomic.h, they could of course > > > > > > be moved into pvclock.c. But i386_atomic_testset_uq() is already in > > > > > > atomic.h. > > > > > > > > > > There was something wrong with the i386 assembler atomics that caused > > > > > havoc if the TSC stable bit was not set by the hypervisor. Updated diff > > > > > below. > > > > > > > > > > diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c > > > > > index 0a3be15cd64..5a1570aa8d9 100644 > > > > > --- a/sys/arch/amd64/amd64/vmm_machdep.c > > > > > +++ b/sys/arch/amd64/amd64/vmm_machdep.c > > > > > @@ -146,6 +146,7 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *); > > > > > int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size); > > > > > void vmm_init_pvclock(struct vcpu *, paddr_t); > > > > > int vmm_update_pvclock(struct vcpu *); > > > > > +void vmm_pv_wall_clock(struct vcpu *, paddr_t); > > > > > int vmm_pat_is_valid(uint64_t); > > > > > > > > > > #ifdef MULTIPROCESSOR > > > > > @@ -185,6 +186,7 @@ extern uint64_t tsc_frequency; > > > > > extern int tsc_is_invariant; > > > > > > > > > > const char *vmm_hv_signature = VMM_HV_SIGNATURE; > > > > > +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0"; > > > > > > > > > > const struct kmem_pa_mode vmm_kp_contig = { > > > > > .kp_constraint = &no_constraint, > > > > > @@ -5578,6 +5580,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu) > > > > > vmm_init_pvclock(vcpu, > > > > > (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > > > > > break; > > > > > + case KVM_MSR_WALL_CLOCK: > > > > > + vmm_pv_wall_clock(vcpu, > > > > > + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > > > > > + break; > > > > > #ifdef VMM_DEBUG > > > > > default: > > > > > /* > > > > > @@ -5639,6 +5645,10 @@ svm_handle_msr(struct vcpu *vcpu) > > > > > vmm_init_pvclock(vcpu, > > > > > (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > > > > > break; > > > > > + case KVM_MSR_WALL_CLOCK: > > > > > + vmm_pv_wall_clock(vcpu, > > > > > + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > > > > > + break; > > > > > default: > > > > > /* Log the access, to be able to identify unknown MSRs */ > > > > > DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data " > > > > > @@ -5994,6 +6004,20 @@ vmm_handle_cpuid(struct vcpu *vcpu) > > > > > *rcx = 0; > > > > > *rdx = 0; > > > > > break; > > > > > + case 0x40000100: /* Hypervisor information KVM */ > > > > > + *rax = 0x40000101; > > > > > + *rbx = *((uint32_t *)&kvm_hv_signature[0]); > > > > > + *rcx = *((uint32_t *)&kvm_hv_signature[4]); > > > > > + *rdx = *((uint32_t *)&kvm_hv_signature[8]); > > > > > + break; > > > > > + case 0x40000101: /* KVM hypervisor features */ > > > > > + *rax = (1 << KVM_FEATURE_CLOCKSOURCE2) | > > > > > + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | > > > > > + (1 << KVM_FEATURE_NOP_IO_DELAY); > > > > > + *rbx = 0; > > > > > + *rcx = 0; > > > > > + *rdx = 0; > > > > > + break; > > > > > > > > This makes us claim we are kvm. Even though you are only advertising support > > > > for these 3 features, that makes a big assumption that every linux kernel out > > > > there respects a neutered/weird "KVM" like this. > > > > > > > > How many varieties of guest kernels did you test this on? I think we need to > > > > be very careful if we do this. We are basically claiming to be something > > > > that doesn't really exist anywhere else. > > > > > > I have only tested this with relatively new linux kernels. One hypervisor > > > implementing some pv features of another in order to take advantage of > > > existing guest pv support is nothing unusual. For example qemu/KVM can > > > emulate features from Hyper-V and vmware. However, this is configurable. > > > If you don't want more knobs in vmd(4), I see two ways forward: > > > > > > One could commit it anyway and see if there are any complaints until the > > > release. If there are major problems, one could still revert it. If there > > > are minor problems, maybe one could point users to Linux's nopv command > > > line option, which exists since 2019 / 5.3. > > > > I don't think "commit and wait for complaints" is the way I'd go. I think as > > long as we can test some popular i686/x86_64 distributions from the past few > > years, and everything works, we'd be more confident. For example, LTS versions > > of various things like: > > > > Centos 7, Centos 8 > > Ubuntu 14, 18, 20, 24 > > > > ... and certain other popular distributions like: > > > > Fedora latest > > Alpine 3.10, Alpine 3.21 > > > > If all those work, I would say we've covered enough historic versions to be > > reasonably certain we haven't broken anything. That list above covers these > > kernel versions, which is a good representative sample of things that vmm > > supports. > > > > 3.10 > > 4.4 > > 4.15 > > 4.18 > > 4.19 > > 5.4 > > 6.8 > > 6.12 > > 6.14 > > > > All you need to do is boot up the relevant installer, break into the shell, > > and verify that time is advancing. No need to do a full install. > > I don't usually use vmm and don't know what normally works on vmm and what > does not. Therefore I have some doubts that I am the right persion to do > the testing, but I have tried booting the installer ISOs on openbsd > -current with my vmm patch on an core i5-8365U. I have looked at the > output of "while sleep 1; do date; done" (or something equivalent if > "date" was not available). I have successfully checked these distros > (kernel versions taken from uname -a): > > centos 6.10 2.6.32-754.el6.x86_64 > > debian 8.8 3.16.0-4-amd64 > debian 12.10 6.1.0-32-amd64 > > fedora 39 6.5.6-300.fc39.x86_64 > fedora 41 6.11.4-301.fc41.x86_64 > > alpine 3.21 6.12.13-0-virt > > ubuntu 18.04 4.15.0-20-generic > ubuntu 20.04.6 5.4.0-144-generic > ubuntu 22.04.5 5.15.0-119-generic > ubuntu 25.04 6.14.0-15-generic > > For centos 6.10, debian 8.8, fedora 39+41, I have also tried this without > my patch. On centos 6.10 and debian 8.8, "sleep 1" takes ~ 5 seconds > without my patch, but only 1 second with my patch. fedora 39+41 seem to > use clocksource tsc without my patch, so they don't have that problem. > > Debian 8.8 does not find its cdrom due to missing virtio-scsi driver but > rescue shell works. I remember that this was a problem in old Debian > installers. > > Fedora 39+41 don't find the cdrom either, with fed 41 I have debugged this > until > > systemd-coredump[1172]: Process 1157 (cdrom_id) of user 0 terminated > abnormally with signal 6/ABRT, processing... > > After a long wait I get the rescue shell. The behavior is identical > with/without my patch. > > I could not get ubuntu 16.04 to boot without vga, the bootloader fails > already. > > The list above covers a wide range of linux versions. As only very few kvm > features were present in the first kvm versions, linux as a guest > dutifully checks the cpuid feature bits. Therefore I don't expect that > more casual testing would reveal more issues. Also, sometimes linux uses > tsc as clocksource at boot but is later unhappy with it and switches to a > different clock source. I saw this on one alpine linux VM on bluhm's > system when I started looking at this. I also expect that behaviour may be > different depending on the cpu type and the number of sockets in the host > system. One would need much more testers to cover these cases. > > An updated diff to account for changes in -current is attached below. This > includes only the vmm(4) part. The pvclock(4) part is covered in a > separate sub-thread. > > Cheers, > Stefan > Thanks, this was a good deal of testing. I'll take a look at the diff below tomorrow! -ml > > > > > > > Or one could redefine the current VMM_HV_SIGNATURE feature bits in a way > > > that allows one to detect the new KVM_MSR_WALL_CLOCK support and then try > > > to get a patch into linux to detect VMM_HV_SIGNATURE and those feature > > > bits. > > > > > > What do you think? > > > > > > > As long as we are convinced that we haven't broken anything from the list > > above, I don't think this extra work (trying to upstream a VMM_HV_SIGNATURE > > check in Linux) is worthwhile. > > > > diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c > index 037cf650cb9..cd23d8acf00 100644 > --- a/sys/arch/amd64/amd64/vmm_machdep.c > +++ b/sys/arch/amd64/amd64/vmm_machdep.c > @@ -154,6 +154,7 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *); > int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size); > void vmm_init_pvclock(struct vcpu *, paddr_t); > int vmm_update_pvclock(struct vcpu *); > +void vmm_pv_wall_clock(struct vcpu *, paddr_t); > int vmm_pat_is_valid(uint64_t); > > #ifdef MULTIPROCESSOR > @@ -193,6 +194,7 @@ extern uint64_t tsc_frequency; > extern int tsc_is_invariant; > > const char *vmm_hv_signature = VMM_HV_SIGNATURE; > +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0"; > > const struct kmem_pa_mode vmm_kp_contig = { > .kp_constraint = &no_constraint, > @@ -6043,6 +6045,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu) > vmm_init_pvclock(vcpu, > (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > break; > + case KVM_MSR_WALL_CLOCK: > + vmm_pv_wall_clock(vcpu, > + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > + break; > #ifdef VMM_DEBUG > default: > /* > @@ -6104,6 +6110,10 @@ svm_handle_msr(struct vcpu *vcpu) > vmm_init_pvclock(vcpu, > (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > break; > + case KVM_MSR_WALL_CLOCK: > + vmm_pv_wall_clock(vcpu, > + (*rax & 0xFFFFFFFFULL) | (*rdx << 32)); > + break; > default: > /* Log the access, to be able to identify unknown MSRs */ > DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data " > @@ -6459,6 +6469,20 @@ vmm_handle_cpuid(struct vcpu *vcpu) > *rcx = 0; > *rdx = 0; > break; > + case 0x40000100: /* Hypervisor information KVM */ > + *rax = 0x40000101; > + *rbx = *((uint32_t *)&kvm_hv_signature[0]); > + *rcx = *((uint32_t *)&kvm_hv_signature[4]); > + *rdx = *((uint32_t *)&kvm_hv_signature[8]); > + break; > + case 0x40000101: /* KVM hypervisor features */ > + *rax = (1 << KVM_FEATURE_CLOCKSOURCE2) | > + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | > + (1 << KVM_FEATURE_NOP_IO_DELAY); > + *rbx = 0; > + *rcx = 0; > + *rdx = 0; > + break; > case 0x80000000: /* Extended function level */ > /* We don't emulate past 0x8000001f currently. */ > *rax = min(curcpu()->ci_pnfeatset, 0x8000001f); > @@ -6980,7 +7004,7 @@ vmm_update_pvclock(struct vcpu *vcpu) > (++vcpu->vc_pvclock_version << 1) | 0x1; > > pvclock_ti->ti_tsc_timestamp = rdtsc(); > - nanotime(&tv); > + nanouptime(&tv); > pvclock_ti->ti_system_time = > tv.tv_sec * 1000000000L + tv.tv_nsec; > pvclock_ti->ti_tsc_shift = 12; > @@ -6994,6 +7018,36 @@ vmm_update_pvclock(struct vcpu *vcpu) > return (0); > } > > +void > +vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa) > +{ > + struct pvclock_wall_clock *pvclock_wc; > + struct timespec tv; > + struct vm *vm = vcpu->vc_parent; > + paddr_t hpa; > + > + if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock))) > + goto err; > + > + /* XXX: handle case when this struct goes over page boundaries */ > + if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE) > + goto err; > + > + if (!pmap_extract(vm->vm_pmap, gpa, &hpa)) > + goto err; > + pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa); > + pvclock_wc->wc_version |= 0x1; > + nanoboottime(&tv); > + pvclock_wc->wc_sec = tv.tv_sec; > + pvclock_wc->wc_nsec = tv.tv_nsec; > + pvclock_wc->wc_version += 1; > + > + return; > +err: > + vmm_inject_gp(vcpu); > +} > + > + > int > vmm_pat_is_valid(uint64_t pat) > {