From: Mike Larkin <mlarkin@nested.page>
Subject: Re: Improving time keeping in x86 VMs
To: Stefan Fritsch <sf@sfritsch.de>
Cc: tech@openbsd.org
Date: Sun, 1 Jun 2025 21:20:31 -0700

On Fri, May 30, 2025 at 09:23:27AM +0200, Stefan Fritsch wrote:
> On Mon, 19 May 2025, Mike Larkin wrote:
>
> > On Mon, May 19, 2025 at 10:11:54PM +0200, Stefan Fritsch wrote:
> > > On Sun, 18 May 2025, Mike Larkin wrote:
> > >
> > > > On Sat, May 17, 2025 at 05:39:35PM +0200, Stefan Fritsch wrote:
> > > > > On Sat, 12 Apr 2025, Stefan Fritsch wrote:
> > > > >
> > > > > > On Fri, 11 Apr 2025, Mike Larkin wrote:
> > > > > >
> > > > > > > On Wed, Mar 12, 2025 at 09:27:50PM +0100, Stefan Fritsch wrote:
> > > > > > > > On Tue, 11 Mar 2025, Dave Voutila wrote:
> > > > > > > >
> > > > > > > > > Stefan Fritsch <sf@openbsd.org> writes:
> > > > > > > > >
> > > > > > > > > > Hi,
> > > > > > > > > >
> > > > > > > > > > after talking to bluhm@ about time keeping issues of linux guests on
> > > > > > > > > > vmm, I have looked into making linux attach to vmm's pvclock. In doing
> > > > > > > > > > so, I also found some issues in the pvclock(4) guest driver. So, here
> > > > > > > > > > comes a larger diff that hopefully improves time keeping in VMs, both
> > > > > > > > > > with openbsd as guest and as hypervisor.
> > > > > > > > >
> > > > > > > > > My kingdom for something to kill off my hacky Linux pvclock module.
> > > > > > > > >
> > > > > > > > > Some questions / comments below.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > The diff changes in pvclock(4):
> > > > > > > > > >
> > > > > > > > > > * Fix integer overflows during multiplication by taking an assembler
> > > > > > > > > >   implementation from FreeBSD that uses 128 bit intermediate results.
> > > > > > > > > >
> > > > > > > > > > * Increase accuracy by disabling interrupts while reading the clock
> > > > > > > > > >
> > > > > > > > > > * Fix the not-TSC_STABLE handling which was busted because it wrongly
> > > > > > > > > >   compared 32 and 64 bit values. (This requires some atomic hackery on
> > > > > > > > > >   i386).
> > > > > > > > > >
> > > > > > > > > > * Add a timedelta sensor using the KVM WALL_CLOCK MSR, similar to the
> > > > > > > > > >   sensor in vmmci(4)
> > > > > > > > >
> > > > > > > > > Lovely...
> > > > > > > >
> > > > > > > > Today, I found a diff by Scott Cheloha from 2022 that was never
> > > > > > > > committed and that I have missed before. It has some overlap with my
> > > > > > > > pvclock diff. I will look at it some more and produce a combined diff.
> > > > > >
> > > > > > Sorry, I got side-tracked by other issues that were morge urgent. Updated
> > > > > > diff below. The vmm part is unchanged but the pvclock part has several
> > > > > > fixes, some inspired by Scott's earlier diff.
> > > > > >
> > > > > > This would need to be split up into several diffs before committing. I am
> > > > > > not 100% sure that the memory clobbers in i386_atomic_testset_uq and
> > > > > > i386_atomic_cas_uq are neccessary, but I think they are. If the new
> > > > > > i386_atomic_* functions are not wanted in atomic.h, they could of course
> > > > > > be moved into pvclock.c. But i386_atomic_testset_uq() is already in
> > > > > > atomic.h.
> > > > >
> > > > > There was something wrong with the i386 assembler atomics that caused
> > > > > havoc if the TSC stable bit was not set by the hypervisor. Updated diff
> > > > > below.
> > > > >
> > > > > diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c
> > > > > index 0a3be15cd64..5a1570aa8d9 100644
> > > > > --- a/sys/arch/amd64/amd64/vmm_machdep.c
> > > > > +++ b/sys/arch/amd64/amd64/vmm_machdep.c
> > > > > @@ -146,6 +146,7 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *);
> > > > >  int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size);
> > > > >  void vmm_init_pvclock(struct vcpu *, paddr_t);
> > > > >  int vmm_update_pvclock(struct vcpu *);
> > > > > +void vmm_pv_wall_clock(struct vcpu *, paddr_t);
> > > > >  int vmm_pat_is_valid(uint64_t);
> > > > >
> > > > >  #ifdef MULTIPROCESSOR
> > > > > @@ -185,6 +186,7 @@ extern uint64_t tsc_frequency;
> > > > >  extern int tsc_is_invariant;
> > > > >
> > > > >  const char *vmm_hv_signature = VMM_HV_SIGNATURE;
> > > > > +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0";
> > > > >
> > > > >  const struct kmem_pa_mode vmm_kp_contig = {
> > > > >  	.kp_constraint = &no_constraint,
> > > > > @@ -5578,6 +5580,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu)
> > > > >  		vmm_init_pvclock(vcpu,
> > > > >  		    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
> > > > >  		break;
> > > > > +	case KVM_MSR_WALL_CLOCK:
> > > > > +		vmm_pv_wall_clock(vcpu,
> > > > > +		    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
> > > > > +		break;
> > > > >  #ifdef VMM_DEBUG
> > > > >  	default:
> > > > >  		/*
> > > > > @@ -5639,6 +5645,10 @@ svm_handle_msr(struct vcpu *vcpu)
> > > > >  			vmm_init_pvclock(vcpu,
> > > > >  			    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
> > > > >  			break;
> > > > > +		case KVM_MSR_WALL_CLOCK:
> > > > > +			vmm_pv_wall_clock(vcpu,
> > > > > +			    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
> > > > > +			break;
> > > > >  		default:
> > > > >  			/* Log the access, to be able to identify unknown MSRs */
> > > > >  			DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data "
> > > > > @@ -5994,6 +6004,20 @@ vmm_handle_cpuid(struct vcpu *vcpu)
> > > > >  		*rcx = 0;
> > > > >  		*rdx = 0;
> > > > >  		break;
> > > > > +	case 0x40000100:	/* Hypervisor information KVM */
> > > > > +		*rax = 0x40000101;
> > > > > +		*rbx = *((uint32_t *)&kvm_hv_signature[0]);
> > > > > +		*rcx = *((uint32_t *)&kvm_hv_signature[4]);
> > > > > +		*rdx = *((uint32_t *)&kvm_hv_signature[8]);
> > > > > +		break;
> > > > > +	case 0x40000101:	/* KVM hypervisor features */
> > > > > +		*rax = (1 << KVM_FEATURE_CLOCKSOURCE2) |
> > > > > +		    (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> > > > > +		    (1 << KVM_FEATURE_NOP_IO_DELAY);
> > > > > +		*rbx = 0;
> > > > > +		*rcx = 0;
> > > > > +		*rdx = 0;
> > > > > +		break;
> > > >
> > > > This makes us claim we are kvm. Even though you are only advertising support
> > > > for these 3 features, that makes a big assumption that every linux kernel out
> > > > there respects a neutered/weird "KVM" like this.
> > > >
> > > > How many varieties of guest kernels did you test this on? I think we need to
> > > > be very careful if we do this. We are basically claiming to be something
> > > > that doesn't really exist anywhere else.
> > >
> > > I have only tested this with relatively new linux kernels. One hypervisor
> > > implementing some pv features of another in order to take advantage of
> > > existing guest pv support is nothing unusual. For example qemu/KVM can
> > > emulate features from Hyper-V and vmware. However, this is configurable.
> > > If you don't want more knobs in vmd(4), I see two ways forward:
> > >
> > > One could commit it anyway and see if there are any complaints until the
> > > release. If there are major problems, one could still revert it. If there
> > > are minor problems, maybe one could point users to Linux's nopv command
> > > line option, which exists since 2019 / 5.3.
> >
> > I don't think "commit and wait for complaints" is the way I'd go. I think as
> > long as we can test some popular i686/x86_64 distributions from the past few
> > years, and everything works, we'd be more confident. For example, LTS versions
> > of various things like:
> >
> > Centos 7, Centos 8
> > Ubuntu 14, 18, 20, 24
> >
> > ... and certain other popular distributions like:
> >
> > Fedora latest
> > Alpine 3.10, Alpine 3.21
> >
> > If all those work, I would say we've covered enough historic versions to be
> > reasonably certain we haven't broken anything. That list above covers these
> > kernel versions, which is a good representative sample of things that vmm
> > supports.
> >
> > 3.10
> > 4.4
> > 4.15
> > 4.18
> > 4.19
> > 5.4
> > 6.8
> > 6.12
> > 6.14
> >
> > All you need to do is boot up the relevant installer, break into the shell,
> > and verify that time is advancing. No need to do a full install.
>
> I don't usually use vmm and don't know what normally works on vmm and what
> does not.  Therefore I have some doubts that I am the right persion to do
> the testing, but I have tried booting the installer ISOs on openbsd
> -current with my vmm patch on an core i5-8365U. I have looked at the
> output of "while sleep 1; do date; done" (or something equivalent if
> "date" was not available).  I have successfully checked these distros
> (kernel versions taken from uname -a):
>
> centos 6.10     2.6.32-754.el6.x86_64
>
> debian 8.8      3.16.0-4-amd64
> debian 12.10    6.1.0-32-amd64
>
> fedora 39       6.5.6-300.fc39.x86_64
> fedora 41       6.11.4-301.fc41.x86_64
>
> alpine 3.21     6.12.13-0-virt
>
> ubuntu 18.04    4.15.0-20-generic
> ubuntu 20.04.6  5.4.0-144-generic
> ubuntu 22.04.5  5.15.0-119-generic
> ubuntu 25.04    6.14.0-15-generic
>
> For centos 6.10, debian 8.8, fedora 39+41, I have also tried this without
> my patch. On centos 6.10 and debian 8.8, "sleep 1" takes ~ 5 seconds
> without my patch, but only 1 second with my patch. fedora 39+41 seem to
> use clocksource tsc without my patch, so they don't have that problem.
>
> Debian 8.8 does not find its cdrom due to missing virtio-scsi driver but
> rescue shell works. I remember that this was a problem in old Debian
> installers.
>
> Fedora 39+41 don't find the cdrom either, with fed 41 I have debugged this
> until
>
>   systemd-coredump[1172]: Process 1157 (cdrom_id) of user 0 terminated
>   abnormally with signal 6/ABRT, processing...
>
> After a long wait I get the rescue shell. The behavior is identical
> with/without my patch.
>
> I could not get ubuntu 16.04 to boot without vga, the bootloader fails
> already.
>
> The list above covers a wide range of linux versions. As only very few kvm
> features were present in the first kvm versions, linux as a guest
> dutifully checks the cpuid feature bits. Therefore I don't expect that
> more casual testing would reveal more issues. Also, sometimes linux uses
> tsc as clocksource at boot but is later unhappy with it and switches to a
> different clock source. I saw this on one alpine linux VM on bluhm's
> system when I started looking at this. I also expect that behaviour may be
> different depending on the cpu type and the number of sockets in the host
> system. One would need much more testers to cover these cases.
>
> An updated diff to account for changes in -current is attached below. This
> includes only the vmm(4) part. The pvclock(4) part is covered in a
> separate sub-thread.
>
> Cheers,
> Stefan
>

Thanks, this was a good deal of testing. I'll take a look at the diff below
tomorrow!

-ml

>
> > >
> > > Or one could redefine the current VMM_HV_SIGNATURE feature bits in a way
> > > that allows one to detect the new KVM_MSR_WALL_CLOCK support and then try
> > > to get a patch into linux to detect VMM_HV_SIGNATURE and those feature
> > > bits.
> > >
> > > What do you think?
> > >
> >
> > As long as we are convinced that we haven't broken anything from the list
> > above, I don't think this extra work (trying to upstream a VMM_HV_SIGNATURE
> > check in Linux) is worthwhile.
> >
>
> diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c
> index 037cf650cb9..cd23d8acf00 100644
> --- a/sys/arch/amd64/amd64/vmm_machdep.c
> +++ b/sys/arch/amd64/amd64/vmm_machdep.c
> @@ -154,6 +154,7 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *);
>  int vmm_gpa_is_valid(struct vcpu *vcpu, paddr_t gpa, size_t obj_size);
>  void vmm_init_pvclock(struct vcpu *, paddr_t);
>  int vmm_update_pvclock(struct vcpu *);
> +void vmm_pv_wall_clock(struct vcpu *, paddr_t);
>  int vmm_pat_is_valid(uint64_t);
>
>  #ifdef MULTIPROCESSOR
> @@ -193,6 +194,7 @@ extern uint64_t tsc_frequency;
>  extern int tsc_is_invariant;
>
>  const char *vmm_hv_signature = VMM_HV_SIGNATURE;
> +const char *kvm_hv_signature = "KVMKVMKVM\0\0\0";
>
>  const struct kmem_pa_mode vmm_kp_contig = {
>  	.kp_constraint = &no_constraint,
> @@ -6043,6 +6045,10 @@ vmx_handle_wrmsr(struct vcpu *vcpu)
>  		vmm_init_pvclock(vcpu,
>  		    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
>  		break;
> +	case KVM_MSR_WALL_CLOCK:
> +		vmm_pv_wall_clock(vcpu,
> +		    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
> +		break;
>  #ifdef VMM_DEBUG
>  	default:
>  		/*
> @@ -6104,6 +6110,10 @@ svm_handle_msr(struct vcpu *vcpu)
>  			vmm_init_pvclock(vcpu,
>  			    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
>  			break;
> +		case KVM_MSR_WALL_CLOCK:
> +			vmm_pv_wall_clock(vcpu,
> +			    (*rax & 0xFFFFFFFFULL) | (*rdx  << 32));
> +			break;
>  		default:
>  			/* Log the access, to be able to identify unknown MSRs */
>  			DPRINTF("%s: wrmsr exit, msr=0x%llx, discarding data "
> @@ -6459,6 +6469,20 @@ vmm_handle_cpuid(struct vcpu *vcpu)
>  		*rcx = 0;
>  		*rdx = 0;
>  		break;
> +	case 0x40000100:	/* Hypervisor information KVM */
> +		*rax = 0x40000101;
> +		*rbx = *((uint32_t *)&kvm_hv_signature[0]);
> +		*rcx = *((uint32_t *)&kvm_hv_signature[4]);
> +		*rdx = *((uint32_t *)&kvm_hv_signature[8]);
> +		break;
> +	case 0x40000101:	/* KVM hypervisor features */
> +		*rax = (1 << KVM_FEATURE_CLOCKSOURCE2) |
> +		    (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> +		    (1 << KVM_FEATURE_NOP_IO_DELAY);
> +		*rbx = 0;
> +		*rcx = 0;
> +		*rdx = 0;
> +		break;
>  	case 0x80000000:	/* Extended function level */
>  		/* We don't emulate past 0x8000001f currently. */
>  		*rax = min(curcpu()->ci_pnfeatset, 0x8000001f);
> @@ -6980,7 +7004,7 @@ vmm_update_pvclock(struct vcpu *vcpu)
>  		    (++vcpu->vc_pvclock_version << 1) | 0x1;
>
>  		pvclock_ti->ti_tsc_timestamp = rdtsc();
> -		nanotime(&tv);
> +		nanouptime(&tv);
>  		pvclock_ti->ti_system_time =
>  		    tv.tv_sec * 1000000000L + tv.tv_nsec;
>  		pvclock_ti->ti_tsc_shift = 12;
> @@ -6994,6 +7018,36 @@ vmm_update_pvclock(struct vcpu *vcpu)
>  	return (0);
>  }
>
> +void
> +vmm_pv_wall_clock(struct vcpu *vcpu, paddr_t gpa)
> +{
> +	struct pvclock_wall_clock *pvclock_wc;
> +	struct timespec tv;
> +	struct vm *vm = vcpu->vc_parent;
> +	paddr_t hpa;
> +
> +	if (!vmm_gpa_is_valid(vcpu, gpa, sizeof(struct pvclock_wall_clock)))
> +		goto err;
> +
> +	/* XXX: handle case when this struct goes over page boundaries */
> +	if ((gpa & PAGE_MASK) + sizeof(struct pvclock_wall_clock) > PAGE_SIZE)
> +		goto err;
> +
> +	if (!pmap_extract(vm->vm_pmap, gpa, &hpa))
> +		goto err;
> +	pvclock_wc = (void*) PMAP_DIRECT_MAP(hpa);
> +	pvclock_wc->wc_version |= 0x1;
> +	nanoboottime(&tv);
> +	pvclock_wc->wc_sec = tv.tv_sec;
> +	pvclock_wc->wc_nsec = tv.tv_nsec;
> +	pvclock_wc->wc_version += 1;
> +
> +	return;
> +err:
> +	vmm_inject_gp(vcpu);
> +}
> +
> +
>  int
>  vmm_pat_is_valid(uint64_t pat)
>  {