Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: wired pages vs sysctl(2)
To:
tech@openbsd.org, kettenis@openbsd.org
Date:
Mon, 17 Mar 2025 19:30:13 +0100

Download raw body.

Thread
  • Alexander Bluhm:

    wired pages vs sysctl(2)

  • On Thu, Feb 27, 2025 at 02:44:13PM +0100, Martin Pieuchot wrote:
    > Since the 90s sysctl(2) has been wiring pages that contain the syscall
    > arguments. 
    > 
    > In uvm/uvm_fault.c r1.129 kettenis@ fixed one of the races exposed by
    > syzkaller where a sibling thread munmap(2)ed the range currently wired
    > by sysctl(2).
    > 
    > Sadly the fix is incomplete and the reproducer attached triggers the
    > following panic:
    > 
    > panic: uvm_fault_unwire_locked: address not in map
    > Stopped at      db_enter+0x14:  popq    %rbp
    >     TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
    > *129832   2744      0         0x3          0    0  sysctl_wired
    >   84116  96377      0        0x10        0x8    2  sshd-session
    >    6897  50184      0     0x14000      0x200    1  softnet0
    > db_enter() at db_enter+0x14
    > panic(ffffffff8261d529) at panic+0xdd
    > uvm_fault_unwire_locked(fffffd878156ab58,20000000,20001000) at uvm_fault_unwire
    > _locked+0x246
    > uvm_fault_unwire(fffffd878156ab58,20000000,20001000) at uvm_fault_unwire+0x44
    > sysctl_vsunlock(20000040,0) at sysctl_vsunlock+0x46
    > net_sysctl(ffff800055d411b4,3,20000040,ffff800055d411e8,20000080,c,cf641d03c835
    > bbeb) at net_sysctl+0x309
    > sys_sysctl(ffff800055c99c78,ffff800055d41300,ffff800055d41270) at sys_sysctl+0x
    > 1f8
    > syscall(ffff800055d41300) at syscall+0x5ec
    > Xsyscall() at Xsyscall+0x128
    >  
    > Syzkaller has two reports for this:
    >   https://syzkaller.appspot.com/bug?extid=d9b926edfd5f64a66c58
    >   https://syzkaller.appspot.com/bug?extid=a12540517e3a76a6a490
    > 
    > It is not clear to my why vslock(9) is needed today.
    
    We have sysctl that grab net lock around copyout(9).  To avoid
    potential NFS activity during copyout(9), wire the memory before
    calling NET_LOCK().  Maybe there are more resons to call vslock(9).
    
    > That said the diff
    > below completes kettenis@'s approach by making vslock similar to mlock(2).
    > With it, vsunlock() now silently fails without crashing the kernel and I
    > can no longer reproduce the panic.  
    > 
    > ok?
    
    Diff passes full regress on amd64 with witness.  And I have a machine
    based on 7.4 that runs some telegraf daemon and prints 'pmap_unwire:
    wiring for pmap 0xfffffd8e45525378 va 0xc0014fc000 didnt change!'
    regulary.  With this patch the message disappered.
    
    I am not an UVM expert, but for me it is OK bluhm@
    
    > diff --git sys/uvm/uvm_fault.c sys/uvm/uvm_fault.c
    > index cf6ea625571..e0f2ee8e08e 100644
    > --- sys/uvm/uvm_fault.c
    > +++ sys/uvm/uvm_fault.c
    > @@ -1782,16 +1782,12 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end)
    >  		 * find the map entry for the current address.
    >  		 */
    >  		KASSERT(va >= entry->start);
    > -		while (entry && va >= entry->end) {
    > +		while (va >= entry->end) {
    >  			next = RBT_NEXT(uvm_map_addr, entry);
    > +			KASSERT(next != NULL && next->start <= entry->end);
    >  			entry = next;
    >  		}
    >  
    > -		if (entry == NULL)
    > -			return;
    > -		if (va < entry->start)
    > -			continue;
    > -
    >  		/*
    >  		 * lock it.
    >  		 */
    > @@ -1819,7 +1815,7 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end)
    >  	}
    >  
    >  	if (oentry != NULL) {
    > -		uvm_map_unlock_entry(oentry);
    > +		uvm_map_unlock_entry(entry);
    >  	}
    >  }
    >  
    > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
    > index 0d7bd9ea10b..63c6f8393d6 100644
    > --- sys/uvm/uvm_glue.c
    > +++ sys/uvm/uvm_glue.c
    > @@ -114,7 +114,7 @@ uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
    >  	if (end <= start)
    >  		return (EINVAL);
    >  
    > -	return uvm_fault_wire(map, start, end, access_type);
    > +	return uvm_map_pageable(map, start, end, FALSE, 0);
    >  }
    >  
    >  /*
    > @@ -125,13 +125,14 @@ uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
    >  void
    >  uvm_vsunlock(struct proc *p, caddr_t addr, size_t len)
    >  {
    > +	struct vm_map *map = &p->p_vmspace->vm_map;
    >  	vaddr_t start, end;
    >  
    >  	start = trunc_page((vaddr_t)addr);
    >  	end = round_page((vaddr_t)addr + len);
    >  	KASSERT(end > start);
    >  
    > -	uvm_fault_unwire(&p->p_vmspace->vm_map, start, end);
    > +	uvm_map_pageable(map, start, end, TRUE, 0);
    >  }
    >  
    >  /*
    
    > /*	$OpenBSD$ */
    > 
    > /*
    >  * MT race reproducer: unmap memory range while sysctl(2) marked it wired.
    >  */
    > 
    > #include <netinet/in.h>
    > #include <netinet/icmp6.h>
    > #include <sys/mman.h>
    > #include <sys/types.h>
    > #include <sys/socket.h>
    > #include <sys/sysctl.h>
    > 
    > #include <err.h>
    > #include <pthread.h>
    > #include <stdint.h>
    > #include <stdio.h>
    > #include <string.h>
    > #include <unistd.h>
    > 
    > static pthread_mutex_t gmtx = PTHREAD_MUTEX_INITIALIZER;
    > 
    > size_t addr = 0x20000000UL;
    > size_t len = 0x01000000UL;
    > 
    > void *
    > thr_unmap(void *arg)
    > {
    > 	pthread_mutex_lock(&gmtx);
    > 
    > 	for (;;)
    > 		munmap((void *)addr, 0x4000UL);
    > 
    > 	return NULL;
    > }
    > 
    > int
    > main(void)
    > {
    > 	pthread_t th;
    > 	size_t *olenp;
    > 	void *oldp, *newp;
    > 	char *pg;
    > 	int i, error, pgsz, *mib;
    > 
    > 	pgsz = getpagesize();
    > 
    > 	mmap((void *)addr, len, PROT_READ|PROT_WRITE,
    > 	    MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
    > 
    > 	/* fault in all pages. */
    > 	for (i = 0; i < len; i+= pgsz) {
    > 		pg = (char *)(addr + i);
    > 		pg[0] = 1;
    > 	}
    > 
    > 	mib = (int *)(addr + 0xc0);
    > 	mib[0] = CTL_NET;
    > 	mib[1] = AF_INET6;
    > 	mib[2] = IPPROTO_ICMPV6;
    > 	mib[3] = ICMPV6CTL_REDIRTIMEOUT;
    > 
    > 	oldp = (void *)(addr + 0x40);
    > 
    > 	olenp = (size_t *)(addr + 0x100);
    > 	*olenp = 0;
    > 
    > 	/* new arguments */
    > 	newp = (char *)(addr + 0x80);
    > 	memcpy(newp, "somegarbage", 12);
    > 
    > 	pthread_mutex_lock(&gmtx);
    > 	/* competing thread. */
    > 	error = pthread_create(&th, NULL, thr_unmap, NULL);
    > 	if (error)
    > 		warn("pthread_create");
    > 
    > 	/* Ensure memory is still mapped. */
    > 	printf("%s\n", (char *)newp);
    > 	pthread_mutex_unlock(&gmtx);
    > 	for (i = 0; i < (100 * 100); i++) {
    > 		error = sysctl(mib, 4, oldp, olenp, newp, 12);
    > 	}
    > 
    > 	return 0;
    > }
    
    
  • Alexander Bluhm:

    wired pages vs sysctl(2)