Index | Thread | Search

From:
Martin Pieuchot <mpi@grenadille.net>
Subject:
Re: wired pages vs sysctl(2)
To:
tech@openbsd.org, kettenis@openbsd.org
Date:
Tue, 4 Mar 2025 14:45:42 +0100

Download raw body.

Thread
On 27/02/25(Thu) 14:44, Martin Pieuchot wrote:
> Since the 90s sysctl(2) has been wiring pages that contain the syscall
> arguments. 
> 
> In uvm/uvm_fault.c r1.129 kettenis@ fixed one of the races exposed by
> syzkaller where a sibling thread munmap(2)ed the range currently wired
> by sysctl(2).
> 
> Sadly the fix is incomplete and the reproducer attached triggers the
> following panic:
> 
> panic: uvm_fault_unwire_locked: address not in map
> Stopped at      db_enter+0x14:  popq    %rbp
>     TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
> *129832   2744      0         0x3          0    0  sysctl_wired
>   84116  96377      0        0x10        0x8    2  sshd-session
>    6897  50184      0     0x14000      0x200    1  softnet0
> db_enter() at db_enter+0x14
> panic(ffffffff8261d529) at panic+0xdd
> uvm_fault_unwire_locked(fffffd878156ab58,20000000,20001000) at uvm_fault_unwire
> _locked+0x246
> uvm_fault_unwire(fffffd878156ab58,20000000,20001000) at uvm_fault_unwire+0x44
> sysctl_vsunlock(20000040,0) at sysctl_vsunlock+0x46
> net_sysctl(ffff800055d411b4,3,20000040,ffff800055d411e8,20000080,c,cf641d03c835
> bbeb) at net_sysctl+0x309
> sys_sysctl(ffff800055c99c78,ffff800055d41300,ffff800055d41270) at sys_sysctl+0x
> 1f8
> syscall(ffff800055d41300) at syscall+0x5ec
> Xsyscall() at Xsyscall+0x128
>  
> Syzkaller has two reports for this:
>   https://syzkaller.appspot.com/bug?extid=d9b926edfd5f64a66c58
>   https://syzkaller.appspot.com/bug?extid=a12540517e3a76a6a490
> 
> It is not clear to my why vslock(9) is needed today.  That said the diff
> below completes kettenis@'s approach by making vslock similar to mlock(2).
> With it, vsunlock() now silently fails without crashing the kernel and I
> can no longer reproduce the panic.  

Note that there's another issue with the logic that wires pages during
sysctl(2).  If a page has been previously mlock(2)ed and we pass it as
arguments to sysctl(2), it will be munlock(2)ed when returning to
userland.
As long as we need vslock(9), which is something that I don't know, might
want to distinguish between userland vs kernel wiring like FreeBSD do.

This diff is a step in this direction anyway.

So ok?

> diff --git sys/uvm/uvm_fault.c sys/uvm/uvm_fault.c
> index cf6ea625571..e0f2ee8e08e 100644
> --- sys/uvm/uvm_fault.c
> +++ sys/uvm/uvm_fault.c
> @@ -1782,16 +1782,12 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end)
>  		 * find the map entry for the current address.
>  		 */
>  		KASSERT(va >= entry->start);
> -		while (entry && va >= entry->end) {
> +		while (va >= entry->end) {
>  			next = RBT_NEXT(uvm_map_addr, entry);
> +			KASSERT(next != NULL && next->start <= entry->end);
>  			entry = next;
>  		}
>  
> -		if (entry == NULL)
> -			return;
> -		if (va < entry->start)
> -			continue;
> -
>  		/*
>  		 * lock it.
>  		 */
> @@ -1819,7 +1815,7 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end)
>  	}
>  
>  	if (oentry != NULL) {
> -		uvm_map_unlock_entry(oentry);
> +		uvm_map_unlock_entry(entry);
>  	}
>  }
>  
> diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
> index 0d7bd9ea10b..63c6f8393d6 100644
> --- sys/uvm/uvm_glue.c
> +++ sys/uvm/uvm_glue.c
> @@ -114,7 +114,7 @@ uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
>  	if (end <= start)
>  		return (EINVAL);
>  
> -	return uvm_fault_wire(map, start, end, access_type);
> +	return uvm_map_pageable(map, start, end, FALSE, 0);
>  }
>  
>  /*
> @@ -125,13 +125,14 @@ uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
>  void
>  uvm_vsunlock(struct proc *p, caddr_t addr, size_t len)
>  {
> +	struct vm_map *map = &p->p_vmspace->vm_map;
>  	vaddr_t start, end;
>  
>  	start = trunc_page((vaddr_t)addr);
>  	end = round_page((vaddr_t)addr + len);
>  	KASSERT(end > start);
>  
> -	uvm_fault_unwire(&p->p_vmspace->vm_map, start, end);
> +	uvm_map_pageable(map, start, end, TRUE, 0);
>  }
>  
>  /*

> /*	$OpenBSD$ */
> 
> /*
>  * MT race reproducer: unmap memory range while sysctl(2) marked it wired.
>  */
> 
> #include <netinet/in.h>
> #include <netinet/icmp6.h>
> #include <sys/mman.h>
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <sys/sysctl.h>
> 
> #include <err.h>
> #include <pthread.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <string.h>
> #include <unistd.h>
> 
> static pthread_mutex_t gmtx = PTHREAD_MUTEX_INITIALIZER;
> 
> size_t addr = 0x20000000UL;
> size_t len = 0x01000000UL;
> 
> void *
> thr_unmap(void *arg)
> {
> 	pthread_mutex_lock(&gmtx);
> 
> 	for (;;)
> 		munmap((void *)addr, 0x4000UL);
> 
> 	return NULL;
> }
> 
> int
> main(void)
> {
> 	pthread_t th;
> 	size_t *olenp;
> 	void *oldp, *newp;
> 	char *pg;
> 	int i, error, pgsz, *mib;
> 
> 	pgsz = getpagesize();
> 
> 	mmap((void *)addr, len, PROT_READ|PROT_WRITE,
> 	    MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
> 
> 	/* fault in all pages. */
> 	for (i = 0; i < len; i+= pgsz) {
> 		pg = (char *)(addr + i);
> 		pg[0] = 1;
> 	}
> 
> 	mib = (int *)(addr + 0xc0);
> 	mib[0] = CTL_NET;
> 	mib[1] = AF_INET6;
> 	mib[2] = IPPROTO_ICMPV6;
> 	mib[3] = ICMPV6CTL_REDIRTIMEOUT;
> 
> 	oldp = (void *)(addr + 0x40);
> 
> 	olenp = (size_t *)(addr + 0x100);
> 	*olenp = 0;
> 
> 	/* new arguments */
> 	newp = (char *)(addr + 0x80);
> 	memcpy(newp, "somegarbage", 12);
> 
> 	pthread_mutex_lock(&gmtx);
> 	/* competing thread. */
> 	error = pthread_create(&th, NULL, thr_unmap, NULL);
> 	if (error)
> 		warn("pthread_create");
> 
> 	/* Ensure memory is still mapped. */
> 	printf("%s\n", (char *)newp);
> 	pthread_mutex_unlock(&gmtx);
> 	for (i = 0; i < (100 * 100); i++) {
> 		error = sysctl(mib, 4, oldp, olenp, newp, 12);
> 	}
> 
> 	return 0;
> }