Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: update on pinsyscalls(2)
To:
"Theo de Raadt" <deraadt@openbsd.org>
Cc:
tech@cvs.openbsd.org
Date:
Sat, 13 Jan 2024 15:39:04 +0100

Download raw body.

Thread
> From: "Theo de Raadt" <deraadt@openbsd.org>
> Date: Sat, 30 Dec 2023 11:56:35 -0700
> 
> The pinsyscalls(2) diff is now much smaller, since many pieces it depends
> upon have been commmited.
> 
> All the DSO containing system call entries have the proper annotations for
> kernel and ld.so to do the right thing.
> 
> This diff can be applied to a -current system, if people want to play
> along:
> 
> 	cd /usr/src
> 	make includes   ;; to update sys/proc.h mostly
> 	build new kernel
> 	build new libexec/ld.so
> 	build new bin/ps
> 
> ps:
> 	- for static binaries, ps will show 'l' to indicate the
> 	  binary's text segment is doing syscall pinning.
> 	- for dynamic binaries, ps will show 'l' to indicate that
> 	  ld.so's text segement is doing syscall pinning, and 'L'
> 	  to indicate libc.so's text segment is doing syscal pinning
> 
> 
> There's a long tail with this diff.  Perhaps in a release or two when
> all binaries are known to follow the pinsyscalls(2) rules, we'll be able
> to turn msyscall() and the less powerful pinsyscall(2) into NOPs, and
> eventually remove them.
> 
> The more precise pinsyscalls(2) check in syscall_mi.h is O(1) but
> slightly more expensive than the msyscall(2) check which also is O(1) in
> the general case but has a special case when text msyscall-allowed
> segments get crossed (such as when doing ld.so GOT/PLT resolution, or
> signal handlers), then uvm locks occur twice.  But we don't need both.

A few small things and one important mistake I spotted below.
Otherwise this looks pretty good to me now.

> Index: sys/kern/exec_elf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/exec_elf.c,v
> diff -u -p -u -r1.183 exec_elf.c
> --- sys/kern/exec_elf.c	12 Jul 2023 19:34:14 -0000	1.183
> +++ sys/kern/exec_elf.c	19 Dec 2023 21:45:31 -0000
> @@ -81,6 +81,7 @@
>  #include <sys/ptrace.h>
>  #include <sys/signalvar.h>
>  #include <sys/pledge.h>
> +#include <sys/syscall.h>
>  
>  #include <sys/mman.h>
>  
> @@ -97,6 +98,8 @@ void	elf_load_psection(struct exec_vmcmd
>  	    Elf_Phdr *, Elf_Addr *, Elf_Addr *, int *, int);
>  int	elf_os_pt_note_name(Elf_Note *);
>  int	elf_os_pt_note(struct proc *, struct exec_package *, Elf_Ehdr *, int *);
> +int	elf_read_pintable(struct proc *p, struct vnode *vp, Elf_Phdr *pp,
> +	    u_int **pinp, int is_ldso, size_t len);
>  
>  /* round up and down to page boundaries. */
>  #define ELF_ROUND(a, b)		(((a) + (b) - 1) & ~((b) - 1))
> @@ -266,6 +269,74 @@ elf_read_from(struct proc *p, struct vno
>  }
>  
>  /*
> + * rebase the pin offsets inside a base,len window for the text segment only.
> + */
> +void
> +elf_adjustpins(vaddr_t *basep, size_t *lenp, u_int *pins, int npins, u_int offset)
> +{
> +	int i;
> +
> +	/* Adjust offsets, base, len */
> +	for (i = 0; i < npins; i++) {
> +		if (pins[i] == -1 || pins[i] == 0)
> +			continue;
> +		pins[i] -= offset;
> +	}
> +	*basep += offset;
> +	*lenp -= offset;
> +}
> +
> +int
> +elf_read_pintable(struct proc *p, struct vnode *vp, Elf_Phdr *pp,
> +    u_int **pinp, int is_ldso, size_t len)
> +{
> +	struct pinsyscalls {
> +		u_int offset;
> +		u_int sysno;
> +	} *syscalls = NULL;
> +	int i, nsyscalls = 0, npins = 0;
> +	u_int *pins = NULL;
> +
> +	if (pp->p_filesz > SYS_MAXSYSCALL * 2 * sizeof(*syscalls) ||
> +	    pp->p_filesz % sizeof(*syscalls) != 0)
> +		goto bad;
> +	nsyscalls = pp->p_filesz / sizeof(*syscalls);
> +	syscalls = malloc(pp->p_filesz, M_PINSYSCALL, M_WAITOK);
> +	if (elf_read_from(p, vp, pp->p_offset, syscalls,
> +	    pp->p_filesz) != 0)
> +		goto bad;
> +
> +	/* Validate, and calculate pintable size */
> +	for (i = 0; i < nsyscalls; i++) {
> +		if (syscalls[i].sysno <= 0 ||
> +		    syscalls[i].sysno >= SYS_MAXSYSCALL ||
> +		    syscalls[i].offset > len)
> +			goto bad;
> +		npins = MAX(npins, syscalls[i].sysno);
> +	}
> +	if (is_ldso)
> +		npins = MAX(npins, SYS_kbind);	/* XXX see ld.so/loader.c */
> +	npins++;
> +
> +	/* Fill pintable: 0 = invalid, -1 = allowed, else offset from base */
> +	pins = mallocarray(npins, sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
> +	for (i = 0; i < nsyscalls; i++) {
> +		if (pins[syscalls[i].sysno])
> +			pins[syscalls[i].sysno] = -1;	/* duplicated */
> +		else
> +			pins[syscalls[i].sysno] = syscalls[i].offset;
> +	}
> +	if (is_ldso)
> +		pins[SYS_kbind] = -1;	/* XXX see ld.so/loader.c */
> +	*pinp = pins;
> +	pins = NULL;
> +bad:
> +	free(syscalls, M_PINSYSCALL, nsyscalls * sizeof(*syscalls));
> +	free(pins, M_PINSYSCALL, npins * sizeof(u_int));
> +	return npins;
> +}
> +
> +/*
>   * Load a file (interpreter/library) pointed to by path [stolen from
>   * coff_load_shlib()]. Made slightly generic so it might be used externally.
>   */
> @@ -276,7 +347,7 @@ elf_load_file(struct proc *p, char *path
>  	int error, i;
>  	struct nameidata nd;
>  	Elf_Ehdr eh;
> -	Elf_Phdr *ph = NULL;
> +	Elf_Phdr *ph = NULL, *syscall_ph = NULL;
>  	u_long phsize = 0;
>  	Elf_Addr addr;
>  	struct vnode *vp;
> @@ -290,6 +361,7 @@ elf_load_file(struct proc *p, char *path
>  	int file_align;
>  	int loop;
>  	size_t randomizequota = ELF_RANDOMIZE_LIMIT;
> +	vaddr_t text_start = -1, text_end = 0;
>  
>  	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p);
>  	nd.ni_pledge = PLEDGE_RPATH;
> @@ -432,6 +504,12 @@ elf_load_file(struct proc *p, char *path
>  					epp->ep_entry += pos;
>  				ap->arg_interp = pos;
>  			}
> +			if (prot & PROT_EXEC) {
> +				if (addr < text_start)
> +					text_start = addr;
> +				if (addr+size >= text_end)
> +					text_end = addr + size;
> +			}
>  			addr += size;
>  			break;
>  
> @@ -461,12 +539,34 @@ elf_load_file(struct proc *p, char *path
>  			NEW_VMCMD(&epp->ep_vmcmds, vmcmd_mutable,
>  			    ph[i].p_memsz, ph[i].p_vaddr + pos, NULLVP, 0, 0);
>  			break;
> -
> +		case PT_OPENBSD_SYSCALLS:
> +			syscall_ph = &ph[i];
> +			break;
>  		default:
>  			break;
>  		}
>  	}
>  
> +	if (syscall_ph) {
> +		struct process *pr = p->p_p;
> +		vaddr_t base = pos;
> +		size_t len = text_end;
> +		u_int *pins;
> +		int npins;
> +
> +		npins = elf_read_pintable(p, nd.ni_vp, syscall_ph,
> +		    &pins, 1, len);
> +		if (npins) {
> +			elf_adjustpins(&base, &len, pins, npins,
> +			    text_start);
> +			pr->ps_pin.pn_start = base;
> +			pr->ps_pin.pn_end = base + len;
> +			pr->ps_pin.pn_pins = pins;
> +			pr->ps_pin.pn_npins = npins;
> +			pr->ps_flags |= PS_PIN;
> +		}
> +	}
> +
>  	vn_marktext(nd.ni_vp);
>  
>  bad1:
> @@ -491,8 +591,8 @@ int
>  exec_elf_makecmds(struct proc *p, struct exec_package *epp)
>  {
>  	Elf_Ehdr *eh = epp->ep_hdr;
> -	Elf_Phdr *ph, *pp, *base_ph = NULL;
> -	Elf_Addr phdr = 0, exe_base = 0;
> +	Elf_Phdr *ph, *pp, *base_ph = NULL, *syscall_ph = NULL;
> +	Elf_Addr phdr = 0, exe_base = 0, exe_end = 0;
>  	int error, i, has_phdr = 0, names = 0, textrel = 0;
>  	char *interp = NULL;
>  	u_long phsize;
> @@ -633,11 +733,13 @@ exec_elf_makecmds(struct proc *p, struct
>  
>  			/*
>  			 * Permit system calls in main-text static binaries.
> -			 * Also block the ld.so syscall-grant
> +			 * static binaries may not call msyscall() or
> +			 * pinsyscalls()
>  			 */
>  			if (interp == NULL) {
>  				syscall = VMCMD_SYSCALL;
>  				p->p_vmspace->vm_map.flags |= VM_MAP_SYSCALL_ONCE;
> +				p->p_vmspace->vm_map.flags |= VM_MAP_PINSYSCALL_ONCE;
>  			}
>  
>  			/*
> @@ -696,6 +798,9 @@ exec_elf_makecmds(struct proc *p, struct
>  						epp->ep_tsize = addr+size -
>  						    epp->ep_taddr;
>  				}
> +				if (interp == NULL)
> +					exe_end = epp->ep_taddr +
> +					    epp->ep_tsize;	/* end of TEXT */
>  			}
>  			break;
>  
> @@ -735,13 +840,35 @@ exec_elf_makecmds(struct proc *p, struct
>  			NEW_VMCMD(&epp->ep_vmcmds, vmcmd_mutable,
>  			    ph[i].p_memsz, ph[i].p_vaddr + exe_base, NULLVP, 0, 0);
>  			break;
> -
> +		case PT_OPENBSD_SYSCALLS:
> +			if (interp == NULL)
> +				syscall_ph = &ph[i];
> +			break;
>  		default:
>  			/*
>  			 * Not fatal, we don't need to understand everything
>  			 * :-)
>  			 */
>  			break;
> +		}
> +	}
> +
> +	if (syscall_ph) {
> +		vaddr_t base = exe_base;
> +		size_t len = exe_end - exe_base;
> +		u_int *pins;
> +		int npins;
> +
> +		npins = elf_read_pintable(p, epp->ep_vp, syscall_ph,
> +		    &pins, 0, len);
> +		if (npins) {
> +			elf_adjustpins(&base, &len, pins, npins,
> +			    epp->ep_taddr - exe_base);
> +			epp->ep_pinstart = base;
> +			epp->ep_pinend = base + len;
> +			epp->ep_pins = pins;
> +			epp->ep_npins = npins;
> +			p->p_p->ps_flags |= PS_PIN;
>  		}
>  	}
>  
> Index: sys/kern/kern_exec.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_exec.c,v
> diff -u -p -u -r1.252 kern_exec.c
> --- sys/kern/kern_exec.c	30 Oct 2023 07:13:10 -0000	1.252
> +++ sys/kern/kern_exec.c	19 Dec 2023 21:45:31 -0000
> @@ -314,6 +314,8 @@ sys_execve(struct proc *p, void *v, regi
>  	VMCMDSET_INIT(&pack.ep_vmcmds);
>  	pack.ep_vap = &attr;
>  	pack.ep_flags = 0;
> +	pack.ep_pins = NULL;
> +	pack.ep_npins = 0;
>  
>  	/* see if we can run it. */
>  	if ((error = check_exec(p, &pack)) != 0) {
> @@ -514,6 +516,30 @@ sys_execve(struct proc *p, void *v, regi
>  	if (copyout(&arginfo, (char *)pr->ps_strings, sizeof(arginfo)))
>  		goto exec_abort;
>  
> +	free(pr->ps_pin.pn_pins, M_PINSYSCALL,
> +	    pr->ps_pin.pn_npins * sizeof(u_int));
> +	if (pack.ep_npins) {
> +		pr->ps_pin.pn_start = pack.ep_pinstart;
> +		pr->ps_pin.pn_end = pack.ep_pinend;
> +		pr->ps_pin.pn_pins = pack.ep_pins;
> +		pack.ep_pins = NULL;
> +		pr->ps_pin.pn_npins = pack.ep_npins;
> +		pr->ps_flags |= PS_PIN;
> +	} else {
> +		pr->ps_pin.pn_start = pr->ps_pin.pn_end = 0;
> +		pr->ps_pin.pn_pins = NULL;
> +		pr->ps_pin.pn_npins = 0;
> +		pr->ps_flags &= ~PS_PIN;
> +	}
> +	if (pr->ps_libcpin.pn_pins) {
> +		free(pr->ps_libcpin.pn_pins, M_PINSYSCALL,
> +		    pr->ps_libcpin.pn_npins * sizeof(u_int));
> +		pr->ps_libcpin.pn_start = pr->ps_libcpin.pn_end = 0;
> +		pr->ps_libcpin.pn_pins = NULL;
> +		pr->ps_libcpin.pn_npins = 0;
> +		pr->ps_flags &= ~PS_LIBCPIN;
> +	}
> +
>  	stopprofclock(pr);	/* stop profiling */
>  	fdcloseexec(p);		/* handle close on exec */
>  	execsigs(p);		/* reset caught signals */
> @@ -752,6 +778,7 @@ bad:
>  	if (pack.ep_interp != NULL)
>  		pool_put(&namei_pool, pack.ep_interp);
>  	free(pack.ep_args, M_TEMP, sizeof *pack.ep_args);
> +	free(pack.ep_pins, M_PINSYSCALL, pack.ep_npins * sizeof(u_int));
>  	/* close and put the exec'd file */
>  	vn_close(pack.ep_vp, FREAD, cred, p);
>  	pool_put(&namei_pool, nid.ni_cnd.cn_pnbuf);
> Index: sys/kern/kern_exit.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_exit.c,v
> diff -u -p -u -r1.217 kern_exit.c
> --- sys/kern/kern_exit.c	29 Sep 2023 12:47:34 -0000	1.217
> +++ sys/kern/kern_exit.c	19 Dec 2023 21:45:31 -0000
> @@ -215,6 +215,11 @@ exit1(struct proc *p, int xexit, int xsi
>  
>  		unveil_destroy(pr);
>  
> +		free(pr->ps_pin.pn_pins, M_PINSYSCALL,
> +		    pr->ps_pin.pn_npins * sizeof(u_int));
> +		free(pr->ps_libcpin.pn_pins, M_PINSYSCALL,
> +		    pr->ps_libcpin.pn_npins * sizeof(u_int));
> +
>  		/*
>  		 * If parent has the SAS_NOCLDWAIT flag set, we're not
>  		 * going to become a zombie.
> Index: sys/kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> diff -u -p -u -r1.253 kern_fork.c
> --- sys/kern/kern_fork.c	24 Oct 2023 13:20:11 -0000	1.253
> +++ sys/kern/kern_fork.c	19 Dec 2023 21:45:31 -0000
> @@ -248,6 +248,21 @@ process_new(struct proc *p, struct proce
>  	if (parent->ps_session->s_ttyvp != NULL)
>  		pr->ps_flags |= parent->ps_flags & PS_CONTROLT;
>  
> +	if (parent->ps_pin.pn_pins) {
> +		pr->ps_pin.pn_pins = mallocarray(parent->ps_pin.pn_npins,
> +		    sizeof(u_int), M_PINSYSCALL, M_WAITOK);
> +		memcpy(pr->ps_pin.pn_pins, parent->ps_pin.pn_pins,
> +		    parent->ps_pin.pn_npins * sizeof(u_int));
> +		pr->ps_flags |= PS_PIN;
> +	}
> +	if (parent->ps_libcpin.pn_pins) {
> +		pr->ps_libcpin.pn_pins = mallocarray(parent->ps_libcpin.pn_npins,
> +		    sizeof(u_int), M_PINSYSCALL, M_WAITOK);
> +		memcpy(pr->ps_libcpin.pn_pins, parent->ps_libcpin.pn_pins,
> +		    parent->ps_libcpin.pn_npins * sizeof(u_int));
> +		pr->ps_flags |= PS_LIBCPIN;
> +	}
> +
>  	/*
>  	 * Duplicate sub-structures as needed.
>  	 * Increase reference counts on shared objects.
> Index: sys/sys/exec.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/exec.h,v
> diff -u -p -u -r1.52 exec.h
> --- sys/sys/exec.h	19 Apr 2023 15:37:36 -0000	1.52
> +++ sys/sys/exec.h	19 Dec 2023 21:45:31 -0000
> @@ -131,6 +131,9 @@ struct exec_package {
>  	struct	elf_args *ep_args;	/* ELF info */
>  	void	*ep_auxinfo;		/* userspace auxinfo address */
>  	char	*ep_interp;		/* name of interpreter if any */
> +	vaddr_t	ep_pinstart, ep_pinend;	/* executable region */
> +	u_int	*ep_pins;		/* array of system call offsets */
> +	int	ep_npins;		/* entries in array */
>  };
>  #define	EXEC_INDIR	0x0001		/* script handling already done */
>  #define	EXEC_HASFD	0x0002		/* holding a shell script */
> Index: sys/sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> diff -u -p -u -r1.352 proc.h
> --- sys/sys/proc.h	29 Sep 2023 12:47:34 -0000	1.352
> +++ sys/sys/proc.h	19 Dec 2023 21:45:31 -0000
> @@ -117,6 +117,13 @@ struct tslpentry;
>  TAILQ_HEAD(tslpqueue, tslpentry);
>  struct unveil;
>  
> +struct pinsyscall {
> +	vaddr_t		pn_start;
> +	vaddr_t		pn_end;
> +	u_int		*pn_pins; /* array of offset indexed by syscall# */

s/offset/offsets/?

> +	int		pn_npins; /* number of entries in table */
> +};
> +
>  /*
>   * Locks used to protect struct members in this file:
>   *	I	immutable after creation
> @@ -240,6 +247,9 @@ struct process {
>  /* an address that can't be in userspace or kernelspace */
>  #define	BOGO_PC	(u_long)-1
>  
> +	struct pinsyscall ps_pin;	/* static or ld.so */
> +	struct pinsyscall ps_libcpin;	/* libc.so, from pinsyscalls(2) */
> +
>  /* End area that is copied on creation. */
>  #define ps_endcopy	ps_threadcnt
>  	u_int	ps_threadcnt;		/* Number of threads. */
> @@ -283,6 +293,8 @@ struct process {
>  #define	PS_CHROOT	0x01000000	/* Process is chrooted */
>  #define	PS_NOBTCFI	0x02000000	/* No Branch Target CFI */
>  #define	PS_ITIMER	0x04000000	/* Virtual interval timers running */
> +#define	PS_PIN		0x08000000	/* ld.so or static syscall pin */
> +#define	PS_LIBCPIN	0x10000000	/* libc.so syscall pin */
>  
>  #define	PS_BITS \
>      ("\20" "\01CONTROLT" "\02EXEC" "\03INEXEC" "\04EXITING" "\05SUGID" \
> Index: sys/sys/syscall_mi.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/syscall_mi.h,v
> diff -u -p -u -r1.29 syscall_mi.h
> --- sys/sys/syscall_mi.h	12 Dec 2023 15:30:55 -0000	1.29
> +++ sys/sys/syscall_mi.h	19 Dec 2023 21:45:31 -0000
> @@ -33,8 +33,11 @@
>  
>  #include <sys/param.h>
>  #include <sys/pledge.h>
> +#include <sys/acct.h>
> +#include <sys/syslog.h>
>  #include <sys/tracepoint.h>
>  #include <sys/syscall.h>
> +#include <sys/signalvar.h>
>  #include <uvm/uvm_extern.h>
>  
>  #ifdef KTRACE
> @@ -46,6 +49,79 @@
>  #include <dev/dt/dtvar.h>
>  #endif
>  
> +/*
> + * Check if a system call is entered from precisely correct location
> + */
> +static inline int
> +pin_check(struct proc *p, register_t code)
> +{
> +	extern char sigcodecall[], sigcoderet[], sigcodecall[];
> +	struct pinsyscall *pin = NULL, *ppin, *plibcpin;
> +	struct process *pr = p->p_p;
> +	vaddr_t addr;
> +	int error = 0;
> +
> +	/* point at start of syscall instruction */
> +	addr = (vaddr_t)PROC_PC(p) - (vaddr_t)(sigcoderet - sigcodecall);
> +	ppin = &pr->ps_pin;
> +	plibcpin = &pr->ps_libcpin;
> +
> +	/*
> +	 * System calls come from the following places, checks are ordered
> +	 * by most common case:
> +	 * 1) dynamic binary: syscalls in libc.so (in the ps_libcpin region)
> +	 * 2a) static binary: syscalls in main program (in the ps_pin region)
> +	 * 2b) dynamic binary: sysalls in ld.so (in the ps_pin region)
> +	 * 3) sigtramp, containing only sigreturn(2)
> +	 */
> +	if (plibcpin->pn_pins &&
> +	    addr >= plibcpin->pn_start && addr < plibcpin->pn_end)
> +		pin = plibcpin;
> +	else if (ppin->pn_pins &&
> +	    addr >= ppin->pn_start && addr < ppin->pn_end)
> +		pin = ppin;
> +	else if (PROC_PC(p) == pr->ps_sigcoderet) {
> +		if (code == SYS_sigreturn)
> +			return (0);
> +		error = EPERM;
> +	}
> +	if (pin) {
> +		if (code >= pin->pn_npins || pin->pn_pins[code] == 0)
> +			error = ENOSYS;
> +		else if (pin->pn_pins[code] + pin->pn_start == addr)
> +			; /* correct location */
> +		else if (pin->pn_pins[code] == (u_int)-1)
> +			; /* multiple locations, hopefully a boring operation */
> +		else
> +			error = ENOSYS;
> +	}
> +	if (error == 0)
> +		return (0);
> +#ifdef KTRACE
> +	if (KTRPOINT(p, KTR_PINSYSCALL))
> +		ktrpinsyscall(p, error, code, addr);
> +#endif
> +	KERNEL_LOCK();
> +	log(LOG_ERR,
> +	    "%s[%d]: pinsyscalls addr %lx code %ld, pinoff 0x%x "
> +	    "(pin%s %d %lx-%lx %lx) (libcpin%s %d %lx-%lx %lx) error %d\n",
> +	    p->p_p->ps_comm, p->p_p->ps_pid, addr, code,
> +	    (pin && code < pin->pn_npins) ? pin->pn_pins[code] : -1,
> +	    pin == ppin ? "(Y)" : "", ppin->pn_npins,
> +	    ppin->pn_start, ppin->pn_end, ppin->pn_end - ppin->pn_start,
> +	    pin == plibcpin ? "(Y)" : "", plibcpin->pn_npins,
> +	    plibcpin->pn_start, plibcpin->pn_end, plibcpin->pn_end - plibcpin->pn_start,
> +	    error);
> +        p->p_p->ps_acflag |= APINSYS;
> +
> +	/* Try to stop threads immediately, because this process is suspect */
> +	if (P_HASSIBLING(p))
> +		single_thread_set(p, SINGLE_UNWIND | SINGLE_DEEP);
> +	/* Send uncatchable SIGABRT for coredump */
> +	sigabort(p);
> +	KERNEL_UNLOCK();
> +	return (error);
> +}
>  
>  /*
>   * The MD setup for a system call has been done; here's the MI part.
> @@ -90,6 +166,9 @@ mi_syscall(struct proc *p, register_t co
>  	    "[%s]%d/%d pc=%lx inside %lx-%lx: bogus syscall\n",
>  	    uvm_map_inentry_pc, p->p_vmspace->vm_map.wserial))
>  		return (EPERM);
> +
> +	if ((error = pin_check(p, code)))
> +		return (error);
>  
>  	pledged = (p->p_p->ps_flags & PS_PLEDGE);
>  	if (pledged && (error = pledge_syscall(p, code, &tval))) {
> Index: sys/uvm/uvm_map.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> diff -u -p -u -r1.319 uvm_map.c
> --- sys/uvm/uvm_map.c	2 Aug 2023 09:19:47 -0000	1.319
> +++ sys/uvm/uvm_map.c	21 Dec 2023 17:55:14 -0000
> @@ -3407,7 +3407,8 @@ uvmspace_exec(struct proc *p, vaddr_t st
>  		 * when a process execs another program image.
>  		 */
>  		vm_map_lock(map);
> -		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE|VM_MAP_SYSCALL_ONCE);
> +		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE |
> +		    VM_MAP_SYSCALL_ONCE | VM_MAP_PINSYSCALL_ONCE);
>  
>  		/*
>  		 * now unmap the old program
> @@ -3944,7 +3945,8 @@ uvmspace_fork(struct process *pr)
>  			    new_map, new_entry->start, new_entry->end);
>  		}
>  	}
> -	new_map->flags |= old_map->flags & VM_MAP_SYSCALL_ONCE;
> +	new_map->flags |= old_map->flags &
> +	    (VM_MAP_SYSCALL_ONCE | VM_MAP_PINSYSCALL_ONCE);
>  #ifdef PMAP_CHECK_COPYIN
>  	if (PMAP_CHECK_COPYIN) {
>  		memcpy(&new_map->check_copyin, &old_map->check_copyin,
> Index: sys/uvm/uvm_map.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_map.h,v
> diff -u -p -u -r1.87 uvm_map.h
> --- sys/uvm/uvm_map.h	2 Aug 2023 09:19:47 -0000	1.87
> +++ sys/uvm/uvm_map.h	21 Dec 2023 15:31:23 -0000
> @@ -329,6 +329,7 @@ struct vm_map {
>  #define	VM_MAP_GUARDPAGES	0x20		/* rw: add guard pgs to map */
>  #define	VM_MAP_ISVMSPACE	0x40		/* ro: map is a vmspace */
>  #define	VM_MAP_SYSCALL_ONCE	0x80		/* rw: libc syscall registered */
> +#define	VM_MAP_PINSYSCALL_ONCE	0x100		/* rw: pinsyscall done */
>  
>  /* Number of kernel maps and entries to statically allocate */
>  #define	MAX_KMAPENT	1024	/* Sufficient to make it to the scheduler. */
> Index: sys/uvm/uvm_mmap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> diff -u -p -u -r1.183 uvm_mmap.c
> --- sys/uvm/uvm_mmap.c	7 Dec 2023 13:59:05 -0000	1.183
> +++ sys/uvm/uvm_mmap.c	19 Dec 2023 21:45:31 -0000
> @@ -644,13 +644,65 @@ sys_pinsyscall(struct proc *p, void *v, 
>  	return (0);
>  }
>  
> - /*
> - * sys_pinsyscalls
> +/*
> + * sys_pinsyscalls.  The caller is required to normalize base,len
> + * to the minimum .text region, and adjust pintable offsets relative
> + * to that base.
>   */
>  int
>  sys_pinsyscalls(struct proc *p, void *v, register_t *retval)
>  {
> -	/* STUB until other parts are ready */
> +	struct sys_pinsyscalls_args /* {
> +		syscallarg(void *) base;
> +		syscallarg(size_t) len;
> +		syscallarg(u_int *) pins;
> +		syscallarg(int) npins;
> +	} */ *uap = v;
> +	struct process *pr = p->p_p;
> +	int npins, error = 0, i;
> +	vaddr_t base;
> +	size_t len;
> +	u_int *pins;
> +
> +	if (pr->ps_libcpin.pn_start ||
> +	    (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE))
> +		return (EPERM);
> +	base = (vaddr_t)SCARG(uap, base);
> +	len = (vsize_t)SCARG(uap, len);
> +	if (base > SIZE_MAX - len)
> +		return (EINVAL);	/* disallow wrap-around. */
> +
> +	/* XXX MP unlock */
> +
> +	npins = SCARG(uap, npins);
> +	if (npins < 1 || npins > SYS_MAXSYSCALL * 2)
> +		return (E2BIG);

Since pinsyscalls(2) now takes an array of offsets indexed by
syscall#, the above check should be

    if (npins < 1 || npins > SYS_MAXSYSCALL)

> +	pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
> +	if (pins == NULL)
> +		return (ENOMEM);
> +	error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int));
> +	if (error)
> +		goto err;
> +
> +	/* Range-check pintable offsets */
> +	for (i = 0; i < npins; i++) {
> +		if (pins[i] == (u_int)-1 || pins[i] == 0)
> +			continue;
> +		if (pins[i] > SCARG(uap, len)) {
> +			error = ERANGE;
> +			break;
> +		}
> +	}
> +	if (error) {
> +err:
> +		free(pins, M_PINSYSCALL, npins * sizeof(u_int));
> +		return (error);
> +	}
> +	pr->ps_libcpin.pn_start = base;
> +	pr->ps_libcpin.pn_end = base + len;
> +	pr->ps_libcpin.pn_pins = pins;
> +	pr->ps_libcpin.pn_npins = npins;
> +	pr->ps_flags |= PS_LIBCPIN;
>  	return (0);
>  }
>  
> Index: libexec/ld.so/library.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/library.c,v
> diff -u -p -u -r1.93 library.c
> --- libexec/ld.so/library.c	19 Dec 2023 16:13:22 -0000	1.93
> +++ libexec/ld.so/library.c	19 Dec 2023 21:45:31 -0000
> @@ -99,7 +99,7 @@ elf_object_t *
>  _dl_tryload_shlib(const char *libname, int type, int flags, int nodelete)
>  {
>  	struct range_vector imut, mut;
> -	int	libfile, i;
> +	int	libfile, libc = -1, i;
>  	struct load_list *next_load, *load_list = NULL;
>  	Elf_Addr maxva = 0, minva = ELF_NO_ADDR;
>  	Elf_Addr libaddr, loff, align = _dl_pagesz - 1;
> @@ -109,8 +109,8 @@ _dl_tryload_shlib(const char *libname, i
>  	size_t exec_size = 0;
>  	Elf_Dyn *dynp = NULL;
>  	Elf_Ehdr *ehdr;
> -	Elf_Phdr *phdp;
> -	Elf_Phdr *ptls = NULL;
> +	Elf_Phdr *phdp, *ptls = NULL;
> +	Elf_Phdr *syscall_phdp = NULL;
>  	struct stat sb;
>  
>  #define powerof2(x) ((((x) - 1) & (x)) == 0)
> @@ -139,7 +139,6 @@ _dl_tryload_shlib(const char *libname, i
>  	if (flags & DF_1_NOOPEN) {
>  		_dl_close(libfile);
>  		return NULL;
> -
>  	}
>  
>  	_dl_read(libfile, hbuf, sizeof(hbuf));
> @@ -316,11 +315,30 @@ _dl_tryload_shlib(const char *libname, i
>  			_dl_push_range_size(&mut, phdp->p_vaddr + loff,
>  			    phdp->p_memsz);
>  			break;
> +		case PT_OPENBSD_SYSCALLS:
> +			syscall_phdp = phdp;
> +			break;
>  		default:
>  			break;
>  		}
>  	}
>  
> +	libc = _dl_islibc(dynp, loff);
> +	if (libc) {
> +		if (syscall_phdp)
> +			_dl_pin(libfile, syscall_phdp, (void *)libaddr,
> +			    (size_t)((exec_start + exec_size) - libaddr),
> +			    exec_start, exec_size);
> +
> +		/*
> +		 * XXX msyscall() can be removed once pinsyscalls()
> +		 * is fully operational
> +		 */
> +		/* Request permission for system calls in libc.so's text segment */
> +		if (_dl_msyscall(exec_start, exec_size) == -1)
> +			_dl_printf("msyscall %lx %lx error\n",
> +			    exec_start, exec_size);
> +	}
>  	_dl_close(libfile);
>  
>  	dynp = (Elf_Dyn *)((unsigned long)dynp + loff);
> @@ -328,8 +346,6 @@ _dl_tryload_shlib(const char *libname, i
>  	    (Elf_Phdr *)((char *)libaddr + ehdr->e_phoff), ehdr->e_phnum,type,
>  	    libaddr, loff);
>  	if (object) {
> -		char *soname = (char *)object->Dyn.info[DT_SONAME];
> -
>  		object->load_size = maxva - minva;	/*XXX*/
>  		object->load_list = load_list;
>  		/* set inode, dev from stat info */
> @@ -339,17 +355,10 @@ _dl_tryload_shlib(const char *libname, i
>  		object->nodelete = nodelete;
>  		object->relro_addr = relro_addr;
>  		object->relro_size = relro_size;
> +		object->islibc = libc;
>  		_dl_set_sod(object->load_name, &object->sod);
>  		if (ptls != NULL && ptls->p_memsz)
>  			_dl_set_tls(object, ptls, libaddr, libname);
> -
> -		/* Request permission for system calls in libc.so's text segment */
> -		if (soname != NULL && !_dl_traceld &&
> -		    _dl_strncmp(soname, "libc.so.", 8) == 0) {
> -			if (_dl_msyscall(exec_start, exec_size) == -1)
> -				_dl_printf("msyscall %lx %lx error\n",
> -				    exec_start, exec_size);
> -		}
>  		_dl_bcopy(&mut, &object->mut, sizeof mut);
>  		_dl_bcopy(&imut, &object->imut, sizeof imut);
>  	} else {
> Index: libexec/ld.so/library_mquery.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/library_mquery.c,v
> diff -u -p -u -r1.73 library_mquery.c
> --- libexec/ld.so/library_mquery.c	19 Dec 2023 16:13:22 -0000	1.73
> +++ libexec/ld.so/library_mquery.c	20 Dec 2023 14:56:19 -0000
> @@ -102,15 +102,15 @@ elf_object_t *
>  _dl_tryload_shlib(const char *libname, int type, int flags, int nodelete)
>  {
>  	struct range_vector imut, mut;
> -	int libfile, i;
> +	int libfile, libc = -1, i;
>  	struct load_list *ld, *lowld = NULL;
>  	elf_object_t *object;
>  	Elf_Dyn *dynp = NULL;
>  	Elf_Ehdr *ehdr;
> -	Elf_Phdr *phdp;
> +	Elf_Phdr *phdp, *ptls = NULL;
> +	Elf_Phdr *syscall_phdp = NULL;
>  	Elf_Addr load_end = 0;
>  	Elf_Addr align = _dl_pagesz - 1, off, size;
> -	Elf_Phdr *ptls = NULL;
>  	Elf_Addr relro_addr = 0, relro_size = 0;
>  	struct stat sb;
>  	char hbuf[4096], *exec_start;
> @@ -325,9 +325,28 @@ retry:
>  			_dl_push_range_size(&mut, phdp->p_vaddr + LOFF,
>  			    phdp->p_memsz);
>  			break;
> +		case PT_OPENBSD_SYSCALLS:
> +			syscall_phdp = phdp;
> +			break;
>  		}
>  	}
>  
> +	libc = _dl_islibc(dynp, LOFF);
> +	if (libc) {
> +		if (syscall_phdp)
> +			_dl_pin(libfile, syscall_phdp, lowld->start,
> +			    (size_t)((exec_start + exec_size) - LOFF),
> +			    exec_start, exec_size);
> +
> +		/*
> +		 * XXX msyscall() can be removed once pinsyscalls()
> +		 * is fully operational
> +		 */
> +		/* Request permission for system calls in libc.so's text segment */
> +		if (_dl_msyscall(exec_start, exec_size) == -1)
> +			_dl_printf("msyscall %lx %lx error\n",
> +			    exec_start, exec_size);
> +	}
>  	_dl_close(libfile);
>  
>  	dynp = (Elf_Dyn *)((unsigned long)dynp + LOFF);
> @@ -335,8 +354,6 @@ retry:
>  	    (Elf_Phdr *)((char *)lowld->start + ehdr->e_phoff), ehdr->e_phnum,
>  	    type, (Elf_Addr)lowld->start, LOFF);
>  	if (object) {
> -		char *soname = (char *)object->Dyn.info[DT_SONAME];
> -
>  		object->load_size = (Elf_Addr)load_end - (Elf_Addr)lowld->start;
>  		object->load_list = lowld;
>  		/* set inode, dev from stat info */
> @@ -346,18 +363,11 @@ retry:
>  		object->nodelete = nodelete;
>  		object->relro_addr = relro_addr;
>  		object->relro_size = relro_size;
> +		object->islibc = libc;
>  		_dl_set_sod(object->load_name, &object->sod);
>  		if (ptls != NULL && ptls->p_memsz)
>  			_dl_set_tls(object, ptls, (Elf_Addr)lowld->start,
>  			    libname);
> -
> -		/* Request permission for system calls in libc.so's text segment */
> -		if (soname != NULL && !_dl_traceld &&
> -		    _dl_strncmp(soname, "libc.so.", 8) == 0) {
> -			if (_dl_msyscall(exec_start, exec_size) == -1)
> -				_dl_printf("msyscall %lx %lx error\n",
> -				    exec_start, exec_size);
> -		}
>  		_dl_bcopy(&mut, &object->mut, sizeof mut);
>  		_dl_bcopy(&imut, &object->imut, sizeof imut);
>  	} else {
> Index: libexec/ld.so/loader.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/loader.c,v
> diff -u -p -u -r1.218 loader.c
> --- libexec/ld.so/loader.c	19 Dec 2023 16:13:22 -0000	1.218
> +++ libexec/ld.so/loader.c	19 Dec 2023 21:45:31 -0000
> @@ -410,11 +410,14 @@ _dl_load_dep_libs(elf_object_t *object, 
>  
>  	_dl_cache_grpsym_list_setup(object);
>  
> +	/*
> +	 * XXX pinsyscall(SYS_execve,...) can be removed once pinsyscalls()
> +	 * is fully operational
> +	 */
>  	for (obj = _dl_objects; booting && obj != NULL; obj = obj->next) {
> -		char *soname = (char *)obj->Dyn.info[DT_SONAME];
>  		struct sym_res sr;
>  
> -		if (!soname || _dl_strncmp(soname, "libc.so.", 8))
> +		if (obj->islibc == 0)

Since islibc is treated as a boolean "if (!obj->islibc) is probably better.

>  			continue;
>  		sr = _dl_find_symbol("execve",
>  		    SYM_SEARCH_SELF|SYM_PLT|SYM_WARNNOTFOUND, NULL, obj);
> Index: libexec/ld.so/resolve.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/resolve.c,v
> diff -u -p -u -r1.100 resolve.c
> --- libexec/ld.so/resolve.c	8 Jul 2023 14:09:43 -0000	1.100
> +++ libexec/ld.so/resolve.c	19 Dec 2023 21:45:31 -0000
> @@ -29,6 +29,8 @@
>  #define _DYN_LOADER
>  
>  #include <sys/types.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
>  
>  #include <limits.h>
>  #include <link.h>
> @@ -36,6 +38,7 @@
>  #include "util.h"
>  #include "path.h"
>  #include "resolve.h"
> +#include "syscall.h"
>  
>  /* substitution types */
>  typedef enum {
> @@ -744,4 +747,83 @@ void
>  _dl_debug_state(void)
>  {
>  	/* Debugger stub */
> +}
> +
> +/*
> + * Search for DT_SONAME, and check if this is libc
> + */
> +int
> +_dl_islibc(Elf_Dyn *_dynp, Elf_Addr loff)
> +{
> +	Elf_Dyn *d, *dynp = (Elf_Dyn *)((unsigned long)_dynp + loff);
> +	long base = 0;
> +
> +	for (d = dynp; d->d_tag != DT_NULL; d++)
> +		if (d->d_tag == DT_STRTAB) {
> +			base = d->d_un.d_ptr + loff;
> +			break;
> +		}
> +	if (base == 0)
> +		return 0;
> +	for (d = dynp; d->d_tag != DT_NULL; d++)
> +		if (d->d_tag == DT_SONAME) {
> +			if (_dl_strncmp((char *)(base + d->d_un.d_ptr),
> +			    "libc.so.", 8) == 0)
> +				return 1;
> +			break;
> +		}
> +	return 0;
> +}
> +
> +void
> +_dl_pin(int file, Elf_Phdr *phdp, void *base, size_t len,
> +    void *exec_base, size_t exec_size)
> +{
> +	struct pinsyscalls {
> +		u_int offset;
> +		u_int sysno;
> +	} *syscalls;
> +	int npins = 0, nsyscalls, i;
> +	u_int *pins = NULL;
> +	vaddr_t offset;
> +
> +	if (phdp->p_filesz > SYS_MAXSYSCALL * 2 * sizeof(*syscalls) ||
> +	    phdp->p_filesz % sizeof(*syscalls) != 0 ||
> +	    phdp->p_offset & 0x3)
> +		return;
> +	syscalls = _dl_mmap(NULL, phdp->p_filesz, PROT_READ,
> +	    MAP_PRIVATE|MAP_FILE, file, phdp->p_offset);
> +	if (syscalls == MAP_FAILED)
> +		return;
> +
> +	/* Validate, and calculate pintable size */
> +	nsyscalls = phdp->p_filesz / sizeof(*syscalls);
> +	for (i = 0; i < nsyscalls; i++) {
> +		if (syscalls[i].sysno < 0 ||
> +		    syscalls[i].sysno >= SYS_MAXSYSCALL ||
> +		    syscalls[i].offset >= len)
> +			goto bad;
> +		npins = MAXIMUM(npins, syscalls[i].sysno);
> +	}
> +	npins++;
> +
> +	/*
> +	 * Fill pintable: 0 = invalid, -1 = accept, else offset
> +	 * from base, rebase to text_start while at it
> +	 */
> +	pins = _dl_calloc(npins, sizeof(u_int));
> +	offset = exec_base - base;
> +	for (i = 0; i < nsyscalls; i++) {
> +		if (pins[syscalls[i].sysno])
> +			pins[syscalls[i].sysno] = (u_int)-1; /* duplicated */
> +		else
> +			pins[syscalls[i].sysno] = syscalls[i].offset - offset;
> +	}
> +	base += offset;
> +	len = len - offset;
> +bad:
> +	_dl_munmap(syscalls, phdp->p_filesz);
> +	if (pins)
> +		_dl_pinsyscalls(base, len, pins, npins);
> +	_dl_free(pins);
>  }
> Index: libexec/ld.so/resolve.h
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/resolve.h,v
> diff -u -p -u -r1.106 resolve.h
> --- libexec/ld.so/resolve.h	19 Dec 2023 16:13:22 -0000	1.106
> +++ libexec/ld.so/resolve.h	19 Dec 2023 21:45:31 -0000
> @@ -245,6 +245,7 @@ struct elf_object {
>  
>  	struct range_vector imut;
>  	struct range_vector mut;
> +	int islibc;
>  };
>  
>  struct dep_node {
> @@ -339,6 +340,9 @@ void _dl_apply_immutable(elf_object_t *o
>  typedef void lock_cb(int);
>  void	_dl_thread_kern_go(lock_cb *);
>  lock_cb	*_dl_thread_kern_stop(void);
> +
> +int	_dl_islibc(Elf_Dyn *_dynp, Elf_Addr loff);
> +void	_dl_pin(int, Elf_Phdr *, void *, size_t, void *, size_t);
>  
>  char	*_dl_getenv(const char *, char **) __boot;
>  void	_dl_unsetenv(const char *, char **) __boot;
> Index: bin/ps/print.c
> ===================================================================
> RCS file: /cvs/src/bin/ps/print.c,v
> diff -u -p -u -r1.86 print.c
> --- bin/ps/print.c	8 Mar 2023 14:47:02 -0000	1.86
> +++ bin/ps/print.c	19 Dec 2023 21:45:31 -0000
> @@ -303,6 +303,10 @@ printstate(const struct pinfo *pi, VAREN
>  		*cp++ = '+';
>  	if (kp->p_psflags & PS_PLEDGE)
>  		*cp++ = 'p';
> +	if (kp->p_psflags & PS_PIN)
> +		*cp++ = 'l';
> +	if (kp->p_psflags & PS_LIBCPIN)
> +		*cp++ = 'L';
>  	if (kp->p_eflag & EPROC_UNVEIL) {
>  		if (kp->p_eflag & EPROC_LKUNVEIL)
>  			*cp++ = 'U';
> Index: bin/ps/ps.1
> ===================================================================
> RCS file: /cvs/src/bin/ps/ps.1,v
> diff -u -p -u -r1.131 ps.1
> --- bin/ps/ps.1	10 Nov 2023 09:17:02 -0000	1.131
> +++ bin/ps/ps.1	19 Dec 2023 21:45:31 -0000
> @@ -359,6 +359,9 @@ PS_EXECPLEDGE	0x00400000 has exec pledge
>  PS_ORPHAN	0x00800000 process is on an orphan list
>  PS_CHROOT	0x01000000 process is chrooted
>  PS_NOBTCFI	0x02000000 no Branch Target CFI
> +PS_PIN          0x08000000 ld.so or static executable that
> +                           has syscalls pinned
> +PS_LIBCPIN      0x10000000 libc.so has syscalls pinned
>  .Ed
>  .It Cm re
>  Core residency time (in seconds; 127 = infinity).
> @@ -475,6 +478,11 @@ scheduling priority.
>  .It p
>  The process has called
>  .Xr pledge 2 .
> +.It l
> +.Xr ld.so 1
> +or a static executable has syscall pinning.
> +.It L
> +libc.so has syscall pinning.
>  .\" .It S
>  .\" The process has asked for FIFO
>  .\" page replacement
> 
>