Download raw body.
update on pinsyscalls(2)
> From: "Theo de Raadt" <deraadt@openbsd.org>
> Date: Sat, 30 Dec 2023 11:56:35 -0700
>
> The pinsyscalls(2) diff is now much smaller, since many pieces it depends
> upon have been commmited.
>
> All the DSO containing system call entries have the proper annotations for
> kernel and ld.so to do the right thing.
>
> This diff can be applied to a -current system, if people want to play
> along:
>
> cd /usr/src
> make includes ;; to update sys/proc.h mostly
> build new kernel
> build new libexec/ld.so
> build new bin/ps
>
> ps:
> - for static binaries, ps will show 'l' to indicate the
> binary's text segment is doing syscall pinning.
> - for dynamic binaries, ps will show 'l' to indicate that
> ld.so's text segement is doing syscall pinning, and 'L'
> to indicate libc.so's text segment is doing syscal pinning
>
>
> There's a long tail with this diff. Perhaps in a release or two when
> all binaries are known to follow the pinsyscalls(2) rules, we'll be able
> to turn msyscall() and the less powerful pinsyscall(2) into NOPs, and
> eventually remove them.
>
> The more precise pinsyscalls(2) check in syscall_mi.h is O(1) but
> slightly more expensive than the msyscall(2) check which also is O(1) in
> the general case but has a special case when text msyscall-allowed
> segments get crossed (such as when doing ld.so GOT/PLT resolution, or
> signal handlers), then uvm locks occur twice. But we don't need both.
A few small things and one important mistake I spotted below.
Otherwise this looks pretty good to me now.
> Index: sys/kern/exec_elf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/exec_elf.c,v
> diff -u -p -u -r1.183 exec_elf.c
> --- sys/kern/exec_elf.c 12 Jul 2023 19:34:14 -0000 1.183
> +++ sys/kern/exec_elf.c 19 Dec 2023 21:45:31 -0000
> @@ -81,6 +81,7 @@
> #include <sys/ptrace.h>
> #include <sys/signalvar.h>
> #include <sys/pledge.h>
> +#include <sys/syscall.h>
>
> #include <sys/mman.h>
>
> @@ -97,6 +98,8 @@ void elf_load_psection(struct exec_vmcmd
> Elf_Phdr *, Elf_Addr *, Elf_Addr *, int *, int);
> int elf_os_pt_note_name(Elf_Note *);
> int elf_os_pt_note(struct proc *, struct exec_package *, Elf_Ehdr *, int *);
> +int elf_read_pintable(struct proc *p, struct vnode *vp, Elf_Phdr *pp,
> + u_int **pinp, int is_ldso, size_t len);
>
> /* round up and down to page boundaries. */
> #define ELF_ROUND(a, b) (((a) + (b) - 1) & ~((b) - 1))
> @@ -266,6 +269,74 @@ elf_read_from(struct proc *p, struct vno
> }
>
> /*
> + * rebase the pin offsets inside a base,len window for the text segment only.
> + */
> +void
> +elf_adjustpins(vaddr_t *basep, size_t *lenp, u_int *pins, int npins, u_int offset)
> +{
> + int i;
> +
> + /* Adjust offsets, base, len */
> + for (i = 0; i < npins; i++) {
> + if (pins[i] == -1 || pins[i] == 0)
> + continue;
> + pins[i] -= offset;
> + }
> + *basep += offset;
> + *lenp -= offset;
> +}
> +
> +int
> +elf_read_pintable(struct proc *p, struct vnode *vp, Elf_Phdr *pp,
> + u_int **pinp, int is_ldso, size_t len)
> +{
> + struct pinsyscalls {
> + u_int offset;
> + u_int sysno;
> + } *syscalls = NULL;
> + int i, nsyscalls = 0, npins = 0;
> + u_int *pins = NULL;
> +
> + if (pp->p_filesz > SYS_MAXSYSCALL * 2 * sizeof(*syscalls) ||
> + pp->p_filesz % sizeof(*syscalls) != 0)
> + goto bad;
> + nsyscalls = pp->p_filesz / sizeof(*syscalls);
> + syscalls = malloc(pp->p_filesz, M_PINSYSCALL, M_WAITOK);
> + if (elf_read_from(p, vp, pp->p_offset, syscalls,
> + pp->p_filesz) != 0)
> + goto bad;
> +
> + /* Validate, and calculate pintable size */
> + for (i = 0; i < nsyscalls; i++) {
> + if (syscalls[i].sysno <= 0 ||
> + syscalls[i].sysno >= SYS_MAXSYSCALL ||
> + syscalls[i].offset > len)
> + goto bad;
> + npins = MAX(npins, syscalls[i].sysno);
> + }
> + if (is_ldso)
> + npins = MAX(npins, SYS_kbind); /* XXX see ld.so/loader.c */
> + npins++;
> +
> + /* Fill pintable: 0 = invalid, -1 = allowed, else offset from base */
> + pins = mallocarray(npins, sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
> + for (i = 0; i < nsyscalls; i++) {
> + if (pins[syscalls[i].sysno])
> + pins[syscalls[i].sysno] = -1; /* duplicated */
> + else
> + pins[syscalls[i].sysno] = syscalls[i].offset;
> + }
> + if (is_ldso)
> + pins[SYS_kbind] = -1; /* XXX see ld.so/loader.c */
> + *pinp = pins;
> + pins = NULL;
> +bad:
> + free(syscalls, M_PINSYSCALL, nsyscalls * sizeof(*syscalls));
> + free(pins, M_PINSYSCALL, npins * sizeof(u_int));
> + return npins;
> +}
> +
> +/*
> * Load a file (interpreter/library) pointed to by path [stolen from
> * coff_load_shlib()]. Made slightly generic so it might be used externally.
> */
> @@ -276,7 +347,7 @@ elf_load_file(struct proc *p, char *path
> int error, i;
> struct nameidata nd;
> Elf_Ehdr eh;
> - Elf_Phdr *ph = NULL;
> + Elf_Phdr *ph = NULL, *syscall_ph = NULL;
> u_long phsize = 0;
> Elf_Addr addr;
> struct vnode *vp;
> @@ -290,6 +361,7 @@ elf_load_file(struct proc *p, char *path
> int file_align;
> int loop;
> size_t randomizequota = ELF_RANDOMIZE_LIMIT;
> + vaddr_t text_start = -1, text_end = 0;
>
> NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p);
> nd.ni_pledge = PLEDGE_RPATH;
> @@ -432,6 +504,12 @@ elf_load_file(struct proc *p, char *path
> epp->ep_entry += pos;
> ap->arg_interp = pos;
> }
> + if (prot & PROT_EXEC) {
> + if (addr < text_start)
> + text_start = addr;
> + if (addr+size >= text_end)
> + text_end = addr + size;
> + }
> addr += size;
> break;
>
> @@ -461,12 +539,34 @@ elf_load_file(struct proc *p, char *path
> NEW_VMCMD(&epp->ep_vmcmds, vmcmd_mutable,
> ph[i].p_memsz, ph[i].p_vaddr + pos, NULLVP, 0, 0);
> break;
> -
> + case PT_OPENBSD_SYSCALLS:
> + syscall_ph = &ph[i];
> + break;
> default:
> break;
> }
> }
>
> + if (syscall_ph) {
> + struct process *pr = p->p_p;
> + vaddr_t base = pos;
> + size_t len = text_end;
> + u_int *pins;
> + int npins;
> +
> + npins = elf_read_pintable(p, nd.ni_vp, syscall_ph,
> + &pins, 1, len);
> + if (npins) {
> + elf_adjustpins(&base, &len, pins, npins,
> + text_start);
> + pr->ps_pin.pn_start = base;
> + pr->ps_pin.pn_end = base + len;
> + pr->ps_pin.pn_pins = pins;
> + pr->ps_pin.pn_npins = npins;
> + pr->ps_flags |= PS_PIN;
> + }
> + }
> +
> vn_marktext(nd.ni_vp);
>
> bad1:
> @@ -491,8 +591,8 @@ int
> exec_elf_makecmds(struct proc *p, struct exec_package *epp)
> {
> Elf_Ehdr *eh = epp->ep_hdr;
> - Elf_Phdr *ph, *pp, *base_ph = NULL;
> - Elf_Addr phdr = 0, exe_base = 0;
> + Elf_Phdr *ph, *pp, *base_ph = NULL, *syscall_ph = NULL;
> + Elf_Addr phdr = 0, exe_base = 0, exe_end = 0;
> int error, i, has_phdr = 0, names = 0, textrel = 0;
> char *interp = NULL;
> u_long phsize;
> @@ -633,11 +733,13 @@ exec_elf_makecmds(struct proc *p, struct
>
> /*
> * Permit system calls in main-text static binaries.
> - * Also block the ld.so syscall-grant
> + * static binaries may not call msyscall() or
> + * pinsyscalls()
> */
> if (interp == NULL) {
> syscall = VMCMD_SYSCALL;
> p->p_vmspace->vm_map.flags |= VM_MAP_SYSCALL_ONCE;
> + p->p_vmspace->vm_map.flags |= VM_MAP_PINSYSCALL_ONCE;
> }
>
> /*
> @@ -696,6 +798,9 @@ exec_elf_makecmds(struct proc *p, struct
> epp->ep_tsize = addr+size -
> epp->ep_taddr;
> }
> + if (interp == NULL)
> + exe_end = epp->ep_taddr +
> + epp->ep_tsize; /* end of TEXT */
> }
> break;
>
> @@ -735,13 +840,35 @@ exec_elf_makecmds(struct proc *p, struct
> NEW_VMCMD(&epp->ep_vmcmds, vmcmd_mutable,
> ph[i].p_memsz, ph[i].p_vaddr + exe_base, NULLVP, 0, 0);
> break;
> -
> + case PT_OPENBSD_SYSCALLS:
> + if (interp == NULL)
> + syscall_ph = &ph[i];
> + break;
> default:
> /*
> * Not fatal, we don't need to understand everything
> * :-)
> */
> break;
> + }
> + }
> +
> + if (syscall_ph) {
> + vaddr_t base = exe_base;
> + size_t len = exe_end - exe_base;
> + u_int *pins;
> + int npins;
> +
> + npins = elf_read_pintable(p, epp->ep_vp, syscall_ph,
> + &pins, 0, len);
> + if (npins) {
> + elf_adjustpins(&base, &len, pins, npins,
> + epp->ep_taddr - exe_base);
> + epp->ep_pinstart = base;
> + epp->ep_pinend = base + len;
> + epp->ep_pins = pins;
> + epp->ep_npins = npins;
> + p->p_p->ps_flags |= PS_PIN;
> }
> }
>
> Index: sys/kern/kern_exec.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_exec.c,v
> diff -u -p -u -r1.252 kern_exec.c
> --- sys/kern/kern_exec.c 30 Oct 2023 07:13:10 -0000 1.252
> +++ sys/kern/kern_exec.c 19 Dec 2023 21:45:31 -0000
> @@ -314,6 +314,8 @@ sys_execve(struct proc *p, void *v, regi
> VMCMDSET_INIT(&pack.ep_vmcmds);
> pack.ep_vap = &attr;
> pack.ep_flags = 0;
> + pack.ep_pins = NULL;
> + pack.ep_npins = 0;
>
> /* see if we can run it. */
> if ((error = check_exec(p, &pack)) != 0) {
> @@ -514,6 +516,30 @@ sys_execve(struct proc *p, void *v, regi
> if (copyout(&arginfo, (char *)pr->ps_strings, sizeof(arginfo)))
> goto exec_abort;
>
> + free(pr->ps_pin.pn_pins, M_PINSYSCALL,
> + pr->ps_pin.pn_npins * sizeof(u_int));
> + if (pack.ep_npins) {
> + pr->ps_pin.pn_start = pack.ep_pinstart;
> + pr->ps_pin.pn_end = pack.ep_pinend;
> + pr->ps_pin.pn_pins = pack.ep_pins;
> + pack.ep_pins = NULL;
> + pr->ps_pin.pn_npins = pack.ep_npins;
> + pr->ps_flags |= PS_PIN;
> + } else {
> + pr->ps_pin.pn_start = pr->ps_pin.pn_end = 0;
> + pr->ps_pin.pn_pins = NULL;
> + pr->ps_pin.pn_npins = 0;
> + pr->ps_flags &= ~PS_PIN;
> + }
> + if (pr->ps_libcpin.pn_pins) {
> + free(pr->ps_libcpin.pn_pins, M_PINSYSCALL,
> + pr->ps_libcpin.pn_npins * sizeof(u_int));
> + pr->ps_libcpin.pn_start = pr->ps_libcpin.pn_end = 0;
> + pr->ps_libcpin.pn_pins = NULL;
> + pr->ps_libcpin.pn_npins = 0;
> + pr->ps_flags &= ~PS_LIBCPIN;
> + }
> +
> stopprofclock(pr); /* stop profiling */
> fdcloseexec(p); /* handle close on exec */
> execsigs(p); /* reset caught signals */
> @@ -752,6 +778,7 @@ bad:
> if (pack.ep_interp != NULL)
> pool_put(&namei_pool, pack.ep_interp);
> free(pack.ep_args, M_TEMP, sizeof *pack.ep_args);
> + free(pack.ep_pins, M_PINSYSCALL, pack.ep_npins * sizeof(u_int));
> /* close and put the exec'd file */
> vn_close(pack.ep_vp, FREAD, cred, p);
> pool_put(&namei_pool, nid.ni_cnd.cn_pnbuf);
> Index: sys/kern/kern_exit.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_exit.c,v
> diff -u -p -u -r1.217 kern_exit.c
> --- sys/kern/kern_exit.c 29 Sep 2023 12:47:34 -0000 1.217
> +++ sys/kern/kern_exit.c 19 Dec 2023 21:45:31 -0000
> @@ -215,6 +215,11 @@ exit1(struct proc *p, int xexit, int xsi
>
> unveil_destroy(pr);
>
> + free(pr->ps_pin.pn_pins, M_PINSYSCALL,
> + pr->ps_pin.pn_npins * sizeof(u_int));
> + free(pr->ps_libcpin.pn_pins, M_PINSYSCALL,
> + pr->ps_libcpin.pn_npins * sizeof(u_int));
> +
> /*
> * If parent has the SAS_NOCLDWAIT flag set, we're not
> * going to become a zombie.
> Index: sys/kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> diff -u -p -u -r1.253 kern_fork.c
> --- sys/kern/kern_fork.c 24 Oct 2023 13:20:11 -0000 1.253
> +++ sys/kern/kern_fork.c 19 Dec 2023 21:45:31 -0000
> @@ -248,6 +248,21 @@ process_new(struct proc *p, struct proce
> if (parent->ps_session->s_ttyvp != NULL)
> pr->ps_flags |= parent->ps_flags & PS_CONTROLT;
>
> + if (parent->ps_pin.pn_pins) {
> + pr->ps_pin.pn_pins = mallocarray(parent->ps_pin.pn_npins,
> + sizeof(u_int), M_PINSYSCALL, M_WAITOK);
> + memcpy(pr->ps_pin.pn_pins, parent->ps_pin.pn_pins,
> + parent->ps_pin.pn_npins * sizeof(u_int));
> + pr->ps_flags |= PS_PIN;
> + }
> + if (parent->ps_libcpin.pn_pins) {
> + pr->ps_libcpin.pn_pins = mallocarray(parent->ps_libcpin.pn_npins,
> + sizeof(u_int), M_PINSYSCALL, M_WAITOK);
> + memcpy(pr->ps_libcpin.pn_pins, parent->ps_libcpin.pn_pins,
> + parent->ps_libcpin.pn_npins * sizeof(u_int));
> + pr->ps_flags |= PS_LIBCPIN;
> + }
> +
> /*
> * Duplicate sub-structures as needed.
> * Increase reference counts on shared objects.
> Index: sys/sys/exec.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/exec.h,v
> diff -u -p -u -r1.52 exec.h
> --- sys/sys/exec.h 19 Apr 2023 15:37:36 -0000 1.52
> +++ sys/sys/exec.h 19 Dec 2023 21:45:31 -0000
> @@ -131,6 +131,9 @@ struct exec_package {
> struct elf_args *ep_args; /* ELF info */
> void *ep_auxinfo; /* userspace auxinfo address */
> char *ep_interp; /* name of interpreter if any */
> + vaddr_t ep_pinstart, ep_pinend; /* executable region */
> + u_int *ep_pins; /* array of system call offsets */
> + int ep_npins; /* entries in array */
> };
> #define EXEC_INDIR 0x0001 /* script handling already done */
> #define EXEC_HASFD 0x0002 /* holding a shell script */
> Index: sys/sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> diff -u -p -u -r1.352 proc.h
> --- sys/sys/proc.h 29 Sep 2023 12:47:34 -0000 1.352
> +++ sys/sys/proc.h 19 Dec 2023 21:45:31 -0000
> @@ -117,6 +117,13 @@ struct tslpentry;
> TAILQ_HEAD(tslpqueue, tslpentry);
> struct unveil;
>
> +struct pinsyscall {
> + vaddr_t pn_start;
> + vaddr_t pn_end;
> + u_int *pn_pins; /* array of offset indexed by syscall# */
s/offset/offsets/?
> + int pn_npins; /* number of entries in table */
> +};
> +
> /*
> * Locks used to protect struct members in this file:
> * I immutable after creation
> @@ -240,6 +247,9 @@ struct process {
> /* an address that can't be in userspace or kernelspace */
> #define BOGO_PC (u_long)-1
>
> + struct pinsyscall ps_pin; /* static or ld.so */
> + struct pinsyscall ps_libcpin; /* libc.so, from pinsyscalls(2) */
> +
> /* End area that is copied on creation. */
> #define ps_endcopy ps_threadcnt
> u_int ps_threadcnt; /* Number of threads. */
> @@ -283,6 +293,8 @@ struct process {
> #define PS_CHROOT 0x01000000 /* Process is chrooted */
> #define PS_NOBTCFI 0x02000000 /* No Branch Target CFI */
> #define PS_ITIMER 0x04000000 /* Virtual interval timers running */
> +#define PS_PIN 0x08000000 /* ld.so or static syscall pin */
> +#define PS_LIBCPIN 0x10000000 /* libc.so syscall pin */
>
> #define PS_BITS \
> ("\20" "\01CONTROLT" "\02EXEC" "\03INEXEC" "\04EXITING" "\05SUGID" \
> Index: sys/sys/syscall_mi.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/syscall_mi.h,v
> diff -u -p -u -r1.29 syscall_mi.h
> --- sys/sys/syscall_mi.h 12 Dec 2023 15:30:55 -0000 1.29
> +++ sys/sys/syscall_mi.h 19 Dec 2023 21:45:31 -0000
> @@ -33,8 +33,11 @@
>
> #include <sys/param.h>
> #include <sys/pledge.h>
> +#include <sys/acct.h>
> +#include <sys/syslog.h>
> #include <sys/tracepoint.h>
> #include <sys/syscall.h>
> +#include <sys/signalvar.h>
> #include <uvm/uvm_extern.h>
>
> #ifdef KTRACE
> @@ -46,6 +49,79 @@
> #include <dev/dt/dtvar.h>
> #endif
>
> +/*
> + * Check if a system call is entered from precisely correct location
> + */
> +static inline int
> +pin_check(struct proc *p, register_t code)
> +{
> + extern char sigcodecall[], sigcoderet[], sigcodecall[];
> + struct pinsyscall *pin = NULL, *ppin, *plibcpin;
> + struct process *pr = p->p_p;
> + vaddr_t addr;
> + int error = 0;
> +
> + /* point at start of syscall instruction */
> + addr = (vaddr_t)PROC_PC(p) - (vaddr_t)(sigcoderet - sigcodecall);
> + ppin = &pr->ps_pin;
> + plibcpin = &pr->ps_libcpin;
> +
> + /*
> + * System calls come from the following places, checks are ordered
> + * by most common case:
> + * 1) dynamic binary: syscalls in libc.so (in the ps_libcpin region)
> + * 2a) static binary: syscalls in main program (in the ps_pin region)
> + * 2b) dynamic binary: sysalls in ld.so (in the ps_pin region)
> + * 3) sigtramp, containing only sigreturn(2)
> + */
> + if (plibcpin->pn_pins &&
> + addr >= plibcpin->pn_start && addr < plibcpin->pn_end)
> + pin = plibcpin;
> + else if (ppin->pn_pins &&
> + addr >= ppin->pn_start && addr < ppin->pn_end)
> + pin = ppin;
> + else if (PROC_PC(p) == pr->ps_sigcoderet) {
> + if (code == SYS_sigreturn)
> + return (0);
> + error = EPERM;
> + }
> + if (pin) {
> + if (code >= pin->pn_npins || pin->pn_pins[code] == 0)
> + error = ENOSYS;
> + else if (pin->pn_pins[code] + pin->pn_start == addr)
> + ; /* correct location */
> + else if (pin->pn_pins[code] == (u_int)-1)
> + ; /* multiple locations, hopefully a boring operation */
> + else
> + error = ENOSYS;
> + }
> + if (error == 0)
> + return (0);
> +#ifdef KTRACE
> + if (KTRPOINT(p, KTR_PINSYSCALL))
> + ktrpinsyscall(p, error, code, addr);
> +#endif
> + KERNEL_LOCK();
> + log(LOG_ERR,
> + "%s[%d]: pinsyscalls addr %lx code %ld, pinoff 0x%x "
> + "(pin%s %d %lx-%lx %lx) (libcpin%s %d %lx-%lx %lx) error %d\n",
> + p->p_p->ps_comm, p->p_p->ps_pid, addr, code,
> + (pin && code < pin->pn_npins) ? pin->pn_pins[code] : -1,
> + pin == ppin ? "(Y)" : "", ppin->pn_npins,
> + ppin->pn_start, ppin->pn_end, ppin->pn_end - ppin->pn_start,
> + pin == plibcpin ? "(Y)" : "", plibcpin->pn_npins,
> + plibcpin->pn_start, plibcpin->pn_end, plibcpin->pn_end - plibcpin->pn_start,
> + error);
> + p->p_p->ps_acflag |= APINSYS;
> +
> + /* Try to stop threads immediately, because this process is suspect */
> + if (P_HASSIBLING(p))
> + single_thread_set(p, SINGLE_UNWIND | SINGLE_DEEP);
> + /* Send uncatchable SIGABRT for coredump */
> + sigabort(p);
> + KERNEL_UNLOCK();
> + return (error);
> +}
>
> /*
> * The MD setup for a system call has been done; here's the MI part.
> @@ -90,6 +166,9 @@ mi_syscall(struct proc *p, register_t co
> "[%s]%d/%d pc=%lx inside %lx-%lx: bogus syscall\n",
> uvm_map_inentry_pc, p->p_vmspace->vm_map.wserial))
> return (EPERM);
> +
> + if ((error = pin_check(p, code)))
> + return (error);
>
> pledged = (p->p_p->ps_flags & PS_PLEDGE);
> if (pledged && (error = pledge_syscall(p, code, &tval))) {
> Index: sys/uvm/uvm_map.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> diff -u -p -u -r1.319 uvm_map.c
> --- sys/uvm/uvm_map.c 2 Aug 2023 09:19:47 -0000 1.319
> +++ sys/uvm/uvm_map.c 21 Dec 2023 17:55:14 -0000
> @@ -3407,7 +3407,8 @@ uvmspace_exec(struct proc *p, vaddr_t st
> * when a process execs another program image.
> */
> vm_map_lock(map);
> - vm_map_modflags(map, 0, VM_MAP_WIREFUTURE|VM_MAP_SYSCALL_ONCE);
> + vm_map_modflags(map, 0, VM_MAP_WIREFUTURE |
> + VM_MAP_SYSCALL_ONCE | VM_MAP_PINSYSCALL_ONCE);
>
> /*
> * now unmap the old program
> @@ -3944,7 +3945,8 @@ uvmspace_fork(struct process *pr)
> new_map, new_entry->start, new_entry->end);
> }
> }
> - new_map->flags |= old_map->flags & VM_MAP_SYSCALL_ONCE;
> + new_map->flags |= old_map->flags &
> + (VM_MAP_SYSCALL_ONCE | VM_MAP_PINSYSCALL_ONCE);
> #ifdef PMAP_CHECK_COPYIN
> if (PMAP_CHECK_COPYIN) {
> memcpy(&new_map->check_copyin, &old_map->check_copyin,
> Index: sys/uvm/uvm_map.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_map.h,v
> diff -u -p -u -r1.87 uvm_map.h
> --- sys/uvm/uvm_map.h 2 Aug 2023 09:19:47 -0000 1.87
> +++ sys/uvm/uvm_map.h 21 Dec 2023 15:31:23 -0000
> @@ -329,6 +329,7 @@ struct vm_map {
> #define VM_MAP_GUARDPAGES 0x20 /* rw: add guard pgs to map */
> #define VM_MAP_ISVMSPACE 0x40 /* ro: map is a vmspace */
> #define VM_MAP_SYSCALL_ONCE 0x80 /* rw: libc syscall registered */
> +#define VM_MAP_PINSYSCALL_ONCE 0x100 /* rw: pinsyscall done */
>
> /* Number of kernel maps and entries to statically allocate */
> #define MAX_KMAPENT 1024 /* Sufficient to make it to the scheduler. */
> Index: sys/uvm/uvm_mmap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> diff -u -p -u -r1.183 uvm_mmap.c
> --- sys/uvm/uvm_mmap.c 7 Dec 2023 13:59:05 -0000 1.183
> +++ sys/uvm/uvm_mmap.c 19 Dec 2023 21:45:31 -0000
> @@ -644,13 +644,65 @@ sys_pinsyscall(struct proc *p, void *v,
> return (0);
> }
>
> - /*
> - * sys_pinsyscalls
> +/*
> + * sys_pinsyscalls. The caller is required to normalize base,len
> + * to the minimum .text region, and adjust pintable offsets relative
> + * to that base.
> */
> int
> sys_pinsyscalls(struct proc *p, void *v, register_t *retval)
> {
> - /* STUB until other parts are ready */
> + struct sys_pinsyscalls_args /* {
> + syscallarg(void *) base;
> + syscallarg(size_t) len;
> + syscallarg(u_int *) pins;
> + syscallarg(int) npins;
> + } */ *uap = v;
> + struct process *pr = p->p_p;
> + int npins, error = 0, i;
> + vaddr_t base;
> + size_t len;
> + u_int *pins;
> +
> + if (pr->ps_libcpin.pn_start ||
> + (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE))
> + return (EPERM);
> + base = (vaddr_t)SCARG(uap, base);
> + len = (vsize_t)SCARG(uap, len);
> + if (base > SIZE_MAX - len)
> + return (EINVAL); /* disallow wrap-around. */
> +
> + /* XXX MP unlock */
> +
> + npins = SCARG(uap, npins);
> + if (npins < 1 || npins > SYS_MAXSYSCALL * 2)
> + return (E2BIG);
Since pinsyscalls(2) now takes an array of offsets indexed by
syscall#, the above check should be
if (npins < 1 || npins > SYS_MAXSYSCALL)
> + pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
> + if (pins == NULL)
> + return (ENOMEM);
> + error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int));
> + if (error)
> + goto err;
> +
> + /* Range-check pintable offsets */
> + for (i = 0; i < npins; i++) {
> + if (pins[i] == (u_int)-1 || pins[i] == 0)
> + continue;
> + if (pins[i] > SCARG(uap, len)) {
> + error = ERANGE;
> + break;
> + }
> + }
> + if (error) {
> +err:
> + free(pins, M_PINSYSCALL, npins * sizeof(u_int));
> + return (error);
> + }
> + pr->ps_libcpin.pn_start = base;
> + pr->ps_libcpin.pn_end = base + len;
> + pr->ps_libcpin.pn_pins = pins;
> + pr->ps_libcpin.pn_npins = npins;
> + pr->ps_flags |= PS_LIBCPIN;
> return (0);
> }
>
> Index: libexec/ld.so/library.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/library.c,v
> diff -u -p -u -r1.93 library.c
> --- libexec/ld.so/library.c 19 Dec 2023 16:13:22 -0000 1.93
> +++ libexec/ld.so/library.c 19 Dec 2023 21:45:31 -0000
> @@ -99,7 +99,7 @@ elf_object_t *
> _dl_tryload_shlib(const char *libname, int type, int flags, int nodelete)
> {
> struct range_vector imut, mut;
> - int libfile, i;
> + int libfile, libc = -1, i;
> struct load_list *next_load, *load_list = NULL;
> Elf_Addr maxva = 0, minva = ELF_NO_ADDR;
> Elf_Addr libaddr, loff, align = _dl_pagesz - 1;
> @@ -109,8 +109,8 @@ _dl_tryload_shlib(const char *libname, i
> size_t exec_size = 0;
> Elf_Dyn *dynp = NULL;
> Elf_Ehdr *ehdr;
> - Elf_Phdr *phdp;
> - Elf_Phdr *ptls = NULL;
> + Elf_Phdr *phdp, *ptls = NULL;
> + Elf_Phdr *syscall_phdp = NULL;
> struct stat sb;
>
> #define powerof2(x) ((((x) - 1) & (x)) == 0)
> @@ -139,7 +139,6 @@ _dl_tryload_shlib(const char *libname, i
> if (flags & DF_1_NOOPEN) {
> _dl_close(libfile);
> return NULL;
> -
> }
>
> _dl_read(libfile, hbuf, sizeof(hbuf));
> @@ -316,11 +315,30 @@ _dl_tryload_shlib(const char *libname, i
> _dl_push_range_size(&mut, phdp->p_vaddr + loff,
> phdp->p_memsz);
> break;
> + case PT_OPENBSD_SYSCALLS:
> + syscall_phdp = phdp;
> + break;
> default:
> break;
> }
> }
>
> + libc = _dl_islibc(dynp, loff);
> + if (libc) {
> + if (syscall_phdp)
> + _dl_pin(libfile, syscall_phdp, (void *)libaddr,
> + (size_t)((exec_start + exec_size) - libaddr),
> + exec_start, exec_size);
> +
> + /*
> + * XXX msyscall() can be removed once pinsyscalls()
> + * is fully operational
> + */
> + /* Request permission for system calls in libc.so's text segment */
> + if (_dl_msyscall(exec_start, exec_size) == -1)
> + _dl_printf("msyscall %lx %lx error\n",
> + exec_start, exec_size);
> + }
> _dl_close(libfile);
>
> dynp = (Elf_Dyn *)((unsigned long)dynp + loff);
> @@ -328,8 +346,6 @@ _dl_tryload_shlib(const char *libname, i
> (Elf_Phdr *)((char *)libaddr + ehdr->e_phoff), ehdr->e_phnum,type,
> libaddr, loff);
> if (object) {
> - char *soname = (char *)object->Dyn.info[DT_SONAME];
> -
> object->load_size = maxva - minva; /*XXX*/
> object->load_list = load_list;
> /* set inode, dev from stat info */
> @@ -339,17 +355,10 @@ _dl_tryload_shlib(const char *libname, i
> object->nodelete = nodelete;
> object->relro_addr = relro_addr;
> object->relro_size = relro_size;
> + object->islibc = libc;
> _dl_set_sod(object->load_name, &object->sod);
> if (ptls != NULL && ptls->p_memsz)
> _dl_set_tls(object, ptls, libaddr, libname);
> -
> - /* Request permission for system calls in libc.so's text segment */
> - if (soname != NULL && !_dl_traceld &&
> - _dl_strncmp(soname, "libc.so.", 8) == 0) {
> - if (_dl_msyscall(exec_start, exec_size) == -1)
> - _dl_printf("msyscall %lx %lx error\n",
> - exec_start, exec_size);
> - }
> _dl_bcopy(&mut, &object->mut, sizeof mut);
> _dl_bcopy(&imut, &object->imut, sizeof imut);
> } else {
> Index: libexec/ld.so/library_mquery.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/library_mquery.c,v
> diff -u -p -u -r1.73 library_mquery.c
> --- libexec/ld.so/library_mquery.c 19 Dec 2023 16:13:22 -0000 1.73
> +++ libexec/ld.so/library_mquery.c 20 Dec 2023 14:56:19 -0000
> @@ -102,15 +102,15 @@ elf_object_t *
> _dl_tryload_shlib(const char *libname, int type, int flags, int nodelete)
> {
> struct range_vector imut, mut;
> - int libfile, i;
> + int libfile, libc = -1, i;
> struct load_list *ld, *lowld = NULL;
> elf_object_t *object;
> Elf_Dyn *dynp = NULL;
> Elf_Ehdr *ehdr;
> - Elf_Phdr *phdp;
> + Elf_Phdr *phdp, *ptls = NULL;
> + Elf_Phdr *syscall_phdp = NULL;
> Elf_Addr load_end = 0;
> Elf_Addr align = _dl_pagesz - 1, off, size;
> - Elf_Phdr *ptls = NULL;
> Elf_Addr relro_addr = 0, relro_size = 0;
> struct stat sb;
> char hbuf[4096], *exec_start;
> @@ -325,9 +325,28 @@ retry:
> _dl_push_range_size(&mut, phdp->p_vaddr + LOFF,
> phdp->p_memsz);
> break;
> + case PT_OPENBSD_SYSCALLS:
> + syscall_phdp = phdp;
> + break;
> }
> }
>
> + libc = _dl_islibc(dynp, LOFF);
> + if (libc) {
> + if (syscall_phdp)
> + _dl_pin(libfile, syscall_phdp, lowld->start,
> + (size_t)((exec_start + exec_size) - LOFF),
> + exec_start, exec_size);
> +
> + /*
> + * XXX msyscall() can be removed once pinsyscalls()
> + * is fully operational
> + */
> + /* Request permission for system calls in libc.so's text segment */
> + if (_dl_msyscall(exec_start, exec_size) == -1)
> + _dl_printf("msyscall %lx %lx error\n",
> + exec_start, exec_size);
> + }
> _dl_close(libfile);
>
> dynp = (Elf_Dyn *)((unsigned long)dynp + LOFF);
> @@ -335,8 +354,6 @@ retry:
> (Elf_Phdr *)((char *)lowld->start + ehdr->e_phoff), ehdr->e_phnum,
> type, (Elf_Addr)lowld->start, LOFF);
> if (object) {
> - char *soname = (char *)object->Dyn.info[DT_SONAME];
> -
> object->load_size = (Elf_Addr)load_end - (Elf_Addr)lowld->start;
> object->load_list = lowld;
> /* set inode, dev from stat info */
> @@ -346,18 +363,11 @@ retry:
> object->nodelete = nodelete;
> object->relro_addr = relro_addr;
> object->relro_size = relro_size;
> + object->islibc = libc;
> _dl_set_sod(object->load_name, &object->sod);
> if (ptls != NULL && ptls->p_memsz)
> _dl_set_tls(object, ptls, (Elf_Addr)lowld->start,
> libname);
> -
> - /* Request permission for system calls in libc.so's text segment */
> - if (soname != NULL && !_dl_traceld &&
> - _dl_strncmp(soname, "libc.so.", 8) == 0) {
> - if (_dl_msyscall(exec_start, exec_size) == -1)
> - _dl_printf("msyscall %lx %lx error\n",
> - exec_start, exec_size);
> - }
> _dl_bcopy(&mut, &object->mut, sizeof mut);
> _dl_bcopy(&imut, &object->imut, sizeof imut);
> } else {
> Index: libexec/ld.so/loader.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/loader.c,v
> diff -u -p -u -r1.218 loader.c
> --- libexec/ld.so/loader.c 19 Dec 2023 16:13:22 -0000 1.218
> +++ libexec/ld.so/loader.c 19 Dec 2023 21:45:31 -0000
> @@ -410,11 +410,14 @@ _dl_load_dep_libs(elf_object_t *object,
>
> _dl_cache_grpsym_list_setup(object);
>
> + /*
> + * XXX pinsyscall(SYS_execve,...) can be removed once pinsyscalls()
> + * is fully operational
> + */
> for (obj = _dl_objects; booting && obj != NULL; obj = obj->next) {
> - char *soname = (char *)obj->Dyn.info[DT_SONAME];
> struct sym_res sr;
>
> - if (!soname || _dl_strncmp(soname, "libc.so.", 8))
> + if (obj->islibc == 0)
Since islibc is treated as a boolean "if (!obj->islibc) is probably better.
> continue;
> sr = _dl_find_symbol("execve",
> SYM_SEARCH_SELF|SYM_PLT|SYM_WARNNOTFOUND, NULL, obj);
> Index: libexec/ld.so/resolve.c
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/resolve.c,v
> diff -u -p -u -r1.100 resolve.c
> --- libexec/ld.so/resolve.c 8 Jul 2023 14:09:43 -0000 1.100
> +++ libexec/ld.so/resolve.c 19 Dec 2023 21:45:31 -0000
> @@ -29,6 +29,8 @@
> #define _DYN_LOADER
>
> #include <sys/types.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
>
> #include <limits.h>
> #include <link.h>
> @@ -36,6 +38,7 @@
> #include "util.h"
> #include "path.h"
> #include "resolve.h"
> +#include "syscall.h"
>
> /* substitution types */
> typedef enum {
> @@ -744,4 +747,83 @@ void
> _dl_debug_state(void)
> {
> /* Debugger stub */
> +}
> +
> +/*
> + * Search for DT_SONAME, and check if this is libc
> + */
> +int
> +_dl_islibc(Elf_Dyn *_dynp, Elf_Addr loff)
> +{
> + Elf_Dyn *d, *dynp = (Elf_Dyn *)((unsigned long)_dynp + loff);
> + long base = 0;
> +
> + for (d = dynp; d->d_tag != DT_NULL; d++)
> + if (d->d_tag == DT_STRTAB) {
> + base = d->d_un.d_ptr + loff;
> + break;
> + }
> + if (base == 0)
> + return 0;
> + for (d = dynp; d->d_tag != DT_NULL; d++)
> + if (d->d_tag == DT_SONAME) {
> + if (_dl_strncmp((char *)(base + d->d_un.d_ptr),
> + "libc.so.", 8) == 0)
> + return 1;
> + break;
> + }
> + return 0;
> +}
> +
> +void
> +_dl_pin(int file, Elf_Phdr *phdp, void *base, size_t len,
> + void *exec_base, size_t exec_size)
> +{
> + struct pinsyscalls {
> + u_int offset;
> + u_int sysno;
> + } *syscalls;
> + int npins = 0, nsyscalls, i;
> + u_int *pins = NULL;
> + vaddr_t offset;
> +
> + if (phdp->p_filesz > SYS_MAXSYSCALL * 2 * sizeof(*syscalls) ||
> + phdp->p_filesz % sizeof(*syscalls) != 0 ||
> + phdp->p_offset & 0x3)
> + return;
> + syscalls = _dl_mmap(NULL, phdp->p_filesz, PROT_READ,
> + MAP_PRIVATE|MAP_FILE, file, phdp->p_offset);
> + if (syscalls == MAP_FAILED)
> + return;
> +
> + /* Validate, and calculate pintable size */
> + nsyscalls = phdp->p_filesz / sizeof(*syscalls);
> + for (i = 0; i < nsyscalls; i++) {
> + if (syscalls[i].sysno < 0 ||
> + syscalls[i].sysno >= SYS_MAXSYSCALL ||
> + syscalls[i].offset >= len)
> + goto bad;
> + npins = MAXIMUM(npins, syscalls[i].sysno);
> + }
> + npins++;
> +
> + /*
> + * Fill pintable: 0 = invalid, -1 = accept, else offset
> + * from base, rebase to text_start while at it
> + */
> + pins = _dl_calloc(npins, sizeof(u_int));
> + offset = exec_base - base;
> + for (i = 0; i < nsyscalls; i++) {
> + if (pins[syscalls[i].sysno])
> + pins[syscalls[i].sysno] = (u_int)-1; /* duplicated */
> + else
> + pins[syscalls[i].sysno] = syscalls[i].offset - offset;
> + }
> + base += offset;
> + len = len - offset;
> +bad:
> + _dl_munmap(syscalls, phdp->p_filesz);
> + if (pins)
> + _dl_pinsyscalls(base, len, pins, npins);
> + _dl_free(pins);
> }
> Index: libexec/ld.so/resolve.h
> ===================================================================
> RCS file: /cvs/src/libexec/ld.so/resolve.h,v
> diff -u -p -u -r1.106 resolve.h
> --- libexec/ld.so/resolve.h 19 Dec 2023 16:13:22 -0000 1.106
> +++ libexec/ld.so/resolve.h 19 Dec 2023 21:45:31 -0000
> @@ -245,6 +245,7 @@ struct elf_object {
>
> struct range_vector imut;
> struct range_vector mut;
> + int islibc;
> };
>
> struct dep_node {
> @@ -339,6 +340,9 @@ void _dl_apply_immutable(elf_object_t *o
> typedef void lock_cb(int);
> void _dl_thread_kern_go(lock_cb *);
> lock_cb *_dl_thread_kern_stop(void);
> +
> +int _dl_islibc(Elf_Dyn *_dynp, Elf_Addr loff);
> +void _dl_pin(int, Elf_Phdr *, void *, size_t, void *, size_t);
>
> char *_dl_getenv(const char *, char **) __boot;
> void _dl_unsetenv(const char *, char **) __boot;
> Index: bin/ps/print.c
> ===================================================================
> RCS file: /cvs/src/bin/ps/print.c,v
> diff -u -p -u -r1.86 print.c
> --- bin/ps/print.c 8 Mar 2023 14:47:02 -0000 1.86
> +++ bin/ps/print.c 19 Dec 2023 21:45:31 -0000
> @@ -303,6 +303,10 @@ printstate(const struct pinfo *pi, VAREN
> *cp++ = '+';
> if (kp->p_psflags & PS_PLEDGE)
> *cp++ = 'p';
> + if (kp->p_psflags & PS_PIN)
> + *cp++ = 'l';
> + if (kp->p_psflags & PS_LIBCPIN)
> + *cp++ = 'L';
> if (kp->p_eflag & EPROC_UNVEIL) {
> if (kp->p_eflag & EPROC_LKUNVEIL)
> *cp++ = 'U';
> Index: bin/ps/ps.1
> ===================================================================
> RCS file: /cvs/src/bin/ps/ps.1,v
> diff -u -p -u -r1.131 ps.1
> --- bin/ps/ps.1 10 Nov 2023 09:17:02 -0000 1.131
> +++ bin/ps/ps.1 19 Dec 2023 21:45:31 -0000
> @@ -359,6 +359,9 @@ PS_EXECPLEDGE 0x00400000 has exec pledge
> PS_ORPHAN 0x00800000 process is on an orphan list
> PS_CHROOT 0x01000000 process is chrooted
> PS_NOBTCFI 0x02000000 no Branch Target CFI
> +PS_PIN 0x08000000 ld.so or static executable that
> + has syscalls pinned
> +PS_LIBCPIN 0x10000000 libc.so has syscalls pinned
> .Ed
> .It Cm re
> Core residency time (in seconds; 127 = infinity).
> @@ -475,6 +478,11 @@ scheduling priority.
> .It p
> The process has called
> .Xr pledge 2 .
> +.It l
> +.Xr ld.so 1
> +or a static executable has syscall pinning.
> +.It L
> +libc.so has syscall pinning.
> .\" .It S
> .\" The process has asked for FIFO
> .\" page replacement
>
>
update on pinsyscalls(2)