From: Mark Kettenis Subject: Re: update on pinsyscalls(2) To: "Theo de Raadt" Cc: tech@cvs.openbsd.org Date: Sat, 13 Jan 2024 15:39:04 +0100 > From: "Theo de Raadt" > Date: Sat, 30 Dec 2023 11:56:35 -0700 > > The pinsyscalls(2) diff is now much smaller, since many pieces it depends > upon have been commmited. > > All the DSO containing system call entries have the proper annotations for > kernel and ld.so to do the right thing. > > This diff can be applied to a -current system, if people want to play > along: > > cd /usr/src > make includes ;; to update sys/proc.h mostly > build new kernel > build new libexec/ld.so > build new bin/ps > > ps: > - for static binaries, ps will show 'l' to indicate the > binary's text segment is doing syscall pinning. > - for dynamic binaries, ps will show 'l' to indicate that > ld.so's text segement is doing syscall pinning, and 'L' > to indicate libc.so's text segment is doing syscal pinning > > > There's a long tail with this diff. Perhaps in a release or two when > all binaries are known to follow the pinsyscalls(2) rules, we'll be able > to turn msyscall() and the less powerful pinsyscall(2) into NOPs, and > eventually remove them. > > The more precise pinsyscalls(2) check in syscall_mi.h is O(1) but > slightly more expensive than the msyscall(2) check which also is O(1) in > the general case but has a special case when text msyscall-allowed > segments get crossed (such as when doing ld.so GOT/PLT resolution, or > signal handlers), then uvm locks occur twice. But we don't need both. A few small things and one important mistake I spotted below. Otherwise this looks pretty good to me now. > Index: sys/kern/exec_elf.c > =================================================================== > RCS file: /cvs/src/sys/kern/exec_elf.c,v > diff -u -p -u -r1.183 exec_elf.c > --- sys/kern/exec_elf.c 12 Jul 2023 19:34:14 -0000 1.183 > +++ sys/kern/exec_elf.c 19 Dec 2023 21:45:31 -0000 > @@ -81,6 +81,7 @@ > #include > #include > #include > +#include > > #include > > @@ -97,6 +98,8 @@ void elf_load_psection(struct exec_vmcmd > Elf_Phdr *, Elf_Addr *, Elf_Addr *, int *, int); > int elf_os_pt_note_name(Elf_Note *); > int elf_os_pt_note(struct proc *, struct exec_package *, Elf_Ehdr *, int *); > +int elf_read_pintable(struct proc *p, struct vnode *vp, Elf_Phdr *pp, > + u_int **pinp, int is_ldso, size_t len); > > /* round up and down to page boundaries. */ > #define ELF_ROUND(a, b) (((a) + (b) - 1) & ~((b) - 1)) > @@ -266,6 +269,74 @@ elf_read_from(struct proc *p, struct vno > } > > /* > + * rebase the pin offsets inside a base,len window for the text segment only. > + */ > +void > +elf_adjustpins(vaddr_t *basep, size_t *lenp, u_int *pins, int npins, u_int offset) > +{ > + int i; > + > + /* Adjust offsets, base, len */ > + for (i = 0; i < npins; i++) { > + if (pins[i] == -1 || pins[i] == 0) > + continue; > + pins[i] -= offset; > + } > + *basep += offset; > + *lenp -= offset; > +} > + > +int > +elf_read_pintable(struct proc *p, struct vnode *vp, Elf_Phdr *pp, > + u_int **pinp, int is_ldso, size_t len) > +{ > + struct pinsyscalls { > + u_int offset; > + u_int sysno; > + } *syscalls = NULL; > + int i, nsyscalls = 0, npins = 0; > + u_int *pins = NULL; > + > + if (pp->p_filesz > SYS_MAXSYSCALL * 2 * sizeof(*syscalls) || > + pp->p_filesz % sizeof(*syscalls) != 0) > + goto bad; > + nsyscalls = pp->p_filesz / sizeof(*syscalls); > + syscalls = malloc(pp->p_filesz, M_PINSYSCALL, M_WAITOK); > + if (elf_read_from(p, vp, pp->p_offset, syscalls, > + pp->p_filesz) != 0) > + goto bad; > + > + /* Validate, and calculate pintable size */ > + for (i = 0; i < nsyscalls; i++) { > + if (syscalls[i].sysno <= 0 || > + syscalls[i].sysno >= SYS_MAXSYSCALL || > + syscalls[i].offset > len) > + goto bad; > + npins = MAX(npins, syscalls[i].sysno); > + } > + if (is_ldso) > + npins = MAX(npins, SYS_kbind); /* XXX see ld.so/loader.c */ > + npins++; > + > + /* Fill pintable: 0 = invalid, -1 = allowed, else offset from base */ > + pins = mallocarray(npins, sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO); > + for (i = 0; i < nsyscalls; i++) { > + if (pins[syscalls[i].sysno]) > + pins[syscalls[i].sysno] = -1; /* duplicated */ > + else > + pins[syscalls[i].sysno] = syscalls[i].offset; > + } > + if (is_ldso) > + pins[SYS_kbind] = -1; /* XXX see ld.so/loader.c */ > + *pinp = pins; > + pins = NULL; > +bad: > + free(syscalls, M_PINSYSCALL, nsyscalls * sizeof(*syscalls)); > + free(pins, M_PINSYSCALL, npins * sizeof(u_int)); > + return npins; > +} > + > +/* > * Load a file (interpreter/library) pointed to by path [stolen from > * coff_load_shlib()]. Made slightly generic so it might be used externally. > */ > @@ -276,7 +347,7 @@ elf_load_file(struct proc *p, char *path > int error, i; > struct nameidata nd; > Elf_Ehdr eh; > - Elf_Phdr *ph = NULL; > + Elf_Phdr *ph = NULL, *syscall_ph = NULL; > u_long phsize = 0; > Elf_Addr addr; > struct vnode *vp; > @@ -290,6 +361,7 @@ elf_load_file(struct proc *p, char *path > int file_align; > int loop; > size_t randomizequota = ELF_RANDOMIZE_LIMIT; > + vaddr_t text_start = -1, text_end = 0; > > NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p); > nd.ni_pledge = PLEDGE_RPATH; > @@ -432,6 +504,12 @@ elf_load_file(struct proc *p, char *path > epp->ep_entry += pos; > ap->arg_interp = pos; > } > + if (prot & PROT_EXEC) { > + if (addr < text_start) > + text_start = addr; > + if (addr+size >= text_end) > + text_end = addr + size; > + } > addr += size; > break; > > @@ -461,12 +539,34 @@ elf_load_file(struct proc *p, char *path > NEW_VMCMD(&epp->ep_vmcmds, vmcmd_mutable, > ph[i].p_memsz, ph[i].p_vaddr + pos, NULLVP, 0, 0); > break; > - > + case PT_OPENBSD_SYSCALLS: > + syscall_ph = &ph[i]; > + break; > default: > break; > } > } > > + if (syscall_ph) { > + struct process *pr = p->p_p; > + vaddr_t base = pos; > + size_t len = text_end; > + u_int *pins; > + int npins; > + > + npins = elf_read_pintable(p, nd.ni_vp, syscall_ph, > + &pins, 1, len); > + if (npins) { > + elf_adjustpins(&base, &len, pins, npins, > + text_start); > + pr->ps_pin.pn_start = base; > + pr->ps_pin.pn_end = base + len; > + pr->ps_pin.pn_pins = pins; > + pr->ps_pin.pn_npins = npins; > + pr->ps_flags |= PS_PIN; > + } > + } > + > vn_marktext(nd.ni_vp); > > bad1: > @@ -491,8 +591,8 @@ int > exec_elf_makecmds(struct proc *p, struct exec_package *epp) > { > Elf_Ehdr *eh = epp->ep_hdr; > - Elf_Phdr *ph, *pp, *base_ph = NULL; > - Elf_Addr phdr = 0, exe_base = 0; > + Elf_Phdr *ph, *pp, *base_ph = NULL, *syscall_ph = NULL; > + Elf_Addr phdr = 0, exe_base = 0, exe_end = 0; > int error, i, has_phdr = 0, names = 0, textrel = 0; > char *interp = NULL; > u_long phsize; > @@ -633,11 +733,13 @@ exec_elf_makecmds(struct proc *p, struct > > /* > * Permit system calls in main-text static binaries. > - * Also block the ld.so syscall-grant > + * static binaries may not call msyscall() or > + * pinsyscalls() > */ > if (interp == NULL) { > syscall = VMCMD_SYSCALL; > p->p_vmspace->vm_map.flags |= VM_MAP_SYSCALL_ONCE; > + p->p_vmspace->vm_map.flags |= VM_MAP_PINSYSCALL_ONCE; > } > > /* > @@ -696,6 +798,9 @@ exec_elf_makecmds(struct proc *p, struct > epp->ep_tsize = addr+size - > epp->ep_taddr; > } > + if (interp == NULL) > + exe_end = epp->ep_taddr + > + epp->ep_tsize; /* end of TEXT */ > } > break; > > @@ -735,13 +840,35 @@ exec_elf_makecmds(struct proc *p, struct > NEW_VMCMD(&epp->ep_vmcmds, vmcmd_mutable, > ph[i].p_memsz, ph[i].p_vaddr + exe_base, NULLVP, 0, 0); > break; > - > + case PT_OPENBSD_SYSCALLS: > + if (interp == NULL) > + syscall_ph = &ph[i]; > + break; > default: > /* > * Not fatal, we don't need to understand everything > * :-) > */ > break; > + } > + } > + > + if (syscall_ph) { > + vaddr_t base = exe_base; > + size_t len = exe_end - exe_base; > + u_int *pins; > + int npins; > + > + npins = elf_read_pintable(p, epp->ep_vp, syscall_ph, > + &pins, 0, len); > + if (npins) { > + elf_adjustpins(&base, &len, pins, npins, > + epp->ep_taddr - exe_base); > + epp->ep_pinstart = base; > + epp->ep_pinend = base + len; > + epp->ep_pins = pins; > + epp->ep_npins = npins; > + p->p_p->ps_flags |= PS_PIN; > } > } > > Index: sys/kern/kern_exec.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_exec.c,v > diff -u -p -u -r1.252 kern_exec.c > --- sys/kern/kern_exec.c 30 Oct 2023 07:13:10 -0000 1.252 > +++ sys/kern/kern_exec.c 19 Dec 2023 21:45:31 -0000 > @@ -314,6 +314,8 @@ sys_execve(struct proc *p, void *v, regi > VMCMDSET_INIT(&pack.ep_vmcmds); > pack.ep_vap = &attr; > pack.ep_flags = 0; > + pack.ep_pins = NULL; > + pack.ep_npins = 0; > > /* see if we can run it. */ > if ((error = check_exec(p, &pack)) != 0) { > @@ -514,6 +516,30 @@ sys_execve(struct proc *p, void *v, regi > if (copyout(&arginfo, (char *)pr->ps_strings, sizeof(arginfo))) > goto exec_abort; > > + free(pr->ps_pin.pn_pins, M_PINSYSCALL, > + pr->ps_pin.pn_npins * sizeof(u_int)); > + if (pack.ep_npins) { > + pr->ps_pin.pn_start = pack.ep_pinstart; > + pr->ps_pin.pn_end = pack.ep_pinend; > + pr->ps_pin.pn_pins = pack.ep_pins; > + pack.ep_pins = NULL; > + pr->ps_pin.pn_npins = pack.ep_npins; > + pr->ps_flags |= PS_PIN; > + } else { > + pr->ps_pin.pn_start = pr->ps_pin.pn_end = 0; > + pr->ps_pin.pn_pins = NULL; > + pr->ps_pin.pn_npins = 0; > + pr->ps_flags &= ~PS_PIN; > + } > + if (pr->ps_libcpin.pn_pins) { > + free(pr->ps_libcpin.pn_pins, M_PINSYSCALL, > + pr->ps_libcpin.pn_npins * sizeof(u_int)); > + pr->ps_libcpin.pn_start = pr->ps_libcpin.pn_end = 0; > + pr->ps_libcpin.pn_pins = NULL; > + pr->ps_libcpin.pn_npins = 0; > + pr->ps_flags &= ~PS_LIBCPIN; > + } > + > stopprofclock(pr); /* stop profiling */ > fdcloseexec(p); /* handle close on exec */ > execsigs(p); /* reset caught signals */ > @@ -752,6 +778,7 @@ bad: > if (pack.ep_interp != NULL) > pool_put(&namei_pool, pack.ep_interp); > free(pack.ep_args, M_TEMP, sizeof *pack.ep_args); > + free(pack.ep_pins, M_PINSYSCALL, pack.ep_npins * sizeof(u_int)); > /* close and put the exec'd file */ > vn_close(pack.ep_vp, FREAD, cred, p); > pool_put(&namei_pool, nid.ni_cnd.cn_pnbuf); > Index: sys/kern/kern_exit.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_exit.c,v > diff -u -p -u -r1.217 kern_exit.c > --- sys/kern/kern_exit.c 29 Sep 2023 12:47:34 -0000 1.217 > +++ sys/kern/kern_exit.c 19 Dec 2023 21:45:31 -0000 > @@ -215,6 +215,11 @@ exit1(struct proc *p, int xexit, int xsi > > unveil_destroy(pr); > > + free(pr->ps_pin.pn_pins, M_PINSYSCALL, > + pr->ps_pin.pn_npins * sizeof(u_int)); > + free(pr->ps_libcpin.pn_pins, M_PINSYSCALL, > + pr->ps_libcpin.pn_npins * sizeof(u_int)); > + > /* > * If parent has the SAS_NOCLDWAIT flag set, we're not > * going to become a zombie. > Index: sys/kern/kern_fork.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_fork.c,v > diff -u -p -u -r1.253 kern_fork.c > --- sys/kern/kern_fork.c 24 Oct 2023 13:20:11 -0000 1.253 > +++ sys/kern/kern_fork.c 19 Dec 2023 21:45:31 -0000 > @@ -248,6 +248,21 @@ process_new(struct proc *p, struct proce > if (parent->ps_session->s_ttyvp != NULL) > pr->ps_flags |= parent->ps_flags & PS_CONTROLT; > > + if (parent->ps_pin.pn_pins) { > + pr->ps_pin.pn_pins = mallocarray(parent->ps_pin.pn_npins, > + sizeof(u_int), M_PINSYSCALL, M_WAITOK); > + memcpy(pr->ps_pin.pn_pins, parent->ps_pin.pn_pins, > + parent->ps_pin.pn_npins * sizeof(u_int)); > + pr->ps_flags |= PS_PIN; > + } > + if (parent->ps_libcpin.pn_pins) { > + pr->ps_libcpin.pn_pins = mallocarray(parent->ps_libcpin.pn_npins, > + sizeof(u_int), M_PINSYSCALL, M_WAITOK); > + memcpy(pr->ps_libcpin.pn_pins, parent->ps_libcpin.pn_pins, > + parent->ps_libcpin.pn_npins * sizeof(u_int)); > + pr->ps_flags |= PS_LIBCPIN; > + } > + > /* > * Duplicate sub-structures as needed. > * Increase reference counts on shared objects. > Index: sys/sys/exec.h > =================================================================== > RCS file: /cvs/src/sys/sys/exec.h,v > diff -u -p -u -r1.52 exec.h > --- sys/sys/exec.h 19 Apr 2023 15:37:36 -0000 1.52 > +++ sys/sys/exec.h 19 Dec 2023 21:45:31 -0000 > @@ -131,6 +131,9 @@ struct exec_package { > struct elf_args *ep_args; /* ELF info */ > void *ep_auxinfo; /* userspace auxinfo address */ > char *ep_interp; /* name of interpreter if any */ > + vaddr_t ep_pinstart, ep_pinend; /* executable region */ > + u_int *ep_pins; /* array of system call offsets */ > + int ep_npins; /* entries in array */ > }; > #define EXEC_INDIR 0x0001 /* script handling already done */ > #define EXEC_HASFD 0x0002 /* holding a shell script */ > Index: sys/sys/proc.h > =================================================================== > RCS file: /cvs/src/sys/sys/proc.h,v > diff -u -p -u -r1.352 proc.h > --- sys/sys/proc.h 29 Sep 2023 12:47:34 -0000 1.352 > +++ sys/sys/proc.h 19 Dec 2023 21:45:31 -0000 > @@ -117,6 +117,13 @@ struct tslpentry; > TAILQ_HEAD(tslpqueue, tslpentry); > struct unveil; > > +struct pinsyscall { > + vaddr_t pn_start; > + vaddr_t pn_end; > + u_int *pn_pins; /* array of offset indexed by syscall# */ s/offset/offsets/? > + int pn_npins; /* number of entries in table */ > +}; > + > /* > * Locks used to protect struct members in this file: > * I immutable after creation > @@ -240,6 +247,9 @@ struct process { > /* an address that can't be in userspace or kernelspace */ > #define BOGO_PC (u_long)-1 > > + struct pinsyscall ps_pin; /* static or ld.so */ > + struct pinsyscall ps_libcpin; /* libc.so, from pinsyscalls(2) */ > + > /* End area that is copied on creation. */ > #define ps_endcopy ps_threadcnt > u_int ps_threadcnt; /* Number of threads. */ > @@ -283,6 +293,8 @@ struct process { > #define PS_CHROOT 0x01000000 /* Process is chrooted */ > #define PS_NOBTCFI 0x02000000 /* No Branch Target CFI */ > #define PS_ITIMER 0x04000000 /* Virtual interval timers running */ > +#define PS_PIN 0x08000000 /* ld.so or static syscall pin */ > +#define PS_LIBCPIN 0x10000000 /* libc.so syscall pin */ > > #define PS_BITS \ > ("\20" "\01CONTROLT" "\02EXEC" "\03INEXEC" "\04EXITING" "\05SUGID" \ > Index: sys/sys/syscall_mi.h > =================================================================== > RCS file: /cvs/src/sys/sys/syscall_mi.h,v > diff -u -p -u -r1.29 syscall_mi.h > --- sys/sys/syscall_mi.h 12 Dec 2023 15:30:55 -0000 1.29 > +++ sys/sys/syscall_mi.h 19 Dec 2023 21:45:31 -0000 > @@ -33,8 +33,11 @@ > > #include > #include > +#include > +#include > #include > #include > +#include > #include > > #ifdef KTRACE > @@ -46,6 +49,79 @@ > #include > #endif > > +/* > + * Check if a system call is entered from precisely correct location > + */ > +static inline int > +pin_check(struct proc *p, register_t code) > +{ > + extern char sigcodecall[], sigcoderet[], sigcodecall[]; > + struct pinsyscall *pin = NULL, *ppin, *plibcpin; > + struct process *pr = p->p_p; > + vaddr_t addr; > + int error = 0; > + > + /* point at start of syscall instruction */ > + addr = (vaddr_t)PROC_PC(p) - (vaddr_t)(sigcoderet - sigcodecall); > + ppin = &pr->ps_pin; > + plibcpin = &pr->ps_libcpin; > + > + /* > + * System calls come from the following places, checks are ordered > + * by most common case: > + * 1) dynamic binary: syscalls in libc.so (in the ps_libcpin region) > + * 2a) static binary: syscalls in main program (in the ps_pin region) > + * 2b) dynamic binary: sysalls in ld.so (in the ps_pin region) > + * 3) sigtramp, containing only sigreturn(2) > + */ > + if (plibcpin->pn_pins && > + addr >= plibcpin->pn_start && addr < plibcpin->pn_end) > + pin = plibcpin; > + else if (ppin->pn_pins && > + addr >= ppin->pn_start && addr < ppin->pn_end) > + pin = ppin; > + else if (PROC_PC(p) == pr->ps_sigcoderet) { > + if (code == SYS_sigreturn) > + return (0); > + error = EPERM; > + } > + if (pin) { > + if (code >= pin->pn_npins || pin->pn_pins[code] == 0) > + error = ENOSYS; > + else if (pin->pn_pins[code] + pin->pn_start == addr) > + ; /* correct location */ > + else if (pin->pn_pins[code] == (u_int)-1) > + ; /* multiple locations, hopefully a boring operation */ > + else > + error = ENOSYS; > + } > + if (error == 0) > + return (0); > +#ifdef KTRACE > + if (KTRPOINT(p, KTR_PINSYSCALL)) > + ktrpinsyscall(p, error, code, addr); > +#endif > + KERNEL_LOCK(); > + log(LOG_ERR, > + "%s[%d]: pinsyscalls addr %lx code %ld, pinoff 0x%x " > + "(pin%s %d %lx-%lx %lx) (libcpin%s %d %lx-%lx %lx) error %d\n", > + p->p_p->ps_comm, p->p_p->ps_pid, addr, code, > + (pin && code < pin->pn_npins) ? pin->pn_pins[code] : -1, > + pin == ppin ? "(Y)" : "", ppin->pn_npins, > + ppin->pn_start, ppin->pn_end, ppin->pn_end - ppin->pn_start, > + pin == plibcpin ? "(Y)" : "", plibcpin->pn_npins, > + plibcpin->pn_start, plibcpin->pn_end, plibcpin->pn_end - plibcpin->pn_start, > + error); > + p->p_p->ps_acflag |= APINSYS; > + > + /* Try to stop threads immediately, because this process is suspect */ > + if (P_HASSIBLING(p)) > + single_thread_set(p, SINGLE_UNWIND | SINGLE_DEEP); > + /* Send uncatchable SIGABRT for coredump */ > + sigabort(p); > + KERNEL_UNLOCK(); > + return (error); > +} > > /* > * The MD setup for a system call has been done; here's the MI part. > @@ -90,6 +166,9 @@ mi_syscall(struct proc *p, register_t co > "[%s]%d/%d pc=%lx inside %lx-%lx: bogus syscall\n", > uvm_map_inentry_pc, p->p_vmspace->vm_map.wserial)) > return (EPERM); > + > + if ((error = pin_check(p, code))) > + return (error); > > pledged = (p->p_p->ps_flags & PS_PLEDGE); > if (pledged && (error = pledge_syscall(p, code, &tval))) { > Index: sys/uvm/uvm_map.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_map.c,v > diff -u -p -u -r1.319 uvm_map.c > --- sys/uvm/uvm_map.c 2 Aug 2023 09:19:47 -0000 1.319 > +++ sys/uvm/uvm_map.c 21 Dec 2023 17:55:14 -0000 > @@ -3407,7 +3407,8 @@ uvmspace_exec(struct proc *p, vaddr_t st > * when a process execs another program image. > */ > vm_map_lock(map); > - vm_map_modflags(map, 0, VM_MAP_WIREFUTURE|VM_MAP_SYSCALL_ONCE); > + vm_map_modflags(map, 0, VM_MAP_WIREFUTURE | > + VM_MAP_SYSCALL_ONCE | VM_MAP_PINSYSCALL_ONCE); > > /* > * now unmap the old program > @@ -3944,7 +3945,8 @@ uvmspace_fork(struct process *pr) > new_map, new_entry->start, new_entry->end); > } > } > - new_map->flags |= old_map->flags & VM_MAP_SYSCALL_ONCE; > + new_map->flags |= old_map->flags & > + (VM_MAP_SYSCALL_ONCE | VM_MAP_PINSYSCALL_ONCE); > #ifdef PMAP_CHECK_COPYIN > if (PMAP_CHECK_COPYIN) { > memcpy(&new_map->check_copyin, &old_map->check_copyin, > Index: sys/uvm/uvm_map.h > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_map.h,v > diff -u -p -u -r1.87 uvm_map.h > --- sys/uvm/uvm_map.h 2 Aug 2023 09:19:47 -0000 1.87 > +++ sys/uvm/uvm_map.h 21 Dec 2023 15:31:23 -0000 > @@ -329,6 +329,7 @@ struct vm_map { > #define VM_MAP_GUARDPAGES 0x20 /* rw: add guard pgs to map */ > #define VM_MAP_ISVMSPACE 0x40 /* ro: map is a vmspace */ > #define VM_MAP_SYSCALL_ONCE 0x80 /* rw: libc syscall registered */ > +#define VM_MAP_PINSYSCALL_ONCE 0x100 /* rw: pinsyscall done */ > > /* Number of kernel maps and entries to statically allocate */ > #define MAX_KMAPENT 1024 /* Sufficient to make it to the scheduler. */ > Index: sys/uvm/uvm_mmap.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v > diff -u -p -u -r1.183 uvm_mmap.c > --- sys/uvm/uvm_mmap.c 7 Dec 2023 13:59:05 -0000 1.183 > +++ sys/uvm/uvm_mmap.c 19 Dec 2023 21:45:31 -0000 > @@ -644,13 +644,65 @@ sys_pinsyscall(struct proc *p, void *v, > return (0); > } > > - /* > - * sys_pinsyscalls > +/* > + * sys_pinsyscalls. The caller is required to normalize base,len > + * to the minimum .text region, and adjust pintable offsets relative > + * to that base. > */ > int > sys_pinsyscalls(struct proc *p, void *v, register_t *retval) > { > - /* STUB until other parts are ready */ > + struct sys_pinsyscalls_args /* { > + syscallarg(void *) base; > + syscallarg(size_t) len; > + syscallarg(u_int *) pins; > + syscallarg(int) npins; > + } */ *uap = v; > + struct process *pr = p->p_p; > + int npins, error = 0, i; > + vaddr_t base; > + size_t len; > + u_int *pins; > + > + if (pr->ps_libcpin.pn_start || > + (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE)) > + return (EPERM); > + base = (vaddr_t)SCARG(uap, base); > + len = (vsize_t)SCARG(uap, len); > + if (base > SIZE_MAX - len) > + return (EINVAL); /* disallow wrap-around. */ > + > + /* XXX MP unlock */ > + > + npins = SCARG(uap, npins); > + if (npins < 1 || npins > SYS_MAXSYSCALL * 2) > + return (E2BIG); Since pinsyscalls(2) now takes an array of offsets indexed by syscall#, the above check should be if (npins < 1 || npins > SYS_MAXSYSCALL) > + pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO); > + if (pins == NULL) > + return (ENOMEM); > + error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int)); > + if (error) > + goto err; > + > + /* Range-check pintable offsets */ > + for (i = 0; i < npins; i++) { > + if (pins[i] == (u_int)-1 || pins[i] == 0) > + continue; > + if (pins[i] > SCARG(uap, len)) { > + error = ERANGE; > + break; > + } > + } > + if (error) { > +err: > + free(pins, M_PINSYSCALL, npins * sizeof(u_int)); > + return (error); > + } > + pr->ps_libcpin.pn_start = base; > + pr->ps_libcpin.pn_end = base + len; > + pr->ps_libcpin.pn_pins = pins; > + pr->ps_libcpin.pn_npins = npins; > + pr->ps_flags |= PS_LIBCPIN; > return (0); > } > > Index: libexec/ld.so/library.c > =================================================================== > RCS file: /cvs/src/libexec/ld.so/library.c,v > diff -u -p -u -r1.93 library.c > --- libexec/ld.so/library.c 19 Dec 2023 16:13:22 -0000 1.93 > +++ libexec/ld.so/library.c 19 Dec 2023 21:45:31 -0000 > @@ -99,7 +99,7 @@ elf_object_t * > _dl_tryload_shlib(const char *libname, int type, int flags, int nodelete) > { > struct range_vector imut, mut; > - int libfile, i; > + int libfile, libc = -1, i; > struct load_list *next_load, *load_list = NULL; > Elf_Addr maxva = 0, minva = ELF_NO_ADDR; > Elf_Addr libaddr, loff, align = _dl_pagesz - 1; > @@ -109,8 +109,8 @@ _dl_tryload_shlib(const char *libname, i > size_t exec_size = 0; > Elf_Dyn *dynp = NULL; > Elf_Ehdr *ehdr; > - Elf_Phdr *phdp; > - Elf_Phdr *ptls = NULL; > + Elf_Phdr *phdp, *ptls = NULL; > + Elf_Phdr *syscall_phdp = NULL; > struct stat sb; > > #define powerof2(x) ((((x) - 1) & (x)) == 0) > @@ -139,7 +139,6 @@ _dl_tryload_shlib(const char *libname, i > if (flags & DF_1_NOOPEN) { > _dl_close(libfile); > return NULL; > - > } > > _dl_read(libfile, hbuf, sizeof(hbuf)); > @@ -316,11 +315,30 @@ _dl_tryload_shlib(const char *libname, i > _dl_push_range_size(&mut, phdp->p_vaddr + loff, > phdp->p_memsz); > break; > + case PT_OPENBSD_SYSCALLS: > + syscall_phdp = phdp; > + break; > default: > break; > } > } > > + libc = _dl_islibc(dynp, loff); > + if (libc) { > + if (syscall_phdp) > + _dl_pin(libfile, syscall_phdp, (void *)libaddr, > + (size_t)((exec_start + exec_size) - libaddr), > + exec_start, exec_size); > + > + /* > + * XXX msyscall() can be removed once pinsyscalls() > + * is fully operational > + */ > + /* Request permission for system calls in libc.so's text segment */ > + if (_dl_msyscall(exec_start, exec_size) == -1) > + _dl_printf("msyscall %lx %lx error\n", > + exec_start, exec_size); > + } > _dl_close(libfile); > > dynp = (Elf_Dyn *)((unsigned long)dynp + loff); > @@ -328,8 +346,6 @@ _dl_tryload_shlib(const char *libname, i > (Elf_Phdr *)((char *)libaddr + ehdr->e_phoff), ehdr->e_phnum,type, > libaddr, loff); > if (object) { > - char *soname = (char *)object->Dyn.info[DT_SONAME]; > - > object->load_size = maxva - minva; /*XXX*/ > object->load_list = load_list; > /* set inode, dev from stat info */ > @@ -339,17 +355,10 @@ _dl_tryload_shlib(const char *libname, i > object->nodelete = nodelete; > object->relro_addr = relro_addr; > object->relro_size = relro_size; > + object->islibc = libc; > _dl_set_sod(object->load_name, &object->sod); > if (ptls != NULL && ptls->p_memsz) > _dl_set_tls(object, ptls, libaddr, libname); > - > - /* Request permission for system calls in libc.so's text segment */ > - if (soname != NULL && !_dl_traceld && > - _dl_strncmp(soname, "libc.so.", 8) == 0) { > - if (_dl_msyscall(exec_start, exec_size) == -1) > - _dl_printf("msyscall %lx %lx error\n", > - exec_start, exec_size); > - } > _dl_bcopy(&mut, &object->mut, sizeof mut); > _dl_bcopy(&imut, &object->imut, sizeof imut); > } else { > Index: libexec/ld.so/library_mquery.c > =================================================================== > RCS file: /cvs/src/libexec/ld.so/library_mquery.c,v > diff -u -p -u -r1.73 library_mquery.c > --- libexec/ld.so/library_mquery.c 19 Dec 2023 16:13:22 -0000 1.73 > +++ libexec/ld.so/library_mquery.c 20 Dec 2023 14:56:19 -0000 > @@ -102,15 +102,15 @@ elf_object_t * > _dl_tryload_shlib(const char *libname, int type, int flags, int nodelete) > { > struct range_vector imut, mut; > - int libfile, i; > + int libfile, libc = -1, i; > struct load_list *ld, *lowld = NULL; > elf_object_t *object; > Elf_Dyn *dynp = NULL; > Elf_Ehdr *ehdr; > - Elf_Phdr *phdp; > + Elf_Phdr *phdp, *ptls = NULL; > + Elf_Phdr *syscall_phdp = NULL; > Elf_Addr load_end = 0; > Elf_Addr align = _dl_pagesz - 1, off, size; > - Elf_Phdr *ptls = NULL; > Elf_Addr relro_addr = 0, relro_size = 0; > struct stat sb; > char hbuf[4096], *exec_start; > @@ -325,9 +325,28 @@ retry: > _dl_push_range_size(&mut, phdp->p_vaddr + LOFF, > phdp->p_memsz); > break; > + case PT_OPENBSD_SYSCALLS: > + syscall_phdp = phdp; > + break; > } > } > > + libc = _dl_islibc(dynp, LOFF); > + if (libc) { > + if (syscall_phdp) > + _dl_pin(libfile, syscall_phdp, lowld->start, > + (size_t)((exec_start + exec_size) - LOFF), > + exec_start, exec_size); > + > + /* > + * XXX msyscall() can be removed once pinsyscalls() > + * is fully operational > + */ > + /* Request permission for system calls in libc.so's text segment */ > + if (_dl_msyscall(exec_start, exec_size) == -1) > + _dl_printf("msyscall %lx %lx error\n", > + exec_start, exec_size); > + } > _dl_close(libfile); > > dynp = (Elf_Dyn *)((unsigned long)dynp + LOFF); > @@ -335,8 +354,6 @@ retry: > (Elf_Phdr *)((char *)lowld->start + ehdr->e_phoff), ehdr->e_phnum, > type, (Elf_Addr)lowld->start, LOFF); > if (object) { > - char *soname = (char *)object->Dyn.info[DT_SONAME]; > - > object->load_size = (Elf_Addr)load_end - (Elf_Addr)lowld->start; > object->load_list = lowld; > /* set inode, dev from stat info */ > @@ -346,18 +363,11 @@ retry: > object->nodelete = nodelete; > object->relro_addr = relro_addr; > object->relro_size = relro_size; > + object->islibc = libc; > _dl_set_sod(object->load_name, &object->sod); > if (ptls != NULL && ptls->p_memsz) > _dl_set_tls(object, ptls, (Elf_Addr)lowld->start, > libname); > - > - /* Request permission for system calls in libc.so's text segment */ > - if (soname != NULL && !_dl_traceld && > - _dl_strncmp(soname, "libc.so.", 8) == 0) { > - if (_dl_msyscall(exec_start, exec_size) == -1) > - _dl_printf("msyscall %lx %lx error\n", > - exec_start, exec_size); > - } > _dl_bcopy(&mut, &object->mut, sizeof mut); > _dl_bcopy(&imut, &object->imut, sizeof imut); > } else { > Index: libexec/ld.so/loader.c > =================================================================== > RCS file: /cvs/src/libexec/ld.so/loader.c,v > diff -u -p -u -r1.218 loader.c > --- libexec/ld.so/loader.c 19 Dec 2023 16:13:22 -0000 1.218 > +++ libexec/ld.so/loader.c 19 Dec 2023 21:45:31 -0000 > @@ -410,11 +410,14 @@ _dl_load_dep_libs(elf_object_t *object, > > _dl_cache_grpsym_list_setup(object); > > + /* > + * XXX pinsyscall(SYS_execve,...) can be removed once pinsyscalls() > + * is fully operational > + */ > for (obj = _dl_objects; booting && obj != NULL; obj = obj->next) { > - char *soname = (char *)obj->Dyn.info[DT_SONAME]; > struct sym_res sr; > > - if (!soname || _dl_strncmp(soname, "libc.so.", 8)) > + if (obj->islibc == 0) Since islibc is treated as a boolean "if (!obj->islibc) is probably better. > continue; > sr = _dl_find_symbol("execve", > SYM_SEARCH_SELF|SYM_PLT|SYM_WARNNOTFOUND, NULL, obj); > Index: libexec/ld.so/resolve.c > =================================================================== > RCS file: /cvs/src/libexec/ld.so/resolve.c,v > diff -u -p -u -r1.100 resolve.c > --- libexec/ld.so/resolve.c 8 Jul 2023 14:09:43 -0000 1.100 > +++ libexec/ld.so/resolve.c 19 Dec 2023 21:45:31 -0000 > @@ -29,6 +29,8 @@ > #define _DYN_LOADER > > #include > +#include > +#include > > #include > #include > @@ -36,6 +38,7 @@ > #include "util.h" > #include "path.h" > #include "resolve.h" > +#include "syscall.h" > > /* substitution types */ > typedef enum { > @@ -744,4 +747,83 @@ void > _dl_debug_state(void) > { > /* Debugger stub */ > +} > + > +/* > + * Search for DT_SONAME, and check if this is libc > + */ > +int > +_dl_islibc(Elf_Dyn *_dynp, Elf_Addr loff) > +{ > + Elf_Dyn *d, *dynp = (Elf_Dyn *)((unsigned long)_dynp + loff); > + long base = 0; > + > + for (d = dynp; d->d_tag != DT_NULL; d++) > + if (d->d_tag == DT_STRTAB) { > + base = d->d_un.d_ptr + loff; > + break; > + } > + if (base == 0) > + return 0; > + for (d = dynp; d->d_tag != DT_NULL; d++) > + if (d->d_tag == DT_SONAME) { > + if (_dl_strncmp((char *)(base + d->d_un.d_ptr), > + "libc.so.", 8) == 0) > + return 1; > + break; > + } > + return 0; > +} > + > +void > +_dl_pin(int file, Elf_Phdr *phdp, void *base, size_t len, > + void *exec_base, size_t exec_size) > +{ > + struct pinsyscalls { > + u_int offset; > + u_int sysno; > + } *syscalls; > + int npins = 0, nsyscalls, i; > + u_int *pins = NULL; > + vaddr_t offset; > + > + if (phdp->p_filesz > SYS_MAXSYSCALL * 2 * sizeof(*syscalls) || > + phdp->p_filesz % sizeof(*syscalls) != 0 || > + phdp->p_offset & 0x3) > + return; > + syscalls = _dl_mmap(NULL, phdp->p_filesz, PROT_READ, > + MAP_PRIVATE|MAP_FILE, file, phdp->p_offset); > + if (syscalls == MAP_FAILED) > + return; > + > + /* Validate, and calculate pintable size */ > + nsyscalls = phdp->p_filesz / sizeof(*syscalls); > + for (i = 0; i < nsyscalls; i++) { > + if (syscalls[i].sysno < 0 || > + syscalls[i].sysno >= SYS_MAXSYSCALL || > + syscalls[i].offset >= len) > + goto bad; > + npins = MAXIMUM(npins, syscalls[i].sysno); > + } > + npins++; > + > + /* > + * Fill pintable: 0 = invalid, -1 = accept, else offset > + * from base, rebase to text_start while at it > + */ > + pins = _dl_calloc(npins, sizeof(u_int)); > + offset = exec_base - base; > + for (i = 0; i < nsyscalls; i++) { > + if (pins[syscalls[i].sysno]) > + pins[syscalls[i].sysno] = (u_int)-1; /* duplicated */ > + else > + pins[syscalls[i].sysno] = syscalls[i].offset - offset; > + } > + base += offset; > + len = len - offset; > +bad: > + _dl_munmap(syscalls, phdp->p_filesz); > + if (pins) > + _dl_pinsyscalls(base, len, pins, npins); > + _dl_free(pins); > } > Index: libexec/ld.so/resolve.h > =================================================================== > RCS file: /cvs/src/libexec/ld.so/resolve.h,v > diff -u -p -u -r1.106 resolve.h > --- libexec/ld.so/resolve.h 19 Dec 2023 16:13:22 -0000 1.106 > +++ libexec/ld.so/resolve.h 19 Dec 2023 21:45:31 -0000 > @@ -245,6 +245,7 @@ struct elf_object { > > struct range_vector imut; > struct range_vector mut; > + int islibc; > }; > > struct dep_node { > @@ -339,6 +340,9 @@ void _dl_apply_immutable(elf_object_t *o > typedef void lock_cb(int); > void _dl_thread_kern_go(lock_cb *); > lock_cb *_dl_thread_kern_stop(void); > + > +int _dl_islibc(Elf_Dyn *_dynp, Elf_Addr loff); > +void _dl_pin(int, Elf_Phdr *, void *, size_t, void *, size_t); > > char *_dl_getenv(const char *, char **) __boot; > void _dl_unsetenv(const char *, char **) __boot; > Index: bin/ps/print.c > =================================================================== > RCS file: /cvs/src/bin/ps/print.c,v > diff -u -p -u -r1.86 print.c > --- bin/ps/print.c 8 Mar 2023 14:47:02 -0000 1.86 > +++ bin/ps/print.c 19 Dec 2023 21:45:31 -0000 > @@ -303,6 +303,10 @@ printstate(const struct pinfo *pi, VAREN > *cp++ = '+'; > if (kp->p_psflags & PS_PLEDGE) > *cp++ = 'p'; > + if (kp->p_psflags & PS_PIN) > + *cp++ = 'l'; > + if (kp->p_psflags & PS_LIBCPIN) > + *cp++ = 'L'; > if (kp->p_eflag & EPROC_UNVEIL) { > if (kp->p_eflag & EPROC_LKUNVEIL) > *cp++ = 'U'; > Index: bin/ps/ps.1 > =================================================================== > RCS file: /cvs/src/bin/ps/ps.1,v > diff -u -p -u -r1.131 ps.1 > --- bin/ps/ps.1 10 Nov 2023 09:17:02 -0000 1.131 > +++ bin/ps/ps.1 19 Dec 2023 21:45:31 -0000 > @@ -359,6 +359,9 @@ PS_EXECPLEDGE 0x00400000 has exec pledge > PS_ORPHAN 0x00800000 process is on an orphan list > PS_CHROOT 0x01000000 process is chrooted > PS_NOBTCFI 0x02000000 no Branch Target CFI > +PS_PIN 0x08000000 ld.so or static executable that > + has syscalls pinned > +PS_LIBCPIN 0x10000000 libc.so has syscalls pinned > .Ed > .It Cm re > Core residency time (in seconds; 127 = infinity). > @@ -475,6 +478,11 @@ scheduling priority. > .It p > The process has called > .Xr pledge 2 . > +.It l > +.Xr ld.so 1 > +or a static executable has syscall pinning. > +.It L > +libc.so has syscall pinning. > .\" .It S > .\" The process has asked for FIFO > .\" page replacement > >