From: "Theo de Raadt" Subject: Re: patch: relax ni_pledge panic To: Mark Kettenis , semarie@kapouay.eu.org, tech@openbsd.org Date: Fri, 07 Feb 2025 10:18:56 -0700 Claudio Jeker wrote: > I like this, if this goes in many of those READ_ONCE wrappers can die > since p_pledge can be accessed without fear of concurrent changes. Yes, a later diff had that. Those parts included now. > I need to look closer at the single_thread_set calls. > The one in sys_pledge looks ok since the call is at the tail end of the > syscall. The sys_unveil one I need to double check when I have time. > > The problem with single_thread_set is that it can call exit1 or sleep > which must happen in a spot that is safe. Very scary. I now enforce it for the entire going-to-do-work part of sys_unveil, so that it leaks no objects in that case. Index: sys/sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v diff -u -p -u -r1.378 proc.h --- sys/sys/proc.h 25 Jan 2025 19:21:40 -0000 1.378 +++ sys/sys/proc.h 7 Feb 2025 14:12:00 -0000 @@ -395,6 +395,7 @@ struct proc { u_char p_usrpri; /* [S] Priority based on p_estcpu & ps_nice */ u_int p_estcpu; /* [S] Time averaged val of p_cpticks */ int p_pledge_syscall; /* Cache of current syscall */ + uint64_t p_pledge; /* [o] copy of p_p->ps_pledge */ struct ucred *p_ucred; /* [o] cached credentials */ struct sigaltstack p_sigstk; /* sp & on stack state variable */ Index: sys/kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v diff -u -p -u -r1.200 kern_event.c --- sys/kern/kern_event.c 6 Aug 2024 08:44:54 -0000 1.200 +++ sys/kern/kern_event.c 6 Feb 2025 18:52:15 -0000 @@ -343,7 +343,7 @@ filt_procattach(struct knote *kn) int nolock; if ((curproc->p_p->ps_flags & PS_PLEDGE) && - (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0) + (curproc->p_pledge & PLEDGE_PROC) == 0) return pledge_fail(curproc, EPERM, PLEDGE_PROC); if (kn->kn_id > PID_MAX) Index: sys/kern/kern_pledge.c =================================================================== RCS file: /cvs/src/sys/kern/kern_pledge.c,v diff -u -p -u -r1.321 kern_pledge.c --- sys/kern/kern_pledge.c 6 Oct 2024 23:39:24 -0000 1.321 +++ sys/kern/kern_pledge.c 7 Feb 2025 15:04:11 -0000 @@ -451,7 +451,7 @@ sys_pledge(struct proc *p, void *v, regi } */ *uap = v; struct process *pr = p->p_p; uint64_t promises, execpromises; - int error; + int error = 0; int unveil_cleanup = 0; /* Check for any error in user input */ @@ -481,16 +481,16 @@ sys_pledge(struct proc *p, void *v, regi /* Only permit reductions */ if (ISSET(pr->ps_flags, PS_PLEDGE) && (((promises | pr->ps_pledge) != pr->ps_pledge))) { - mtx_leave(&pr->ps_mtx); - return (EPERM); + error = EPERM; + goto fail; } } if (SCARG(uap, execpromises)) { /* Only permit reductions */ if (ISSET(pr->ps_flags, PS_EXECPLEDGE) && (((execpromises | pr->ps_execpledge) != pr->ps_execpledge))) { - mtx_leave(&pr->ps_mtx); - return (EPERM); + error = EPERM; + goto fail; } } @@ -509,19 +509,24 @@ sys_pledge(struct proc *p, void *v, regi atomic_setbits_int(&pr->ps_flags, PS_EXECPLEDGE); } +fail: mtx_leave(&pr->ps_mtx); if (unveil_cleanup) { /* * Kill off unveil and drop unveil vnode refs if we no - * longer are holding any path-accessing pledge + * longer are holding any path-accessing pledge, must + * be done single-tread so that other sleeping syscalls + * finish first before we change this. */ + single_thread_set(p, SINGLE_UNWIND); KERNEL_LOCK(); unveil_destroy(pr); KERNEL_UNLOCK(); + single_thread_clear(p); } - return (0); + return (error); } int @@ -536,7 +541,8 @@ pledge_syscall(struct proc *p, int code, if (pledge_syscalls[code] == PLEDGE_ALWAYS) return (0); - if (p->p_p->ps_pledge & pledge_syscalls[code]) + p->p_pledge = p->p_p->ps_pledge; /* pledge checks are per-thread */ + if (p->p_pledge & pledge_syscalls[code]) return (0); *tval = pledge_syscalls[code]; @@ -559,7 +565,7 @@ pledge_fail(struct proc *p, int error, u if (KTRPOINT(p, KTR_PLEDGE)) ktrpledge(p, error, code, p->p_pledge_syscall); #endif - if (p->p_p->ps_pledge & PLEDGE_ERROR) + if (p->p_pledge & PLEDGE_ERROR) return (ENOSYS); KERNEL_LOCK(); @@ -593,7 +599,7 @@ pledge_namei(struct proc *p, struct name if ((p->p_p->ps_flags & PS_PLEDGE) == 0 || (p->p_p->ps_flags & PS_COREDUMP)) return (0); - pledge = READ_ONCE(p->p_p->ps_pledge); + pledge = p->p_pledge; if (ni->ni_pledge == 0) panic("pledge_namei: ni_pledge"); @@ -604,8 +610,7 @@ pledge_namei(struct proc *p, struct name */ /* Doing a permitted execve() */ - if ((ni->ni_pledge & PLEDGE_EXEC) && - (pledge & PLEDGE_EXEC)) + if ((ni->ni_pledge & PLEDGE_EXEC) && (pledge & PLEDGE_EXEC)) return (0); error = canonpath(origpath, path, sizeof(path)); @@ -729,7 +734,7 @@ pledge_namei(struct proc *p, struct name /* * Ensure each flag of ni_pledge has counterpart allowing it in - * ps_pledge. + * p_pledge. */ if (ni->ni_pledge & ~pledge) return (pledge_fail(p, EPERM, (ni->ni_pledge & ~pledge))); @@ -748,7 +753,7 @@ pledge_recvfd(struct proc *p, struct fil if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if ((p->p_p->ps_pledge & PLEDGE_RECVFD) == 0) + if ((p->p_pledge & PLEDGE_RECVFD) == 0) return pledge_fail(p, EPERM, PLEDGE_RECVFD); switch (fp->f_type) { @@ -776,7 +781,7 @@ pledge_sendfd(struct proc *p, struct fil if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if ((p->p_p->ps_pledge & PLEDGE_SENDFD) == 0) + if ((p->p_pledge & PLEDGE_SENDFD) == 0) return pledge_fail(p, EPERM, PLEDGE_SENDFD); switch (fp->f_type) { @@ -804,7 +809,7 @@ pledge_sysctl(struct proc *p, int miblen if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - pledge = READ_ONCE(p->p_p->ps_pledge); + pledge = p->p_pledge; if (new) return pledge_fail(p, EFAULT, 0); @@ -1010,7 +1015,7 @@ pledge_chown(struct proc *p, uid_t uid, if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if (p->p_p->ps_pledge & PLEDGE_CHOWNUID) + if (p->p_pledge & PLEDGE_CHOWNUID) return (0); if (uid != -1 && uid != p->p_ucred->cr_uid) @@ -1028,7 +1033,7 @@ pledge_adjtime(struct proc *p, const voi if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if ((p->p_p->ps_pledge & PLEDGE_SETTIME)) + if ((p->p_pledge & PLEDGE_SETTIME)) return (0); if (delta) return (EPERM); @@ -1041,7 +1046,7 @@ pledge_sendit(struct proc *p, const void if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if ((p->p_p->ps_pledge & (PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS))) + if ((p->p_pledge & (PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS))) return (0); /* may use address */ if (to == NULL) return (0); /* behaves just like write */ @@ -1057,7 +1062,7 @@ pledge_ioctl(struct proc *p, long com, s if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - pledge = READ_ONCE(p->p_p->ps_pledge); + pledge = p->p_pledge; /* * The ioctl's which are always allowed. @@ -1365,7 +1370,7 @@ pledge_sockopt(struct proc *p, int set, if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - pledge = READ_ONCE(p->p_p->ps_pledge); + pledge = p->p_pledge; /* Always allow these, which are too common to reject */ switch (level) { @@ -1502,7 +1507,7 @@ pledge_socket(struct proc *p, int domain if (!ISSET(p->p_p->ps_flags, PS_PLEDGE)) return 0; - pledge = READ_ONCE(p->p_p->ps_pledge); + pledge = p->p_pledge; if (ISSET(state, SS_DNS)) { if (ISSET(pledge, PLEDGE_DNS)) @@ -1534,7 +1539,7 @@ pledge_flock(struct proc *p) if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if ((p->p_p->ps_pledge & PLEDGE_FLOCK)) + if ((p->p_pledge & PLEDGE_FLOCK)) return (0); return (pledge_fail(p, EPERM, PLEDGE_FLOCK)); } @@ -1545,7 +1550,7 @@ pledge_swapctl(struct proc *p, int cmd) if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if (p->p_p->ps_pledge & PLEDGE_VMINFO) { + if (p->p_pledge & PLEDGE_VMINFO) { switch (cmd) { case SWAP_NSWAP: case SWAP_STATS: @@ -1580,7 +1585,7 @@ pledge_fcntl(struct proc *p, int cmd) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); - if ((p->p_p->ps_pledge & PLEDGE_PROC) == 0 && cmd == F_SETOWN) + if ((p->p_pledge & PLEDGE_PROC) == 0 && cmd == F_SETOWN) return pledge_fail(p, EPERM, PLEDGE_PROC); return (0); } @@ -1590,7 +1595,7 @@ pledge_kill(struct proc *p, pid_t pid) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return 0; - if (p->p_p->ps_pledge & PLEDGE_PROC) + if (p->p_pledge & PLEDGE_PROC) return 0; if (pid == 0 || pid == p->p_p->ps_pid) return 0; @@ -1615,7 +1620,7 @@ pledge_protexec(struct proc *p, int prot /* Before kbind(2) call, ld.so and crt may create EXEC mappings */ if (p->p_p->ps_kbind_addr == 0 && p->p_p->ps_kbind_cookie == 0) return 0; - if (!(p->p_p->ps_pledge & PLEDGE_PROTEXEC) && (prot & PROT_EXEC)) + if (!(p->p_pledge & PLEDGE_PROTEXEC) && (prot & PROT_EXEC)) return pledge_fail(p, EPERM, PLEDGE_PROTEXEC); return 0; } Index: sys/kern/vfs_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v diff -u -p -u -r1.372 vfs_syscalls.c --- sys/kern/vfs_syscalls.c 29 Jan 2025 14:57:19 -0000 1.372 +++ sys/kern/vfs_syscalls.c 7 Feb 2025 17:12:25 -0000 @@ -966,6 +966,9 @@ sys_unveil(struct proc *p, void *v, regi sizeof(permissions), NULL); if (error) return (error); + + single_thread_set(p, SINGLE_UNWIND); + pathname = pool_get(&namei_pool, PR_WAITOK); error = copyinstr(SCARG(uap, path), pathname, MAXPATHLEN, &pathlen); if (error) @@ -1031,6 +1034,7 @@ sys_unveil(struct proc *p, void *v, regi end: pool_put(&namei_pool, pathname); + single_thread_clear(p); return (error); } Index: sys/dev/vmm/vmm.c =================================================================== RCS file: /cvs/src/sys/dev/vmm/vmm.c,v diff -u -p -u -r1.3 vmm.c --- sys/dev/vmm/vmm.c 27 Aug 2024 09:16:03 -0000 1.3 +++ sys/dev/vmm/vmm.c 6 Feb 2025 19:14:14 -0000 @@ -184,7 +184,7 @@ vm_find(uint32_t id, struct vm **res) * The managing vmm parent process can lookup all * all VMs and is indicated by PLEDGE_PROC. */ - if (((p->p_p->ps_pledge & + if (((p->p_pledge & (PLEDGE_VMM | PLEDGE_PROC)) == PLEDGE_VMM) && (vm->vm_creator_pid != p->p_p->ps_pid)) ret = EPERM; @@ -291,7 +291,7 @@ pledge_ioctl_vmm(struct proc *p, long co case VMM_IOC_INFO: case VMM_IOC_SHAREMEM: /* The "parent" process in vmd forks and manages VMs */ - if (p->p_p->ps_pledge & PLEDGE_PROC) + if (p->p_pledge & PLEDGE_PROC) return (0); break; case VMM_IOC_TERM: