Index | Thread | Search

From:
Ricardo Branco <rbranco@suse.de>
Subject:
[PATCH]: Add POSIX O_CLOFORK flag
To:
tech@openbsd.org
Date:
Sat, 21 Jun 2025 23:35:36 +0200

Download raw body.

Thread
This initial patch adds support for POSIX O_CLOFORK (close-on-fork) flag.

If there's interest, I can update manpages and fill the TODO list in the PR:
https://github.com/openbsd/src/pull/46

I uploaded the full test-suite from Illumos adapted to OpenBSD there.

Work also being done to add this flag on:

- FreeBSD: https://github.com/freebsd/freebsd-src/pull/1698
- DragonflyBSD: https://github.com/DragonFlyBSD/DragonFlyBSD/pull/28

The discussion for adding this flag was done in the FreeBSD PR.

Best,
Ricardo
---
 sys/kern/kern_descrip.c  | 32 ++++++++++++++++++++++----------
 sys/kern/sys_pipe.c      | 11 ++++++-----
 sys/kern/uipc_syscalls.c | 29 ++++++++++++++++-------------
 sys/kern/uipc_usrreq.c   |  2 ++
 sys/kern/vfs_syscalls.c  | 24 ++++++++++++++----------
 sys/sys/fcntl.h          |  5 +++++
 sys/sys/filedesc.h       |  1 +
 sys/sys/socket.h         |  2 ++
 usr.bin/fstat/fstat.c    |  2 ++
 9 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 3e57566b820..550b1c3ae9e 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -80,6 +80,7 @@ int dodup3(struct proc *, int, int, int, register_t *);
 
 #define DUPF_CLOEXEC	0x01
 #define DUPF_DUP2	0x02
+#define DUPF_CLOFORK	0x04
 
 struct pool file_pool;
 struct pool fdesc_pool;
@@ -336,7 +337,7 @@ sys_dup3(struct proc *p, void *v, register_t *retval)
 
 	if (SCARG(uap, from) == SCARG(uap, to))
 		return (EINVAL);
-	if (SCARG(uap, flags) & ~O_CLOEXEC)
+	if (SCARG(uap, flags) & ~(O_CLOEXEC | O_CLOFORK))
 		return (EINVAL);
 	return (dodup3(p, SCARG(uap, from), SCARG(uap, to),
 	    SCARG(uap, flags), retval));
@@ -388,6 +389,8 @@ restart:
 	dupflags = DUPF_DUP2;
 	if (flags & O_CLOEXEC)
 		dupflags |= DUPF_CLOEXEC;
+	if (flags & O_CLOFORK)
+		dupflags |= DUPF_CLOFORK;
 
 	/* No need for FRELE(), finishdup() uses current ref. */
 	return (finishdup(p, fp, old, new, retval, dupflags));
@@ -423,6 +426,7 @@ restart:
 
 	case F_DUPFD:
 	case F_DUPFD_CLOEXEC:
+	case F_DUPFD_CLOFORK:
 		newmin = (long)SCARG(uap, arg);
 		if ((u_int)newmin >= lim_cur(RLIMIT_NOFILE) ||
 		    (u_int)newmin >= atomic_load_int(&maxfiles)) {
@@ -444,6 +448,8 @@ restart:
 
 			if (SCARG(uap, cmd) == F_DUPFD_CLOEXEC)
 				dupflags |= DUPF_CLOEXEC;
+			else if (SCARG(uap, cmd) == F_DUPFD_CLOFORK)
+				dupflags |= DUPF_CLOFORK;
 
 			/* No need for FRELE(), finishdup() uses current ref. */
 			error = finishdup(p, fp, fd, i, retval, dupflags);
@@ -452,16 +458,17 @@ restart:
 
 	case F_GETFD:
 		fdplock(fdp);
-		*retval = fdp->fd_ofileflags[fd] & UF_EXCLOSE ? 1 : 0;
+		*retval =
+		    ((fdp->fd_ofileflags[fd] & UF_EXCLOSE) ? FD_CLOEXEC : 0) |
+		    ((fdp->fd_ofileflags[fd] & UF_FOCLOSE) ? FD_CLOFORK : 0);
 		fdpunlock(fdp);
 		break;
 
 	case F_SETFD:
 		fdplock(fdp);
-		if ((long)SCARG(uap, arg) & 1)
-			fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
-		else
-			fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+		fdp->fd_ofileflags[fd] =
+		    (((long)SCARG(uap, arg) & FD_CLOEXEC) ? UF_EXCLOSE : 0) |
+		    (((long)SCARG(uap, arg) & FD_CLOFORK) ? UF_FOCLOSE : 0);
 		fdpunlock(fdp);
 		break;
 
@@ -667,9 +674,12 @@ finishdup(struct proc *p, struct file *fp, int old, int new,
 	fdp->fd_ofiles[new] = fp;
 	mtx_leave(&fdp->fd_fplock);
 
-	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+	fdp->fd_ofileflags[new] =
+	    fdp->fd_ofileflags[old] & ~(UF_EXCLOSE | UF_FOCLOSE);
 	if (dupflags & DUPF_CLOEXEC)
 		fdp->fd_ofileflags[new] |= UF_EXCLOSE;
+	if (dupflags & DUPF_CLOFORK)
+		fdp->fd_ofileflags[new] |= UF_FOCLOSE;
 	*retval = new;
 
 	if (oldfp != NULL) {
@@ -711,7 +721,7 @@ fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp)
 	fdp->fd_ofiles[fd] = fp;
 	mtx_leave(&fdp->fd_fplock);
 
-	fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
+	fdp->fd_ofileflags[fd] |= (flags & (UF_EXCLOSE | UF_FOCLOSE));
 }
 
 void
@@ -1150,6 +1160,7 @@ fdcopy(struct process *pr)
 			 * their internal consistency, so close them here.
 			 */
 			if (fp->f_count >= FDUP_MAX_COUNT ||
+			    (fdp->fd_ofileflags[i] & UF_FOCLOSE) != 0 ||
 			    fp->f_type == DTYPE_KQUEUE) {
 				if (i < newfdp->fd_freefile)
 					newfdp->fd_freefile = i;
@@ -1407,8 +1418,9 @@ dupfdopen(struct proc *p, int indx, int mode)
 	fdp->fd_ofiles[indx] = wfp;
 	mtx_leave(&fdp->fd_fplock);
 
-	fdp->fd_ofileflags[indx] = (fdp->fd_ofileflags[indx] & UF_EXCLOSE) |
-	    (fdp->fd_ofileflags[dupfd] & ~UF_EXCLOSE);
+	fdp->fd_ofileflags[indx] =
+	    (fdp->fd_ofileflags[indx] & (UF_EXCLOSE | UF_FOCLOSE)) |
+	    (fdp->fd_ofileflags[dupfd] & ~(UF_EXCLOSE | UF_FOCLOSE));
 
 	return (0);
 }
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 12254a052da..d278647d382 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -162,7 +162,7 @@ sys_pipe2(struct proc *p, void *v, register_t *retval)
 		syscallarg(int) flags;
 	} */ *uap = v;
 
-	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
+	if (SCARG(uap, flags) & ~(O_CLOEXEC | O_CLOFORK | FNONBLOCK))
 		return (EINVAL);
 
 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
@@ -175,9 +175,10 @@ dopipe(struct proc *p, int *ufds, int flags)
 	struct file *rf, *wf;
 	struct pipe_pair *pp;
 	struct pipe *rpipe, *wpipe = NULL;
-	int fds[2], cloexec, error;
+	int fds[2], fdflags, error;
 
-	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
+	fdflags = ((flags & O_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((flags & O_CLOFORK) ? UF_FOCLOSE : 0);
 
 	pp = pipe_pair_create();
 	if (pp == NULL)
@@ -203,8 +204,8 @@ dopipe(struct proc *p, int *ufds, int flags)
 	wf->f_data = wpipe;
 	wf->f_ops = &pipeops;
 
-	fdinsert(fdp, fds[0], cloexec, rf);
-	fdinsert(fdp, fds[1], cloexec, wf);
+	fdinsert(fdp, fds[0], fdflags, rf);
+	fdinsert(fdp, fds[1], fdflags, wf);
 
 	error = copyout(fds, ufds, sizeof(fds));
 	if (error == 0) {
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 7a93c571a29..00833af2705 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -81,7 +81,7 @@ sys_socket(struct proc *p, void *v, register_t *retval)
 	struct file *fp;
 	int type = SCARG(uap, type);
 	int domain = SCARG(uap, domain);
-	int fd, cloexec, nonblock, fflag, error;
+	int fd, fdflags, nonblock, fflag, error;
 	unsigned int ss = 0;
 
 	if ((type & SOCK_DNS) && !(domain == AF_INET || domain == AF_INET6))
@@ -93,8 +93,9 @@ sys_socket(struct proc *p, void *v, register_t *retval)
 	if (error)
 		return (error);
 
-	type &= ~(SOCK_CLOEXEC | SOCK_NONBLOCK | SOCK_DNS);
-	cloexec = (SCARG(uap, type) & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
+	type &= ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK | SOCK_DNS);
+	fdflags = ((SCARG(uap, type) & SOCK_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((SCARG(uap, type) & SOCK_CLOFORK) ? UF_FOCLOSE : 0);
 	nonblock = SCARG(uap, type) & SOCK_NONBLOCK;
 	fflag = FREAD | FWRITE | (nonblock ? FNONBLOCK : 0);
 
@@ -113,7 +114,7 @@ sys_socket(struct proc *p, void *v, register_t *retval)
 		fp->f_ops = &socketops;
 		so->so_state |= ss;
 		fp->f_data = so;
-		fdinsert(fdp, fd, cloexec, fp);
+		fdinsert(fdp, fd, fdflags, fp);
 		fdpunlock(fdp);
 		FRELE(fp, p);
 		*retval = fd;
@@ -240,7 +241,7 @@ sys_accept4(struct proc *p, void *v, register_t *retval)
 		syscallarg(socklen_t *) int flags;
 	} */ *uap = v;
 
-	if (SCARG(uap, flags) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+	if (SCARG(uap, flags) & ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK))
 		return (EINVAL);
 
 	return (doaccept(p, SCARG(uap, s), SCARG(uap, name),
@@ -257,9 +258,10 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen,
 	socklen_t namelen;
 	int error, tmpfd;
 	struct socket *head, *so;
-	int cloexec, nflag;
+	int fdflags, nflag;
 
-	cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
+	fdflags = ((flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((flags & SOCK_CLOFORK) ? UF_FOCLOSE : 0);
 
 	if (name && (error = copyin(anamelen, &namelen, sizeof (namelen))))
 		return (error);
@@ -346,7 +348,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen,
 	}
 
 	fdplock(fdp);
-	fdinsert(fdp, tmpfd, cloexec, fp);
+	fdinsert(fdp, tmpfd, fdflags, fp);
 	fdpunlock(fdp);
 	FRELE(fp, p);
 	*retval = tmpfd;
@@ -457,10 +459,11 @@ sys_socketpair(struct proc *p, void *v, register_t *retval)
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp1 = NULL, *fp2 = NULL;
 	struct socket *so1, *so2;
-	int type, cloexec, nonblock, fflag, error, sv[2];
+	int type, fdflags, nonblock, fflag, error, sv[2];
 
-	type  = SCARG(uap, type) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK);
-	cloexec = (SCARG(uap, type) & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
+	type = SCARG(uap, type) & ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK);
+	fdflags = ((SCARG(uap, type) & SOCK_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((SCARG(uap, type) & SOCK_CLOFORK) ? UF_FOCLOSE : 0);
 	nonblock = SCARG(uap, type) & SOCK_NONBLOCK;
 	fflag = FREAD | FWRITE | (nonblock ? FNONBLOCK : 0);
 
@@ -498,8 +501,8 @@ sys_socketpair(struct proc *p, void *v, register_t *retval)
 	fp2->f_data = so2;
 	error = copyout(sv, SCARG(uap, rsv), 2 * sizeof (int));
 	if (error == 0) {
-		fdinsert(fdp, sv[0], cloexec, fp1);
-		fdinsert(fdp, sv[1], cloexec, fp2);
+		fdinsert(fdp, sv[0], fdflags, fp1);
+		fdinsert(fdp, sv[1], fdflags, fp2);
 		fdpunlock(fdp);
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_STRUCT))
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index f50a040d1e8..b025c071f41 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1146,6 +1146,8 @@ restart:
 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
 		if (flags & MSG_CMSG_CLOEXEC)
 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
+		if (flags & MSG_CMSG_CLOFORK)
+			fdp->fd_ofileflags[fds[i]] |= UF_FOCLOSE;
 
 		rp++;
 	}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 1f5731712a0..32734b2a3f4 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1086,7 +1086,7 @@ doopenat(struct proc *p, int fd, const char *path, int oflags, mode_t mode,
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vattr;
-	int flags, cloexec, cmode;
+	int flags, fdflags, cmode;
 	int type, indx, error, localtrunc = 0;
 	struct flock lf;
 	struct nameidata nd;
@@ -1099,7 +1099,8 @@ doopenat(struct proc *p, int fd, const char *path, int oflags, mode_t mode,
 			return (error);
 	}
 
-	cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0;
+	fdflags = ((oflags & O_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((oflags & O_CLOFORK) ? UF_FOCLOSE : 0);
 
 	fdplock(fdp);
 	if ((error = falloc(p, &fp, &indx)) != 0) {
@@ -1200,7 +1201,7 @@ doopenat(struct proc *p, int fd, const char *path, int oflags, mode_t mode,
 	KERNEL_UNLOCK();
 	*retval = indx;
 	fdplock(fdp);
-	fdinsert(fdp, indx, cloexec, fp);
+	fdinsert(fdp, indx, fdflags, fp);
 	fdpunlock(fdp);
 	FRELE(fp, p);
 	return (error);
@@ -1224,7 +1225,7 @@ sys___tmpfd(struct proc *p, void *v, register_t *retval)
 	struct file *fp;
 	struct vnode *vp;
 	int oflags = SCARG(uap, flags);
-	int flags, cloexec, cmode;
+	int flags, fdflags, cmode;
 	int indx, error;
 	unsigned int i;
 	struct nameidata nd;
@@ -1232,9 +1233,11 @@ sys___tmpfd(struct proc *p, void *v, register_t *retval)
 	static const char *letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-";
 
 	/* most flags are hardwired */
-	oflags = O_RDWR | O_CREAT | O_EXCL | O_NOFOLLOW | (oflags & O_CLOEXEC);
+	oflags = O_RDWR | O_CREAT | O_EXCL | O_NOFOLLOW |
+	    (oflags & (O_CLOEXEC | O_CLOFORK));
 
-	cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0;
+	fdflags = ((oflags & O_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((oflags & O_CLOFORK) ? UF_FOCLOSE : 0);
 
 	fdplock(fdp);
 	if ((error = falloc(p, &fp, &indx)) != 0) {
@@ -1270,7 +1273,7 @@ sys___tmpfd(struct proc *p, void *v, register_t *retval)
 	VOP_UNLOCK(vp);
 	*retval = indx;
 	fdplock(fdp);
-	fdinsert(fdp, indx, cloexec, fp);
+	fdinsert(fdp, indx, fdflags, fp);
 	fdpunlock(fdp);
 	FRELE(fp, p);
 
@@ -1352,7 +1355,7 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
 	struct vnode *vp = NULL;
 	struct mount *mp;
 	struct ucred *cred = p->p_ucred;
-	int flags, cloexec;
+	int flags, fdflags;
 	int type, indx, error=0;
 	struct flock lf;
 	struct vattr va;
@@ -1370,7 +1373,8 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
 	if ((flags & O_CREAT))
 		return (EINVAL);
 
-	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
+	fdflags = ((flags & O_CLOEXEC) ? UF_EXCLOSE : 0) |
+	    ((flags & O_CLOFORK) ? UF_FOCLOSE : 0);
 
 	fdplock(fdp);
 	if ((error = falloc(p, &fp, &indx)) != 0) {
@@ -1456,7 +1460,7 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
 	VOP_UNLOCK(vp);
 	*retval = indx;
 	fdplock(fdp);
-	fdinsert(fdp, indx, cloexec, fp);
+	fdinsert(fdp, indx, fdflags, fp);
 	fdpunlock(fdp);
 	FRELE(fp, p);
 	return (0);
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index e964ea49dde..bd6e329afa1 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -106,6 +106,7 @@
 /* defined by POSIX Issue 7 */
 #define	O_CLOEXEC	0x10000		/* atomically set FD_CLOEXEC */
 #define	O_DIRECTORY	0x20000		/* fail if not a directory */
+#define	O_CLOFORK	0x40000		/* atomically set FD_CLOFORK */
 
 #ifdef _KERNEL
 /*
@@ -158,9 +159,13 @@
 #if __BSD_VISIBLE
 #define F_ISATTY	11		/* used by isatty(3) */
 #endif
+#if __POSIX_VISIBLE >= 202405
+#define	F_DUPFD_CLOFORK	12		/* duplicate with FD_CLOFORK set */
+#endif
 
 /* file descriptor flags (F_GETFD, F_SETFD) */
 #define	FD_CLOEXEC	1		/* close-on-exec flag */
+#define	FD_CLOFORK	2		/* close-on-fork flag */
 
 /* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */
 #define	F_RDLCK		1		/* shared or read lock */
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index 50bc7734a02..38857d98fd7 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -115,6 +115,7 @@ struct filedesc0 {
  */
 #define	UF_EXCLOSE 	0x01		/* auto-close on exec */
 #define	UF_PLEDGED 	0x02		/* open after pledge(2) */
+#define	UF_FOCLOSE 	0x04		/* auto-close on fork */
 
 /*
  * Flags on the file descriptor table.
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 4fd50d29274..4e4987ca255 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -79,6 +79,7 @@ typedef	__sa_family_t	sa_family_t;	/* sockaddr address family type */
 #define	SOCK_NONBLOCK_INHERIT	0x2000	/* inherit O_NONBLOCK from listener */
 #endif
 #define	SOCK_DNS		0x1000	/* set SS_DNS */
+#define	SOCK_CLOFORK		0x0800	/* set FD_CLOFORK */
 #endif /* __BSD_VISIBLE */
 
 /*
@@ -511,6 +512,7 @@ struct timespec;
 #define	MSG_NOSIGNAL		0x400	/* do not send SIGPIPE */
 #define	MSG_CMSG_CLOEXEC	0x800	/* set FD_CLOEXEC on received fds */
 #define	MSG_WAITFORONE		0x1000	/* nonblocking but wait for one msg */
+#define	MSG_CMSG_CLOFORK	0x2000	/* set FD_CLOFORK on received fds */
 
 /*
  * Header for ancillary data objects in msg_control buffer.
diff --git a/usr.bin/fstat/fstat.c b/usr.bin/fstat/fstat.c
index a74d3a6e916..acd1ffe26cc 100644
--- a/usr.bin/fstat/fstat.c
+++ b/usr.bin/fstat/fstat.c
@@ -482,6 +482,8 @@ vtrans(struct kinfo_file *kf)
 		strlcat(rwep, "w", sizeof rwep);
 	if (kf->fd_ofileflags & UF_EXCLOSE)
 		strlcat(rwep, "e", sizeof rwep);
+	if (kf->fd_ofileflags & UF_FOCLOSE)
+		strlcat(rwep, "f", sizeof rwep);
 	if (kf->fd_ofileflags & UF_PLEDGED)
 		strlcat(rwep, "p", sizeof rwep);
 	printf(" %4s", rwep);
-- 
2.49.0