From: Vitaliy Makkoveev <mvs@openbsd.org>
Subject: sysctl_file(): rework FILLSO() macro
To: Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date: Sun, 12 Jan 2025 03:58:04 +0300

I don't like how KERN_FILE_BYFILE case of the sysctl_file() delivers
sockets data to the userland. It not only takes exclusive netlock
around all except divert socket tables walkthrough, but also does
copyout() with mutex(9) held. Sounds strange, but the context switch
is avoided because userland pages are wired.

The table is protected with `inpt_mtx' mutex(9), so the socket lock or
netlock should be take outside. Since we have no socket pointer, we
can't use solock(). We can't use the only shared netlock because this
left socket unprotected against concurrent threads which rely on
solock_shared().

Now it is possible to fix this precious gem. We can use reference
counting for all socket types, and bluhm@ introduced `inp_sofree_mtx'
mutex(9) to protect `inp_socket'. The INP tables have the special
iterator to safely release `inpt_mtx' during table walkthrough. 

The FILLSO() or FILLIT2() macros can't be used unrolled, because we
need to push mutexes re-locking and solock() deep within. I made
the FILLINPTABLE() macro which is the unrolling, but with the socket
related locking dances. FILLIT2() macro is not required anymore and
was merged with FILLIT().

Current implementation takes the reference on `inp_socket' and
releases  `inpt_mtx' mutex(9). Now it's possible to fairly use
shared_solock() on socket instead of netlock while performing
fill_file(). The copyout() is external for fill_file() and touches
nothing required to be protected so it could be made lockless.

The KERN_FILE_BYFILE case became mp-safe, but the rest sysctl_file()
cases still not, so the sysctl_vslock() dances left as is.

Index: sys/kern/kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.460
diff -u -p -r1.460 kern_sysctl.c
--- sys/kern/kern_sysctl.c	4 Jan 2025 09:26:01 -0000	1.460
+++ sys/kern/kern_sysctl.c	12 Jan 2025 00:16:41 -0000
@@ -1672,9 +1672,10 @@ sysctl_file(int *name, u_int namelen, ch
 
 	kf = malloc(sizeof(*kf), M_TEMP, M_WAITOK);
 
-#define FILLIT2(fp, fdp, i, vp, pr, so) do {				\
+#define FILLIT(fp, fdp, i, vp, pr) do {					\
 	if (buflen >= elem_size && elem_count > 0) {			\
-		fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\
+		fill_file(kf, fp, fdp, i, vp, pr, p, NULL,		\
+		    show_pointers);					\
 		error = copyout(kf, dp, outsize);			\
 		if (error)						\
 			break;						\
@@ -1684,62 +1685,59 @@ sysctl_file(int *name, u_int namelen, ch
 	}								\
 	needed += elem_size;						\
 } while (0)
-#define FILLIT(fp, fdp, i, vp, pr) \
-	FILLIT2(fp, fdp, i, vp, pr, NULL)
-#define FILLSO(so) \
-	FILLIT2(NULL, NULL, 0, NULL, NULL, so)
+
+#define FILLINPTABLE(table)						\
+do {									\
+	struct inpcb_iterator iter = { .inp_table = NULL };		\
+	struct inpcb *inp = NULL;					\
+	struct socket *so;						\
+									\
+	mtx_enter(&(table)->inpt_mtx);					\
+	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {	\
+		if (buflen >= elem_size && elem_count > 0) {		\
+			mtx_enter(&inp->inp_sofree_mtx);		\
+			so = soref(inp->inp_socket);			\
+			mtx_leave(&inp->inp_sofree_mtx);		\
+			if (so == NULL)					\
+				continue;				\
+			mtx_leave(&(table)->inpt_mtx);			\
+			solock_shared(so);				\
+			fill_file(kf, NULL, NULL, 0, NULL, NULL, p,	\
+			    so, show_pointers);				\
+			sounlock_shared(so);				\
+			sorele(so);					\
+			error = copyout(kf, dp, outsize);		\
+			mtx_enter(&(table)->inpt_mtx);			\
+			if (error) {					\
+				in_pcb_iterator_abort((table), inp,	\
+				    &iter);				\
+				break;					\
+			}						\
+			dp += elem_size;				\
+			buflen -= elem_size;				\
+			elem_count--;					\
+		}							\
+		needed += elem_size;					\
+	}								\
+	mtx_leave(&(table)->inpt_mtx);					\
+} while (0)
 
 	switch (op) {
 	case KERN_FILE_BYFILE:
 		/* use the inp-tables to pick up closed connections, too */
 		if (arg == DTYPE_SOCKET) {
-			struct inpcb *inp;
-
-			NET_LOCK();
-			mtx_enter(&tcbtable.inpt_mtx);
-			TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue)
-				FILLSO(inp->inp_socket);
-			mtx_leave(&tcbtable.inpt_mtx);
+			FILLINPTABLE(&tcbtable);
 #ifdef INET6
-			mtx_enter(&tcb6table.inpt_mtx);
-			TAILQ_FOREACH(inp, &tcb6table.inpt_queue, inp_queue)
-				FILLSO(inp->inp_socket);
-			mtx_leave(&tcb6table.inpt_mtx);
+			FILLINPTABLE(&tcb6table);
 #endif
-			mtx_enter(&udbtable.inpt_mtx);
-			TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
-				if (in_pcb_is_iterator(inp))
-					continue;
-				FILLSO(inp->inp_socket);
-			}
-			mtx_leave(&udbtable.inpt_mtx);
+			FILLINPTABLE(&udbtable);
 #ifdef INET6
-			mtx_enter(&udb6table.inpt_mtx);
-			TAILQ_FOREACH(inp, &udb6table.inpt_queue, inp_queue) {
-				if (in_pcb_is_iterator(inp))
-					continue;
-				FILLSO(inp->inp_socket);
-			}
-			mtx_leave(&udb6table.inpt_mtx);
+			FILLINPTABLE(&udb6table);
 #endif
-			mtx_enter(&rawcbtable.inpt_mtx);
-			TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
-				if (in_pcb_is_iterator(inp))
-					continue;
-				FILLSO(inp->inp_socket);
-			}
-			mtx_leave(&rawcbtable.inpt_mtx);
+			FILLINPTABLE(&rawcbtable);
 #ifdef INET6
-			mtx_enter(&rawin6pcbtable.inpt_mtx);
-			TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue,
-			    inp_queue) {
-				if (in_pcb_is_iterator(inp))
-					continue;
-				FILLSO(inp->inp_socket);
-			}
-			mtx_leave(&rawin6pcbtable.inpt_mtx);
+			FILLINPTABLE(&rawin6pcbtable);
 #endif
-			NET_UNLOCK();
 		}
 		fp = NULL;
 		while ((fp = fd_iterfile(fp, p)) != NULL) {