Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Kill `inp_notify' list remains
To:
Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Fri, 20 Dec 2024 04:08:14 +0300

Download raw body.

Thread
This was the list, where PCBs were temporary linked to avoid sleep with 
`inpt_mtx' mutex(9) held. in_pcbnotifyall() and in6_pcbnotify are the
last list users, so I propose to switch them to in_pcb_iterator() too,
moreover they already do in_pcb_is_iterator() check.

Note, in_pcb_iterator() does in_pcbref() and in_pcbunref() for
corresponding `inp', unlocked dereference is safe and no extra
reference release required.

Outside divert(4) sockets search loop, sysctl_file() is the only
`inp_queue' foreach loops runner. It takes exclusive netlock and holds
it while processing all inet PCB tables. I want to switch these loops to
the iterators and push shared solock into `inp_queue' loops.

From this:

	NET_LOCK();
	mtx_enter(&tcbtable.inpt_mtx);
	TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue)
	FILLSO(inp->inp_socket);
	mtx_leave(&tcbtable.inpt_mtx);

to this:

	mtx_enter(&tcbtable.inpt_mtx);
	while ((inp = in_pcb_iterator(&tcbtable, inp, &iter)) != NULL) {
		mtx_leave(&tcbtable.inpt_mtx);
		NET_LOCK_SHARED();
		if (inp->inp_socket)
			FILLSO(inp->inp_socket);
		NET_UNLOCK_SHARED();
		mtx_enter(&tcbtable.inpt_mtx);
	}
	mtx_leave(&tcbtable.inpt_mtx);

In the future, net lock could be replaced with solock() and pushed down
to FILLSO(), so copyout() could be moved outside net lock too.

Index: sys/netinet/in_pcb.c
===================================================================
RCS file: /cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.305 in_pcb.c
--- sys/netinet/in_pcb.c	5 Nov 2024 22:44:20 -0000	1.305
+++ sys/netinet/in_pcb.c	20 Dec 2024 01:01:19 -0000
@@ -179,7 +179,6 @@ void
 in_pcbinit(struct inpcbtable *table, int hashsize)
 {
 	mtx_init(&table->inpt_mtx, IPL_SOFTNET);
-	rw_init(&table->inpt_notify, "inpnotify");
 	TAILQ_INIT(&table->inpt_queue);
 	table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
 	    &table->inpt_mask);
@@ -763,8 +762,8 @@ void
 in_pcbnotifyall(struct inpcbtable *table, const struct sockaddr_in *dst,
     u_int rtable, int errno, void (*notify)(struct inpcb *, int))
 {
-	SIMPLEQ_HEAD(, inpcb) inpcblist;
-	struct inpcb *inp;
+	struct inpcb_iterator iter = { .inp_table = NULL };
+	struct inpcb *inp = NULL;
 	u_int rdomain;
 
 	if (dst->sin_addr.s_addr == INADDR_ANY)
@@ -772,39 +771,20 @@ in_pcbnotifyall(struct inpcbtable *table
 	if (notify == NULL)
 		return;
 
-	/*
-	 * Use a temporary notify list protected by rwlock to run over
-	 * selected PCB.  This is necessary as the list of all PCB is
-	 * protected by a mutex.  Notify may call ip_output() eventually
-	 * which may sleep as pf lock is a rwlock.  Also the SRP
-	 * implementation of the routing table might sleep.
-	 * The same inp_notify list entry and inpt_notify rwlock are
-	 * used for UDP multicast and raw IP delivery.
-	 */
-	SIMPLEQ_INIT(&inpcblist);
 	rdomain = rtable_l2(rtable);
-	rw_enter_write(&table->inpt_notify);
 	mtx_enter(&table->inpt_mtx);
-	TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
-		if (in_pcb_is_iterator(inp))
-			continue;
+	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
 		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
 
 		if (inp->inp_faddr.s_addr != dst->sin_addr.s_addr ||
 		    rtable_l2(inp->inp_rtableid) != rdomain) {
 			continue;
 		}
-		in_pcbref(inp);
-		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
-	}
-	mtx_leave(&table->inpt_mtx);
-
-	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
-		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
+		mtx_leave(&table->inpt_mtx);
 		(*notify)(inp, errno);
-		in_pcbunref(inp);
+		mtx_enter(&table->inpt_mtx);
 	}
-	rw_exit_write(&table->inpt_notify);
+	mtx_leave(&table->inpt_mtx);
 }
 
 /*
Index: sys/netinet/in_pcb.h
===================================================================
RCS file: /cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.159 in_pcb.h
--- sys/netinet/in_pcb.h	5 Nov 2024 10:49:23 -0000	1.159
+++ sys/netinet/in_pcb.h	20 Dec 2024 01:01:19 -0000
@@ -79,7 +79,6 @@
  *	I	immutable after creation
  *	N	net lock
  *	t	inpt_mtx		pcb table mutex
- *	y	inpt_notify		pcb table rwlock for notify
  *	L	pf_inp_mtx		link pf to inp mutex
  *	s	so_lock			socket rwlock
  */
@@ -128,7 +127,6 @@ struct inpcb {
 	LIST_ENTRY(inpcb) inp_hash;		/* [t] local and foreign hash */
 	LIST_ENTRY(inpcb) inp_lhash;		/* [t] local port hash */
 	TAILQ_ENTRY(inpcb) inp_queue;		/* [t] inet PCB queue */
-	SIMPLEQ_ENTRY(inpcb) inp_notify;	/* [y] notify or udp append */
 	struct	  inpcbtable *inp_table;	/* [I] inet queue/hash table */
 	union	  inpaddru inp_faddru;		/* [t] Foreign address. */
 	union	  inpaddru inp_laddru;		/* [t] Local address. */
@@ -194,7 +192,6 @@ in_pcb_is_iterator(struct inpcb *inp)
 
 struct inpcbtable {
 	struct mutex inpt_mtx;			/* protect queue and hash */
-	struct rwlock inpt_notify;		/* protect inp_notify list */
 	TAILQ_HEAD(inpthead, inpcb) inpt_queue;	/* [t] inet PCB queue */
 	struct	inpcbhead *inpt_hashtbl;	/* [t] local and foreign hash */
 	struct	inpcbhead *inpt_lhashtbl;	/* [t] local port hash */
Index: sys/netinet6/in6_pcb.c
===================================================================
RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v
diff -u -p -r1.145 in6_pcb.c
--- sys/netinet6/in6_pcb.c	5 Nov 2024 10:49:23 -0000	1.145
+++ sys/netinet6/in6_pcb.c	20 Dec 2024 01:01:19 -0000
@@ -427,8 +427,8 @@ in6_pcbnotify(struct inpcbtable *table, 
     uint fport_arg, const struct sockaddr_in6 *src, uint lport_arg,
     u_int rtable, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int))
 {
-	SIMPLEQ_HEAD(, inpcb) inpcblist;
-	struct inpcb *inp;
+	struct inpcb_iterator iter = { .inp_table = NULL };
+	struct inpcb *inp = NULL;
 	u_short fport = fport_arg, lport = lport_arg;
 	struct sockaddr_in6 sa6_src;
 	int errno;
@@ -474,13 +474,9 @@ in6_pcbnotify(struct inpcbtable *table, 
 	if (notify == NULL)
 		return;
 
-	SIMPLEQ_INIT(&inpcblist);
 	rdomain = rtable_l2(rtable);
-	rw_enter_write(&table->inpt_notify);
 	mtx_enter(&table->inpt_mtx);
-	TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
-		if (in_pcb_is_iterator(inp))
-			continue;
+	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
 		KASSERT(ISSET(inp->inp_flags, INP_IPV6));
 
 		/*
@@ -546,17 +542,11 @@ in6_pcbnotify(struct inpcbtable *table, 
 			continue;
 		}
 	  do_notify:
-		in_pcbref(inp);
-		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
-	}
-	mtx_leave(&table->inpt_mtx);
-
-	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
-		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
+		mtx_leave(&table->inpt_mtx);
 		(*notify)(inp, errno);
-		in_pcbunref(inp);
+		mtx_enter(&table->inpt_mtx);
 	}
-	rw_exit_write(&table->inpt_notify);
+	mtx_leave(&table->inpt_mtx);
 }
 
 struct rtentry *