Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
raw IP input loop iterator
To:
tech@openbsd.org
Date:
Tue, 5 Nov 2024 13:58:07 +0100

Download raw body.

Thread
Hi,

Inspired by mvs@ idea with the iterator in the UDP multicast loop,
I implemented the same for raw IP input delivery.  This removes an
unneccesary rwlock and only uses the table mutex.

While there, I found some issues with my UDP implementation.  When
comparing the inp address and port, we must hold some lock.  So
assume that iterator already has the table mutex and hold it while
traversing the list and doing the checks.  We are free to release
the mutex when appending to the the socket buffer and doing the
upcalls.

I think the goto bad in udp_input does one in_pcbunref(inp) too
much.  If we exited the loop with in_pcb_iterator_abort() inp is
not NULL, but already unrefed.

In rip_input() move the actual work to rip_sbappend().  This can
be called without mutex during list traversal and for the final
element.

ok?

bluhm

Index: kern/kern_sysctl.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
diff -u -p -r1.452 kern_sysctl.c
--- kern/kern_sysctl.c	5 Nov 2024 10:49:23 -0000	1.452
+++ kern/kern_sysctl.c	5 Nov 2024 12:18:15 -0000
@@ -1705,8 +1705,11 @@ sysctl_file(int *name, u_int namelen, ch
 			mtx_leave(&udb6table.inpt_mtx);
 #endif
 			mtx_enter(&rawcbtable.inpt_mtx);
-			TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue)
+			TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
+				if (in_pcb_is_iterator(inp))
+					continue;
 				FILLSO(inp->inp_socket);
+			}
 			mtx_leave(&rawcbtable.inpt_mtx);
 #ifdef INET6
 			mtx_enter(&rawin6pcbtable.inpt_mtx);
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.304 in_pcb.c
--- netinet/in_pcb.c	5 Nov 2024 10:49:23 -0000	1.304
+++ netinet/in_pcb.c	5 Nov 2024 12:18:15 -0000
@@ -650,7 +650,7 @@ in_pcb_iterator(struct inpcbtable *table
 {
 	struct inpcb *tmp;
 
-	mtx_enter(&table->inpt_mtx);
+	MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
 
 	if (inp)
 		tmp = TAILQ_NEXT((struct inpcb *)iter, inp_queue);
@@ -663,6 +663,7 @@ in_pcb_iterator(struct inpcbtable *table
 	if (inp) {
 		TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter,
 		    inp_queue);
+		in_pcbunref(inp);
 	}
 	if (tmp) {
 		TAILQ_INSERT_AFTER(&table->inpt_queue, tmp,
@@ -670,10 +671,6 @@ in_pcb_iterator(struct inpcbtable *table
 		in_pcbref(tmp);
 	}
 
-	mtx_leave(&table->inpt_mtx);
-
-	in_pcbunref(inp);
-
 	return tmp;
 }
 
@@ -681,16 +678,13 @@ void
 in_pcb_iterator_abort(struct inpcbtable *table, struct inpcb *inp,
     struct inpcb_iterator *iter)
 {
-	mtx_enter(&table->inpt_mtx);
+	MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
 
 	if (inp) {
 		TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter,
 		    inp_queue);
+		in_pcbunref(inp);
 	}
-
-	mtx_leave(&table->inpt_mtx);
-
-	in_pcbunref(inp);
 }
 
 void
Index: netinet/raw_ip.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
diff -u -p -r1.160 raw_ip.c
--- netinet/raw_ip.c	12 Jul 2024 19:50:35 -0000	1.160
+++ netinet/raw_ip.c	5 Nov 2024 12:18:15 -0000
@@ -116,6 +116,9 @@ const struct pr_usrreqs rip_usrreqs = {
 	.pru_peeraddr	= in_peeraddr,
 };
 
+void    rip_sbappend(struct inpcb *, struct mbuf *, struct ip *,
+	    struct sockaddr_in *);
+
 /*
  * Initialize raw connection block q.
  */
@@ -130,8 +133,8 @@ rip_input(struct mbuf **mp, int *offp, i
 {
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
-	struct inpcb *inp;
-	SIMPLEQ_HEAD(, inpcb) inpcblist;
+	struct inpcb_iterator iter = { .inp_table = NULL };
+	struct inpcb *inp, *last;
 	struct in_addr *key;
 	struct counters_ref ref;
 	uint64_t *counters;
@@ -163,10 +166,9 @@ rip_input(struct mbuf **mp, int *offp, i
 		}
 	}
 #endif
-	SIMPLEQ_INIT(&inpcblist);
-	rw_enter_write(&rawcbtable.inpt_notify);
 	mtx_enter(&rawcbtable.inpt_mtx);
-	TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
+	last = inp = NULL;
+	while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
 		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
 
 		/*
@@ -190,14 +192,23 @@ rip_input(struct mbuf **mp, int *offp, i
 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			continue;
 
-		in_pcbref(inp);
-		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
+		if (last != NULL) {
+			struct mbuf *n;
+
+			mtx_leave(&rawcbtable.inpt_mtx);
+
+			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+			if (n != NULL)
+				rip_sbappend(last, n, ip, &ripsrc);
+			in_pcbunref(last);
+
+			mtx_enter(&rawcbtable.inpt_mtx);
+		}
+		last = in_pcbref(inp);
 	}
 	mtx_leave(&rawcbtable.inpt_mtx);
 
-	if (SIMPLEQ_EMPTY(&inpcblist)) {
-		rw_exit_write(&rawcbtable.inpt_notify);
-
+	if (last == NULL) {
 		if (ip->ip_p != IPPROTO_ICMP)
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
 			    0, 0);
@@ -212,42 +223,34 @@ rip_input(struct mbuf **mp, int *offp, i
 		return IPPROTO_DONE;
 	}
 
-	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
-		struct mbuf *n, *opts = NULL;
-
-		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
-		if (SIMPLEQ_EMPTY(&inpcblist))
-			n = m;
-		else
-			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-		if (n != NULL) {
-			struct socket *so = inp->inp_socket;
-			int ret = 0;
-
-			if (inp->inp_flags & INP_CONTROLOPTS ||
-			    so->so_options & SO_TIMESTAMP)
-				ip_savecontrol(inp, &opts, ip, n);
-
-			mtx_enter(&so->so_rcv.sb_mtx);
-			if (!ISSET(inp->inp_socket->so_rcv.sb_state,
-			    SS_CANTRCVMORE)) {
-				ret = sbappendaddr(so, &so->so_rcv,
-				    sintosa(&ripsrc), n, opts);
-			}
-			mtx_leave(&so->so_rcv.sb_mtx);
-
-			if (ret == 0) {
-				m_freem(n);
-				m_freem(opts);
-				ipstat_inc(ips_noproto);
-			} else
-				sorwakeup(so);
-		}
-		in_pcbunref(inp);
-	}
-	rw_exit_write(&rawcbtable.inpt_notify);
+	rip_sbappend(last, m, ip, &ripsrc);
+	in_pcbunref(last);
 
 	return IPPROTO_DONE;
+}
+
+void
+rip_sbappend(struct inpcb *inp, struct mbuf *m, struct ip *ip,
+    struct sockaddr_in *ripsrc)
+{
+	struct socket *so = inp->inp_socket;
+	struct mbuf *opts = NULL;
+	int ret = 0;
+
+	if (inp->inp_flags & INP_CONTROLOPTS || so->so_options & SO_TIMESTAMP)
+		ip_savecontrol(inp, &opts, ip, m);
+
+	mtx_enter(&so->so_rcv.sb_mtx);
+	if (!ISSET(inp->inp_socket->so_rcv.sb_state, SS_CANTRCVMORE))
+		ret = sbappendaddr(so, &so->so_rcv, sintosa(ripsrc), m, opts);
+	mtx_leave(&so->so_rcv.sb_mtx);
+
+	if (ret == 0) {
+		m_freem(m);
+		m_freem(opts);
+		ipstat_inc(ips_noproto);
+	} else
+		sorwakeup(so);
 }
 
 /*
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
diff -u -p -r1.326 udp_usrreq.c
--- netinet/udp_usrreq.c	5 Nov 2024 10:49:23 -0000	1.326
+++ netinet/udp_usrreq.c	5 Nov 2024 12:18:15 -0000
@@ -409,6 +409,7 @@ udp_input(struct mbuf **mp, int *offp, i
 #endif
 			table = &udbtable;
 
+		mtx_enter(&table->inpt_mtx);
 		last = inp = NULL;
 		while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
 			if (ip6)
@@ -464,12 +465,16 @@ udp_input(struct mbuf **mp, int *offp, i
 			if (last != NULL) {
 				struct mbuf *n;
 
+				mtx_leave(&table->inpt_mtx);
+
 				n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 				if (n != NULL) {
 					udp_sbappend(last, n, ip, ip6, iphlen,
 					    uh, &srcsa.sa, 0);
 				}
 				in_pcbunref(last);
+
+				mtx_enter(&table->inpt_mtx);
 			}
 			last = in_pcbref(inp);
 
@@ -487,6 +492,7 @@ udp_input(struct mbuf **mp, int *offp, i
 				break;
 			}
 		}
+		mtx_leave(&table->inpt_mtx);
 
 		if (last == NULL) {
 			/*
@@ -495,7 +501,8 @@ udp_input(struct mbuf **mp, int *offp, i
 			 * for a broadcast or multicast datgram.)
 			 */
 			udpstat_inc(udps_noportbcast);
-			goto bad;
+			m_freem(m);
+			return IPPROTO_DONE;
 		}
 
 		udp_sbappend(last, m, ip, ip6, iphlen, uh, &srcsa.sa, 0);