Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: get rid of the SRP code in src/sys/net/pfkeyv2.c
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Thu, 21 Aug 2025 16:23:10 +1000

Download raw body.

Thread
On Wed, Aug 20, 2025 at 12:25:37PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Aug 20, 2025 at 02:17:30PM +1000, David Gwynne wrote:
> > this is like the rtsock change i just put in the tree in that it gets
> > rid of an SRPL, but it works a bit differently.
> > 
> > this code uses an SPRL to manage the list of pfkey sockets that
> > route messages can be "recved" on. the obvious replacement would
> > be an SMR list, but while you shouldn't sleep with an SRP ref, we
> > do here and you can't do that inside an SMR read critical section.
> > 
> > to avoid the sleep while in the SMR crit section, i'd have to add refcnt
> > operations, which means a bunch of atomic ops.
> > 
> > alternatively, we could do what i did with the ethernet sockets and use
> > an rwlock to protect the list of sockets, and take a read lock while
> > traversing the list. this reduces the overall number of atomic ops and
> > keeps the code in the recv loop much the same.
> > 
> > unlike the rtsock code, the pfkey sockets don't use solock to serialise
> > the recv code, they use the rcvbuf mutexes. this means we don't have the
> > same lock ordering problems that we had in rtsock, which makes this a
> > much more straightforward converson.
> > 
> > ok? tests?
> > 
> 
> This diff introduces lock order issue. You hold solock() while you
> acquire pkptable.pkp_lk rwlock in pfkeyv2_attach() or pfkeyv2_detach().
> Meanwhile you hold pkptable.pkp_lk rwlock while you acquire solock() in
> the PFKEYV2_SENDMESSAGE_REGISTERED case of pfkeyv2_sendmessage().

nice.

this has PFKEYV2_SENDMESSAGE_REGISTERED read kcb_reg without the lock.
it's testing a single bit in the word, so there's no inconsistent
intermediate states that can confuse it.

Index: pfkeyv2.c
===================================================================
RCS file: /cvs/src/sys/net/pfkeyv2.c,v
diff -u -p -r1.270 pfkeyv2.c
--- pfkeyv2.c	7 Jul 2025 02:28:50 -0000	1.270
+++ pfkeyv2.c	21 Aug 2025 06:18:28 -0000
@@ -135,8 +135,7 @@ const struct domain pfkeydomain;
 struct pkpcb {
 	struct socket		*kcb_socket;	/* [I] associated socket */
 
-	SRPL_ENTRY(pkpcb)	kcb_list;	/* [l] */
-	struct refcnt		kcb_refcnt;	/* [a] */
+	TAILQ_ENTRY(pkpcb)	kcb_list;	/* [l] */
 	int			kcb_flags;	/* [s] */
 	uint32_t		kcb_reg;	/* [s] Inc if SATYPE_MAX > 31 */
 	uint32_t		kcb_pid;	/* [I] */
@@ -153,8 +152,7 @@ struct dump_state {
 };
 
 struct pkptable {
-	SRPL_HEAD(, pkpcb)	pkp_list;
-	struct srpl_rc		pkp_rc;
+	TAILQ_HEAD(, pkpcb)	pkp_list;
 	struct rwlock		pkp_lk;
 };
 
@@ -229,28 +227,11 @@ const struct domain pfkeydomain = {
 };
 
 void
-keycb_ref(void *null, void *v)
-{
-	struct pkpcb *kp = v;
-
-	refcnt_take(&kp->kcb_refcnt);
-}
-
-void
-keycb_unref(void *null, void *v)
-{
-	struct pkpcb *kp = v;
-
-	refcnt_rele_wake(&kp->kcb_refcnt);
-}
-
-void
 pfkey_init(void)
 {
 	rn_init(sizeof(struct sockaddr_encap));
-	srpl_rc_init(&pkptable.pkp_rc, keycb_ref, keycb_unref, NULL);
 	rw_init(&pkptable.pkp_lk, "pfkey");
-	SRPL_INIT(&pkptable.pkp_list);
+	TAILQ_INIT(&pkptable.pkp_list);
 	pool_init(&pkpcb_pool, sizeof(struct pkpcb), 0,
 	    IPL_SOFTNET, PR_WAITOK, "pkpcb", NULL);
 	pool_init(&ipsec_policy_pool, sizeof(struct ipsec_policy), 0,
@@ -281,7 +262,6 @@ pfkeyv2_attach(struct socket *so, int pr
 	if (kp == NULL)
 		return (ENOBUFS);
 	so->so_pcb = kp;
-	refcnt_init(&kp->kcb_refcnt);
 	kp->kcb_socket = so;
 	kp->kcb_pid = curproc->p_p->ps_pid;
 	kp->kcb_rdomain = rtable_l2(curproc->p_p->ps_rtableid);
@@ -290,7 +270,7 @@ pfkeyv2_attach(struct socket *so, int pr
 	soisconnected(so);
 
 	rw_enter(&pkptable.pkp_lk, RW_WRITE);
-	SRPL_INSERT_HEAD_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, kcb_list);
+	TAILQ_INSERT_TAIL(&pkptable.pkp_list, kp, kcb_list);
 	rw_exit(&pkptable.pkp_lk);
 
 	return (0);
@@ -322,15 +302,9 @@ pfkeyv2_detach(struct socket *so)
 	}
 
 	rw_enter(&pkptable.pkp_lk, RW_WRITE);
-	SRPL_REMOVE_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, pkpcb,
-	    kcb_list);
+	TAILQ_REMOVE(&pkptable.pkp_list, kp, kcb_list);
 	rw_exit(&pkptable.pkp_lk);
 
-	sounlock(so);
-	/* wait for all references to drop */
-	refcnt_finalize(&kp->kcb_refcnt, "pfkeyrefs");
-	solock(so);
-
 	so->so_pcb = NULL;
 	KASSERT((so->so_state & SS_NOFDREF) == 0);
 	pool_put(&pkpcb_pool, kp);
@@ -475,7 +449,6 @@ pfkeyv2_sendmessage(void **headers, int 
 	struct mbuf *packet;
 	struct pkpcb *kp;
 	struct sadb_msg *smsg;
-	struct srp_ref sr;
 
 	/* Find out how much space we'll need... */
 	j = sizeof(struct sadb_msg);
@@ -538,14 +511,15 @@ pfkeyv2_sendmessage(void **headers, int 
 		 * Search for promiscuous listeners, skipping the
 		 * original destination.
 		 */
-		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
+		rw_enter_read(&pkptable.pkp_lk);
+		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
 			if (kp->kcb_socket == so || kp->kcb_rdomain != rdomain)
 				continue;
 
 			if (kp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC)
 				pfkey_sendup(kp, packet, 1);
 		}
-		SRPL_LEAVE(&sr);
+		rw_exit_read(&pkptable.pkp_lk);
 		m_freem(packet);
 		break;
 
@@ -554,7 +528,8 @@ pfkeyv2_sendmessage(void **headers, int 
 		 * Send the message to all registered sockets that match
 		 * the specified satype (e.g., all IPSEC-ESP negotiators)
 		 */
-		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
+		rw_enter_read(&pkptable.pkp_lk);
+		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
 			if (kp->kcb_rdomain != rdomain)
 				continue;
 
@@ -563,15 +538,16 @@ pfkeyv2_sendmessage(void **headers, int 
 					/* Just send to everyone registered */
 					pfkey_sendup(kp, packet, 1);
 				} else {
-					keylock(kp);
+					uint32_t kcb_reg;
+
+					kcb_reg = READ_ONCE(kp->kcb_reg);
 					/* Check for specified satype */
-					if ((1 << satype) & kp->kcb_reg)
+					if ((1 << satype) & kcb_reg)
 						pfkey_sendup(kp, packet, 1);
-					keyunlock(kp);
 				}
 			}
 		}
-		SRPL_LEAVE(&sr);
+		rw_exit_read(&pkptable.pkp_lk);
 		/* Free last/original copy of the packet */
 		m_freem(packet);
 
@@ -590,7 +566,8 @@ pfkeyv2_sendmessage(void **headers, int 
 			goto ret;
 
 		/* Send to all registered promiscuous listeners */
-		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
+		rw_enter_read(&pkptable.pkp_lk);
+		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
 			int flags = READ_ONCE(kp->kcb_flags);
 
 			if (kp->kcb_rdomain != rdomain)
@@ -600,19 +577,20 @@ pfkeyv2_sendmessage(void **headers, int 
 			    !(flags & PFKEYV2_SOCKETFLAGS_REGISTERED))
 				pfkey_sendup(kp, packet, 1);
 		}
-		SRPL_LEAVE(&sr);
+		rw_exit_read(&pkptable.pkp_lk);
 		m_freem(packet);
 		break;
 
 	case PFKEYV2_SENDMESSAGE_BROADCAST:
 		/* Send message to all sockets */
-		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
+		rw_enter_read(&pkptable.pkp_lk);
+		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
 			if (kp->kcb_rdomain != rdomain)
 				continue;
 
 			pfkey_sendup(kp, packet, 1);
 		}
-		SRPL_LEAVE(&sr);
+		rw_exit_read(&pkptable.pkp_lk);
 		m_freem(packet);
 		break;
 	}
@@ -1136,7 +1114,6 @@ pfkeyv2_dosend(struct socket *so, void *
 	struct sadb_sa *ssa;
 	struct sadb_supported *ssup;
 	struct sadb_ident *sid, *did;
-	struct srp_ref sr;
 	struct sadb_x_rdomain *srdomain;
 	u_int rdomain = 0;
 	int promisc;
@@ -1186,14 +1163,15 @@ pfkeyv2_dosend(struct socket *so, void *
 			goto ret;
 
 		/* Send to all promiscuous listeners */
-		SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) {
+		rw_enter_read(&pkptable.pkp_lk);
+		TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) {
 			if (bkp->kcb_rdomain != kp->kcb_rdomain)
 				continue;
 
 			if (bkp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC)
 				pfkey_sendup(bkp, packet, 1);
 		}
-		SRPL_LEAVE(&sr);
+		rw_exit_read(&pkptable.pkp_lk);
 
 		m_freem(packet);
 
@@ -2040,7 +2018,8 @@ pfkeyv2_dosend(struct socket *so, void *
 			if ((rval = pfdatatopacket(message, len, &packet)) != 0)
 				goto ret;
 
-			SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) {
+			rw_enter_read(&pkptable.pkp_lk);
+			TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) {
 				if (bkp == kp ||
 				    bkp->kcb_rdomain != kp->kcb_rdomain)
 					continue;
@@ -2050,7 +2029,7 @@ pfkeyv2_dosend(struct socket *so, void *
 					pfkey_sendup(bkp, packet, 1);
 				}
 			}
-			SRPL_LEAVE(&sr);
+			rw_exit_read(&pkptable.pkp_lk);
 
 			m_freem(packet);
 		} else {