From: David Gwynne Subject: Re: get rid of the SRP code in src/sys/net/pfkeyv2.c To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Thu, 21 Aug 2025 16:23:10 +1000 On Wed, Aug 20, 2025 at 12:25:37PM +0300, Vitaliy Makkoveev wrote: > On Wed, Aug 20, 2025 at 02:17:30PM +1000, David Gwynne wrote: > > this is like the rtsock change i just put in the tree in that it gets > > rid of an SRPL, but it works a bit differently. > > > > this code uses an SPRL to manage the list of pfkey sockets that > > route messages can be "recved" on. the obvious replacement would > > be an SMR list, but while you shouldn't sleep with an SRP ref, we > > do here and you can't do that inside an SMR read critical section. > > > > to avoid the sleep while in the SMR crit section, i'd have to add refcnt > > operations, which means a bunch of atomic ops. > > > > alternatively, we could do what i did with the ethernet sockets and use > > an rwlock to protect the list of sockets, and take a read lock while > > traversing the list. this reduces the overall number of atomic ops and > > keeps the code in the recv loop much the same. > > > > unlike the rtsock code, the pfkey sockets don't use solock to serialise > > the recv code, they use the rcvbuf mutexes. this means we don't have the > > same lock ordering problems that we had in rtsock, which makes this a > > much more straightforward converson. > > > > ok? tests? > > > > This diff introduces lock order issue. You hold solock() while you > acquire pkptable.pkp_lk rwlock in pfkeyv2_attach() or pfkeyv2_detach(). > Meanwhile you hold pkptable.pkp_lk rwlock while you acquire solock() in > the PFKEYV2_SENDMESSAGE_REGISTERED case of pfkeyv2_sendmessage(). nice. this has PFKEYV2_SENDMESSAGE_REGISTERED read kcb_reg without the lock. it's testing a single bit in the word, so there's no inconsistent intermediate states that can confuse it. Index: pfkeyv2.c =================================================================== RCS file: /cvs/src/sys/net/pfkeyv2.c,v diff -u -p -r1.270 pfkeyv2.c --- pfkeyv2.c 7 Jul 2025 02:28:50 -0000 1.270 +++ pfkeyv2.c 21 Aug 2025 06:18:28 -0000 @@ -135,8 +135,7 @@ const struct domain pfkeydomain; struct pkpcb { struct socket *kcb_socket; /* [I] associated socket */ - SRPL_ENTRY(pkpcb) kcb_list; /* [l] */ - struct refcnt kcb_refcnt; /* [a] */ + TAILQ_ENTRY(pkpcb) kcb_list; /* [l] */ int kcb_flags; /* [s] */ uint32_t kcb_reg; /* [s] Inc if SATYPE_MAX > 31 */ uint32_t kcb_pid; /* [I] */ @@ -153,8 +152,7 @@ struct dump_state { }; struct pkptable { - SRPL_HEAD(, pkpcb) pkp_list; - struct srpl_rc pkp_rc; + TAILQ_HEAD(, pkpcb) pkp_list; struct rwlock pkp_lk; }; @@ -229,28 +227,11 @@ const struct domain pfkeydomain = { }; void -keycb_ref(void *null, void *v) -{ - struct pkpcb *kp = v; - - refcnt_take(&kp->kcb_refcnt); -} - -void -keycb_unref(void *null, void *v) -{ - struct pkpcb *kp = v; - - refcnt_rele_wake(&kp->kcb_refcnt); -} - -void pfkey_init(void) { rn_init(sizeof(struct sockaddr_encap)); - srpl_rc_init(&pkptable.pkp_rc, keycb_ref, keycb_unref, NULL); rw_init(&pkptable.pkp_lk, "pfkey"); - SRPL_INIT(&pkptable.pkp_list); + TAILQ_INIT(&pkptable.pkp_list); pool_init(&pkpcb_pool, sizeof(struct pkpcb), 0, IPL_SOFTNET, PR_WAITOK, "pkpcb", NULL); pool_init(&ipsec_policy_pool, sizeof(struct ipsec_policy), 0, @@ -281,7 +262,6 @@ pfkeyv2_attach(struct socket *so, int pr if (kp == NULL) return (ENOBUFS); so->so_pcb = kp; - refcnt_init(&kp->kcb_refcnt); kp->kcb_socket = so; kp->kcb_pid = curproc->p_p->ps_pid; kp->kcb_rdomain = rtable_l2(curproc->p_p->ps_rtableid); @@ -290,7 +270,7 @@ pfkeyv2_attach(struct socket *so, int pr soisconnected(so); rw_enter(&pkptable.pkp_lk, RW_WRITE); - SRPL_INSERT_HEAD_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, kcb_list); + TAILQ_INSERT_TAIL(&pkptable.pkp_list, kp, kcb_list); rw_exit(&pkptable.pkp_lk); return (0); @@ -322,15 +302,9 @@ pfkeyv2_detach(struct socket *so) } rw_enter(&pkptable.pkp_lk, RW_WRITE); - SRPL_REMOVE_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, pkpcb, - kcb_list); + TAILQ_REMOVE(&pkptable.pkp_list, kp, kcb_list); rw_exit(&pkptable.pkp_lk); - sounlock(so); - /* wait for all references to drop */ - refcnt_finalize(&kp->kcb_refcnt, "pfkeyrefs"); - solock(so); - so->so_pcb = NULL; KASSERT((so->so_state & SS_NOFDREF) == 0); pool_put(&pkpcb_pool, kp); @@ -475,7 +449,6 @@ pfkeyv2_sendmessage(void **headers, int struct mbuf *packet; struct pkpcb *kp; struct sadb_msg *smsg; - struct srp_ref sr; /* Find out how much space we'll need... */ j = sizeof(struct sadb_msg); @@ -538,14 +511,15 @@ pfkeyv2_sendmessage(void **headers, int * Search for promiscuous listeners, skipping the * original destination. */ - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { + rw_enter_read(&pkptable.pkp_lk); + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { if (kp->kcb_socket == so || kp->kcb_rdomain != rdomain) continue; if (kp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC) pfkey_sendup(kp, packet, 1); } - SRPL_LEAVE(&sr); + rw_exit_read(&pkptable.pkp_lk); m_freem(packet); break; @@ -554,7 +528,8 @@ pfkeyv2_sendmessage(void **headers, int * Send the message to all registered sockets that match * the specified satype (e.g., all IPSEC-ESP negotiators) */ - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { + rw_enter_read(&pkptable.pkp_lk); + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { if (kp->kcb_rdomain != rdomain) continue; @@ -563,15 +538,16 @@ pfkeyv2_sendmessage(void **headers, int /* Just send to everyone registered */ pfkey_sendup(kp, packet, 1); } else { - keylock(kp); + uint32_t kcb_reg; + + kcb_reg = READ_ONCE(kp->kcb_reg); /* Check for specified satype */ - if ((1 << satype) & kp->kcb_reg) + if ((1 << satype) & kcb_reg) pfkey_sendup(kp, packet, 1); - keyunlock(kp); } } } - SRPL_LEAVE(&sr); + rw_exit_read(&pkptable.pkp_lk); /* Free last/original copy of the packet */ m_freem(packet); @@ -590,7 +566,8 @@ pfkeyv2_sendmessage(void **headers, int goto ret; /* Send to all registered promiscuous listeners */ - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { + rw_enter_read(&pkptable.pkp_lk); + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { int flags = READ_ONCE(kp->kcb_flags); if (kp->kcb_rdomain != rdomain) @@ -600,19 +577,20 @@ pfkeyv2_sendmessage(void **headers, int !(flags & PFKEYV2_SOCKETFLAGS_REGISTERED)) pfkey_sendup(kp, packet, 1); } - SRPL_LEAVE(&sr); + rw_exit_read(&pkptable.pkp_lk); m_freem(packet); break; case PFKEYV2_SENDMESSAGE_BROADCAST: /* Send message to all sockets */ - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { + rw_enter_read(&pkptable.pkp_lk); + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { if (kp->kcb_rdomain != rdomain) continue; pfkey_sendup(kp, packet, 1); } - SRPL_LEAVE(&sr); + rw_exit_read(&pkptable.pkp_lk); m_freem(packet); break; } @@ -1136,7 +1114,6 @@ pfkeyv2_dosend(struct socket *so, void * struct sadb_sa *ssa; struct sadb_supported *ssup; struct sadb_ident *sid, *did; - struct srp_ref sr; struct sadb_x_rdomain *srdomain; u_int rdomain = 0; int promisc; @@ -1186,14 +1163,15 @@ pfkeyv2_dosend(struct socket *so, void * goto ret; /* Send to all promiscuous listeners */ - SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) { + rw_enter_read(&pkptable.pkp_lk); + TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) { if (bkp->kcb_rdomain != kp->kcb_rdomain) continue; if (bkp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC) pfkey_sendup(bkp, packet, 1); } - SRPL_LEAVE(&sr); + rw_exit_read(&pkptable.pkp_lk); m_freem(packet); @@ -2040,7 +2018,8 @@ pfkeyv2_dosend(struct socket *so, void * if ((rval = pfdatatopacket(message, len, &packet)) != 0) goto ret; - SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) { + rw_enter_read(&pkptable.pkp_lk); + TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) { if (bkp == kp || bkp->kcb_rdomain != kp->kcb_rdomain) continue; @@ -2050,7 +2029,7 @@ pfkeyv2_dosend(struct socket *so, void * pfkey_sendup(bkp, packet, 1); } } - SRPL_LEAVE(&sr); + rw_exit_read(&pkptable.pkp_lk); m_freem(packet); } else {