From: Vitaliy Makkoveev Subject: Re: get rid of the SRP code in src/sys/net/pfkeyv2.c To: David Gwynne Cc: tech@openbsd.org Date: Wed, 20 Aug 2025 12:25:37 +0300 On Wed, Aug 20, 2025 at 02:17:30PM +1000, David Gwynne wrote: > this is like the rtsock change i just put in the tree in that it gets > rid of an SRPL, but it works a bit differently. > > this code uses an SPRL to manage the list of pfkey sockets that > route messages can be "recved" on. the obvious replacement would > be an SMR list, but while you shouldn't sleep with an SRP ref, we > do here and you can't do that inside an SMR read critical section. > > to avoid the sleep while in the SMR crit section, i'd have to add refcnt > operations, which means a bunch of atomic ops. > > alternatively, we could do what i did with the ethernet sockets and use > an rwlock to protect the list of sockets, and take a read lock while > traversing the list. this reduces the overall number of atomic ops and > keeps the code in the recv loop much the same. > > unlike the rtsock code, the pfkey sockets don't use solock to serialise > the recv code, they use the rcvbuf mutexes. this means we don't have the > same lock ordering problems that we had in rtsock, which makes this a > much more straightforward converson. > > ok? tests? > This diff introduces lock order issue. You hold solock() while you acquire pkptable.pkp_lk rwlock in pfkeyv2_attach() or pfkeyv2_detach(). Meanwhile you hold pkptable.pkp_lk rwlock while you acquire solock() in the PFKEYV2_SENDMESSAGE_REGISTERED case of pfkeyv2_sendmessage(). > Index: pfkeyv2.c > =================================================================== > RCS file: /cvs/src/sys/net/pfkeyv2.c,v > diff -u -p -r1.270 pfkeyv2.c > --- pfkeyv2.c 7 Jul 2025 02:28:50 -0000 1.270 > +++ pfkeyv2.c 8 Aug 2025 05:23:29 -0000 > @@ -135,8 +135,7 @@ const struct domain pfkeydomain; > struct pkpcb { > struct socket *kcb_socket; /* [I] associated socket */ > > - SRPL_ENTRY(pkpcb) kcb_list; /* [l] */ > - struct refcnt kcb_refcnt; /* [a] */ > + TAILQ_ENTRY(pkpcb) kcb_list; /* [l] */ > int kcb_flags; /* [s] */ > uint32_t kcb_reg; /* [s] Inc if SATYPE_MAX > 31 */ > uint32_t kcb_pid; /* [I] */ > @@ -153,8 +152,7 @@ struct dump_state { > }; > > struct pkptable { > - SRPL_HEAD(, pkpcb) pkp_list; > - struct srpl_rc pkp_rc; > + TAILQ_HEAD(, pkpcb) pkp_list; > struct rwlock pkp_lk; > }; > > @@ -229,28 +227,11 @@ const struct domain pfkeydomain = { > }; > > void > -keycb_ref(void *null, void *v) > -{ > - struct pkpcb *kp = v; > - > - refcnt_take(&kp->kcb_refcnt); > -} > - > -void > -keycb_unref(void *null, void *v) > -{ > - struct pkpcb *kp = v; > - > - refcnt_rele_wake(&kp->kcb_refcnt); > -} > - > -void > pfkey_init(void) > { > rn_init(sizeof(struct sockaddr_encap)); > - srpl_rc_init(&pkptable.pkp_rc, keycb_ref, keycb_unref, NULL); > rw_init(&pkptable.pkp_lk, "pfkey"); > - SRPL_INIT(&pkptable.pkp_list); > + TAILQ_INIT(&pkptable.pkp_list); > pool_init(&pkpcb_pool, sizeof(struct pkpcb), 0, > IPL_SOFTNET, PR_WAITOK, "pkpcb", NULL); > pool_init(&ipsec_policy_pool, sizeof(struct ipsec_policy), 0, > @@ -281,7 +262,6 @@ pfkeyv2_attach(struct socket *so, int pr > if (kp == NULL) > return (ENOBUFS); > so->so_pcb = kp; > - refcnt_init(&kp->kcb_refcnt); > kp->kcb_socket = so; > kp->kcb_pid = curproc->p_p->ps_pid; > kp->kcb_rdomain = rtable_l2(curproc->p_p->ps_rtableid); > @@ -290,7 +270,7 @@ pfkeyv2_attach(struct socket *so, int pr > soisconnected(so); > > rw_enter(&pkptable.pkp_lk, RW_WRITE); > - SRPL_INSERT_HEAD_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, kcb_list); > + TAILQ_INSERT_TAIL(&pkptable.pkp_list, kp, kcb_list); > rw_exit(&pkptable.pkp_lk); > > return (0); > @@ -322,15 +302,9 @@ pfkeyv2_detach(struct socket *so) > } > > rw_enter(&pkptable.pkp_lk, RW_WRITE); > - SRPL_REMOVE_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, pkpcb, > - kcb_list); > + TAILQ_REMOVE(&pkptable.pkp_list, kp, kcb_list); > rw_exit(&pkptable.pkp_lk); > > - sounlock(so); > - /* wait for all references to drop */ > - refcnt_finalize(&kp->kcb_refcnt, "pfkeyrefs"); > - solock(so); > - > so->so_pcb = NULL; > KASSERT((so->so_state & SS_NOFDREF) == 0); > pool_put(&pkpcb_pool, kp); > @@ -475,7 +449,6 @@ pfkeyv2_sendmessage(void **headers, int > struct mbuf *packet; > struct pkpcb *kp; > struct sadb_msg *smsg; > - struct srp_ref sr; > > /* Find out how much space we'll need... */ > j = sizeof(struct sadb_msg); > @@ -538,14 +511,15 @@ pfkeyv2_sendmessage(void **headers, int > * Search for promiscuous listeners, skipping the > * original destination. > */ > - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { > + rw_enter_read(&pkptable.pkp_lk); > + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { > if (kp->kcb_socket == so || kp->kcb_rdomain != rdomain) > continue; > > if (kp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC) > pfkey_sendup(kp, packet, 1); > } > - SRPL_LEAVE(&sr); > + rw_exit_read(&pkptable.pkp_lk); > m_freem(packet); > break; > > @@ -554,7 +528,8 @@ pfkeyv2_sendmessage(void **headers, int > * Send the message to all registered sockets that match > * the specified satype (e.g., all IPSEC-ESP negotiators) > */ > - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { > + rw_enter_read(&pkptable.pkp_lk); > + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { > if (kp->kcb_rdomain != rdomain) > continue; > > @@ -571,7 +546,7 @@ pfkeyv2_sendmessage(void **headers, int > } > } > } > - SRPL_LEAVE(&sr); > + rw_exit_read(&pkptable.pkp_lk); > /* Free last/original copy of the packet */ > m_freem(packet); > > @@ -590,7 +565,8 @@ pfkeyv2_sendmessage(void **headers, int > goto ret; > > /* Send to all registered promiscuous listeners */ > - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { > + rw_enter_read(&pkptable.pkp_lk); > + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { > int flags = READ_ONCE(kp->kcb_flags); > > if (kp->kcb_rdomain != rdomain) > @@ -600,19 +576,20 @@ pfkeyv2_sendmessage(void **headers, int > !(flags & PFKEYV2_SOCKETFLAGS_REGISTERED)) > pfkey_sendup(kp, packet, 1); > } > - SRPL_LEAVE(&sr); > + rw_exit_read(&pkptable.pkp_lk); > m_freem(packet); > break; > > case PFKEYV2_SENDMESSAGE_BROADCAST: > /* Send message to all sockets */ > - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) { > + rw_enter_read(&pkptable.pkp_lk); > + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) { > if (kp->kcb_rdomain != rdomain) > continue; > > pfkey_sendup(kp, packet, 1); > } > - SRPL_LEAVE(&sr); > + rw_exit_read(&pkptable.pkp_lk); > m_freem(packet); > break; > } > @@ -1136,7 +1113,6 @@ pfkeyv2_dosend(struct socket *so, void * > struct sadb_sa *ssa; > struct sadb_supported *ssup; > struct sadb_ident *sid, *did; > - struct srp_ref sr; > struct sadb_x_rdomain *srdomain; > u_int rdomain = 0; > int promisc; > @@ -1186,14 +1162,15 @@ pfkeyv2_dosend(struct socket *so, void * > goto ret; > > /* Send to all promiscuous listeners */ > - SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) { > + rw_enter_read(&pkptable.pkp_lk); > + TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) { > if (bkp->kcb_rdomain != kp->kcb_rdomain) > continue; > > if (bkp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC) > pfkey_sendup(bkp, packet, 1); > } > - SRPL_LEAVE(&sr); > + rw_exit_read(&pkptable.pkp_lk); > > m_freem(packet); > > @@ -2040,7 +2017,8 @@ pfkeyv2_dosend(struct socket *so, void * > if ((rval = pfdatatopacket(message, len, &packet)) != 0) > goto ret; > > - SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) { > + rw_enter_read(&pkptable.pkp_lk); > + TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) { > if (bkp == kp || > bkp->kcb_rdomain != kp->kcb_rdomain) > continue; > @@ -2050,7 +2028,7 @@ pfkeyv2_dosend(struct socket *so, void * > pfkey_sendup(bkp, packet, 1); > } > } > - SRPL_LEAVE(&sr); > + rw_exit_read(&pkptable.pkp_lk); > > m_freem(packet); > } else { >