Download raw body.
get rid of the SRP code in src/sys/net/pfkeyv2.c
On Wed, Aug 20, 2025 at 02:17:30PM +1000, David Gwynne wrote:
> this is like the rtsock change i just put in the tree in that it gets
> rid of an SRPL, but it works a bit differently.
>
> this code uses an SPRL to manage the list of pfkey sockets that
> route messages can be "recved" on. the obvious replacement would
> be an SMR list, but while you shouldn't sleep with an SRP ref, we
> do here and you can't do that inside an SMR read critical section.
>
> to avoid the sleep while in the SMR crit section, i'd have to add refcnt
> operations, which means a bunch of atomic ops.
>
> alternatively, we could do what i did with the ethernet sockets and use
> an rwlock to protect the list of sockets, and take a read lock while
> traversing the list. this reduces the overall number of atomic ops and
> keeps the code in the recv loop much the same.
>
> unlike the rtsock code, the pfkey sockets don't use solock to serialise
> the recv code, they use the rcvbuf mutexes. this means we don't have the
> same lock ordering problems that we had in rtsock, which makes this a
> much more straightforward converson.
>
> ok? tests?
>
This diff introduces lock order issue. You hold solock() while you
acquire pkptable.pkp_lk rwlock in pfkeyv2_attach() or pfkeyv2_detach().
Meanwhile you hold pkptable.pkp_lk rwlock while you acquire solock() in
the PFKEYV2_SENDMESSAGE_REGISTERED case of pfkeyv2_sendmessage().
> Index: pfkeyv2.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pfkeyv2.c,v
> diff -u -p -r1.270 pfkeyv2.c
> --- pfkeyv2.c 7 Jul 2025 02:28:50 -0000 1.270
> +++ pfkeyv2.c 8 Aug 2025 05:23:29 -0000
> @@ -135,8 +135,7 @@ const struct domain pfkeydomain;
> struct pkpcb {
> struct socket *kcb_socket; /* [I] associated socket */
>
> - SRPL_ENTRY(pkpcb) kcb_list; /* [l] */
> - struct refcnt kcb_refcnt; /* [a] */
> + TAILQ_ENTRY(pkpcb) kcb_list; /* [l] */
> int kcb_flags; /* [s] */
> uint32_t kcb_reg; /* [s] Inc if SATYPE_MAX > 31 */
> uint32_t kcb_pid; /* [I] */
> @@ -153,8 +152,7 @@ struct dump_state {
> };
>
> struct pkptable {
> - SRPL_HEAD(, pkpcb) pkp_list;
> - struct srpl_rc pkp_rc;
> + TAILQ_HEAD(, pkpcb) pkp_list;
> struct rwlock pkp_lk;
> };
>
> @@ -229,28 +227,11 @@ const struct domain pfkeydomain = {
> };
>
> void
> -keycb_ref(void *null, void *v)
> -{
> - struct pkpcb *kp = v;
> -
> - refcnt_take(&kp->kcb_refcnt);
> -}
> -
> -void
> -keycb_unref(void *null, void *v)
> -{
> - struct pkpcb *kp = v;
> -
> - refcnt_rele_wake(&kp->kcb_refcnt);
> -}
> -
> -void
> pfkey_init(void)
> {
> rn_init(sizeof(struct sockaddr_encap));
> - srpl_rc_init(&pkptable.pkp_rc, keycb_ref, keycb_unref, NULL);
> rw_init(&pkptable.pkp_lk, "pfkey");
> - SRPL_INIT(&pkptable.pkp_list);
> + TAILQ_INIT(&pkptable.pkp_list);
> pool_init(&pkpcb_pool, sizeof(struct pkpcb), 0,
> IPL_SOFTNET, PR_WAITOK, "pkpcb", NULL);
> pool_init(&ipsec_policy_pool, sizeof(struct ipsec_policy), 0,
> @@ -281,7 +262,6 @@ pfkeyv2_attach(struct socket *so, int pr
> if (kp == NULL)
> return (ENOBUFS);
> so->so_pcb = kp;
> - refcnt_init(&kp->kcb_refcnt);
> kp->kcb_socket = so;
> kp->kcb_pid = curproc->p_p->ps_pid;
> kp->kcb_rdomain = rtable_l2(curproc->p_p->ps_rtableid);
> @@ -290,7 +270,7 @@ pfkeyv2_attach(struct socket *so, int pr
> soisconnected(so);
>
> rw_enter(&pkptable.pkp_lk, RW_WRITE);
> - SRPL_INSERT_HEAD_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, kcb_list);
> + TAILQ_INSERT_TAIL(&pkptable.pkp_list, kp, kcb_list);
> rw_exit(&pkptable.pkp_lk);
>
> return (0);
> @@ -322,15 +302,9 @@ pfkeyv2_detach(struct socket *so)
> }
>
> rw_enter(&pkptable.pkp_lk, RW_WRITE);
> - SRPL_REMOVE_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, pkpcb,
> - kcb_list);
> + TAILQ_REMOVE(&pkptable.pkp_list, kp, kcb_list);
> rw_exit(&pkptable.pkp_lk);
>
> - sounlock(so);
> - /* wait for all references to drop */
> - refcnt_finalize(&kp->kcb_refcnt, "pfkeyrefs");
> - solock(so);
> -
> so->so_pcb = NULL;
> KASSERT((so->so_state & SS_NOFDREF) == 0);
> pool_put(&pkpcb_pool, kp);
> @@ -475,7 +449,6 @@ pfkeyv2_sendmessage(void **headers, int
> struct mbuf *packet;
> struct pkpcb *kp;
> struct sadb_msg *smsg;
> - struct srp_ref sr;
>
> /* Find out how much space we'll need... */
> j = sizeof(struct sadb_msg);
> @@ -538,14 +511,15 @@ pfkeyv2_sendmessage(void **headers, int
> * Search for promiscuous listeners, skipping the
> * original destination.
> */
> - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> + rw_enter_read(&pkptable.pkp_lk);
> + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
> if (kp->kcb_socket == so || kp->kcb_rdomain != rdomain)
> continue;
>
> if (kp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC)
> pfkey_sendup(kp, packet, 1);
> }
> - SRPL_LEAVE(&sr);
> + rw_exit_read(&pkptable.pkp_lk);
> m_freem(packet);
> break;
>
> @@ -554,7 +528,8 @@ pfkeyv2_sendmessage(void **headers, int
> * Send the message to all registered sockets that match
> * the specified satype (e.g., all IPSEC-ESP negotiators)
> */
> - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> + rw_enter_read(&pkptable.pkp_lk);
> + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
> if (kp->kcb_rdomain != rdomain)
> continue;
>
> @@ -571,7 +546,7 @@ pfkeyv2_sendmessage(void **headers, int
> }
> }
> }
> - SRPL_LEAVE(&sr);
> + rw_exit_read(&pkptable.pkp_lk);
> /* Free last/original copy of the packet */
> m_freem(packet);
>
> @@ -590,7 +565,8 @@ pfkeyv2_sendmessage(void **headers, int
> goto ret;
>
> /* Send to all registered promiscuous listeners */
> - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> + rw_enter_read(&pkptable.pkp_lk);
> + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
> int flags = READ_ONCE(kp->kcb_flags);
>
> if (kp->kcb_rdomain != rdomain)
> @@ -600,19 +576,20 @@ pfkeyv2_sendmessage(void **headers, int
> !(flags & PFKEYV2_SOCKETFLAGS_REGISTERED))
> pfkey_sendup(kp, packet, 1);
> }
> - SRPL_LEAVE(&sr);
> + rw_exit_read(&pkptable.pkp_lk);
> m_freem(packet);
> break;
>
> case PFKEYV2_SENDMESSAGE_BROADCAST:
> /* Send message to all sockets */
> - SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> + rw_enter_read(&pkptable.pkp_lk);
> + TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
> if (kp->kcb_rdomain != rdomain)
> continue;
>
> pfkey_sendup(kp, packet, 1);
> }
> - SRPL_LEAVE(&sr);
> + rw_exit_read(&pkptable.pkp_lk);
> m_freem(packet);
> break;
> }
> @@ -1136,7 +1113,6 @@ pfkeyv2_dosend(struct socket *so, void *
> struct sadb_sa *ssa;
> struct sadb_supported *ssup;
> struct sadb_ident *sid, *did;
> - struct srp_ref sr;
> struct sadb_x_rdomain *srdomain;
> u_int rdomain = 0;
> int promisc;
> @@ -1186,14 +1162,15 @@ pfkeyv2_dosend(struct socket *so, void *
> goto ret;
>
> /* Send to all promiscuous listeners */
> - SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) {
> + rw_enter_read(&pkptable.pkp_lk);
> + TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) {
> if (bkp->kcb_rdomain != kp->kcb_rdomain)
> continue;
>
> if (bkp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC)
> pfkey_sendup(bkp, packet, 1);
> }
> - SRPL_LEAVE(&sr);
> + rw_exit_read(&pkptable.pkp_lk);
>
> m_freem(packet);
>
> @@ -2040,7 +2017,8 @@ pfkeyv2_dosend(struct socket *so, void *
> if ((rval = pfdatatopacket(message, len, &packet)) != 0)
> goto ret;
>
> - SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) {
> + rw_enter_read(&pkptable.pkp_lk);
> + TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) {
> if (bkp == kp ||
> bkp->kcb_rdomain != kp->kcb_rdomain)
> continue;
> @@ -2050,7 +2028,7 @@ pfkeyv2_dosend(struct socket *so, void *
> pfkey_sendup(bkp, packet, 1);
> }
> }
> - SRPL_LEAVE(&sr);
> + rw_exit_read(&pkptable.pkp_lk);
>
> m_freem(packet);
> } else {
>
get rid of the SRP code in src/sys/net/pfkeyv2.c