Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: get rid of the SRP code in src/sys/net/pfkeyv2.c
To:
David Gwynne <david@gwynne.id.au>
Cc:
tech@openbsd.org
Date:
Wed, 20 Aug 2025 12:25:37 +0300

Download raw body.

Thread
On Wed, Aug 20, 2025 at 02:17:30PM +1000, David Gwynne wrote:
> this is like the rtsock change i just put in the tree in that it gets
> rid of an SRPL, but it works a bit differently.
> 
> this code uses an SPRL to manage the list of pfkey sockets that
> route messages can be "recved" on. the obvious replacement would
> be an SMR list, but while you shouldn't sleep with an SRP ref, we
> do here and you can't do that inside an SMR read critical section.
> 
> to avoid the sleep while in the SMR crit section, i'd have to add refcnt
> operations, which means a bunch of atomic ops.
> 
> alternatively, we could do what i did with the ethernet sockets and use
> an rwlock to protect the list of sockets, and take a read lock while
> traversing the list. this reduces the overall number of atomic ops and
> keeps the code in the recv loop much the same.
> 
> unlike the rtsock code, the pfkey sockets don't use solock to serialise
> the recv code, they use the rcvbuf mutexes. this means we don't have the
> same lock ordering problems that we had in rtsock, which makes this a
> much more straightforward converson.
> 
> ok? tests?
> 

This diff introduces lock order issue. You hold solock() while you
acquire pkptable.pkp_lk rwlock in pfkeyv2_attach() or pfkeyv2_detach().
Meanwhile you hold pkptable.pkp_lk rwlock while you acquire solock() in
the PFKEYV2_SENDMESSAGE_REGISTERED case of pfkeyv2_sendmessage().

> Index: pfkeyv2.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pfkeyv2.c,v
> diff -u -p -r1.270 pfkeyv2.c
> --- pfkeyv2.c	7 Jul 2025 02:28:50 -0000	1.270
> +++ pfkeyv2.c	8 Aug 2025 05:23:29 -0000
> @@ -135,8 +135,7 @@ const struct domain pfkeydomain;
>  struct pkpcb {
>  	struct socket		*kcb_socket;	/* [I] associated socket */
>  
> -	SRPL_ENTRY(pkpcb)	kcb_list;	/* [l] */
> -	struct refcnt		kcb_refcnt;	/* [a] */
> +	TAILQ_ENTRY(pkpcb)	kcb_list;	/* [l] */
>  	int			kcb_flags;	/* [s] */
>  	uint32_t		kcb_reg;	/* [s] Inc if SATYPE_MAX > 31 */
>  	uint32_t		kcb_pid;	/* [I] */
> @@ -153,8 +152,7 @@ struct dump_state {
>  };
>  
>  struct pkptable {
> -	SRPL_HEAD(, pkpcb)	pkp_list;
> -	struct srpl_rc		pkp_rc;
> +	TAILQ_HEAD(, pkpcb)	pkp_list;
>  	struct rwlock		pkp_lk;
>  };
>  
> @@ -229,28 +227,11 @@ const struct domain pfkeydomain = {
>  };
>  
>  void
> -keycb_ref(void *null, void *v)
> -{
> -	struct pkpcb *kp = v;
> -
> -	refcnt_take(&kp->kcb_refcnt);
> -}
> -
> -void
> -keycb_unref(void *null, void *v)
> -{
> -	struct pkpcb *kp = v;
> -
> -	refcnt_rele_wake(&kp->kcb_refcnt);
> -}
> -
> -void
>  pfkey_init(void)
>  {
>  	rn_init(sizeof(struct sockaddr_encap));
> -	srpl_rc_init(&pkptable.pkp_rc, keycb_ref, keycb_unref, NULL);
>  	rw_init(&pkptable.pkp_lk, "pfkey");
> -	SRPL_INIT(&pkptable.pkp_list);
> +	TAILQ_INIT(&pkptable.pkp_list);
>  	pool_init(&pkpcb_pool, sizeof(struct pkpcb), 0,
>  	    IPL_SOFTNET, PR_WAITOK, "pkpcb", NULL);
>  	pool_init(&ipsec_policy_pool, sizeof(struct ipsec_policy), 0,
> @@ -281,7 +262,6 @@ pfkeyv2_attach(struct socket *so, int pr
>  	if (kp == NULL)
>  		return (ENOBUFS);
>  	so->so_pcb = kp;
> -	refcnt_init(&kp->kcb_refcnt);
>  	kp->kcb_socket = so;
>  	kp->kcb_pid = curproc->p_p->ps_pid;
>  	kp->kcb_rdomain = rtable_l2(curproc->p_p->ps_rtableid);
> @@ -290,7 +270,7 @@ pfkeyv2_attach(struct socket *so, int pr
>  	soisconnected(so);
>  
>  	rw_enter(&pkptable.pkp_lk, RW_WRITE);
> -	SRPL_INSERT_HEAD_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, kcb_list);
> +	TAILQ_INSERT_TAIL(&pkptable.pkp_list, kp, kcb_list);
>  	rw_exit(&pkptable.pkp_lk);
>  
>  	return (0);
> @@ -322,15 +302,9 @@ pfkeyv2_detach(struct socket *so)
>  	}
>  
>  	rw_enter(&pkptable.pkp_lk, RW_WRITE);
> -	SRPL_REMOVE_LOCKED(&pkptable.pkp_rc, &pkptable.pkp_list, kp, pkpcb,
> -	    kcb_list);
> +	TAILQ_REMOVE(&pkptable.pkp_list, kp, kcb_list);
>  	rw_exit(&pkptable.pkp_lk);
>  
> -	sounlock(so);
> -	/* wait for all references to drop */
> -	refcnt_finalize(&kp->kcb_refcnt, "pfkeyrefs");
> -	solock(so);
> -
>  	so->so_pcb = NULL;
>  	KASSERT((so->so_state & SS_NOFDREF) == 0);
>  	pool_put(&pkpcb_pool, kp);
> @@ -475,7 +449,6 @@ pfkeyv2_sendmessage(void **headers, int 
>  	struct mbuf *packet;
>  	struct pkpcb *kp;
>  	struct sadb_msg *smsg;
> -	struct srp_ref sr;
>  
>  	/* Find out how much space we'll need... */
>  	j = sizeof(struct sadb_msg);
> @@ -538,14 +511,15 @@ pfkeyv2_sendmessage(void **headers, int 
>  		 * Search for promiscuous listeners, skipping the
>  		 * original destination.
>  		 */
> -		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> +		rw_enter_read(&pkptable.pkp_lk);
> +		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
>  			if (kp->kcb_socket == so || kp->kcb_rdomain != rdomain)
>  				continue;
>  
>  			if (kp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC)
>  				pfkey_sendup(kp, packet, 1);
>  		}
> -		SRPL_LEAVE(&sr);
> +		rw_exit_read(&pkptable.pkp_lk);
>  		m_freem(packet);
>  		break;
>  
> @@ -554,7 +528,8 @@ pfkeyv2_sendmessage(void **headers, int 
>  		 * Send the message to all registered sockets that match
>  		 * the specified satype (e.g., all IPSEC-ESP negotiators)
>  		 */
> -		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> +		rw_enter_read(&pkptable.pkp_lk);
> +		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
>  			if (kp->kcb_rdomain != rdomain)
>  				continue;
>  
> @@ -571,7 +546,7 @@ pfkeyv2_sendmessage(void **headers, int 
>  				}
>  			}
>  		}
> -		SRPL_LEAVE(&sr);
> +		rw_exit_read(&pkptable.pkp_lk);
>  		/* Free last/original copy of the packet */
>  		m_freem(packet);
>  
> @@ -590,7 +565,8 @@ pfkeyv2_sendmessage(void **headers, int 
>  			goto ret;
>  
>  		/* Send to all registered promiscuous listeners */
> -		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> +		rw_enter_read(&pkptable.pkp_lk);
> +		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
>  			int flags = READ_ONCE(kp->kcb_flags);
>  
>  			if (kp->kcb_rdomain != rdomain)
> @@ -600,19 +576,20 @@ pfkeyv2_sendmessage(void **headers, int 
>  			    !(flags & PFKEYV2_SOCKETFLAGS_REGISTERED))
>  				pfkey_sendup(kp, packet, 1);
>  		}
> -		SRPL_LEAVE(&sr);
> +		rw_exit_read(&pkptable.pkp_lk);
>  		m_freem(packet);
>  		break;
>  
>  	case PFKEYV2_SENDMESSAGE_BROADCAST:
>  		/* Send message to all sockets */
> -		SRPL_FOREACH(kp, &sr, &pkptable.pkp_list, kcb_list) {
> +		rw_enter_read(&pkptable.pkp_lk);
> +		TAILQ_FOREACH(kp, &pkptable.pkp_list, kcb_list) {
>  			if (kp->kcb_rdomain != rdomain)
>  				continue;
>  
>  			pfkey_sendup(kp, packet, 1);
>  		}
> -		SRPL_LEAVE(&sr);
> +		rw_exit_read(&pkptable.pkp_lk);
>  		m_freem(packet);
>  		break;
>  	}
> @@ -1136,7 +1113,6 @@ pfkeyv2_dosend(struct socket *so, void *
>  	struct sadb_sa *ssa;
>  	struct sadb_supported *ssup;
>  	struct sadb_ident *sid, *did;
> -	struct srp_ref sr;
>  	struct sadb_x_rdomain *srdomain;
>  	u_int rdomain = 0;
>  	int promisc;
> @@ -1186,14 +1162,15 @@ pfkeyv2_dosend(struct socket *so, void *
>  			goto ret;
>  
>  		/* Send to all promiscuous listeners */
> -		SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) {
> +		rw_enter_read(&pkptable.pkp_lk);
> +		TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) {
>  			if (bkp->kcb_rdomain != kp->kcb_rdomain)
>  				continue;
>  
>  			if (bkp->kcb_flags & PFKEYV2_SOCKETFLAGS_PROMISC)
>  				pfkey_sendup(bkp, packet, 1);
>  		}
> -		SRPL_LEAVE(&sr);
> +		rw_exit_read(&pkptable.pkp_lk);
>  
>  		m_freem(packet);
>  
> @@ -2040,7 +2017,8 @@ pfkeyv2_dosend(struct socket *so, void *
>  			if ((rval = pfdatatopacket(message, len, &packet)) != 0)
>  				goto ret;
>  
> -			SRPL_FOREACH(bkp, &sr, &pkptable.pkp_list, kcb_list) {
> +			rw_enter_read(&pkptable.pkp_lk);
> +			TAILQ_FOREACH(bkp, &pkptable.pkp_list, kcb_list) {
>  				if (bkp == kp ||
>  				    bkp->kcb_rdomain != kp->kcb_rdomain)
>  					continue;
> @@ -2050,7 +2028,7 @@ pfkeyv2_dosend(struct socket *so, void *
>  					pfkey_sendup(bkp, packet, 1);
>  				}
>  			}
> -			SRPL_LEAVE(&sr);
> +			rw_exit_read(&pkptable.pkp_lk);
>  
>  			m_freem(packet);
>  		} else {
>