Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
incpb socket ref count
To:
tech@openbsd.org
Date:
Wed, 21 May 2025 17:44:45 +0900

Download raw body.

Thread
  • Alexander Bluhm:

    incpb socket ref count

Hi,

Instead of protecting inp_socket with special mutex when being freed
and reference counting in in_pcbsolock_ref(), better reference count
inp_socket.

When an inpcb is created, inp_socket is initialized with a socket
pointer that is reference counted.  When the inpcb is freed, release
the socket reference.  As incpb is reference counted itself, we can
always access the socket memory when we have a valid inpcb counter.

in_pcbsolock() can just lock the socket, no reference counting
needed.  This reduces contention a bit.  As in_pcbdetach() is
protected by socket lock, in_pcbsolock() has to check so_pcb while
holding the lock to skip sockets that are being closed.

ok?

bluhm

Index: kern/kern_sysctl.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
diff -u -p -r1.468 kern_sysctl.c
--- kern/kern_sysctl.c	9 May 2025 14:53:22 -0000	1.468
+++ kern/kern_sysctl.c	21 May 2025 04:50:15 -0000
@@ -1722,17 +1722,15 @@ do {									\
 	mtx_enter(&(table)->inpt_mtx);					\
 	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {	\
 		if (buflen >= elem_size && elem_count > 0) {		\
-			mtx_enter(&inp->inp_sofree_mtx);		\
-			so = soref(inp->inp_socket);			\
-			mtx_leave(&inp->inp_sofree_mtx);		\
+			mtx_leave(&(table)->inpt_mtx);			\
+			NET_LOCK_SHARED();				\
+			so = in_pcbsolock(inp);				\
 			if (so == NULL)					\
 				continue;				\
-			mtx_leave(&(table)->inpt_mtx);			\
-			solock_shared(so);				\
 			fill_file(kf, NULL, NULL, 0, NULL, NULL, p,	\
 			    so, show_pointers);				\
-			sounlock_shared(so);				\
-			sorele(so);					\
+			in_pcbsounlock(inp, so);			\
+			NET_UNLOCK_SHARED();				\
 			error = copyout(kf, dp, outsize);		\
 			mtx_enter(&(table)->inpt_mtx);			\
 			if (error) {					\
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.314 in_pcb.c
--- netinet/in_pcb.c	20 May 2025 05:51:43 -0000	1.314
+++ netinet/in_pcb.c	21 May 2025 04:50:15 -0000
@@ -236,8 +236,7 @@ in_pcballoc(struct socket *so, struct in
 	if (inp == NULL)
 		return (ENOBUFS);
 	inp->inp_table = table;
-	inp->inp_socket = so;
-	mtx_init(&inp->inp_sofree_mtx, IPL_SOFTNET);
+	inp->inp_socket = soref(so);
 	refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
 	inp->inp_seclevel.sl_auth = IPSEC_AUTH_LEVEL_DEFAULT;
 	inp->inp_seclevel.sl_esp_trans = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
@@ -584,10 +583,9 @@ in_pcbdetach(struct inpcb *inp)
 	struct socket *so = inp->inp_socket;
 	struct inpcbtable *table = inp->inp_table;
 
+	soassertlocked(so);
+
 	so->so_pcb = NULL;
-	mtx_enter(&inp->inp_sofree_mtx);
-	inp->inp_socket = NULL;
-	mtx_leave(&inp->inp_sofree_mtx);
 	/*
 	 * As long as the NET_LOCK() is the default lock for Internet
 	 * sockets, do not release it to not introduce new sleeping
@@ -623,22 +621,17 @@ in_pcbdetach(struct inpcb *inp)
 }
 
 struct socket *
-in_pcbsolock_ref(struct inpcb *inp)
+in_pcbsolock(struct inpcb *inp)
 {
-	struct socket *so;
+	struct socket *so = inp->inp_socket;
 
 	NET_ASSERT_LOCKED();
 
-	mtx_enter(&inp->inp_sofree_mtx);
-	so = soref(inp->inp_socket);
-	mtx_leave(&inp->inp_sofree_mtx);
 	if (so == NULL)
 		return NULL;
 	rw_enter_write(&so->so_lock);
-	/* between mutex and rwlock inpcb could be detached */
 	if (so->so_pcb == NULL) {
 		rw_exit_write(&so->so_lock);
-		sorele(so);
 		return NULL;
 	}
 	KASSERT(inp->inp_socket == so && sotoinpcb(so) == inp);
@@ -646,12 +639,13 @@ in_pcbsolock_ref(struct inpcb *inp)
 }
 
 void
-in_pcbsounlock_rele(struct inpcb *inp, struct socket *so)
+in_pcbsounlock(struct inpcb *inp, struct socket *so)
 {
 	if (so == NULL)
 		return;
+	if (inp != NULL && so->so_pcb != NULL)
+		KASSERT(inp->inp_socket == so && sotoinpcb(so) == inp);
 	rw_exit_write(&so->so_lock);
-	sorele(so);
 }
 
 struct inpcb *
@@ -670,6 +664,7 @@ in_pcbunref(struct inpcb *inp)
 		return;
 	if (refcnt_rele(&inp->inp_refcnt) == 0)
 		return;
+	sorele(inp->inp_socket);
 	KASSERT((LIST_NEXT(inp, inp_hash) == NULL) ||
 	    (LIST_NEXT(inp, inp_hash) == _Q_INVALID));
 	KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) ||
@@ -819,10 +814,10 @@ in_pcbnotifyall(struct inpcbtable *table
 			continue;
 		}
 		mtx_leave(&table->inpt_mtx);
-		so = in_pcbsolock_ref(inp);
+		so = in_pcbsolock(inp);
 		if (so != NULL)
 			(*notify)(inp, errno);
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		mtx_enter(&table->inpt_mtx);
 	}
 	mtx_leave(&table->inpt_mtx);
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.168 in_pcb.h
--- netinet/in_pcb.h	20 May 2025 05:51:43 -0000	1.168
+++ netinet/in_pcb.h	21 May 2025 04:51:43 -0000
@@ -81,7 +81,6 @@
  *	t	inpt_mtx		pcb table mutex
  *	L	pf_inp_mtx		link pf to inp mutex
  *	s	so_lock			socket rwlock
- *	f	inp_sofree_mtx		socket detach and lock
  */
 
 /*
@@ -138,8 +137,7 @@ struct inpcb {
 #define	inp_laddr6	inp_laddru.iau_addr6
 	u_int16_t inp_fport;		/* [t] foreign port */
 	u_int16_t inp_lport;		/* [t] local port */
-	struct	  socket *inp_socket;	/* [f] back pointer to socket */
-	struct	  mutex inp_sofree_mtx;	/* protect socket free */
+	struct	  socket *inp_socket;	/* [I] back pointer to socket */
 	caddr_t	  inp_ppcb;		/* pointer to per-protocol pcb */
 	struct    route inp_route;	/* [s] cached route */
 	struct    refcnt inp_refcnt;	/* refcount PCB, delay memory free */
@@ -311,8 +309,8 @@ int	 in_pcbaddrisavail(const struct inpc
 int	 in_pcbconnect(struct inpcb *, struct mbuf *);
 void	 in_pcbdetach(struct inpcb *);
 struct socket *
-	 in_pcbsolock_ref(struct inpcb *);
-void	 in_pcbsounlock_rele(struct inpcb *, struct socket *);
+	 in_pcbsolock(struct inpcb *);
+void	 in_pcbsounlock(struct inpcb *, struct socket *);
 struct inpcb *
 	 in_pcbref(struct inpcb *);
 void	 in_pcbunref(struct inpcb *);
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.447 tcp_input.c
--- netinet/tcp_input.c	19 May 2025 02:27:57 -0000	1.447
+++ netinet/tcp_input.c	21 May 2025 04:50:15 -0000
@@ -385,7 +385,7 @@ tcp_input_mlist(struct mbuf_list *ml, in
 		KASSERT(nxt == IPPROTO_DONE);
 	}
 
-	in_pcbsounlock_rele(NULL, so);
+	in_pcbsounlock(NULL, so);
 }
 
 /*
@@ -655,10 +655,10 @@ findpcb:
 		*solocked = NULL;
 	} else {
 		if (solocked != NULL && *solocked != NULL) {
-			in_pcbsounlock_rele(NULL, *solocked);
+			in_pcbsounlock(NULL, *solocked);
 			*solocked = NULL;
 		}
-		so = in_pcbsolock_ref(inp);
+		so = in_pcbsolock(inp);
 	}
 	if (so == NULL) {
 		tcpstat_inc(tcps_noport);
@@ -905,7 +905,7 @@ findpcb:
 				if (solocked != NULL)
 					*solocked = so;
 				else
-					in_pcbsounlock_rele(inp, so);
+					in_pcbsounlock(inp, so);
 				in_pcbunref(inp);
 				return IPPROTO_DONE;
 			}
@@ -1084,7 +1084,7 @@ findpcb:
 				if (solocked != NULL)
 					*solocked = so;
 				else
-					in_pcbsounlock_rele(inp, so);
+					in_pcbsounlock(inp, so);
 				in_pcbunref(inp);
 				return IPPROTO_DONE;
 			}
@@ -1138,7 +1138,7 @@ findpcb:
 			if (solocked != NULL)
 				*solocked = so;
 			else
-				in_pcbsounlock_rele(inp, so);
+				in_pcbsounlock(inp, so);
 			in_pcbunref(inp);
 			return IPPROTO_DONE;
 		}
@@ -1332,7 +1332,7 @@ trimthenstep6:
 			    ((arc4random() & 0x7fffffff) | 0x8000);
 			reuse = &iss;
 			tp = tcp_close(tp);
-			in_pcbsounlock_rele(inp, so);
+			in_pcbsounlock(inp, so);
 			so = NULL;
 			in_pcbunref(inp);
 			inp = NULL;
@@ -2141,7 +2141,7 @@ dodata:							/* XXX */
 	if (solocked != NULL)
 		*solocked = so;
 	else
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 	in_pcbunref(inp);
 	return IPPROTO_DONE;
 
@@ -2174,7 +2174,7 @@ dropafterack:
 	if (solocked != NULL)
 		*solocked = so;
 	else
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 	in_pcbunref(inp);
 	return IPPROTO_DONE;
 
@@ -2210,7 +2210,7 @@ dropwithreset:
 		    (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now);
 	}
 	m_freem(m);
-	in_pcbsounlock_rele(inp, so);
+	in_pcbsounlock(inp, so);
 	in_pcbunref(inp);
 	return IPPROTO_DONE;
 
@@ -2222,7 +2222,7 @@ drop:
 		tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen);
 
 	m_freem(m);
-	in_pcbsounlock_rele(inp, so);
+	in_pcbsounlock(inp, so);
 	in_pcbunref(inp);
 	return IPPROTO_DONE;
 }
@@ -3490,7 +3490,7 @@ syn_cache_timer(void *arg)
 	mtx_leave(&syn_cache_mtx);
 
 	NET_LOCK_SHARED();
-	so = in_pcbsolock_ref(inp);
+	so = in_pcbsolock(inp);
 	if (so != NULL) {
 		now = tcp_now();
 #ifdef TCP_ECN
@@ -3499,7 +3499,7 @@ syn_cache_timer(void *arg)
 		(void) syn_cache_respond(sc, NULL, now, do_ecn);
 		tcpstat_inc(tcps_sc_retransmitted);
 	}
-	in_pcbsounlock_rele(inp, so);
+	in_pcbsounlock(inp, so);
 	NET_UNLOCK_SHARED();
 
 	in_pcbunref(inp);
@@ -3622,7 +3622,7 @@ syn_cache_get(struct sockaddr *src, stru
 	sc = syn_cache_lookup(src, dst, &scp, inp->inp_rtableid);
 	if (sc == NULL) {
 		mtx_leave(&syn_cache_mtx);
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		return (NULL);
 	}
 
@@ -3636,7 +3636,7 @@ syn_cache_get(struct sockaddr *src, stru
 		refcnt_take(&sc->sc_refcnt);
 		mtx_leave(&syn_cache_mtx);
 		(void) syn_cache_respond(sc, m, now, do_ecn);
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		syn_cache_put(sc);
 		return ((struct socket *)(-1));
 	}
@@ -3767,7 +3767,7 @@ syn_cache_get(struct sockaddr *src, stru
 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 	tp->last_ack_sent = tp->rcv_nxt;
 
-	in_pcbsounlock_rele(listeninp, listenso);
+	in_pcbsounlock(listeninp, listenso);
 	tcpstat_inc(tcps_sc_completed);
 	syn_cache_put(sc);
 	return (so);
@@ -3779,8 +3779,8 @@ abort:
 	if (tp != NULL)
 		tp = tcp_drop(tp, ECONNABORTED);	/* destroys socket */
 	m_freem(m);
-	in_pcbsounlock_rele(inp, so);
-	in_pcbsounlock_rele(listeninp, listenso);
+	in_pcbsounlock(inp, so);
+	in_pcbsounlock(listeninp, listenso);
 	syn_cache_put(sc);
 	tcpstat_inc(tcps_sc_aborted);
 	return ((struct socket *)(-1));
Index: netinet/tcp_subr.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v
diff -u -p -r1.209 tcp_subr.c
--- netinet/tcp_subr.c	2 Mar 2025 21:28:32 -0000	1.209
+++ netinet/tcp_subr.c	21 May 2025 04:50:15 -0000
@@ -692,7 +692,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *
 			return;
 		}
 		if (inp != NULL)
-			so = in_pcbsolock_ref(inp);
+			so = in_pcbsolock(inp);
 		if (so != NULL)
 			tp = intotcpcb(inp);
 		if (tp != NULL) {
@@ -702,7 +702,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *
 			    SEQ_LT(seq, tp->snd_max))
 				notify(inp, inet6ctlerrmap[cmd]);
 		}
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		in_pcbunref(inp);
 
 		if (tp == NULL &&
@@ -762,7 +762,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 		    ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport,
 		    rdomain);
 		if (inp != NULL)
-			so = in_pcbsolock_ref(inp);
+			so = in_pcbsolock(inp);
 		if (so != NULL)
 			tp = intotcpcb(inp);
 		if (tp != NULL &&
@@ -779,7 +779,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 			 */
 			mtu = (u_int)ntohs(icp->icmp_nextmtu);
 			if (mtu >= tp->t_pmtud_mtu_sent) {
-				in_pcbsounlock_rele(inp, so);
+				in_pcbsounlock(inp, so);
 				in_pcbunref(inp);
 				return;
 			}
@@ -800,7 +800,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 				 */
 				if (tp->t_flags & TF_PMTUD_PEND) {
 					if (SEQ_LT(tp->t_pmtud_th_seq, seq)) {
-						in_pcbsounlock_rele(inp, so);
+						in_pcbsounlock(inp, so);
 						in_pcbunref(inp);
 						return;
 					}
@@ -810,17 +810,17 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 				tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
 				tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
 				tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
-				in_pcbsounlock_rele(inp, so);
+				in_pcbsounlock(inp, so);
 				in_pcbunref(inp);
 				return;
 			}
 		} else {
 			/* ignore if we don't have a matching connection */
-			in_pcbsounlock_rele(inp, so);
+			in_pcbsounlock(inp, so);
 			in_pcbunref(inp);
 			return;
 		}
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		in_pcbunref(inp);
 		notify = tcp_mtudisc, ip = NULL;
 	} else if (cmd == PRC_MTUINC)
@@ -840,7 +840,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 		    ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport,
 		    rdomain);
 		if (inp != NULL)
-			so = in_pcbsolock_ref(inp);
+			so = in_pcbsolock(inp);
 		if (so != NULL)
 			tp = intotcpcb(inp);
 		if (tp != NULL) {
@@ -849,7 +849,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 			    SEQ_LT(seq, tp->snd_max))
 				notify(inp, errno);
 		}
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		in_pcbunref(inp);
 
 		if (tp == NULL &&
Index: netinet/tcp_timer.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
diff -u -p -r1.83 tcp_timer.c
--- netinet/tcp_timer.c	12 Feb 2025 21:28:11 -0000	1.83
+++ netinet/tcp_timer.c	21 May 2025 04:50:15 -0000
@@ -91,7 +91,7 @@ tcp_timer_enter(struct inpcb *inp, struc
 	KASSERT(timer < TCPT_NTIMERS);
 
 	NET_LOCK_SHARED();
-	*so = in_pcbsolock_ref(inp);
+	*so = in_pcbsolock(inp);
 	if (*so == NULL) {
 		*tp = NULL;
 		return -1;
@@ -109,7 +109,7 @@ tcp_timer_enter(struct inpcb *inp, struc
 static inline void
 tcp_timer_leave(struct inpcb *inp, struct socket *so)
 {
-	in_pcbsounlock_rele(inp, so);
+	in_pcbsounlock(inp, so);
 	NET_UNLOCK_SHARED();
 	in_pcbunref(inp);
 }
@@ -237,7 +237,7 @@ tcp_timer_rexmt(void *arg)
 		sin.sin_family = AF_INET;
 		sin.sin_addr = inp->inp_faddr;
 
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		in_pcbunref(inp);
 
 		icmp_mtudisc(&icmp, rtableid);
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
diff -u -p -r1.247 tcp_usrreq.c
--- netinet/tcp_usrreq.c	20 May 2025 18:41:06 -0000	1.247
+++ netinet/tcp_usrreq.c	21 May 2025 04:50:15 -0000
@@ -1214,7 +1214,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
 		struct tcpcb *tp = NULL;
 
 		if (inp != NULL) {
-			so = in_pcbsolock_ref(inp);
+			so = in_pcbsolock(inp);
 			if (so != NULL)
 				tp = intotcpcb(inp);
 		}
@@ -1223,7 +1223,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
 		else
 			error = ESRCH;
 
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		NET_UNLOCK_SHARED();
 		in_pcbunref(inp);
 		return (error);
@@ -1246,7 +1246,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
 	}
 
 	if (inp != NULL)
-		so = in_pcbsolock_ref(inp);
+		so = in_pcbsolock(inp);
 
 	if (so != NULL && ISSET(so->so_state, SS_CONNECTOUT)) {
 		tir.ruid = so->so_ruid;
@@ -1256,7 +1256,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
 		tir.euid = -1;
 	}
 
-	in_pcbsounlock_rele(inp, so);
+	in_pcbsounlock(inp, so);
 	NET_UNLOCK_SHARED();
 	in_pcbunref(inp);
 
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
diff -u -p -r1.337 udp_usrreq.c
--- netinet/udp_usrreq.c	12 May 2025 17:21:21 -0000	1.337
+++ netinet/udp_usrreq.c	21 May 2025 04:50:15 -0000
@@ -921,10 +921,10 @@ udp_ctlinput(int cmd, struct sockaddr *s
 		    ip->ip_dst, uhp->uh_dport, ip->ip_src, uhp->uh_sport,
 		    rdomain);
 		if (inp != NULL)
-			so = in_pcbsolock_ref(inp);
+			so = in_pcbsolock(inp);
 		if (so != NULL)
 			notify(inp, errno);
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		in_pcbunref(inp);
 	} else
 		in_pcbnotifyall(&udbtable, satosin(sa), rdomain, errno, notify);
Index: netinet6/in6_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
diff -u -p -r1.148 in6_pcb.c
--- netinet6/in6_pcb.c	4 May 2025 23:05:17 -0000	1.148
+++ netinet6/in6_pcb.c	21 May 2025 04:50:15 -0000
@@ -546,10 +546,10 @@ in6_pcbnotify(struct inpcbtable *table, 
 		}
 	  do_notify:
 		mtx_leave(&table->inpt_mtx);
-		so = in_pcbsolock_ref(inp);
+		so = in_pcbsolock(inp);
 		if (so != NULL)
 			(*notify)(inp, errno);
-		in_pcbsounlock_rele(inp, so);
+		in_pcbsounlock(inp, so);
 		mtx_enter(&table->inpt_mtx);
 	}
 	mtx_leave(&table->inpt_mtx);