Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: shared netlock for soclose()
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Wed, 4 Jun 2025 23:45:14 +0300

Download raw body.

Thread
On Wed, Jun 04, 2025 at 05:09:25PM +0200, Alexander Bluhm wrote:
> On Tue, Jun 03, 2025 at 02:33:03PM +0200, Alexander Bluhm wrote:
> > Hi,
> > 
> > Currently soclose() needs exclusive netlock.  I see deadlocks as
> > tcp_newtcpcb() calls pool_get(&tcpcb_pool) with PR_WAITOK while
> > holding shared netlock.  Running memory allocation in parallel
> > although free needs an exclusive lock is a bad idea.
> > 
> > Solution is to run soclose() in parallel.
> > 
> > Diff consists of four parts:
> > - always hold reference count of socket in inp_socket
> > - inp_socket is not longer protected by special mutex
> > - account when other CPU sets so_pcb to NULL
> > - run soclose() and sofree() with shared netlock
> > - tcp timeout reaper can go away
> > 
> > Please test.  I will split the diff for review.
> 
> Some parts have been commited.  New diff that applies to current.
> 
> bluhm
>

This could be not the destination socket. Do we need to update
`ips_closing' if so?

> @@ -169,6 +170,12 @@ rip_input(struct mbuf **mp, int *offp, i
>  	while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
>  		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
>  
> +		pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +		if (pcb == NULL) {
> +			ipstat_inc(ips_closing);
> +			continue;
> +		}
> +		KASSERT(pcb == inp);


> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
> diff -u -p -r1.378 uipc_socket.c
> --- kern/uipc_socket.c	23 May 2025 23:41:46 -0000	1.378
> +++ kern/uipc_socket.c	4 Jun 2025 14:18:03 -0000
> @@ -213,10 +213,7 @@ socreate(int dom, struct socket **aso, i
>  	if (error) {
>  		so->so_state |= SS_NOFDREF;
>  		/* sofree() calls sounlock(). */
> -		soref(so);
> -		sofree(so, 1);
> -		sounlock_shared(so);
> -		sorele(so);
> +		sofree(so, 0);
>  		return (error);
>  	}
>  	sounlock_shared(so);
> @@ -304,7 +301,7 @@ sofree(struct socket *so, int keep_lock)
>  
>  	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
>  		if (!keep_lock)
> -			sounlock(so);
> +			sounlock_shared(so);
>  		return;
>  	}
>  	if (so->so_head) {
> @@ -317,7 +314,7 @@ sofree(struct socket *so, int keep_lock)
>  		 */
>  		if (so->so_onq == &head->so_q) {
>  			if (!keep_lock)
> -				sounlock(so);
> +				sounlock_shared(so);
>  			return;
>  		}
>  
> @@ -344,7 +341,7 @@ sofree(struct socket *so, int keep_lock)
>  	}
>  
>  	if (!keep_lock)
> -		sounlock(so);
> +		sounlock_shared(so);
>  	sorele(so);
>  }
>  
> @@ -368,7 +365,7 @@ soclose(struct socket *so, int flags)
>  	struct socket *so2;
>  	int error = 0;
>  
> -	solock(so);
> +	solock_shared(so);
>  	/* Revoke async IO early. There is a final revocation in sofree(). */
>  	sigio_free(&so->so_sigio);
>  	if (so->so_state & SS_ISCONNECTED) {
> @@ -430,7 +427,7 @@ discard:
>  	if (so->so_sp) {
>  		struct socket *soback;
>  
> -		sounlock(so);
> +		sounlock_shared(so);
>  		/*
>  		 * Concurrent sounsplice() locks `sb_mtx' mutexes on
>  		 * both `so_snd' and `so_rcv' before unsplice sockets.
> @@ -477,7 +474,7 @@ notsplicedback:
>  		task_del(sosplice_taskq, &so->so_sp->ssp_task);
>  		taskq_barrier(sosplice_taskq);
>  
> -		solock(so);
> +		solock_shared(so);
>  	}
>  #endif /* SOCKET_SPLICE */
>  
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> diff -u -p -r1.169 in_pcb.h
> --- netinet/in_pcb.h	3 Jun 2025 16:51:26 -0000	1.169
> +++ netinet/in_pcb.h	4 Jun 2025 14:18:03 -0000
> @@ -138,7 +138,7 @@ struct inpcb {
>  	u_int16_t inp_fport;		/* [t] foreign port */
>  	u_int16_t inp_lport;		/* [t] local port */
>  	struct	  socket *inp_socket;	/* [I] back pointer to socket */
> -	caddr_t	  inp_ppcb;		/* pointer to per-protocol pcb */
> +	caddr_t	  inp_ppcb;		/* [s] pointer to per-protocol pcb */
>  	struct    route inp_route;	/* [s] cached route */
>  	struct    refcnt inp_refcnt;	/* refcount PCB, delay memory free */
>  	int	  inp_flags;		/* generic IP/datagram flags */
> Index: netinet/ip_divert.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
> diff -u -p -r1.104 ip_divert.c
> --- netinet/ip_divert.c	4 Jun 2025 12:37:00 -0000	1.104
> +++ netinet/ip_divert.c	4 Jun 2025 13:31:07 -0000
> @@ -190,6 +190,7 @@ void
>  divert_packet(struct mbuf *m, int dir, u_int16_t divert_port)
>  {
>  	struct inpcb *inp = NULL;
> +	void *pcb;
>  	struct socket *so;
>  	struct sockaddr_in sin;
>  
> @@ -213,6 +214,12 @@ divert_packet(struct mbuf *m, int dir, u
>  		divstat_inc(divs_noport);
>  		goto bad;
>  	}
> +	pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +	if (pcb == NULL) {
> +		divstat_inc(divs_closing);
> +		goto bad;
> +	}
> +	KASSERT(pcb == inp);
>  
>  	memset(&sin, 0, sizeof(sin));
>  	sin.sin_family = AF_INET;
> Index: netinet/ip_divert.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v
> diff -u -p -r1.27 ip_divert.h
> --- netinet/ip_divert.h	4 Jun 2025 12:37:00 -0000	1.27
> +++ netinet/ip_divert.h	4 Jun 2025 13:31:07 -0000
> @@ -22,6 +22,7 @@
>  struct divstat {
>  	u_long	divs_ipackets;	/* total input packets */
>  	u_long	divs_noport;	/* no socket on port */
> +	u_long	divs_closing;	/* inpcb exists, socket is closing */
>  	u_long	divs_fullsock;	/* not delivered, input socket full */
>  	u_long	divs_opackets;	/* total output packets */
>  	u_long	divs_errors;	/* generic errors */
> @@ -49,6 +50,7 @@ struct divstat {
>  enum divstat_counters {
>  	divs_ipackets,
>  	divs_noport,
> +	divs_closing,
>  	divs_fullsock,
>  	divs_opackets,
>  	divs_errors,
> Index: netinet/ip_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> diff -u -p -r1.121 ip_var.h
> --- netinet/ip_var.h	2 Mar 2025 21:28:32 -0000	1.121
> +++ netinet/ip_var.h	4 Jun 2025 13:31:07 -0000
> @@ -68,6 +68,7 @@ struct	ipstat {
>  	u_long	ips_cantforward;	/* packets rcvd for unreachable dest */
>  	u_long	ips_redirectsent;	/* packets forwarded on same net */
>  	u_long	ips_noproto;		/* unknown or unsupported protocol */
> +	u_long	ips_closing;		/* inpcb exists, socket is closing */
>  	u_long	ips_delivered;		/* datagrams delivered to upper level*/
>  	u_long	ips_localout;		/* total ip packets generated here */
>  	u_long	ips_odropped;		/* lost output due to nobufs, etc. */
> @@ -116,6 +117,7 @@ enum ipstat_counters {
>  	ips_cantforward,	/* packets rcvd for unreachable dest */
>  	ips_redirectsent,	/* packets forwarded on same net */
>  	ips_noproto,		/* unknown or unsupported protocol */
> +	ips_closing,		/* inpcb exists, socket is closing */
>  	ips_delivered,		/* datagrams delivered to upper level*/
>  	ips_localout,		/* total ip packets generated here */
>  	ips_odropped,		/* lost output packets due to nobufs, etc. */
> Index: netinet/raw_ip.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> diff -u -p -r1.166 raw_ip.c
> --- netinet/raw_ip.c	11 Mar 2025 15:31:03 -0000	1.166
> +++ netinet/raw_ip.c	4 Jun 2025 13:31:07 -0000
> @@ -135,6 +135,7 @@ rip_input(struct mbuf **mp, int *offp, i
>  	struct ip *ip = mtod(m, struct ip *);
>  	struct inpcb_iterator iter = { .inp_table = NULL };
>  	struct inpcb *inp, *last;
> +	void *pcb;
>  	struct in_addr *key;
>  	struct sockaddr_in ripsrc;
>  
> @@ -169,6 +170,12 @@ rip_input(struct mbuf **mp, int *offp, i
>  	while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
>  		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
>  
> +		pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +		if (pcb == NULL) {
> +			ipstat_inc(ips_closing);
> +			continue;
> +		}
> +		KASSERT(pcb == inp);
>  		/*
>  		 * Packet must not be inserted after disconnected wakeup
>  		 * call.  To avoid race, check again when holding receive
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.450 tcp_input.c
> --- netinet/tcp_input.c	3 Jun 2025 16:51:26 -0000	1.450
> +++ netinet/tcp_input.c	4 Jun 2025 13:31:07 -0000
> @@ -661,10 +661,9 @@ findpcb:
>  		so = in_pcbsolock(inp);
>  	}
>  	if (so == NULL) {
> -		tcpstat_inc(tcps_noport);
> +		tcpstat_inc(tcps_closing);
>  		goto dropwithreset_ratelim;
>  	}
> -
>  	KASSERT(sotoinpcb(inp->inp_socket) == inp);
>  	KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp);
>  	soassertlocked(inp->inp_socket);
> Index: netinet/tcp_subr.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v
> diff -u -p -r1.211 tcp_subr.c
> --- netinet/tcp_subr.c	3 Jun 2025 16:51:26 -0000	1.211
> +++ netinet/tcp_subr.c	4 Jun 2025 14:18:03 -0000
> @@ -440,8 +440,6 @@ tcp_newtcpcb(struct inpcb *inp, int wait
>  	tp->t_inpcb = inp;
>  	for (i = 0; i < TCPT_NTIMERS; i++)
>  		TCP_TIMER_INIT(tp, i);
> -	timeout_set_flags(&tp->t_timer_reaper, tcp_timer_reaper, tp,
> -	    KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
>  
>  	tp->sack_enable = atomic_load_int(&tcp_do_sack);
>  	tp->t_flags = atomic_load_int(&tcp_do_rfc1323) ?
> @@ -528,9 +526,8 @@ tcp_close(struct tcpcb *tp)
>  	}
>  
>  	m_free(tp->t_template);
> -	/* Free tcpcb after all pending timers have been run. */
> -	timeout_add(&tp->t_timer_reaper, 0);
>  	inp->inp_ppcb = NULL;
> +	pool_put(&tcpcb_pool, tp);
>  	soisdisconnected(so);
>  	in_pcbdetach(inp);
>  	tcpstat_inc(tcps_closed);
> Index: netinet/tcp_timer.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
> diff -u -p -r1.84 tcp_timer.c
> --- netinet/tcp_timer.c	3 Jun 2025 16:51:26 -0000	1.84
> +++ netinet/tcp_timer.c	4 Jun 2025 14:18:03 -0000
> @@ -541,19 +541,3 @@ tcp_timer_2msl(void *arg)
>   out:
>  	tcp_timer_leave(inp, so);
>  }
> -
> -void
> -tcp_timer_reaper(void *arg)
> -{
> -	struct tcpcb *tp = arg;
> -
> -	/*
> -	 * This timer is necessary to delay the pool_put() after all timers
> -	 * have finished, even if they were sleeping to grab the net lock.
> -	 * Putting the pool_put() in a timer is sufficient as all timers run
> -	 * from the same timeout thread.  Note that neither softnet thread nor
> -	 * user process may access the tcpcb after arming the reaper timer.
> -	 * Freeing may run in parallel as it does not grab the net lock.
> -	 */
> -	pool_put(&tcpcb_pool, tp);
> -}
> Index: netinet/tcp_timer.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
> diff -u -p -r1.26 tcp_timer.h
> --- netinet/tcp_timer.h	16 Jan 2025 11:59:20 -0000	1.26
> +++ netinet/tcp_timer.h	4 Jun 2025 14:18:03 -0000
> @@ -164,8 +164,5 @@ extern int tcp_keepidle_sec;	/* [a] copy
>  extern int tcp_keepintvl_sec;	/* [a] copy of above in seconds for sysctl */
>  extern int tcp_ttl;		/* time to live for TCP segs */
>  extern const int tcp_backoff[];
> -
> -void	tcp_timer_reaper(void *);
> -
>  #endif /* _KERNEL */
>  #endif /* _NETINET_TCP_TIMER_H_ */
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.191 tcp_var.h
> --- netinet/tcp_var.h	7 May 2025 14:10:19 -0000	1.191
> +++ netinet/tcp_var.h	4 Jun 2025 14:18:03 -0000
> @@ -70,7 +70,6 @@ struct tcpqent {
>  struct tcpcb {
>  	struct tcpqehead t_segq;		/* sequencing queue */
>  	struct timeout t_timer[TCPT_NTIMERS];	/* tcp timers */
> -	struct timeout t_timer_reaper;	/* reaper is special, no refcnt */
>  	short	t_state;		/* state of this connection */
>  	short	t_rxtshift;		/* log(2) of rexmt exp. backoff */
>  	int	t_rxtcur;		/* current retransmit value */
> @@ -393,6 +392,7 @@ struct	tcpstat {
>  
>  	u_int32_t tcps_pcbhashmiss;	/* input packets missing pcb hash */
>  	u_int32_t tcps_noport;		/* no socket on port */
> +	u_int32_t tcps_closing;		/* inpcb exists, socket is closing */
>  	u_int32_t tcps_badsyn;		/* SYN packet with src==dst rcv'ed */
>  	u_int32_t tcps_dropsyn;		/* SYN packet dropped */
>  
> @@ -583,6 +583,7 @@ enum tcpstat_counters {
>  	tcps_preddat,
>  	tcps_pcbhashmiss,
>  	tcps_noport,
> +	tcps_closing,
>  	tcps_badsyn,
>  	tcps_dropsyn,
>  	tcps_rcvbadsig,
> Index: netinet/udp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
> diff -u -p -r1.341 udp_usrreq.c
> --- netinet/udp_usrreq.c	3 Jun 2025 16:51:26 -0000	1.341
> +++ netinet/udp_usrreq.c	4 Jun 2025 13:31:07 -0000
> @@ -198,6 +198,7 @@ udp_input(struct mbuf **mp, int *offp, i
>  	struct ip *ip = NULL;
>  	struct udphdr *uh;
>  	struct inpcb *inp = NULL;
> +	void *pcb;
>  	struct ip save_ip;
>  	int len;
>  	u_int16_t savesum;
> @@ -419,6 +420,12 @@ udp_input(struct mbuf **mp, int *offp, i
>  			else
>  				KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
>  
> +			pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +			if (pcb == NULL) {
> +				udpstat_inc(udps_closing);
> +				continue;
> +			}
> +			KASSERT(pcb == inp);
>  			if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE)
>  				continue;
>  			if (rtable_l2(inp->inp_rtableid) !=
> @@ -596,7 +603,12 @@ udp_input(struct mbuf **mp, int *offp, i
>  		return IPPROTO_DONE;
>  	}
>  
> -	KASSERT(sotoinpcb(inp->inp_socket) == inp);
> +	pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +	if (pcb == NULL) {
> +		udpstat_inc(udps_closing);
> +		goto bad;
> +	}
> +	KASSERT(pcb == inp);
>  	soassertlocked_readonly(inp->inp_socket);
>  
>  #ifdef INET6
> Index: netinet/udp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v
> diff -u -p -r1.53 udp_var.h
> --- netinet/udp_var.h	2 Mar 2025 21:28:32 -0000	1.53
> +++ netinet/udp_var.h	4 Jun 2025 13:31:07 -0000
> @@ -61,6 +61,7 @@ struct	udpstat {
>  	u_long	udps_badlen;		/* data length larger than packet */
>  	u_long	udps_noport;		/* no socket on port */
>  	u_long	udps_noportbcast;	/* of above, arrived as broadcast */
> +	u_long	udps_closing;		/* inpcb exists, socket is closing */
>  	u_long	udps_nosec;		/* dropped for lack of ipsec */
>  	u_long	udps_fullsock;		/* not delivered, input socket full */
>  	u_long	udps_pcbhashmiss;	/* input packets missing pcb hash */
> @@ -104,6 +105,7 @@ enum udpstat_counters {
>  	udps_badlen,		/* data length larger than packet */
>  	udps_noport,		/* no socket on port */
>  	udps_noportbcast,	/* of above, arrived as broadcast */
> +	udps_closing,		/* inpcb exists, socket is closing */
>  	udps_nosec,		/* dropped for lack of ipsec */
>  	udps_fullsock,		/* not delivered, input socket full */
>  	udps_pcbhashmiss,	/* input packets missing pcb hash */
> Index: netinet6/ip6_divert.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v
> diff -u -p -r1.103 ip6_divert.c
> --- netinet6/ip6_divert.c	4 Jun 2025 12:37:00 -0000	1.103
> +++ netinet6/ip6_divert.c	4 Jun 2025 13:31:07 -0000
> @@ -199,6 +199,7 @@ void
>  divert6_packet(struct mbuf *m, int dir, u_int16_t divert_port)
>  {
>  	struct inpcb *inp = NULL;
> +	void *pcb;
>  	struct socket *so;
>  	struct sockaddr_in6 sin6;
>  
> @@ -222,6 +223,12 @@ divert6_packet(struct mbuf *m, int dir, 
>  		div6stat_inc(divs_noport);
>  		goto bad;
>  	}
> +	pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +	if (pcb == NULL) {
> +		div6stat_inc(divs_closing);
> +		goto bad;
> +	}
> +	KASSERT(pcb == inp);
>  
>  	memset(&sin6, 0, sizeof(sin6));
>  	sin6.sin6_family = AF_INET6;
> Index: netinet6/raw_ip6.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
> diff -u -p -r1.192 raw_ip6.c
> --- netinet6/raw_ip6.c	27 May 2025 07:52:49 -0000	1.192
> +++ netinet6/raw_ip6.c	4 Jun 2025 13:31:07 -0000
> @@ -138,6 +138,7 @@ rip6_input(struct mbuf **mp, int *offp, 
>  	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
>  	struct inpcb_iterator iter = { .inp_table = NULL };
>  	struct inpcb *inp, *last;
> +	void *pcb;
>  	struct in6_addr *key;
>  	struct sockaddr_in6 rip6src;
>  	uint8_t type;
> @@ -184,6 +185,12 @@ rip6_input(struct mbuf **mp, int *offp, 
>  	while ((inp = in_pcb_iterator(&rawin6pcbtable, inp, &iter)) != NULL) {
>  		KASSERT(ISSET(inp->inp_flags, INP_IPV6));
>  
> +		pcb = READ_ONCE(inp->inp_socket->so_pcb);
> +		if (pcb == NULL) {
> +			rip6stat_inc(rip6s_closing);
> +			continue;
> +		}
> +		KASSERT(pcb == inp);
>  		/*
>  		 * Packet must not be inserted after disconnected wakeup
>  		 * call.  To avoid race, check again when holding receive
> Index: netinet6/raw_ip6.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.h,v
> diff -u -p -r1.4 raw_ip6.h
> --- netinet6/raw_ip6.h	9 Feb 2017 15:23:35 -0000	1.4
> +++ netinet6/raw_ip6.h	4 Jun 2025 13:31:07 -0000
> @@ -42,6 +42,7 @@ struct rip6stat {
>  	u_int64_t rip6s_badsum;		/* of above, checksum error */
>  	u_int64_t rip6s_nosock;		/* no matching socket */
>  	u_int64_t rip6s_nosockmcast;	/* of above, arrived as multicast */
> +	u_int64_t rip6s_closing;	/* inpcb exists, socket is closing */
>  	u_int64_t rip6s_fullsock;	/* not delivered, input socket full */
>  
>  	u_int64_t rip6s_opackets;	/* total output packets */
> @@ -68,6 +69,7 @@ enum rip6stat_counters {
>  	rip6s_badsum,
>  	rip6s_nosock,
>  	rip6s_nosockmcast,
> +	rip6s_closing,
>  	rip6s_fullsock,
>  	rip6s_opackets,
>  	rip6s_ncounters,
>