Download raw body.
shared netlock for soclose()
On Wed, Jun 04, 2025 at 11:45:14PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Jun 04, 2025 at 05:09:25PM +0200, Alexander Bluhm wrote:
> > On Tue, Jun 03, 2025 at 02:33:03PM +0200, Alexander Bluhm wrote:
> > > Hi,
> > >
> > > Currently soclose() needs exclusive netlock. I see deadlocks as
> > > tcp_newtcpcb() calls pool_get(&tcpcb_pool) with PR_WAITOK while
> > > holding shared netlock. Running memory allocation in parallel
> > > although free needs an exclusive lock is a bad idea.
> > >
> > > Solution is to run soclose() in parallel.
> > >
> > > Diff consists of four parts:
> > > - always hold reference count of socket in inp_socket
> > > - inp_socket is not longer protected by special mutex
> > > - account when other CPU sets so_pcb to NULL
> > > - run soclose() and sofree() with shared netlock
> > > - tcp timeout reaper can go away
> > >
> > > Please test. I will split the diff for review.
> >
> > Some parts have been commited. New diff that applies to current.
> >
> > bluhm
> >
>
> This could be not the destination socket. Do we need to update
> `ips_closing' if so?
Updated diff with all parts for unlocked soclose().
bluhm
Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.378 uipc_socket.c
--- kern/uipc_socket.c 23 May 2025 23:41:46 -0000 1.378
+++ kern/uipc_socket.c 11 Jun 2025 10:01:13 -0000
@@ -213,10 +213,7 @@ socreate(int dom, struct socket **aso, i
if (error) {
so->so_state |= SS_NOFDREF;
/* sofree() calls sounlock(). */
- soref(so);
- sofree(so, 1);
- sounlock_shared(so);
- sorele(so);
+ sofree(so, 0);
return (error);
}
sounlock_shared(so);
@@ -304,7 +301,7 @@ sofree(struct socket *so, int keep_lock)
if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
if (!keep_lock)
- sounlock(so);
+ sounlock_shared(so);
return;
}
if (so->so_head) {
@@ -317,7 +314,7 @@ sofree(struct socket *so, int keep_lock)
*/
if (so->so_onq == &head->so_q) {
if (!keep_lock)
- sounlock(so);
+ sounlock_shared(so);
return;
}
@@ -344,7 +341,7 @@ sofree(struct socket *so, int keep_lock)
}
if (!keep_lock)
- sounlock(so);
+ sounlock_shared(so);
sorele(so);
}
@@ -368,7 +365,7 @@ soclose(struct socket *so, int flags)
struct socket *so2;
int error = 0;
- solock(so);
+ solock_shared(so);
/* Revoke async IO early. There is a final revocation in sofree(). */
sigio_free(&so->so_sigio);
if (so->so_state & SS_ISCONNECTED) {
@@ -430,7 +427,7 @@ discard:
if (so->so_sp) {
struct socket *soback;
- sounlock(so);
+ sounlock_shared(so);
/*
* Concurrent sounsplice() locks `sb_mtx' mutexes on
* both `so_snd' and `so_rcv' before unsplice sockets.
@@ -477,7 +474,7 @@ notsplicedback:
task_del(sosplice_taskq, &so->so_sp->ssp_task);
taskq_barrier(sosplice_taskq);
- solock(so);
+ solock_shared(so);
}
#endif /* SOCKET_SPLICE */
Index: netinet/ip_divert.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
diff -u -p -r1.105 ip_divert.c
--- netinet/ip_divert.c 6 Jun 2025 13:13:37 -0000 1.105
+++ netinet/ip_divert.c 11 Jun 2025 10:01:13 -0000
@@ -178,6 +178,7 @@ void
divert_packet(struct mbuf *m, int dir, u_int16_t divert_port)
{
struct inpcb *inp = NULL;
+ void *pcb;
struct socket *so;
struct sockaddr_in sin;
@@ -201,6 +202,12 @@ divert_packet(struct mbuf *m, int dir, u
divstat_inc(divs_noport);
goto bad;
}
+ pcb = READ_ONCE(inp->inp_socket->so_pcb);
+ if (pcb == NULL) {
+ divstat_inc(divs_closing);
+ goto bad;
+ }
+ KASSERT(pcb == inp);
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
Index: netinet/ip_divert.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v
diff -u -p -r1.28 ip_divert.h
--- netinet/ip_divert.h 6 Jun 2025 13:13:37 -0000 1.28
+++ netinet/ip_divert.h 11 Jun 2025 10:01:13 -0000
@@ -22,6 +22,7 @@
struct divstat {
u_long divs_ipackets; /* total input packets */
u_long divs_noport; /* no socket on port */
+ u_long divs_closing; /* inpcb exists, socket is closing */
u_long divs_fullsock; /* not delivered, input socket full */
u_long divs_opackets; /* total output packets */
u_long divs_errors; /* generic errors */
@@ -53,6 +54,7 @@ struct divstat {
enum divstat_counters {
divs_ipackets,
divs_noport,
+ divs_closing,
divs_fullsock,
divs_opackets,
divs_errors,
Index: netinet/raw_ip.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
diff -u -p -r1.166 raw_ip.c
--- netinet/raw_ip.c 11 Mar 2025 15:31:03 -0000 1.166
+++ netinet/raw_ip.c 11 Jun 2025 10:01:13 -0000
@@ -135,6 +135,7 @@ rip_input(struct mbuf **mp, int *offp, i
struct ip *ip = mtod(m, struct ip *);
struct inpcb_iterator iter = { .inp_table = NULL };
struct inpcb *inp, *last;
+ void *pcb;
struct in_addr *key;
struct sockaddr_in ripsrc;
@@ -169,6 +170,10 @@ rip_input(struct mbuf **mp, int *offp, i
while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
+ pcb = READ_ONCE(inp->inp_socket->so_pcb);
+ if (pcb == NULL)
+ continue;
+ KASSERT(pcb == inp);
/*
* Packet must not be inserted after disconnected wakeup
* call. To avoid race, check again when holding receive
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.450 tcp_input.c
--- netinet/tcp_input.c 3 Jun 2025 16:51:26 -0000 1.450
+++ netinet/tcp_input.c 11 Jun 2025 10:01:13 -0000
@@ -661,10 +661,9 @@ findpcb:
so = in_pcbsolock(inp);
}
if (so == NULL) {
- tcpstat_inc(tcps_noport);
+ tcpstat_inc(tcps_closing);
goto dropwithreset_ratelim;
}
-
KASSERT(sotoinpcb(inp->inp_socket) == inp);
KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp);
soassertlocked(inp->inp_socket);
@@ -793,7 +792,8 @@ findpcb:
* full-blown connection.
*/
in_pcbunref(inp);
- inp = in_pcbref(sotoinpcb(so));
+ /* syn_cache_get() has refcounted inp */
+ inp = sotoinpcb(so);
tp = intotcpcb(inp);
if (tp == NULL)
goto badsyn; /*XXX*/
@@ -3657,8 +3657,8 @@ syn_cache_get(struct sockaddr *src, stru
if (so == NULL)
goto resetandabort;
soassertlocked(so);
- soref(so);
- inp = sotoinpcb(so);
+ /* inpcb does refcount socket, both so and inp cannot go away */
+ inp = in_pcbref(sotoinpcb(so));
tp = intotcpcb(inp);
#ifdef IPSEC
@@ -3778,9 +3778,10 @@ resetandabort:
abort:
if (tp != NULL)
tp = tcp_drop(tp, ECONNABORTED); /* destroys socket */
- m_freem(m);
in_pcbsounlock(inp, so);
in_pcbsounlock(listeninp, listenso);
+ in_pcbunref(inp);
+ m_freem(m);
syn_cache_put(sc);
tcpstat_inc(tcps_sc_aborted);
return ((struct socket *)(-1));
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.192 tcp_var.h
--- netinet/tcp_var.h 8 Jun 2025 17:06:19 -0000 1.192
+++ netinet/tcp_var.h 11 Jun 2025 10:01:13 -0000
@@ -392,6 +392,7 @@ struct tcpstat {
u_int32_t tcps_pcbhashmiss; /* input packets missing pcb hash */
u_int32_t tcps_noport; /* no socket on port */
+ u_int32_t tcps_closing; /* inpcb exists, socket is closing */
u_int32_t tcps_badsyn; /* SYN packet with src==dst rcv'ed */
u_int32_t tcps_dropsyn; /* SYN packet dropped */
@@ -582,6 +583,7 @@ enum tcpstat_counters {
tcps_preddat,
tcps_pcbhashmiss,
tcps_noport,
+ tcps_closing,
tcps_badsyn,
tcps_dropsyn,
tcps_rcvbadsig,
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
diff -u -p -r1.341 udp_usrreq.c
--- netinet/udp_usrreq.c 3 Jun 2025 16:51:26 -0000 1.341
+++ netinet/udp_usrreq.c 11 Jun 2025 10:01:13 -0000
@@ -198,6 +198,7 @@ udp_input(struct mbuf **mp, int *offp, i
struct ip *ip = NULL;
struct udphdr *uh;
struct inpcb *inp = NULL;
+ void *pcb;
struct ip save_ip;
int len;
u_int16_t savesum;
@@ -419,6 +420,10 @@ udp_input(struct mbuf **mp, int *offp, i
else
KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
+ pcb = READ_ONCE(inp->inp_socket->so_pcb);
+ if (pcb == NULL)
+ continue;
+ KASSERT(pcb == inp);
if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE)
continue;
if (rtable_l2(inp->inp_rtableid) !=
@@ -596,7 +601,12 @@ udp_input(struct mbuf **mp, int *offp, i
return IPPROTO_DONE;
}
- KASSERT(sotoinpcb(inp->inp_socket) == inp);
+ pcb = READ_ONCE(inp->inp_socket->so_pcb);
+ if (pcb == NULL) {
+ udpstat_inc(udps_closing);
+ goto bad;
+ }
+ KASSERT(pcb == inp);
soassertlocked_readonly(inp->inp_socket);
#ifdef INET6
Index: netinet/udp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v
diff -u -p -r1.53 udp_var.h
--- netinet/udp_var.h 2 Mar 2025 21:28:32 -0000 1.53
+++ netinet/udp_var.h 11 Jun 2025 10:01:13 -0000
@@ -61,6 +61,7 @@ struct udpstat {
u_long udps_badlen; /* data length larger than packet */
u_long udps_noport; /* no socket on port */
u_long udps_noportbcast; /* of above, arrived as broadcast */
+ u_long udps_closing; /* inpcb exists, socket is closing */
u_long udps_nosec; /* dropped for lack of ipsec */
u_long udps_fullsock; /* not delivered, input socket full */
u_long udps_pcbhashmiss; /* input packets missing pcb hash */
@@ -104,6 +105,7 @@ enum udpstat_counters {
udps_badlen, /* data length larger than packet */
udps_noport, /* no socket on port */
udps_noportbcast, /* of above, arrived as broadcast */
+ udps_closing, /* inpcb exists, socket is closing */
udps_nosec, /* dropped for lack of ipsec */
udps_fullsock, /* not delivered, input socket full */
udps_pcbhashmiss, /* input packets missing pcb hash */
Index: netinet6/ip6_divert.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v
diff -u -p -r1.104 ip6_divert.c
--- netinet6/ip6_divert.c 6 Jun 2025 13:13:37 -0000 1.104
+++ netinet6/ip6_divert.c 11 Jun 2025 10:01:13 -0000
@@ -187,6 +187,7 @@ void
divert6_packet(struct mbuf *m, int dir, u_int16_t divert_port)
{
struct inpcb *inp = NULL;
+ void *pcb;
struct socket *so;
struct sockaddr_in6 sin6;
@@ -210,6 +211,12 @@ divert6_packet(struct mbuf *m, int dir,
div6stat_inc(divs_noport);
goto bad;
}
+ pcb = READ_ONCE(inp->inp_socket->so_pcb);
+ if (pcb == NULL) {
+ div6stat_inc(divs_closing);
+ goto bad;
+ }
+ KASSERT(pcb == inp);
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
Index: netinet6/raw_ip6.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
diff -u -p -r1.192 raw_ip6.c
--- netinet6/raw_ip6.c 27 May 2025 07:52:49 -0000 1.192
+++ netinet6/raw_ip6.c 11 Jun 2025 10:01:13 -0000
@@ -138,6 +138,7 @@ rip6_input(struct mbuf **mp, int *offp,
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb_iterator iter = { .inp_table = NULL };
struct inpcb *inp, *last;
+ void *pcb;
struct in6_addr *key;
struct sockaddr_in6 rip6src;
uint8_t type;
@@ -184,6 +185,10 @@ rip6_input(struct mbuf **mp, int *offp,
while ((inp = in_pcb_iterator(&rawin6pcbtable, inp, &iter)) != NULL) {
KASSERT(ISSET(inp->inp_flags, INP_IPV6));
+ pcb = READ_ONCE(inp->inp_socket->so_pcb);
+ if (pcb == NULL)
+ continue;
+ KASSERT(pcb == inp);
/*
* Packet must not be inserted after disconnected wakeup
* call. To avoid race, check again when holding receive
shared netlock for soclose()