From: Alexander Bluhm Subject: Re: shared netlock for soclose() To: tech@openbsd.org Date: Thu, 12 Jun 2025 00:41:07 +0200 On Wed, Jun 11, 2025 at 12:04:01PM +0200, Alexander Bluhm wrote: > On Wed, Jun 04, 2025 at 11:45:14PM +0300, Vitaliy Makkoveev wrote: > > On Wed, Jun 04, 2025 at 05:09:25PM +0200, Alexander Bluhm wrote: > > > On Tue, Jun 03, 2025 at 02:33:03PM +0200, Alexander Bluhm wrote: > > > > Hi, > > > > > > > > Currently soclose() needs exclusive netlock. I see deadlocks as > > > > tcp_newtcpcb() calls pool_get(&tcpcb_pool) with PR_WAITOK while > > > > holding shared netlock. Running memory allocation in parallel > > > > although free needs an exclusive lock is a bad idea. > > > > > > > > Solution is to run soclose() in parallel. > > > > > > > > Diff consists of four parts: > > > > - always hold reference count of socket in inp_socket > > > > - inp_socket is not longer protected by special mutex > > > > - account when other CPU sets so_pcb to NULL > > > > - run soclose() and sofree() with shared netlock > > > > - tcp timeout reaper can go away > > > > > > > > Please test. I will split the diff for review. Next chunk has been commited, updated diff. bluhm Index: kern/uipc_socket.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.378 uipc_socket.c --- kern/uipc_socket.c 23 May 2025 23:41:46 -0000 1.378 +++ kern/uipc_socket.c 11 Jun 2025 22:16:15 -0000 @@ -213,10 +213,7 @@ socreate(int dom, struct socket **aso, i if (error) { so->so_state |= SS_NOFDREF; /* sofree() calls sounlock(). */ - soref(so); - sofree(so, 1); - sounlock_shared(so); - sorele(so); + sofree(so, 0); return (error); } sounlock_shared(so); @@ -304,7 +301,7 @@ sofree(struct socket *so, int keep_lock) if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { if (!keep_lock) - sounlock(so); + sounlock_shared(so); return; } if (so->so_head) { @@ -317,7 +314,7 @@ sofree(struct socket *so, int keep_lock) */ if (so->so_onq == &head->so_q) { if (!keep_lock) - sounlock(so); + sounlock_shared(so); return; } @@ -344,7 +341,7 @@ sofree(struct socket *so, int keep_lock) } if (!keep_lock) - sounlock(so); + sounlock_shared(so); sorele(so); } @@ -368,7 +365,7 @@ soclose(struct socket *so, int flags) struct socket *so2; int error = 0; - solock(so); + solock_shared(so); /* Revoke async IO early. There is a final revocation in sofree(). */ sigio_free(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { @@ -430,7 +427,7 @@ discard: if (so->so_sp) { struct socket *soback; - sounlock(so); + sounlock_shared(so); /* * Concurrent sounsplice() locks `sb_mtx' mutexes on * both `so_snd' and `so_rcv' before unsplice sockets. @@ -477,7 +474,7 @@ notsplicedback: task_del(sosplice_taskq, &so->so_sp->ssp_task); taskq_barrier(sosplice_taskq); - solock(so); + solock_shared(so); } #endif /* SOCKET_SPLICE */ Index: netinet/ip_divert.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v diff -u -p -r1.105 ip_divert.c --- netinet/ip_divert.c 6 Jun 2025 13:13:37 -0000 1.105 +++ netinet/ip_divert.c 11 Jun 2025 22:16:15 -0000 @@ -178,6 +178,7 @@ void divert_packet(struct mbuf *m, int dir, u_int16_t divert_port) { struct inpcb *inp = NULL; + void *pcb; struct socket *so; struct sockaddr_in sin; @@ -201,6 +202,12 @@ divert_packet(struct mbuf *m, int dir, u divstat_inc(divs_noport); goto bad; } + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + divstat_inc(divs_closing); + goto bad; + } + KASSERT(pcb == inp); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; Index: netinet/ip_divert.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v diff -u -p -r1.28 ip_divert.h --- netinet/ip_divert.h 6 Jun 2025 13:13:37 -0000 1.28 +++ netinet/ip_divert.h 11 Jun 2025 22:16:15 -0000 @@ -22,6 +22,7 @@ struct divstat { u_long divs_ipackets; /* total input packets */ u_long divs_noport; /* no socket on port */ + u_long divs_closing; /* inpcb exists, socket is closing */ u_long divs_fullsock; /* not delivered, input socket full */ u_long divs_opackets; /* total output packets */ u_long divs_errors; /* generic errors */ @@ -53,6 +54,7 @@ struct divstat { enum divstat_counters { divs_ipackets, divs_noport, + divs_closing, divs_fullsock, divs_opackets, divs_errors, Index: netinet/raw_ip.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v diff -u -p -r1.166 raw_ip.c --- netinet/raw_ip.c 11 Mar 2025 15:31:03 -0000 1.166 +++ netinet/raw_ip.c 11 Jun 2025 22:16:15 -0000 @@ -135,6 +135,7 @@ rip_input(struct mbuf **mp, int *offp, i struct ip *ip = mtod(m, struct ip *); struct inpcb_iterator iter = { .inp_table = NULL }; struct inpcb *inp, *last; + void *pcb; struct in_addr *key; struct sockaddr_in ripsrc; @@ -169,6 +170,10 @@ rip_input(struct mbuf **mp, int *offp, i while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) + continue; + KASSERT(pcb == inp); /* * Packet must not be inserted after disconnected wakeup * call. To avoid race, check again when holding receive Index: netinet/tcp_input.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v diff -u -p -r1.451 tcp_input.c --- netinet/tcp_input.c 11 Jun 2025 14:30:07 -0000 1.451 +++ netinet/tcp_input.c 11 Jun 2025 22:16:15 -0000 @@ -661,10 +661,9 @@ findpcb: so = in_pcbsolock(inp); } if (so == NULL) { - tcpstat_inc(tcps_noport); + tcpstat_inc(tcps_closing); goto dropwithreset_ratelim; } - KASSERT(sotoinpcb(inp->inp_socket) == inp); KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); soassertlocked(inp->inp_socket); Index: netinet/tcp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v diff -u -p -r1.192 tcp_var.h --- netinet/tcp_var.h 8 Jun 2025 17:06:19 -0000 1.192 +++ netinet/tcp_var.h 11 Jun 2025 22:16:15 -0000 @@ -392,6 +392,7 @@ struct tcpstat { u_int32_t tcps_pcbhashmiss; /* input packets missing pcb hash */ u_int32_t tcps_noport; /* no socket on port */ + u_int32_t tcps_closing; /* inpcb exists, socket is closing */ u_int32_t tcps_badsyn; /* SYN packet with src==dst rcv'ed */ u_int32_t tcps_dropsyn; /* SYN packet dropped */ @@ -582,6 +583,7 @@ enum tcpstat_counters { tcps_preddat, tcps_pcbhashmiss, tcps_noport, + tcps_closing, tcps_badsyn, tcps_dropsyn, tcps_rcvbadsig, Index: netinet/udp_usrreq.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v diff -u -p -r1.341 udp_usrreq.c --- netinet/udp_usrreq.c 3 Jun 2025 16:51:26 -0000 1.341 +++ netinet/udp_usrreq.c 11 Jun 2025 22:16:15 -0000 @@ -198,6 +198,7 @@ udp_input(struct mbuf **mp, int *offp, i struct ip *ip = NULL; struct udphdr *uh; struct inpcb *inp = NULL; + void *pcb; struct ip save_ip; int len; u_int16_t savesum; @@ -419,6 +420,10 @@ udp_input(struct mbuf **mp, int *offp, i else KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) + continue; + KASSERT(pcb == inp); if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE) continue; if (rtable_l2(inp->inp_rtableid) != @@ -596,7 +601,12 @@ udp_input(struct mbuf **mp, int *offp, i return IPPROTO_DONE; } - KASSERT(sotoinpcb(inp->inp_socket) == inp); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + udpstat_inc(udps_closing); + goto bad; + } + KASSERT(pcb == inp); soassertlocked_readonly(inp->inp_socket); #ifdef INET6 Index: netinet/udp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v diff -u -p -r1.53 udp_var.h --- netinet/udp_var.h 2 Mar 2025 21:28:32 -0000 1.53 +++ netinet/udp_var.h 11 Jun 2025 22:16:15 -0000 @@ -61,6 +61,7 @@ struct udpstat { u_long udps_badlen; /* data length larger than packet */ u_long udps_noport; /* no socket on port */ u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_closing; /* inpcb exists, socket is closing */ u_long udps_nosec; /* dropped for lack of ipsec */ u_long udps_fullsock; /* not delivered, input socket full */ u_long udps_pcbhashmiss; /* input packets missing pcb hash */ @@ -104,6 +105,7 @@ enum udpstat_counters { udps_badlen, /* data length larger than packet */ udps_noport, /* no socket on port */ udps_noportbcast, /* of above, arrived as broadcast */ + udps_closing, /* inpcb exists, socket is closing */ udps_nosec, /* dropped for lack of ipsec */ udps_fullsock, /* not delivered, input socket full */ udps_pcbhashmiss, /* input packets missing pcb hash */ Index: netinet6/ip6_divert.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v diff -u -p -r1.104 ip6_divert.c --- netinet6/ip6_divert.c 6 Jun 2025 13:13:37 -0000 1.104 +++ netinet6/ip6_divert.c 11 Jun 2025 22:16:15 -0000 @@ -187,6 +187,7 @@ void divert6_packet(struct mbuf *m, int dir, u_int16_t divert_port) { struct inpcb *inp = NULL; + void *pcb; struct socket *so; struct sockaddr_in6 sin6; @@ -210,6 +211,12 @@ divert6_packet(struct mbuf *m, int dir, div6stat_inc(divs_noport); goto bad; } + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + div6stat_inc(divs_closing); + goto bad; + } + KASSERT(pcb == inp); memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; Index: netinet6/raw_ip6.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v diff -u -p -r1.192 raw_ip6.c --- netinet6/raw_ip6.c 27 May 2025 07:52:49 -0000 1.192 +++ netinet6/raw_ip6.c 11 Jun 2025 22:16:15 -0000 @@ -138,6 +138,7 @@ rip6_input(struct mbuf **mp, int *offp, struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb_iterator iter = { .inp_table = NULL }; struct inpcb *inp, *last; + void *pcb; struct in6_addr *key; struct sockaddr_in6 rip6src; uint8_t type; @@ -184,6 +185,10 @@ rip6_input(struct mbuf **mp, int *offp, while ((inp = in_pcb_iterator(&rawin6pcbtable, inp, &iter)) != NULL) { KASSERT(ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) + continue; + KASSERT(pcb == inp); /* * Packet must not be inserted after disconnected wakeup * call. To avoid race, check again when holding receive