From: Vitaliy Makkoveev Subject: Re: shared netlock for soclose() To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 4 Jun 2025 23:45:14 +0300 On Wed, Jun 04, 2025 at 05:09:25PM +0200, Alexander Bluhm wrote: > On Tue, Jun 03, 2025 at 02:33:03PM +0200, Alexander Bluhm wrote: > > Hi, > > > > Currently soclose() needs exclusive netlock. I see deadlocks as > > tcp_newtcpcb() calls pool_get(&tcpcb_pool) with PR_WAITOK while > > holding shared netlock. Running memory allocation in parallel > > although free needs an exclusive lock is a bad idea. > > > > Solution is to run soclose() in parallel. > > > > Diff consists of four parts: > > - always hold reference count of socket in inp_socket > > - inp_socket is not longer protected by special mutex > > - account when other CPU sets so_pcb to NULL > > - run soclose() and sofree() with shared netlock > > - tcp timeout reaper can go away > > > > Please test. I will split the diff for review. > > Some parts have been commited. New diff that applies to current. > > bluhm > This could be not the destination socket. Do we need to update `ips_closing' if so? > @@ -169,6 +170,12 @@ rip_input(struct mbuf **mp, int *offp, i > while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { > KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + ipstat_inc(ips_closing); > + continue; > + } > + KASSERT(pcb == inp); > Index: kern/uipc_socket.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v > diff -u -p -r1.378 uipc_socket.c > --- kern/uipc_socket.c 23 May 2025 23:41:46 -0000 1.378 > +++ kern/uipc_socket.c 4 Jun 2025 14:18:03 -0000 > @@ -213,10 +213,7 @@ socreate(int dom, struct socket **aso, i > if (error) { > so->so_state |= SS_NOFDREF; > /* sofree() calls sounlock(). */ > - soref(so); > - sofree(so, 1); > - sounlock_shared(so); > - sorele(so); > + sofree(so, 0); > return (error); > } > sounlock_shared(so); > @@ -304,7 +301,7 @@ sofree(struct socket *so, int keep_lock) > > if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { > if (!keep_lock) > - sounlock(so); > + sounlock_shared(so); > return; > } > if (so->so_head) { > @@ -317,7 +314,7 @@ sofree(struct socket *so, int keep_lock) > */ > if (so->so_onq == &head->so_q) { > if (!keep_lock) > - sounlock(so); > + sounlock_shared(so); > return; > } > > @@ -344,7 +341,7 @@ sofree(struct socket *so, int keep_lock) > } > > if (!keep_lock) > - sounlock(so); > + sounlock_shared(so); > sorele(so); > } > > @@ -368,7 +365,7 @@ soclose(struct socket *so, int flags) > struct socket *so2; > int error = 0; > > - solock(so); > + solock_shared(so); > /* Revoke async IO early. There is a final revocation in sofree(). */ > sigio_free(&so->so_sigio); > if (so->so_state & SS_ISCONNECTED) { > @@ -430,7 +427,7 @@ discard: > if (so->so_sp) { > struct socket *soback; > > - sounlock(so); > + sounlock_shared(so); > /* > * Concurrent sounsplice() locks `sb_mtx' mutexes on > * both `so_snd' and `so_rcv' before unsplice sockets. > @@ -477,7 +474,7 @@ notsplicedback: > task_del(sosplice_taskq, &so->so_sp->ssp_task); > taskq_barrier(sosplice_taskq); > > - solock(so); > + solock_shared(so); > } > #endif /* SOCKET_SPLICE */ > > Index: netinet/in_pcb.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v > diff -u -p -r1.169 in_pcb.h > --- netinet/in_pcb.h 3 Jun 2025 16:51:26 -0000 1.169 > +++ netinet/in_pcb.h 4 Jun 2025 14:18:03 -0000 > @@ -138,7 +138,7 @@ struct inpcb { > u_int16_t inp_fport; /* [t] foreign port */ > u_int16_t inp_lport; /* [t] local port */ > struct socket *inp_socket; /* [I] back pointer to socket */ > - caddr_t inp_ppcb; /* pointer to per-protocol pcb */ > + caddr_t inp_ppcb; /* [s] pointer to per-protocol pcb */ > struct route inp_route; /* [s] cached route */ > struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ > int inp_flags; /* generic IP/datagram flags */ > Index: netinet/ip_divert.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v > diff -u -p -r1.104 ip_divert.c > --- netinet/ip_divert.c 4 Jun 2025 12:37:00 -0000 1.104 > +++ netinet/ip_divert.c 4 Jun 2025 13:31:07 -0000 > @@ -190,6 +190,7 @@ void > divert_packet(struct mbuf *m, int dir, u_int16_t divert_port) > { > struct inpcb *inp = NULL; > + void *pcb; > struct socket *so; > struct sockaddr_in sin; > > @@ -213,6 +214,12 @@ divert_packet(struct mbuf *m, int dir, u > divstat_inc(divs_noport); > goto bad; > } > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + divstat_inc(divs_closing); > + goto bad; > + } > + KASSERT(pcb == inp); > > memset(&sin, 0, sizeof(sin)); > sin.sin_family = AF_INET; > Index: netinet/ip_divert.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v > diff -u -p -r1.27 ip_divert.h > --- netinet/ip_divert.h 4 Jun 2025 12:37:00 -0000 1.27 > +++ netinet/ip_divert.h 4 Jun 2025 13:31:07 -0000 > @@ -22,6 +22,7 @@ > struct divstat { > u_long divs_ipackets; /* total input packets */ > u_long divs_noport; /* no socket on port */ > + u_long divs_closing; /* inpcb exists, socket is closing */ > u_long divs_fullsock; /* not delivered, input socket full */ > u_long divs_opackets; /* total output packets */ > u_long divs_errors; /* generic errors */ > @@ -49,6 +50,7 @@ struct divstat { > enum divstat_counters { > divs_ipackets, > divs_noport, > + divs_closing, > divs_fullsock, > divs_opackets, > divs_errors, > Index: netinet/ip_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v > diff -u -p -r1.121 ip_var.h > --- netinet/ip_var.h 2 Mar 2025 21:28:32 -0000 1.121 > +++ netinet/ip_var.h 4 Jun 2025 13:31:07 -0000 > @@ -68,6 +68,7 @@ struct ipstat { > u_long ips_cantforward; /* packets rcvd for unreachable dest */ > u_long ips_redirectsent; /* packets forwarded on same net */ > u_long ips_noproto; /* unknown or unsupported protocol */ > + u_long ips_closing; /* inpcb exists, socket is closing */ > u_long ips_delivered; /* datagrams delivered to upper level*/ > u_long ips_localout; /* total ip packets generated here */ > u_long ips_odropped; /* lost output due to nobufs, etc. */ > @@ -116,6 +117,7 @@ enum ipstat_counters { > ips_cantforward, /* packets rcvd for unreachable dest */ > ips_redirectsent, /* packets forwarded on same net */ > ips_noproto, /* unknown or unsupported protocol */ > + ips_closing, /* inpcb exists, socket is closing */ > ips_delivered, /* datagrams delivered to upper level*/ > ips_localout, /* total ip packets generated here */ > ips_odropped, /* lost output packets due to nobufs, etc. */ > Index: netinet/raw_ip.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v > diff -u -p -r1.166 raw_ip.c > --- netinet/raw_ip.c 11 Mar 2025 15:31:03 -0000 1.166 > +++ netinet/raw_ip.c 4 Jun 2025 13:31:07 -0000 > @@ -135,6 +135,7 @@ rip_input(struct mbuf **mp, int *offp, i > struct ip *ip = mtod(m, struct ip *); > struct inpcb_iterator iter = { .inp_table = NULL }; > struct inpcb *inp, *last; > + void *pcb; > struct in_addr *key; > struct sockaddr_in ripsrc; > > @@ -169,6 +170,12 @@ rip_input(struct mbuf **mp, int *offp, i > while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { > KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + ipstat_inc(ips_closing); > + continue; > + } > + KASSERT(pcb == inp); > /* > * Packet must not be inserted after disconnected wakeup > * call. To avoid race, check again when holding receive > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.450 tcp_input.c > --- netinet/tcp_input.c 3 Jun 2025 16:51:26 -0000 1.450 > +++ netinet/tcp_input.c 4 Jun 2025 13:31:07 -0000 > @@ -661,10 +661,9 @@ findpcb: > so = in_pcbsolock(inp); > } > if (so == NULL) { > - tcpstat_inc(tcps_noport); > + tcpstat_inc(tcps_closing); > goto dropwithreset_ratelim; > } > - > KASSERT(sotoinpcb(inp->inp_socket) == inp); > KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); > soassertlocked(inp->inp_socket); > Index: netinet/tcp_subr.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v > diff -u -p -r1.211 tcp_subr.c > --- netinet/tcp_subr.c 3 Jun 2025 16:51:26 -0000 1.211 > +++ netinet/tcp_subr.c 4 Jun 2025 14:18:03 -0000 > @@ -440,8 +440,6 @@ tcp_newtcpcb(struct inpcb *inp, int wait > tp->t_inpcb = inp; > for (i = 0; i < TCPT_NTIMERS; i++) > TCP_TIMER_INIT(tp, i); > - timeout_set_flags(&tp->t_timer_reaper, tcp_timer_reaper, tp, > - KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); > > tp->sack_enable = atomic_load_int(&tcp_do_sack); > tp->t_flags = atomic_load_int(&tcp_do_rfc1323) ? > @@ -528,9 +526,8 @@ tcp_close(struct tcpcb *tp) > } > > m_free(tp->t_template); > - /* Free tcpcb after all pending timers have been run. */ > - timeout_add(&tp->t_timer_reaper, 0); > inp->inp_ppcb = NULL; > + pool_put(&tcpcb_pool, tp); > soisdisconnected(so); > in_pcbdetach(inp); > tcpstat_inc(tcps_closed); > Index: netinet/tcp_timer.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v > diff -u -p -r1.84 tcp_timer.c > --- netinet/tcp_timer.c 3 Jun 2025 16:51:26 -0000 1.84 > +++ netinet/tcp_timer.c 4 Jun 2025 14:18:03 -0000 > @@ -541,19 +541,3 @@ tcp_timer_2msl(void *arg) > out: > tcp_timer_leave(inp, so); > } > - > -void > -tcp_timer_reaper(void *arg) > -{ > - struct tcpcb *tp = arg; > - > - /* > - * This timer is necessary to delay the pool_put() after all timers > - * have finished, even if they were sleeping to grab the net lock. > - * Putting the pool_put() in a timer is sufficient as all timers run > - * from the same timeout thread. Note that neither softnet thread nor > - * user process may access the tcpcb after arming the reaper timer. > - * Freeing may run in parallel as it does not grab the net lock. > - */ > - pool_put(&tcpcb_pool, tp); > -} > Index: netinet/tcp_timer.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v > diff -u -p -r1.26 tcp_timer.h > --- netinet/tcp_timer.h 16 Jan 2025 11:59:20 -0000 1.26 > +++ netinet/tcp_timer.h 4 Jun 2025 14:18:03 -0000 > @@ -164,8 +164,5 @@ extern int tcp_keepidle_sec; /* [a] copy > extern int tcp_keepintvl_sec; /* [a] copy of above in seconds for sysctl */ > extern int tcp_ttl; /* time to live for TCP segs */ > extern const int tcp_backoff[]; > - > -void tcp_timer_reaper(void *); > - > #endif /* _KERNEL */ > #endif /* _NETINET_TCP_TIMER_H_ */ > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.191 tcp_var.h > --- netinet/tcp_var.h 7 May 2025 14:10:19 -0000 1.191 > +++ netinet/tcp_var.h 4 Jun 2025 14:18:03 -0000 > @@ -70,7 +70,6 @@ struct tcpqent { > struct tcpcb { > struct tcpqehead t_segq; /* sequencing queue */ > struct timeout t_timer[TCPT_NTIMERS]; /* tcp timers */ > - struct timeout t_timer_reaper; /* reaper is special, no refcnt */ > short t_state; /* state of this connection */ > short t_rxtshift; /* log(2) of rexmt exp. backoff */ > int t_rxtcur; /* current retransmit value */ > @@ -393,6 +392,7 @@ struct tcpstat { > > u_int32_t tcps_pcbhashmiss; /* input packets missing pcb hash */ > u_int32_t tcps_noport; /* no socket on port */ > + u_int32_t tcps_closing; /* inpcb exists, socket is closing */ > u_int32_t tcps_badsyn; /* SYN packet with src==dst rcv'ed */ > u_int32_t tcps_dropsyn; /* SYN packet dropped */ > > @@ -583,6 +583,7 @@ enum tcpstat_counters { > tcps_preddat, > tcps_pcbhashmiss, > tcps_noport, > + tcps_closing, > tcps_badsyn, > tcps_dropsyn, > tcps_rcvbadsig, > Index: netinet/udp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v > diff -u -p -r1.341 udp_usrreq.c > --- netinet/udp_usrreq.c 3 Jun 2025 16:51:26 -0000 1.341 > +++ netinet/udp_usrreq.c 4 Jun 2025 13:31:07 -0000 > @@ -198,6 +198,7 @@ udp_input(struct mbuf **mp, int *offp, i > struct ip *ip = NULL; > struct udphdr *uh; > struct inpcb *inp = NULL; > + void *pcb; > struct ip save_ip; > int len; > u_int16_t savesum; > @@ -419,6 +420,12 @@ udp_input(struct mbuf **mp, int *offp, i > else > KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + udpstat_inc(udps_closing); > + continue; > + } > + KASSERT(pcb == inp); > if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE) > continue; > if (rtable_l2(inp->inp_rtableid) != > @@ -596,7 +603,12 @@ udp_input(struct mbuf **mp, int *offp, i > return IPPROTO_DONE; > } > > - KASSERT(sotoinpcb(inp->inp_socket) == inp); > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + udpstat_inc(udps_closing); > + goto bad; > + } > + KASSERT(pcb == inp); > soassertlocked_readonly(inp->inp_socket); > > #ifdef INET6 > Index: netinet/udp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v > diff -u -p -r1.53 udp_var.h > --- netinet/udp_var.h 2 Mar 2025 21:28:32 -0000 1.53 > +++ netinet/udp_var.h 4 Jun 2025 13:31:07 -0000 > @@ -61,6 +61,7 @@ struct udpstat { > u_long udps_badlen; /* data length larger than packet */ > u_long udps_noport; /* no socket on port */ > u_long udps_noportbcast; /* of above, arrived as broadcast */ > + u_long udps_closing; /* inpcb exists, socket is closing */ > u_long udps_nosec; /* dropped for lack of ipsec */ > u_long udps_fullsock; /* not delivered, input socket full */ > u_long udps_pcbhashmiss; /* input packets missing pcb hash */ > @@ -104,6 +105,7 @@ enum udpstat_counters { > udps_badlen, /* data length larger than packet */ > udps_noport, /* no socket on port */ > udps_noportbcast, /* of above, arrived as broadcast */ > + udps_closing, /* inpcb exists, socket is closing */ > udps_nosec, /* dropped for lack of ipsec */ > udps_fullsock, /* not delivered, input socket full */ > udps_pcbhashmiss, /* input packets missing pcb hash */ > Index: netinet6/ip6_divert.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v > diff -u -p -r1.103 ip6_divert.c > --- netinet6/ip6_divert.c 4 Jun 2025 12:37:00 -0000 1.103 > +++ netinet6/ip6_divert.c 4 Jun 2025 13:31:07 -0000 > @@ -199,6 +199,7 @@ void > divert6_packet(struct mbuf *m, int dir, u_int16_t divert_port) > { > struct inpcb *inp = NULL; > + void *pcb; > struct socket *so; > struct sockaddr_in6 sin6; > > @@ -222,6 +223,12 @@ divert6_packet(struct mbuf *m, int dir, > div6stat_inc(divs_noport); > goto bad; > } > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + div6stat_inc(divs_closing); > + goto bad; > + } > + KASSERT(pcb == inp); > > memset(&sin6, 0, sizeof(sin6)); > sin6.sin6_family = AF_INET6; > Index: netinet6/raw_ip6.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v > diff -u -p -r1.192 raw_ip6.c > --- netinet6/raw_ip6.c 27 May 2025 07:52:49 -0000 1.192 > +++ netinet6/raw_ip6.c 4 Jun 2025 13:31:07 -0000 > @@ -138,6 +138,7 @@ rip6_input(struct mbuf **mp, int *offp, > struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); > struct inpcb_iterator iter = { .inp_table = NULL }; > struct inpcb *inp, *last; > + void *pcb; > struct in6_addr *key; > struct sockaddr_in6 rip6src; > uint8_t type; > @@ -184,6 +185,12 @@ rip6_input(struct mbuf **mp, int *offp, > while ((inp = in_pcb_iterator(&rawin6pcbtable, inp, &iter)) != NULL) { > KASSERT(ISSET(inp->inp_flags, INP_IPV6)); > > + pcb = READ_ONCE(inp->inp_socket->so_pcb); > + if (pcb == NULL) { > + rip6stat_inc(rip6s_closing); > + continue; > + } > + KASSERT(pcb == inp); > /* > * Packet must not be inserted after disconnected wakeup > * call. To avoid race, check again when holding receive > Index: netinet6/raw_ip6.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.h,v > diff -u -p -r1.4 raw_ip6.h > --- netinet6/raw_ip6.h 9 Feb 2017 15:23:35 -0000 1.4 > +++ netinet6/raw_ip6.h 4 Jun 2025 13:31:07 -0000 > @@ -42,6 +42,7 @@ struct rip6stat { > u_int64_t rip6s_badsum; /* of above, checksum error */ > u_int64_t rip6s_nosock; /* no matching socket */ > u_int64_t rip6s_nosockmcast; /* of above, arrived as multicast */ > + u_int64_t rip6s_closing; /* inpcb exists, socket is closing */ > u_int64_t rip6s_fullsock; /* not delivered, input socket full */ > > u_int64_t rip6s_opackets; /* total output packets */ > @@ -68,6 +69,7 @@ enum rip6stat_counters { > rip6s_badsum, > rip6s_nosock, > rip6s_nosockmcast, > + rip6s_closing, > rip6s_fullsock, > rip6s_opackets, > rip6s_ncounters, >