From: Alexander Bluhm Subject: Re: netstat counter for async socket close To: tech@openbsd.org Date: Fri, 6 Jun 2025 20:27:17 +0200 On Wed, Jun 04, 2025 at 04:17:39PM +0200, Alexander Bluhm wrote: > Hi, > > Currently soclose() is protected by exclusive netlock. Except for > TCP that means an inpcb reference always contains a socket pointer > in inp_socket that points back to the incpb. > > When we start unlocking soclose(), this is no longer the case as > in_pcbdetach() can set so_pcb to NULL running on another CPU. For > TCP input we take the socket lock for every packet with in_pcbsolock(). > If that returns NULL another CPU has called in_pcbdetach() due to > a reset packet or timeout. Instead of reusing the tcps_noport > counter, I would like to introduce tcps_closing for such packets. > > For datagram and raw sockets I want to avoid a per socket lock as > this would kill the packet input performance. The main problem > here is that KASSERT(sotoinpcb(inp->inp_socket) == inp) could be > false as so_pcb is NULL. Only a weak check with READ_ONCE() is > possible without lock. I think it is still worthy to drop the > packet early and check the KASSERT(). If the socket goes away > later, this is not a problem. Packets will be purged in sorele() > after inpcb has been released. > > This diff adds the closing counter to all protocols. Also the > KASSERT(pcb == inp) is added to all socket lookups. It is aware > that other CPU can set so_pcb to NULL and then returns early. mvs@ noticed that counting closing sockets in the broadcast loops does not make sense. So here is a new diff, that checks so_pcb is valid, but only counts it for unicast udp, tcp and divert sockets. Closing raw IP sockets are just ignored. Maybe we should not add the (pcb == NULL) check and (pcb == inp) asserts in the broadcast and multicast loops. Don't know. ok? bluhm Index: sys/netinet/ip_divert.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v diff -u -p -r1.105 ip_divert.c --- sys/netinet/ip_divert.c 6 Jun 2025 13:13:37 -0000 1.105 +++ sys/netinet/ip_divert.c 6 Jun 2025 17:32:31 -0000 @@ -178,6 +178,7 @@ void divert_packet(struct mbuf *m, int dir, u_int16_t divert_port) { struct inpcb *inp = NULL; + void *pcb; struct socket *so; struct sockaddr_in sin; @@ -201,6 +202,12 @@ divert_packet(struct mbuf *m, int dir, u divstat_inc(divs_noport); goto bad; } + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + divstat_inc(divs_closing); + goto bad; + } + KASSERT(pcb == inp); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; Index: sys/netinet/ip_divert.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v diff -u -p -r1.28 ip_divert.h --- sys/netinet/ip_divert.h 6 Jun 2025 13:13:37 -0000 1.28 +++ sys/netinet/ip_divert.h 6 Jun 2025 17:32:31 -0000 @@ -22,6 +22,7 @@ struct divstat { u_long divs_ipackets; /* total input packets */ u_long divs_noport; /* no socket on port */ + u_long divs_closing; /* inpcb exists, socket is closing */ u_long divs_fullsock; /* not delivered, input socket full */ u_long divs_opackets; /* total output packets */ u_long divs_errors; /* generic errors */ @@ -53,6 +54,7 @@ struct divstat { enum divstat_counters { divs_ipackets, divs_noport, + divs_closing, divs_fullsock, divs_opackets, divs_errors, Index: sys/netinet/raw_ip.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v diff -u -p -r1.166 raw_ip.c --- sys/netinet/raw_ip.c 11 Mar 2025 15:31:03 -0000 1.166 +++ sys/netinet/raw_ip.c 6 Jun 2025 17:32:31 -0000 @@ -135,6 +135,7 @@ rip_input(struct mbuf **mp, int *offp, i struct ip *ip = mtod(m, struct ip *); struct inpcb_iterator iter = { .inp_table = NULL }; struct inpcb *inp, *last; + void *pcb; struct in_addr *key; struct sockaddr_in ripsrc; @@ -169,6 +170,10 @@ rip_input(struct mbuf **mp, int *offp, i while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) + continue; + KASSERT(pcb == inp); /* * Packet must not be inserted after disconnected wakeup * call. To avoid race, check again when holding receive Index: sys/netinet/tcp_input.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v diff -u -p -r1.450 tcp_input.c --- sys/netinet/tcp_input.c 3 Jun 2025 16:51:26 -0000 1.450 +++ sys/netinet/tcp_input.c 6 Jun 2025 17:32:31 -0000 @@ -661,10 +661,9 @@ findpcb: so = in_pcbsolock(inp); } if (so == NULL) { - tcpstat_inc(tcps_noport); + tcpstat_inc(tcps_closing); goto dropwithreset_ratelim; } - KASSERT(sotoinpcb(inp->inp_socket) == inp); KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); soassertlocked(inp->inp_socket); Index: sys/netinet/tcp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v diff -u -p -r1.191 tcp_var.h --- sys/netinet/tcp_var.h 7 May 2025 14:10:19 -0000 1.191 +++ sys/netinet/tcp_var.h 6 Jun 2025 17:32:31 -0000 @@ -393,6 +393,7 @@ struct tcpstat { u_int32_t tcps_pcbhashmiss; /* input packets missing pcb hash */ u_int32_t tcps_noport; /* no socket on port */ + u_int32_t tcps_closing; /* inpcb exists, socket is closing */ u_int32_t tcps_badsyn; /* SYN packet with src==dst rcv'ed */ u_int32_t tcps_dropsyn; /* SYN packet dropped */ @@ -583,6 +584,7 @@ enum tcpstat_counters { tcps_preddat, tcps_pcbhashmiss, tcps_noport, + tcps_closing, tcps_badsyn, tcps_dropsyn, tcps_rcvbadsig, Index: sys/netinet/udp_usrreq.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v diff -u -p -r1.341 udp_usrreq.c --- sys/netinet/udp_usrreq.c 3 Jun 2025 16:51:26 -0000 1.341 +++ sys/netinet/udp_usrreq.c 6 Jun 2025 17:32:31 -0000 @@ -198,6 +198,7 @@ udp_input(struct mbuf **mp, int *offp, i struct ip *ip = NULL; struct udphdr *uh; struct inpcb *inp = NULL; + void *pcb; struct ip save_ip; int len; u_int16_t savesum; @@ -419,6 +420,10 @@ udp_input(struct mbuf **mp, int *offp, i else KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) + continue; + KASSERT(pcb == inp); if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE) continue; if (rtable_l2(inp->inp_rtableid) != @@ -596,7 +601,12 @@ udp_input(struct mbuf **mp, int *offp, i return IPPROTO_DONE; } - KASSERT(sotoinpcb(inp->inp_socket) == inp); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + udpstat_inc(udps_closing); + goto bad; + } + KASSERT(pcb == inp); soassertlocked_readonly(inp->inp_socket); #ifdef INET6 Index: sys/netinet/udp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v diff -u -p -r1.53 udp_var.h --- sys/netinet/udp_var.h 2 Mar 2025 21:28:32 -0000 1.53 +++ sys/netinet/udp_var.h 6 Jun 2025 17:32:31 -0000 @@ -61,6 +61,7 @@ struct udpstat { u_long udps_badlen; /* data length larger than packet */ u_long udps_noport; /* no socket on port */ u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_closing; /* inpcb exists, socket is closing */ u_long udps_nosec; /* dropped for lack of ipsec */ u_long udps_fullsock; /* not delivered, input socket full */ u_long udps_pcbhashmiss; /* input packets missing pcb hash */ @@ -104,6 +105,7 @@ enum udpstat_counters { udps_badlen, /* data length larger than packet */ udps_noport, /* no socket on port */ udps_noportbcast, /* of above, arrived as broadcast */ + udps_closing, /* inpcb exists, socket is closing */ udps_nosec, /* dropped for lack of ipsec */ udps_fullsock, /* not delivered, input socket full */ udps_pcbhashmiss, /* input packets missing pcb hash */ Index: sys/netinet6/ip6_divert.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v diff -u -p -r1.104 ip6_divert.c --- sys/netinet6/ip6_divert.c 6 Jun 2025 13:13:37 -0000 1.104 +++ sys/netinet6/ip6_divert.c 6 Jun 2025 17:32:31 -0000 @@ -187,6 +187,7 @@ void divert6_packet(struct mbuf *m, int dir, u_int16_t divert_port) { struct inpcb *inp = NULL; + void *pcb; struct socket *so; struct sockaddr_in6 sin6; @@ -210,6 +211,12 @@ divert6_packet(struct mbuf *m, int dir, div6stat_inc(divs_noport); goto bad; } + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + div6stat_inc(divs_closing); + goto bad; + } + KASSERT(pcb == inp); memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; Index: sys/netinet6/raw_ip6.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v diff -u -p -r1.192 raw_ip6.c --- sys/netinet6/raw_ip6.c 27 May 2025 07:52:49 -0000 1.192 +++ sys/netinet6/raw_ip6.c 6 Jun 2025 17:32:31 -0000 @@ -138,6 +138,7 @@ rip6_input(struct mbuf **mp, int *offp, struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb_iterator iter = { .inp_table = NULL }; struct inpcb *inp, *last; + void *pcb; struct in6_addr *key; struct sockaddr_in6 rip6src; uint8_t type; @@ -184,6 +185,10 @@ rip6_input(struct mbuf **mp, int *offp, while ((inp = in_pcb_iterator(&rawin6pcbtable, inp, &iter)) != NULL) { KASSERT(ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) + continue; + KASSERT(pcb == inp); /* * Packet must not be inserted after disconnected wakeup * call. To avoid race, check again when holding receive Index: usr.bin/netstat/inet.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v diff -u -p -r1.184 inet.c --- usr.bin/netstat/inet.c 17 Apr 2025 17:23:17 -0000 1.184 +++ usr.bin/netstat/inet.c 5 Jun 2025 13:03:15 -0000 @@ -468,6 +468,7 @@ tcp_stats(char *name) p(tcps_preddat, "\t%u correct data packet header prediction%s\n"); pes(tcps_pcbhashmiss, "\t%u PCB cache miss%s\n"); p1(tcps_noport, "\t%u dropped due to no socket\n"); + p1(tcps_closing, "\t%u dropped as socket is closing\n"); p(tcps_ecn_accepts, "\t%u ECN connection%s accepted\n"); p(tcps_ecn_rcvece, "\t\t%u ECE packet%s received\n"); @@ -556,6 +557,7 @@ udp_stats(char *name) p(udps_outswcsum, "\t%lu output packet%s software-checksummed\n"); p1(udps_noport, "\t%lu dropped due to no socket\n"); p(udps_noportbcast, "\t%lu broadcast/multicast datagram%s dropped due to no socket\n"); + p1(udps_closing, "\t%lu dropped as socket is closing\n"); p1(udps_nosec, "\t%lu dropped due to missing IPsec protection\n"); p1(udps_fullsock, "\t%lu dropped due to full socket buffers\n"); delivered = udpstat.udps_ipackets - udpstat.udps_hdrops - @@ -657,6 +659,7 @@ div_stats(char *name) printf(m, divstat.f) p(divs_ipackets, "\t%lu total packet%s received\n"); p1(divs_noport, "\t%lu dropped due to no socket\n"); + p1(divs_closing, "\t%lu dropped as socket is closing\n"); p1(divs_fullsock, "\t%lu dropped due to full socket buffers\n"); p(divs_opackets, "\t%lu packet%s output\n"); p1(divs_errors, "\t%lu errors\n"); Index: usr.bin/netstat/inet6.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet6.c,v diff -u -p -r1.58 inet6.c --- usr.bin/netstat/inet6.c 4 Jun 2025 12:37:00 -0000 1.58 +++ usr.bin/netstat/inet6.c 5 Jun 2025 13:03:23 -0000 @@ -863,6 +863,7 @@ rip6_stats(char *name) printf("\t%llu delivered\n", (unsigned long long)delivered); p(rip6s_opackets, "\t%llu datagram%s output\n"); #undef p +#undef p1 } /* @@ -889,6 +890,7 @@ div6_stats(char *name) printf(m, div6stat.f) p(divs_ipackets, "\t%lu total packet%s received\n"); p1(divs_noport, "\t%lu dropped due to no socket\n"); + p1(divs_closing, "\t%lu dropped as socket is closing\n"); p1(divs_fullsock, "\t%lu dropped due to full socket buffers\n"); p(divs_opackets, "\t%lu packet%s output\n"); p1(divs_errors, "\t%lu errors\n");