From: Alexander Bluhm Subject: netstat counter for async socket close To: tech@openbsd.org Date: Wed, 4 Jun 2025 16:17:39 +0200 Hi, Currently soclose() is protected by exclusive netlock. Except for TCP that means an inpcb reference always contains a socket pointer in inp_socket that points back to the incpb. When we start unlocking soclose(), this is no longer the case as in_pcbdetach() can set so_pcb to NULL running on another CPU. For TCP input we take the socket lock for every packet with in_pcbsolock(). If that returns NULL another CPU has called in_pcbdetach() due to a reset packet or timeout. Instead of reusing the tcps_noport counter, I would like to introduce tcps_closing for such packets. For datagram and raw sockets I want to avoid a per socket lock as this would kill the packet input performance. The main problem here is that KASSERT(sotoinpcb(inp->inp_socket) == inp) could be false as so_pcb is NULL. Only a weak check with READ_ONCE() is possible without lock. I think it is still worthy to drop the packet early and check the KASSERT(). If the socket goes away later, this is not a problem. Packets will be purged in sorele() after inpcb has been released. This diff adds the closing counter to all protocols. Also the KASSERT(pcb == inp) is added to all socket lookups. It is aware that other CPU can set so_pcb to NULL and then returns early. ok? bluhm Index: sys/netinet/ip_divert.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v diff -u -p -r1.104 ip_divert.c --- sys/netinet/ip_divert.c 4 Jun 2025 12:37:00 -0000 1.104 +++ sys/netinet/ip_divert.c 4 Jun 2025 13:12:49 -0000 @@ -190,6 +190,7 @@ void divert_packet(struct mbuf *m, int dir, u_int16_t divert_port) { struct inpcb *inp = NULL; + void *pcb; struct socket *so; struct sockaddr_in sin; @@ -213,6 +214,12 @@ divert_packet(struct mbuf *m, int dir, u divstat_inc(divs_noport); goto bad; } + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + divstat_inc(divs_closing); + goto bad; + } + KASSERT(pcb == inp); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; Index: sys/netinet/ip_divert.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v diff -u -p -r1.27 ip_divert.h --- sys/netinet/ip_divert.h 4 Jun 2025 12:37:00 -0000 1.27 +++ sys/netinet/ip_divert.h 4 Jun 2025 13:12:49 -0000 @@ -22,6 +22,7 @@ struct divstat { u_long divs_ipackets; /* total input packets */ u_long divs_noport; /* no socket on port */ + u_long divs_closing; /* inpcb exists, socket is closing */ u_long divs_fullsock; /* not delivered, input socket full */ u_long divs_opackets; /* total output packets */ u_long divs_errors; /* generic errors */ @@ -49,6 +50,7 @@ struct divstat { enum divstat_counters { divs_ipackets, divs_noport, + divs_closing, divs_fullsock, divs_opackets, divs_errors, Index: sys/netinet/ip_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v diff -u -p -r1.121 ip_var.h --- sys/netinet/ip_var.h 2 Mar 2025 21:28:32 -0000 1.121 +++ sys/netinet/ip_var.h 4 Jun 2025 13:12:49 -0000 @@ -68,6 +68,7 @@ struct ipstat { u_long ips_cantforward; /* packets rcvd for unreachable dest */ u_long ips_redirectsent; /* packets forwarded on same net */ u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_closing; /* inpcb exists, socket is closing */ u_long ips_delivered; /* datagrams delivered to upper level*/ u_long ips_localout; /* total ip packets generated here */ u_long ips_odropped; /* lost output due to nobufs, etc. */ @@ -116,6 +117,7 @@ enum ipstat_counters { ips_cantforward, /* packets rcvd for unreachable dest */ ips_redirectsent, /* packets forwarded on same net */ ips_noproto, /* unknown or unsupported protocol */ + ips_closing, /* inpcb exists, socket is closing */ ips_delivered, /* datagrams delivered to upper level*/ ips_localout, /* total ip packets generated here */ ips_odropped, /* lost output packets due to nobufs, etc. */ Index: sys/netinet/raw_ip.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v diff -u -p -r1.166 raw_ip.c --- sys/netinet/raw_ip.c 11 Mar 2025 15:31:03 -0000 1.166 +++ sys/netinet/raw_ip.c 4 Jun 2025 13:12:49 -0000 @@ -135,6 +135,7 @@ rip_input(struct mbuf **mp, int *offp, i struct ip *ip = mtod(m, struct ip *); struct inpcb_iterator iter = { .inp_table = NULL }; struct inpcb *inp, *last; + void *pcb; struct in_addr *key; struct sockaddr_in ripsrc; @@ -169,6 +170,12 @@ rip_input(struct mbuf **mp, int *offp, i while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + ipstat_inc(ips_closing); + continue; + } + KASSERT(pcb == inp); /* * Packet must not be inserted after disconnected wakeup * call. To avoid race, check again when holding receive Index: sys/netinet/tcp_input.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v diff -u -p -r1.450 tcp_input.c --- sys/netinet/tcp_input.c 3 Jun 2025 16:51:26 -0000 1.450 +++ sys/netinet/tcp_input.c 4 Jun 2025 13:12:49 -0000 @@ -661,10 +661,9 @@ findpcb: so = in_pcbsolock(inp); } if (so == NULL) { - tcpstat_inc(tcps_noport); + tcpstat_inc(tcps_closing); goto dropwithreset_ratelim; } - KASSERT(sotoinpcb(inp->inp_socket) == inp); KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); soassertlocked(inp->inp_socket); Index: sys/netinet/tcp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v diff -u -p -r1.191 tcp_var.h --- sys/netinet/tcp_var.h 7 May 2025 14:10:19 -0000 1.191 +++ sys/netinet/tcp_var.h 4 Jun 2025 13:22:54 -0000 @@ -393,6 +393,7 @@ struct tcpstat { u_int32_t tcps_pcbhashmiss; /* input packets missing pcb hash */ u_int32_t tcps_noport; /* no socket on port */ + u_int32_t tcps_closing; /* inpcb exists, socket is closing */ u_int32_t tcps_badsyn; /* SYN packet with src==dst rcv'ed */ u_int32_t tcps_dropsyn; /* SYN packet dropped */ @@ -583,6 +584,7 @@ enum tcpstat_counters { tcps_preddat, tcps_pcbhashmiss, tcps_noport, + tcps_closing, tcps_badsyn, tcps_dropsyn, tcps_rcvbadsig, Index: sys/netinet/udp_usrreq.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v diff -u -p -r1.341 udp_usrreq.c --- sys/netinet/udp_usrreq.c 3 Jun 2025 16:51:26 -0000 1.341 +++ sys/netinet/udp_usrreq.c 4 Jun 2025 13:12:49 -0000 @@ -198,6 +198,7 @@ udp_input(struct mbuf **mp, int *offp, i struct ip *ip = NULL; struct udphdr *uh; struct inpcb *inp = NULL; + void *pcb; struct ip save_ip; int len; u_int16_t savesum; @@ -419,6 +420,12 @@ udp_input(struct mbuf **mp, int *offp, i else KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + udpstat_inc(udps_closing); + continue; + } + KASSERT(pcb == inp); if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE) continue; if (rtable_l2(inp->inp_rtableid) != @@ -596,7 +603,12 @@ udp_input(struct mbuf **mp, int *offp, i return IPPROTO_DONE; } - KASSERT(sotoinpcb(inp->inp_socket) == inp); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + udpstat_inc(udps_closing); + goto bad; + } + KASSERT(pcb == inp); soassertlocked_readonly(inp->inp_socket); #ifdef INET6 Index: sys/netinet/udp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v diff -u -p -r1.53 udp_var.h --- sys/netinet/udp_var.h 2 Mar 2025 21:28:32 -0000 1.53 +++ sys/netinet/udp_var.h 4 Jun 2025 13:12:49 -0000 @@ -61,6 +61,7 @@ struct udpstat { u_long udps_badlen; /* data length larger than packet */ u_long udps_noport; /* no socket on port */ u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_closing; /* inpcb exists, socket is closing */ u_long udps_nosec; /* dropped for lack of ipsec */ u_long udps_fullsock; /* not delivered, input socket full */ u_long udps_pcbhashmiss; /* input packets missing pcb hash */ @@ -104,6 +105,7 @@ enum udpstat_counters { udps_badlen, /* data length larger than packet */ udps_noport, /* no socket on port */ udps_noportbcast, /* of above, arrived as broadcast */ + udps_closing, /* inpcb exists, socket is closing */ udps_nosec, /* dropped for lack of ipsec */ udps_fullsock, /* not delivered, input socket full */ udps_pcbhashmiss, /* input packets missing pcb hash */ Index: sys/netinet6/ip6_divert.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v diff -u -p -r1.103 ip6_divert.c --- sys/netinet6/ip6_divert.c 4 Jun 2025 12:37:00 -0000 1.103 +++ sys/netinet6/ip6_divert.c 4 Jun 2025 13:21:35 -0000 @@ -199,6 +199,7 @@ void divert6_packet(struct mbuf *m, int dir, u_int16_t divert_port) { struct inpcb *inp = NULL; + void *pcb; struct socket *so; struct sockaddr_in6 sin6; @@ -222,6 +223,12 @@ divert6_packet(struct mbuf *m, int dir, div6stat_inc(divs_noport); goto bad; } + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + div6stat_inc(divs_closing); + goto bad; + } + KASSERT(pcb == inp); memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; Index: sys/netinet6/raw_ip6.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v diff -u -p -r1.192 raw_ip6.c --- sys/netinet6/raw_ip6.c 27 May 2025 07:52:49 -0000 1.192 +++ sys/netinet6/raw_ip6.c 4 Jun 2025 13:12:49 -0000 @@ -138,6 +138,7 @@ rip6_input(struct mbuf **mp, int *offp, struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb_iterator iter = { .inp_table = NULL }; struct inpcb *inp, *last; + void *pcb; struct in6_addr *key; struct sockaddr_in6 rip6src; uint8_t type; @@ -184,6 +185,12 @@ rip6_input(struct mbuf **mp, int *offp, while ((inp = in_pcb_iterator(&rawin6pcbtable, inp, &iter)) != NULL) { KASSERT(ISSET(inp->inp_flags, INP_IPV6)); + pcb = READ_ONCE(inp->inp_socket->so_pcb); + if (pcb == NULL) { + rip6stat_inc(rip6s_closing); + continue; + } + KASSERT(pcb == inp); /* * Packet must not be inserted after disconnected wakeup * call. To avoid race, check again when holding receive Index: sys/netinet6/raw_ip6.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.h,v diff -u -p -r1.4 raw_ip6.h --- sys/netinet6/raw_ip6.h 9 Feb 2017 15:23:35 -0000 1.4 +++ sys/netinet6/raw_ip6.h 4 Jun 2025 13:12:49 -0000 @@ -42,6 +42,7 @@ struct rip6stat { u_int64_t rip6s_badsum; /* of above, checksum error */ u_int64_t rip6s_nosock; /* no matching socket */ u_int64_t rip6s_nosockmcast; /* of above, arrived as multicast */ + u_int64_t rip6s_closing; /* inpcb exists, socket is closing */ u_int64_t rip6s_fullsock; /* not delivered, input socket full */ u_int64_t rip6s_opackets; /* total output packets */ @@ -68,6 +69,7 @@ enum rip6stat_counters { rip6s_badsum, rip6s_nosock, rip6s_nosockmcast, + rip6s_closing, rip6s_fullsock, rip6s_opackets, rip6s_ncounters, Index: usr.bin/netstat/inet.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v diff -u -p -r1.184 inet.c --- usr.bin/netstat/inet.c 17 Apr 2025 17:23:17 -0000 1.184 +++ usr.bin/netstat/inet.c 4 Jun 2025 12:41:21 -0000 @@ -468,6 +468,7 @@ tcp_stats(char *name) p(tcps_preddat, "\t%u correct data packet header prediction%s\n"); pes(tcps_pcbhashmiss, "\t%u PCB cache miss%s\n"); p1(tcps_noport, "\t%u dropped due to no socket\n"); + p1(tcps_closing, "\t%u dropped as socket is closing\n"); p(tcps_ecn_accepts, "\t%u ECN connection%s accepted\n"); p(tcps_ecn_rcvece, "\t\t%u ECE packet%s received\n"); @@ -556,6 +557,7 @@ udp_stats(char *name) p(udps_outswcsum, "\t%lu output packet%s software-checksummed\n"); p1(udps_noport, "\t%lu dropped due to no socket\n"); p(udps_noportbcast, "\t%lu broadcast/multicast datagram%s dropped due to no socket\n"); + p1(udps_closing, "\t%lu dropped as socket is closing\n"); p1(udps_nosec, "\t%lu dropped due to missing IPsec protection\n"); p1(udps_fullsock, "\t%lu dropped due to full socket buffers\n"); delivered = udpstat.udps_ipackets - udpstat.udps_hdrops - @@ -608,6 +610,7 @@ ip_stats(char *name) p(ips_reassembled, "\t%lu packet%s reassembled ok\n"); p(ips_delivered, "\t%lu packet%s for this host\n"); p(ips_noproto, "\t%lu packet%s for unknown/unsupported protocol\n"); + p1(ips_closing, "\t%lu dropped as socket is closing\n"); p(ips_forward, "\t%lu packet%s forwarded\n"); p(ips_cantforward, "\t%lu packet%s not forwardable\n"); p(ips_redirectsent, "\t%lu redirect%s sent\n"); @@ -657,6 +660,7 @@ div_stats(char *name) printf(m, divstat.f) p(divs_ipackets, "\t%lu total packet%s received\n"); p1(divs_noport, "\t%lu dropped due to no socket\n"); + p1(divs_closing, "\t%lu dropped as socket is closing\n"); p1(divs_fullsock, "\t%lu dropped due to full socket buffers\n"); p(divs_opackets, "\t%lu packet%s output\n"); p1(divs_errors, "\t%lu errors\n"); Index: usr.bin/netstat/inet6.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet6.c,v diff -u -p -r1.58 inet6.c --- usr.bin/netstat/inet6.c 4 Jun 2025 12:37:00 -0000 1.58 +++ usr.bin/netstat/inet6.c 4 Jun 2025 12:41:21 -0000 @@ -847,10 +847,13 @@ rip6_stats(char *name) #define p(f, m) if (rip6stat.f || sflag <= 1) \ printf(m, (unsigned long long)rip6stat.f, plural(rip6stat.f)) +#define p1(f, m) if (rip6stat.f || sflag <= 1) \ + printf(m, (unsigned long long)rip6stat.f) p(rip6s_ipackets, "\t%llu message%s received\n"); p(rip6s_isum, "\t%llu checksum calculation%s on inbound\n"); p(rip6s_badsum, "\t%llu message%s with bad checksum\n"); p(rip6s_nosock, "\t%llu message%s dropped due to no socket\n"); + p1(rip6s_closing, "\t%llu dropped as socket is closing\n"); p(rip6s_nosockmcast, "\t%llu multicast message%s dropped due to no socket\n"); p(rip6s_fullsock, @@ -863,6 +866,7 @@ rip6_stats(char *name) printf("\t%llu delivered\n", (unsigned long long)delivered); p(rip6s_opackets, "\t%llu datagram%s output\n"); #undef p +#undef p1 } /* @@ -889,6 +893,7 @@ div6_stats(char *name) printf(m, div6stat.f) p(divs_ipackets, "\t%lu total packet%s received\n"); p1(divs_noport, "\t%lu dropped due to no socket\n"); + p1(divs_closing, "\t%lu dropped as socket is closing\n"); p1(divs_fullsock, "\t%lu dropped due to full socket buffers\n"); p(divs_opackets, "\t%lu packet%s output\n"); p1(divs_errors, "\t%lu errors\n");