From: Alexander Bluhm Subject: Re: Don't take solock() in soreceive() for SOCK_RAW inet sockets To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Thu, 11 Apr 2024 15:07:55 +0200 On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote: > Really, this big diff will be the hellish one to commit and receive > possible fallout due to very huge area. Better small progress than huge commits. > I see raw sockets the best way to start, because they don't need to call Advantage to proceed with raw sockets is that they are simple and have no socket splicing. But the test coverage is low. > The tcp and udp sockets require > to start sosplice() and somove() reworking, so I want to do this the > last. To get rid of MP races we need parallel stress tests. With UDP this is easier than raw IP. I while ago I used the diff below to run UDP input with shared netlock. But it does not work correctly with somove(). To put parallel pressure onto the IP protocol and socket layer we eihter have to make somove() MP safe for UDP, or adapt the diff below for raw IP and write a bunch of tests. With parallel UDP we also get a lot of testing by snapshot users. bluhm Index: net/if_bridge.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_bridge.c,v diff -u -p -r1.369 if_bridge.c --- net/if_bridge.c 13 Feb 2024 12:22:09 -0000 1.369 +++ net/if_bridge.c 11 Apr 2024 12:58:32 -0000 @@ -1600,7 +1600,7 @@ bridge_ipsec(struct ifnet *ifp, struct e off); tdb_unref(tdb); if (prot != IPPROTO_DONE) - ip_deliver(&m, &hlen, prot, af); + ip_deliver(&m, &hlen, prot, af, 0); return (1); } else { tdb_unref(tdb); Index: netinet/in_proto.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v diff -u -p -r1.103 in_proto.c --- netinet/in_proto.c 11 Jan 2024 14:15:12 -0000 1.103 +++ netinet/in_proto.c 11 Apr 2024 12:58:32 -0000 @@ -185,7 +185,7 @@ const struct protosw inetsw[] = { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, - .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE|PR_MPINPUT, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, .pr_ctloutput = ip_ctloutput, Index: netinet/ip_input.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v diff -u -p -r1.391 ip_input.c --- netinet/ip_input.c 28 Feb 2024 10:57:20 -0000 1.391 +++ netinet/ip_input.c 11 Apr 2024 12:58:32 -0000 @@ -245,6 +245,30 @@ ip_ours(struct mbuf **mp, int *offp, int if (af != AF_UNSPEC) return nxt; + nxt = ip_deliver(mp, offp, nxt, AF_INET, 1); + if (nxt == IPPROTO_DONE) + return IPPROTO_DONE; + + /* save values for later, use after dequeue */ + if (*offp != sizeof(struct ip)) { + struct m_tag *mtag; + struct ipoffnxt *ion; + + /* mbuf tags are expensive, but only used for header options */ + mtag = m_tag_get(PACKET_TAG_IP_OFFNXT, sizeof(*ion), + M_NOWAIT); + if (mtag == NULL) { + ipstat_inc(ips_idropped); + m_freemp(mp); + return IPPROTO_DONE; + } + ion = (struct ipoffnxt *)(mtag + 1); + ion->ion_off = *offp; + ion->ion_nxt = nxt; + + m_tag_prepend(*mp, mtag); + } + niq_enqueue(&ipintrq, *mp); *mp = NULL; return IPPROTO_DONE; @@ -260,18 +284,31 @@ ipintr(void) struct mbuf *m; while ((m = niq_dequeue(&ipintrq)) != NULL) { - struct ip *ip; + struct m_tag *mtag; int off, nxt; #ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("ipintr no HDR"); #endif - ip = mtod(m, struct ip *); - off = ip->ip_hl << 2; - nxt = ip->ip_p; + mtag = m_tag_find(m, PACKET_TAG_IP_OFFNXT, NULL); + if (mtag != NULL) { + struct ipoffnxt *ion; + + ion = (struct ipoffnxt *)(mtag + 1); + off = ion->ion_off; + nxt = ion->ion_nxt; + + m_tag_delete(m, mtag); + } else { + struct ip *ip; + + ip = mtod(m, struct ip *); + off = ip->ip_hl << 2; + nxt = ip->ip_p; + } - nxt = ip_deliver(&m, &off, nxt, AF_INET); + nxt = ip_deliver(&m, &off, nxt, AF_INET, 0); KASSERT(nxt == IPPROTO_DONE); } } @@ -675,7 +712,7 @@ ip_fragcheck(struct mbuf **mp, int *offp #endif int -ip_deliver(struct mbuf **mp, int *offp, int nxt, int af) +ip_deliver(struct mbuf **mp, int *offp, int nxt, int af, int shared) { const struct protosw *psw; int naf = af; @@ -683,14 +720,24 @@ ip_deliver(struct mbuf **mp, int *offp, int nest = 0; #endif /* INET6 */ - NET_ASSERT_LOCKED_EXCLUSIVE(); - /* * Tell launch routine the next header */ IPSTAT_INC(delivered); while (nxt != IPPROTO_DONE) { + switch (af) { + case AF_INET: + psw = &inetsw[ip_protox[nxt]]; + break; +#ifdef INET6 + case AF_INET6: + psw = &inet6sw[ip6_protox[nxt]]; + break; +#endif /* INET6 */ + } + if (shared && !ISSET(psw->pr_flags, PR_MPINPUT)) + break; #ifdef INET6 if (af == AF_INET6 && ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { @@ -727,16 +774,6 @@ ip_deliver(struct mbuf **mp, int *offp, case IPPROTO_IPV6: naf = AF_INET6; ip6stat_inc(ip6s_delivered); - break; -#endif /* INET6 */ - } - switch (af) { - case AF_INET: - psw = &inetsw[ip_protox[nxt]]; - break; -#ifdef INET6 - case AF_INET6: - psw = &inet6sw[ip6_protox[nxt]]; break; #endif /* INET6 */ } Index: netinet/ip_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v diff -u -p -r1.114 ip_var.h --- netinet/ip_var.h 5 Mar 2024 09:45:13 -0000 1.114 +++ netinet/ip_var.h 11 Apr 2024 12:58:32 -0000 @@ -198,6 +198,11 @@ struct ipq { struct in_addr ipq_src, ipq_dst; }; +struct ipoffnxt { + int ion_off; + int ion_nxt; +}; + /* flags passed to ip_output */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ @@ -254,7 +259,7 @@ int ip_sysctl(int *, u_int, void *, siz void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); int ip_input_if(struct mbuf **, int *, int, int, struct ifnet *); -int ip_deliver(struct mbuf **, int *, int, int); +int ip_deliver(struct mbuf **, int *, int, int, int); void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int); int rip_ctloutput(int, struct socket *, int, int, struct mbuf *); void rip_init(void); Index: netinet6/in6_proto.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v diff -u -p -r1.113 in6_proto.c --- netinet6/in6_proto.c 11 Jan 2024 14:15:12 -0000 1.113 +++ netinet6/in6_proto.c 11 Apr 2024 12:58:32 -0000 @@ -136,7 +136,7 @@ const struct protosw inet6sw[] = { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, - .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE|PR_MPINPUT, .pr_input = udp_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = ip6_ctloutput, Index: netinet6/ip6_input.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v diff -u -p -r1.259 ip6_input.c --- netinet6/ip6_input.c 28 Feb 2024 10:57:20 -0000 1.259 +++ netinet6/ip6_input.c 11 Apr 2024 12:58:32 -0000 @@ -166,11 +166,6 @@ ip6_init(void) #endif } -struct ip6_offnxt { - int ion_off; - int ion_nxt; -}; - /* * Enqueue packet for local delivery. Queuing is used as a boundary * between the network layer (input/forward path) running with @@ -190,10 +185,14 @@ ip6_ours(struct mbuf **mp, int *offp, in if (af != AF_UNSPEC) return nxt; + nxt = ip_deliver(mp, offp, nxt, AF_INET6, 1); + if (nxt == IPPROTO_DONE) + return IPPROTO_DONE; + /* save values for later, use after dequeue */ if (*offp != sizeof(struct ip6_hdr)) { struct m_tag *mtag; - struct ip6_offnxt *ion; + struct ipoffnxt *ion; /* mbuf tags are expensive, but only used for header options */ mtag = m_tag_get(PACKET_TAG_IP6_OFFNXT, sizeof(*ion), @@ -203,7 +202,7 @@ ip6_ours(struct mbuf **mp, int *offp, in m_freemp(mp); return IPPROTO_DONE; } - ion = (struct ip6_offnxt *)(mtag + 1); + ion = (struct ipoffnxt *)(mtag + 1); ion->ion_off = *offp; ion->ion_nxt = nxt; @@ -234,9 +233,9 @@ ip6intr(void) #endif mtag = m_tag_find(m, PACKET_TAG_IP6_OFFNXT, NULL); if (mtag != NULL) { - struct ip6_offnxt *ion; + struct ipoffnxt *ion; - ion = (struct ip6_offnxt *)(mtag + 1); + ion = (struct ipoffnxt *)(mtag + 1); off = ion->ion_off; nxt = ion->ion_nxt; @@ -248,7 +247,7 @@ ip6intr(void) off = sizeof(struct ip6_hdr); nxt = ip6->ip6_nxt; } - nxt = ip_deliver(&m, &off, nxt, AF_INET6); + nxt = ip_deliver(&m, &off, nxt, AF_INET6, 0); KASSERT(nxt == IPPROTO_DONE); } } Index: sys/mbuf.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v diff -u -p -r1.262 mbuf.h --- sys/mbuf.h 21 Feb 2024 13:42:06 -0000 1.262 +++ sys/mbuf.h 11 Apr 2024 12:58:32 -0000 @@ -471,6 +471,8 @@ struct m_tag *m_tag_next(struct mbuf *, #define PACKET_TAG_IPSEC_IN_DONE 0x0001 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 0x0002 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_FLOWINFO 0x0004 /* IPsec flowinfo */ +#define PACKET_TAG_IP_OFFNXT 0x0010 /* IPv4 offset and next proto */ +#define PACKET_TAG_IP6_OFFNXT 0x0020 /* IPv6 offset and next proto */ #define PACKET_TAG_WIREGUARD 0x0040 /* WireGuard data */ #define PACKET_TAG_GRE 0x0080 /* GRE processing done */ #define PACKET_TAG_DLT 0x0100 /* data link layer type */ @@ -479,7 +481,6 @@ struct m_tag *m_tag_next(struct mbuf *, #define PACKET_TAG_SRCROUTE 0x1000 /* IPv4 source routing options */ #define PACKET_TAG_TUNNEL 0x2000 /* Tunnel endpoint address */ #define PACKET_TAG_CARP_BAL_IP 0x4000 /* carp(4) ip balanced marker */ -#define PACKET_TAG_IP6_OFFNXT 0x8000 /* IPv6 offset and next proto */ #define MTAG_BITS \ ("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_FLOWINFO" \ Index: sys/protosw.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v diff -u -p -r1.65 protosw.h --- sys/protosw.h 3 Feb 2024 22:50:09 -0000 1.65 +++ sys/protosw.h 11 Apr 2024 12:58:32 -0000 @@ -132,6 +132,7 @@ struct protosw { #define PR_ABRTACPTDIS 0x0020 /* abort on accept(2) to disconnected socket */ #define PR_SPLICE 0x0040 /* socket splicing is possible */ +#define PR_MPINPUT 0x0080 /* input runs with shared netlock */ /* * The arguments to usrreq are: