From: Vitaliy Makkoveev Subject: Re: TCP input socket lock caching To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 7 May 2025 11:28:56 +0300 On Tue, May 06, 2025 at 10:00:30PM +0200, Alexander Bluhm wrote: > Hi, > > Parallel TCP input is running for a few days now and looks quite > stable. > > Final step is to implement caching of the socket lock. Without > large receive offloading (LRO) in the driver layer, it is very > likely that consecutive TCP segments are in the input queue. This > leads to contention of the socket lock between TCP input and socket > receive syscall from userland. > > http://bluhm.genua.de/perform/results/2025-05-05T00%3A52%3A28Z/2025-05-05T00%3A00%3A00Z/btrace/iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0345%3A%3A35_-w1m_-t10_-R-btrace-kstack.0.svg?s=rw_do_enter > > With the diff below, ip_deliver() moves all TCP packets that are > in the softnet queue temporarily to a TCP queue. This queue is per > softnet thread so no locking is needed. Finally in the same shared > netlock context, tcp_input_mlist() processes all TCP packets. It > keeps a pointer to the socket lock. tcp_input_solocked() switches > the lock only when the TCP stream changes. A bunch of packets are > processed and placed into the socket receive buffer under the same > lock. Then soreceive() can copy huge chunks to userland. The > contention of the socket lock is gone. > > http://bluhm.genua.de/perform/results/2025-05-05T00%3A52%3A28Z/patch-sys-tcp-input-parallel.0/btrace/iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0345%3A%3A35_-w1m_-t10_-R-btrace-kstack.0.svg?s=rw_do_enter > > On a 4 core machine I see between 12% to 22% improvement with 10 > parallel TCP streams. When testing only with a single TCP stream, > throughput increases between 38% to 100%. > > I have sent out this diff before as part of larger unlocking. > > ok? > ok mvs > bluhm > > Index: net/if.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v > diff -u -p -r1.731 if.c > --- net/if.c 1 May 2025 11:19:46 -0000 1.731 > +++ net/if.c 6 May 2025 19:21:02 -0000 > @@ -999,10 +999,21 @@ if_input_process(struct ifnet *ifp, stru > */ > > sn = net_sn(idx); > + ml_init(&sn->sn_netstack.ns_tcp_ml); > +#ifdef INET6 > + ml_init(&sn->sn_netstack.ns_tcp6_ml); > +#endif > > NET_LOCK_SHARED(); > + > while ((m = ml_dequeue(ml)) != NULL) > (*ifp->if_input)(ifp, m, &sn->sn_netstack); > + > + tcp_input_mlist(&sn->sn_netstack.ns_tcp_ml, AF_INET); > +#ifdef INET6 > + tcp_input_mlist(&sn->sn_netstack.ns_tcp6_ml, AF_INET6); > +#endif > + > NET_UNLOCK_SHARED(); > } > > Index: net/if_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v > diff -u -p -r1.136 if_var.h > --- net/if_var.h 2 Mar 2025 21:28:32 -0000 1.136 > +++ net/if_var.h 6 May 2025 19:21:02 -0000 > @@ -92,7 +92,9 @@ struct task; > struct cpumem; > > struct netstack { > - struct route ns_route; > + struct route ns_route; > + struct mbuf_list ns_tcp_ml; > + struct mbuf_list ns_tcp6_ml; > }; > > /* > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.443 tcp_input.c > --- netinet/tcp_input.c 4 May 2025 23:05:17 -0000 1.443 > +++ netinet/tcp_input.c 6 May 2025 19:21:02 -0000 > @@ -101,9 +101,6 @@ > #include > #endif > > -int tcp_mss_adv(struct rtentry *, int); > -int tcp_flush_queue(struct tcpcb *); > - > #ifdef INET6 > #include > #include > @@ -178,6 +175,9 @@ do { \ > if_put(ifp); \ > } while (0) > > +int tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **); > +int tcp_mss_adv(struct rtentry *, int); > +int tcp_flush_queue(struct tcpcb *); > void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); > void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); > > @@ -348,12 +348,53 @@ tcp_flush_queue(struct tcpcb *tp) > return (flags); > } > > +int > +tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns) > +{ > + if (ns == NULL) > + return tcp_input_solocked(mp, offp, proto, af, NULL); > + (*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp); > + switch (af) { > + case AF_INET: > + ml_enqueue(&ns->ns_tcp_ml, *mp); > + break; > +#ifdef INET6 > + case AF_INET6: > + ml_enqueue(&ns->ns_tcp6_ml, *mp); > + break; > +#endif > + default: > + m_freemp(mp); > + } > + *mp = NULL; > + return IPPROTO_DONE; > +} > + > +void > +tcp_input_mlist(struct mbuf_list *ml, int af) > +{ > + struct socket *so = NULL; > + struct mbuf *m; > + > + while ((m = ml_dequeue(ml)) != NULL) { > + int off, nxt; > + > + off = (long)m->m_pkthdr.ph_cookie; > + m->m_pkthdr.ph_cookie = NULL; > + nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so); > + KASSERT(nxt == IPPROTO_DONE); > + } > + > + in_pcbsounlock_rele(NULL, so); > +} > + > /* > * TCP input routine, follows pages 65-76 of the > * protocol specification dated September, 1981 very closely. > */ > int > -tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns) > +tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af, > + struct socket **solocked) > { > struct mbuf *m = *mp; > int iphlen = *offp; > @@ -604,7 +645,21 @@ findpcb: > tcpstat_inc(tcps_noport); > goto dropwithreset_ratelim; > } > - so = in_pcbsolock_ref(inp); > + /* > + * Avoid needless lock and unlock operation when handling multiple > + * TCP packets from the same stream consecutively. > + */ > + if (solocked != NULL && *solocked != NULL && > + sotoinpcb(*solocked) == inp) { > + so = *solocked; > + *solocked = NULL; > + } else { > + if (solocked != NULL && *solocked != NULL) { > + in_pcbsounlock_rele(NULL, *solocked); > + *solocked = NULL; > + } > + so = in_pcbsolock_ref(inp); > + } > if (so == NULL) { > tcpstat_inc(tcps_noport); > goto dropwithreset_ratelim; > @@ -847,7 +902,10 @@ findpcb: > tcpstat_inc(tcps_dropsyn); > goto drop; > } > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1023,7 +1081,10 @@ findpcb: > if (so->so_snd.sb_cc || > tp->t_flags & TF_NEEDOUTPUT) > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1074,7 +1135,10 @@ findpcb: > tp->t_flags &= ~TF_BLOCKOUTPUT; > if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -2074,7 +2138,10 @@ dodata: /* XXX */ > */ > if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > @@ -2104,7 +2171,10 @@ dropafterack: > m_freem(m); > tp->t_flags |= TF_ACKNOW; > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.190 tcp_var.h > --- netinet/tcp_var.h 29 Apr 2025 20:31:42 -0000 1.190 > +++ netinet/tcp_var.h 6 May 2025 19:21:02 -0000 > @@ -718,6 +718,7 @@ int tcp_dooptions(struct tcpcb *, u_cha > struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t); > void tcp_init(void); > int tcp_input(struct mbuf **, int *, int, int, struct netstack *); > +void tcp_input_mlist(struct mbuf_list *, int); > int tcp_mss(struct tcpcb *, int); > void tcp_mss_update(struct tcpcb *); > void tcp_softlro_glue(struct mbuf_list *, struct mbuf *, struct ifnet *); >