From: Vitaliy Makkoveev Subject: Re: running TCP input in parallel To: Alexander Bluhm Cc: tech@openbsd.org Date: Fri, 18 Apr 2025 16:49:17 +0300 On Thu, Apr 17, 2025 at 06:53:19PM +0200, Alexander Bluhm wrote: > Hi, > > To run tcp_input() in parallel efficently, we have to lock the > socket in a smart way. I have measured multiple variants. > > http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/perform.html > > The relevant TCP graph is here. > > http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/gnuplot/tcp.html > http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/gnuplot/tcp6.html > > First column (left) is no locking at all, just exclusive net lock. > > Third column is socket lock in addition to exclusive net lock. You > see 6% degradation due to locking overhead. This has already been > commited. > > Fourth column (right) is simple switch from exclusive net lock to > shared net lock and relying on socket lock. Grabbing the socket > lock for each packet is expensive. Especially tcp_input() and > soreceive() are fighting for the lock. Single stream performance > goes down by 25%, but multi stream goes up by 140%. > > The second column contains the diff below. The idea is that > tcp_input() moves all TCP packets from softnet input queue into TCP > input queue. This queue has storage per softnet thread and can be > accessed without lock. After running all protocol input functions, > but in the same shared netlock context, process the TCP input queue. > tcp_input_mlist() keeps a pointer to the current socket. > tcp_input_solocked() tries to keep the lock on the socket. If > consecutive TCP packets belong to the same socket, the lock is not > released. Only when the TCP stream changes, we unlock the old and > lock a new socket. This batch processing of locked sockets gives > 5% increase in single stream and 160% for multi stream throughput. > > I have several positive test reports. > > ok? > Yes please. > bluhm > > Index: net/if.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v > diff -u -p -r1.729 if.c > --- net/if.c 19 Mar 2025 23:29:49 -0000 1.729 > +++ net/if.c 16 Apr 2025 14:49:20 -0000 > @@ -1001,10 +1001,21 @@ if_input_process(struct ifnet *ifp, stru > */ > > sn = net_sn(idx); > + ml_init(&sn->sn_netstack.ns_tcp_ml); > +#ifdef INET6 > + ml_init(&sn->sn_netstack.ns_tcp6_ml); > +#endif > > NET_LOCK_SHARED(); > + > while ((m = ml_dequeue(ml)) != NULL) > (*ifp->if_input)(ifp, m, &sn->sn_netstack); > + > + tcp_input_mlist(&sn->sn_netstack.ns_tcp_ml, AF_INET); > +#ifdef INET6 > + tcp_input_mlist(&sn->sn_netstack.ns_tcp6_ml, AF_INET6); > +#endif > + > NET_UNLOCK_SHARED(); > } > > Index: net/if_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v > diff -u -p -r1.136 if_var.h > --- net/if_var.h 2 Mar 2025 21:28:32 -0000 1.136 > +++ net/if_var.h 16 Apr 2025 14:49:20 -0000 > @@ -92,7 +92,9 @@ struct task; > struct cpumem; > > struct netstack { > - struct route ns_route; > + struct route ns_route; > + struct mbuf_list ns_tcp_ml; > + struct mbuf_list ns_tcp6_ml; > }; > > /* > Index: netinet/in_proto.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v > diff -u -p -r1.121 in_proto.c > --- netinet/in_proto.c 5 Jan 2025 12:36:48 -0000 1.121 > +++ netinet/in_proto.c 16 Apr 2025 14:49:20 -0000 > @@ -197,7 +197,8 @@ const struct protosw inetsw[] = { > .pr_type = SOCK_STREAM, > .pr_domain = &inetdomain, > .pr_protocol = IPPROTO_TCP, > - .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE, > + .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE| > + PR_MPINPUT, > .pr_input = tcp_input, > .pr_ctlinput = tcp_ctlinput, > .pr_ctloutput = tcp_ctloutput, > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.435 tcp_input.c > --- netinet/tcp_input.c 16 Apr 2025 12:51:11 -0000 1.435 > +++ netinet/tcp_input.c 16 Apr 2025 14:49:20 -0000 > @@ -100,9 +100,6 @@ > #include > #endif > > -int tcp_mss_adv(struct rtentry *, int); > -int tcp_flush_queue(struct tcpcb *); > - > #ifdef INET6 > #include > #include > @@ -177,6 +174,9 @@ do { \ > if_put(ifp); \ > } while (0) > > +int tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **); > +int tcp_mss_adv(struct rtentry *, int); > +int tcp_flush_queue(struct tcpcb *); > void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); > void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); > > @@ -347,12 +347,53 @@ tcp_flush_queue(struct tcpcb *tp) > return (flags); > } > > +int > +tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns) > +{ > + if (ns == NULL) > + return tcp_input_solocked(mp, offp, proto, af, NULL); > + (*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp); > + switch (af) { > + case AF_INET: > + ml_enqueue(&ns->ns_tcp_ml, *mp); > + break; > +#ifdef INET6 > + case AF_INET6: > + ml_enqueue(&ns->ns_tcp6_ml, *mp); > + break; > +#endif > + default: > + m_freemp(mp); > + } > + *mp = NULL; > + return IPPROTO_DONE; > +} > + > +void > +tcp_input_mlist(struct mbuf_list *ml, int af) > +{ > + struct socket *so = NULL; > + struct mbuf *m; > + > + while ((m = ml_dequeue(ml)) != NULL) { > + int off, nxt; > + > + off = (long)m->m_pkthdr.ph_cookie; > + m->m_pkthdr.ph_cookie = NULL; > + nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so); > + KASSERT(nxt == IPPROTO_DONE); > + } > + > + in_pcbsounlock_rele(NULL, so); > +} > + > /* > * TCP input routine, follows pages 65-76 of the > * protocol specification dated September, 1981 very closely. > */ > int > -tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns) > +tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af, > + struct socket ** solocked) > { > struct mbuf *m = *mp; > int iphlen = *offp; > @@ -603,7 +644,21 @@ findpcb: > tcpstat_inc(tcps_noport); > goto dropwithreset_ratelim; > } > - so = in_pcbsolock_ref(inp); > + /* > + * Avoid needless lock and unlock operation when handling multiple > + * TCP packets from the same stream consecutively. > + */ > + if (solocked != NULL && *solocked != NULL && > + sotoinpcb(*solocked) == inp) { > + so = *solocked; > + *solocked = NULL; > + } else { > + if (solocked != NULL && *solocked != NULL) { > + in_pcbsounlock_rele(NULL, *solocked); > + *solocked = NULL; > + } > + so = in_pcbsolock_ref(inp); > + } > if (so == NULL) { > tcpstat_inc(tcps_noport); > goto dropwithreset_ratelim; > @@ -846,7 +901,10 @@ findpcb: > tcpstat_inc(tcps_dropsyn); > goto drop; > } > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1022,7 +1080,10 @@ findpcb: > if (so->so_snd.sb_cc || > tp->t_flags & TF_NEEDOUTPUT) > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1073,7 +1134,10 @@ findpcb: > tp->t_flags &= ~TF_BLOCKOUTPUT; > if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -2073,7 +2137,10 @@ dodata: /* XXX */ > */ > if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > @@ -2103,7 +2170,10 @@ dropafterack: > m_freem(m); > tp->t_flags |= TF_ACKNOW; > (void) tcp_output(tp); > - in_pcbsounlock_rele(inp, so); > + if (solocked != NULL) > + *solocked = so; > + else > + in_pcbsounlock_rele(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.186 tcp_var.h > --- netinet/tcp_var.h 2 Mar 2025 21:28:32 -0000 1.186 > +++ netinet/tcp_var.h 16 Apr 2025 14:49:20 -0000 > @@ -718,6 +718,7 @@ int tcp_dooptions(struct tcpcb *, u_cha > struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t); > void tcp_init(void); > int tcp_input(struct mbuf **, int *, int, int, struct netstack *); > +void tcp_input_mlist(struct mbuf_list *, int); > int tcp_mss(struct tcpcb *, int); > void tcp_mss_update(struct tcpcb *); > u_int tcp_hdrsz(struct tcpcb *); > Index: netinet6/in6_proto.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v > diff -u -p -r1.124 in6_proto.c > --- netinet6/in6_proto.c 5 Jan 2025 12:36:48 -0000 1.124 > +++ netinet6/in6_proto.c 16 Apr 2025 14:49:20 -0000 > @@ -147,7 +147,8 @@ const struct protosw inet6sw[] = { > .pr_type = SOCK_STREAM, > .pr_domain = &inet6domain, > .pr_protocol = IPPROTO_TCP, > - .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE, > + .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE| > + PR_MPINPUT, > .pr_input = tcp_input, > .pr_ctlinput = tcp6_ctlinput, > .pr_ctloutput = tcp_ctloutput, >