Download raw body.
TCP input socket lock caching
On Tue, May 06, 2025 at 10:00:30PM +0200, Alexander Bluhm wrote:
> Hi,
>
> Parallel TCP input is running for a few days now and looks quite
> stable.
>
> Final step is to implement caching of the socket lock. Without
> large receive offloading (LRO) in the driver layer, it is very
> likely that consecutive TCP segments are in the input queue. This
> leads to contention of the socket lock between TCP input and socket
> receive syscall from userland.
>
> http://bluhm.genua.de/perform/results/2025-05-05T00%3A52%3A28Z/2025-05-05T00%3A00%3A00Z/btrace/iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0345%3A%3A35_-w1m_-t10_-R-btrace-kstack.0.svg?s=rw_do_enter
>
> With the diff below, ip_deliver() moves all TCP packets that are
> in the softnet queue temporarily to a TCP queue. This queue is per
> softnet thread so no locking is needed. Finally in the same shared
> netlock context, tcp_input_mlist() processes all TCP packets. It
> keeps a pointer to the socket lock. tcp_input_solocked() switches
> the lock only when the TCP stream changes. A bunch of packets are
> processed and placed into the socket receive buffer under the same
> lock. Then soreceive() can copy huge chunks to userland. The
> contention of the socket lock is gone.
>
> http://bluhm.genua.de/perform/results/2025-05-05T00%3A52%3A28Z/patch-sys-tcp-input-parallel.0/btrace/iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0345%3A%3A35_-w1m_-t10_-R-btrace-kstack.0.svg?s=rw_do_enter
>
> On a 4 core machine I see between 12% to 22% improvement with 10
> parallel TCP streams. When testing only with a single TCP stream,
> throughput increases between 38% to 100%.
>
> I have sent out this diff before as part of larger unlocking.
>
> ok?
>
ok mvs
> bluhm
>
> Index: net/if.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> diff -u -p -r1.731 if.c
> --- net/if.c 1 May 2025 11:19:46 -0000 1.731
> +++ net/if.c 6 May 2025 19:21:02 -0000
> @@ -999,10 +999,21 @@ if_input_process(struct ifnet *ifp, stru
> */
>
> sn = net_sn(idx);
> + ml_init(&sn->sn_netstack.ns_tcp_ml);
> +#ifdef INET6
> + ml_init(&sn->sn_netstack.ns_tcp6_ml);
> +#endif
>
> NET_LOCK_SHARED();
> +
> while ((m = ml_dequeue(ml)) != NULL)
> (*ifp->if_input)(ifp, m, &sn->sn_netstack);
> +
> + tcp_input_mlist(&sn->sn_netstack.ns_tcp_ml, AF_INET);
> +#ifdef INET6
> + tcp_input_mlist(&sn->sn_netstack.ns_tcp6_ml, AF_INET6);
> +#endif
> +
> NET_UNLOCK_SHARED();
> }
>
> Index: net/if_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
> diff -u -p -r1.136 if_var.h
> --- net/if_var.h 2 Mar 2025 21:28:32 -0000 1.136
> +++ net/if_var.h 6 May 2025 19:21:02 -0000
> @@ -92,7 +92,9 @@ struct task;
> struct cpumem;
>
> struct netstack {
> - struct route ns_route;
> + struct route ns_route;
> + struct mbuf_list ns_tcp_ml;
> + struct mbuf_list ns_tcp6_ml;
> };
>
> /*
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.443 tcp_input.c
> --- netinet/tcp_input.c 4 May 2025 23:05:17 -0000 1.443
> +++ netinet/tcp_input.c 6 May 2025 19:21:02 -0000
> @@ -101,9 +101,6 @@
> #include <net/pfvar.h>
> #endif
>
> -int tcp_mss_adv(struct rtentry *, int);
> -int tcp_flush_queue(struct tcpcb *);
> -
> #ifdef INET6
> #include <netinet6/in6_var.h>
> #include <netinet6/nd6.h>
> @@ -178,6 +175,9 @@ do { \
> if_put(ifp); \
> } while (0)
>
> +int tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **);
> +int tcp_mss_adv(struct rtentry *, int);
> +int tcp_flush_queue(struct tcpcb *);
> void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
> void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
>
> @@ -348,12 +348,53 @@ tcp_flush_queue(struct tcpcb *tp)
> return (flags);
> }
>
> +int
> +tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
> +{
> + if (ns == NULL)
> + return tcp_input_solocked(mp, offp, proto, af, NULL);
> + (*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp);
> + switch (af) {
> + case AF_INET:
> + ml_enqueue(&ns->ns_tcp_ml, *mp);
> + break;
> +#ifdef INET6
> + case AF_INET6:
> + ml_enqueue(&ns->ns_tcp6_ml, *mp);
> + break;
> +#endif
> + default:
> + m_freemp(mp);
> + }
> + *mp = NULL;
> + return IPPROTO_DONE;
> +}
> +
> +void
> +tcp_input_mlist(struct mbuf_list *ml, int af)
> +{
> + struct socket *so = NULL;
> + struct mbuf *m;
> +
> + while ((m = ml_dequeue(ml)) != NULL) {
> + int off, nxt;
> +
> + off = (long)m->m_pkthdr.ph_cookie;
> + m->m_pkthdr.ph_cookie = NULL;
> + nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so);
> + KASSERT(nxt == IPPROTO_DONE);
> + }
> +
> + in_pcbsounlock_rele(NULL, so);
> +}
> +
> /*
> * TCP input routine, follows pages 65-76 of the
> * protocol specification dated September, 1981 very closely.
> */
> int
> -tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
> +tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af,
> + struct socket **solocked)
> {
> struct mbuf *m = *mp;
> int iphlen = *offp;
> @@ -604,7 +645,21 @@ findpcb:
> tcpstat_inc(tcps_noport);
> goto dropwithreset_ratelim;
> }
> - so = in_pcbsolock_ref(inp);
> + /*
> + * Avoid needless lock and unlock operation when handling multiple
> + * TCP packets from the same stream consecutively.
> + */
> + if (solocked != NULL && *solocked != NULL &&
> + sotoinpcb(*solocked) == inp) {
> + so = *solocked;
> + *solocked = NULL;
> + } else {
> + if (solocked != NULL && *solocked != NULL) {
> + in_pcbsounlock_rele(NULL, *solocked);
> + *solocked = NULL;
> + }
> + so = in_pcbsolock_ref(inp);
> + }
> if (so == NULL) {
> tcpstat_inc(tcps_noport);
> goto dropwithreset_ratelim;
> @@ -847,7 +902,10 @@ findpcb:
> tcpstat_inc(tcps_dropsyn);
> goto drop;
> }
> - in_pcbsounlock_rele(inp, so);
> + if (solocked != NULL)
> + *solocked = so;
> + else
> + in_pcbsounlock_rele(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -1023,7 +1081,10 @@ findpcb:
> if (so->so_snd.sb_cc ||
> tp->t_flags & TF_NEEDOUTPUT)
> (void) tcp_output(tp);
> - in_pcbsounlock_rele(inp, so);
> + if (solocked != NULL)
> + *solocked = so;
> + else
> + in_pcbsounlock_rele(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -1074,7 +1135,10 @@ findpcb:
> tp->t_flags &= ~TF_BLOCKOUTPUT;
> if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
> (void) tcp_output(tp);
> - in_pcbsounlock_rele(inp, so);
> + if (solocked != NULL)
> + *solocked = so;
> + else
> + in_pcbsounlock_rele(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -2074,7 +2138,10 @@ dodata: /* XXX */
> */
> if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
> (void) tcp_output(tp);
> - in_pcbsounlock_rele(inp, so);
> + if (solocked != NULL)
> + *solocked = so;
> + else
> + in_pcbsounlock_rele(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
>
> @@ -2104,7 +2171,10 @@ dropafterack:
> m_freem(m);
> tp->t_flags |= TF_ACKNOW;
> (void) tcp_output(tp);
> - in_pcbsounlock_rele(inp, so);
> + if (solocked != NULL)
> + *solocked = so;
> + else
> + in_pcbsounlock_rele(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
>
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.190 tcp_var.h
> --- netinet/tcp_var.h 29 Apr 2025 20:31:42 -0000 1.190
> +++ netinet/tcp_var.h 6 May 2025 19:21:02 -0000
> @@ -718,6 +718,7 @@ int tcp_dooptions(struct tcpcb *, u_cha
> struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t);
> void tcp_init(void);
> int tcp_input(struct mbuf **, int *, int, int, struct netstack *);
> +void tcp_input_mlist(struct mbuf_list *, int);
> int tcp_mss(struct tcpcb *, int);
> void tcp_mss_update(struct tcpcb *);
> void tcp_softlro_glue(struct mbuf_list *, struct mbuf *, struct ifnet *);
>
TCP input socket lock caching