Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: running TCP input in parallel
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Fri, 18 Apr 2025 16:49:17 +0300

Download raw body.

Thread
On Thu, Apr 17, 2025 at 06:53:19PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> To run tcp_input() in parallel efficently, we have to lock the
> socket in a smart way.  I have measured multiple variants.
> 
> http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/perform.html
> 
> The relevant TCP graph is here.
> 
> http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/gnuplot/tcp.html
> http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/gnuplot/tcp6.html
> 
> First column (left) is no locking at all, just exclusive net lock.
> 
> Third column is socket lock in addition to exclusive net lock.  You
> see 6% degradation due to locking overhead.  This has already been
> commited.
> 
> Fourth column (right) is simple switch from exclusive net lock to
> shared net lock and relying on socket lock.  Grabbing the socket
> lock for each packet is expensive.  Especially tcp_input() and
> soreceive() are fighting for the lock.  Single stream performance
> goes down by 25%, but multi stream goes up by 140%.
> 
> The second column contains the diff below.  The idea is that
> tcp_input() moves all TCP packets from softnet input queue into TCP
> input queue.  This queue has storage per softnet thread and can be
> accessed without lock.  After running all protocol input functions,
> but in the same shared netlock context, process the TCP input queue.
> tcp_input_mlist() keeps a pointer to the current socket.
> tcp_input_solocked() tries to keep the lock on the socket.  If
> consecutive TCP packets belong to the same socket, the lock is not
> released.  Only when the TCP stream changes, we unlock the old and
> lock a new socket.  This batch processing of locked sockets gives
> 5% increase in single stream and 160% for multi stream throughput.
> 
> I have several positive test reports.
> 
> ok?
> 

Yes please.

> bluhm
> 
> Index: net/if.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> diff -u -p -r1.729 if.c
> --- net/if.c	19 Mar 2025 23:29:49 -0000	1.729
> +++ net/if.c	16 Apr 2025 14:49:20 -0000
> @@ -1001,10 +1001,21 @@ if_input_process(struct ifnet *ifp, stru
>  	 */
>  
>  	sn = net_sn(idx);
> +	ml_init(&sn->sn_netstack.ns_tcp_ml);
> +#ifdef INET6
> +	ml_init(&sn->sn_netstack.ns_tcp6_ml);
> +#endif
>  
>  	NET_LOCK_SHARED();
> +
>  	while ((m = ml_dequeue(ml)) != NULL)
>  		(*ifp->if_input)(ifp, m, &sn->sn_netstack);
> +
> +	tcp_input_mlist(&sn->sn_netstack.ns_tcp_ml, AF_INET);
> +#ifdef INET6
> +	tcp_input_mlist(&sn->sn_netstack.ns_tcp6_ml, AF_INET6);
> +#endif
> +
>  	NET_UNLOCK_SHARED();
>  }
>  
> Index: net/if_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
> diff -u -p -r1.136 if_var.h
> --- net/if_var.h	2 Mar 2025 21:28:32 -0000	1.136
> +++ net/if_var.h	16 Apr 2025 14:49:20 -0000
> @@ -92,7 +92,9 @@ struct task;
>  struct cpumem;
>  
>  struct netstack {
> -	struct route	ns_route;
> +	struct route		ns_route;
> +	struct mbuf_list	ns_tcp_ml;
> +	struct mbuf_list	ns_tcp6_ml;
>  };
>  
>  /*
> Index: netinet/in_proto.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
> diff -u -p -r1.121 in_proto.c
> --- netinet/in_proto.c	5 Jan 2025 12:36:48 -0000	1.121
> +++ netinet/in_proto.c	16 Apr 2025 14:49:20 -0000
> @@ -197,7 +197,8 @@ const struct protosw inetsw[] = {
>    .pr_type	= SOCK_STREAM,
>    .pr_domain	= &inetdomain,
>    .pr_protocol	= IPPROTO_TCP,
> -  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
> +  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|
> +		    PR_MPINPUT,
>    .pr_input	= tcp_input,
>    .pr_ctlinput	= tcp_ctlinput,
>    .pr_ctloutput	= tcp_ctloutput,
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.435 tcp_input.c
> --- netinet/tcp_input.c	16 Apr 2025 12:51:11 -0000	1.435
> +++ netinet/tcp_input.c	16 Apr 2025 14:49:20 -0000
> @@ -100,9 +100,6 @@
>  #include <net/pfvar.h>
>  #endif
>  
> -int tcp_mss_adv(struct rtentry *, int);
> -int tcp_flush_queue(struct tcpcb *);
> -
>  #ifdef INET6
>  #include <netinet6/in6_var.h>
>  #include <netinet6/nd6.h>
> @@ -177,6 +174,9 @@ do { \
>  	if_put(ifp); \
>  } while (0)
>  
> +int	 tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **);
> +int	 tcp_mss_adv(struct rtentry *, int);
> +int	 tcp_flush_queue(struct tcpcb *);
>  void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
>  void	 tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
>  
> @@ -347,12 +347,53 @@ tcp_flush_queue(struct tcpcb *tp)
>  	return (flags);
>  }
>  
> +int
> +tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
> +{
> +	if (ns == NULL)
> +		return tcp_input_solocked(mp, offp, proto, af, NULL);
> +	(*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp);
> +	switch (af) {
> +	case AF_INET:
> +		ml_enqueue(&ns->ns_tcp_ml, *mp);
> +		break;
> +#ifdef INET6
> +	case AF_INET6:
> +		ml_enqueue(&ns->ns_tcp6_ml, *mp);
> +		break;
> +#endif
> +	default:
> +		m_freemp(mp);
> +	}
> +	*mp = NULL;
> +	return IPPROTO_DONE;
> +}
> +
> +void
> +tcp_input_mlist(struct mbuf_list *ml, int af)
> +{
> +	struct socket *so = NULL;
> +	struct mbuf *m;
> +
> +	while ((m = ml_dequeue(ml)) != NULL) {
> +		int off, nxt;
> +
> +		off = (long)m->m_pkthdr.ph_cookie;
> +		m->m_pkthdr.ph_cookie = NULL;
> +		nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so);
> +		KASSERT(nxt == IPPROTO_DONE);
> +	}
> +
> +	in_pcbsounlock_rele(NULL, so);
> +}
> +
>  /*
>   * TCP input routine, follows pages 65-76 of the
>   * protocol specification dated September, 1981 very closely.
>   */
>  int
> -tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
> +tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af,
> +    struct socket ** solocked)
>  {
>  	struct mbuf *m = *mp;
>  	int iphlen = *offp;
> @@ -603,7 +644,21 @@ findpcb:
>  		tcpstat_inc(tcps_noport);
>  		goto dropwithreset_ratelim;
>  	}
> -	so = in_pcbsolock_ref(inp);
> +	/*
> +	 * Avoid needless lock and unlock operation when handling multiple
> +	 * TCP packets from the same stream consecutively.
> +	 */
> +	if (solocked != NULL && *solocked != NULL &&
> +	    sotoinpcb(*solocked) == inp) {
> +		so = *solocked;
> +		*solocked = NULL;
> +	} else {
> +		if (solocked != NULL && *solocked != NULL) {
> +			in_pcbsounlock_rele(NULL, *solocked);
> +			*solocked = NULL;
> +		}
> +		so = in_pcbsolock_ref(inp);
> +	}
>  	if (so == NULL) {
>  		tcpstat_inc(tcps_noport);
>  		goto dropwithreset_ratelim;
> @@ -846,7 +901,10 @@ findpcb:
>  					tcpstat_inc(tcps_dropsyn);
>  					goto drop;
>  				}
> -				in_pcbsounlock_rele(inp, so);
> +				if (solocked != NULL)
> +					*solocked = so;
> +				else
> +					in_pcbsounlock_rele(inp, so);
>  				in_pcbunref(inp);
>  				return IPPROTO_DONE;
>  			}
> @@ -1022,7 +1080,10 @@ findpcb:
>  				if (so->so_snd.sb_cc ||
>  				    tp->t_flags & TF_NEEDOUTPUT)
>  					(void) tcp_output(tp);
> -				in_pcbsounlock_rele(inp, so);
> +				if (solocked != NULL)
> +					*solocked = so;
> +				else
> +					in_pcbsounlock_rele(inp, so);
>  				in_pcbunref(inp);
>  				return IPPROTO_DONE;
>  			}
> @@ -1073,7 +1134,10 @@ findpcb:
>  			tp->t_flags &= ~TF_BLOCKOUTPUT;
>  			if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
>  				(void) tcp_output(tp);
> -			in_pcbsounlock_rele(inp, so);
> +			if (solocked != NULL)
> +				*solocked = so;
> +			else
> +				in_pcbsounlock_rele(inp, so);
>  			in_pcbunref(inp);
>  			return IPPROTO_DONE;
>  		}
> @@ -2073,7 +2137,10 @@ dodata:							/* XXX */
>  	 */
>  	if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
>  		(void) tcp_output(tp);
> -	in_pcbsounlock_rele(inp, so);
> +	if (solocked != NULL)
> +		*solocked = so;
> +	else
> +		in_pcbsounlock_rele(inp, so);
>  	in_pcbunref(inp);
>  	return IPPROTO_DONE;
>  
> @@ -2103,7 +2170,10 @@ dropafterack:
>  	m_freem(m);
>  	tp->t_flags |= TF_ACKNOW;
>  	(void) tcp_output(tp);
> -	in_pcbsounlock_rele(inp, so);
> +	if (solocked != NULL)
> +		*solocked = so;
> +	else
> +		in_pcbsounlock_rele(inp, so);
>  	in_pcbunref(inp);
>  	return IPPROTO_DONE;
>  
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.186 tcp_var.h
> --- netinet/tcp_var.h	2 Mar 2025 21:28:32 -0000	1.186
> +++ netinet/tcp_var.h	16 Apr 2025 14:49:20 -0000
> @@ -718,6 +718,7 @@ int	 tcp_dooptions(struct tcpcb *, u_cha
>  		struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t);
>  void	 tcp_init(void);
>  int	 tcp_input(struct mbuf **, int *, int, int, struct netstack *);
> +void	 tcp_input_mlist(struct mbuf_list *, int);
>  int	 tcp_mss(struct tcpcb *, int);
>  void	 tcp_mss_update(struct tcpcb *);
>  u_int	 tcp_hdrsz(struct tcpcb *);
> Index: netinet6/in6_proto.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v
> diff -u -p -r1.124 in6_proto.c
> --- netinet6/in6_proto.c	5 Jan 2025 12:36:48 -0000	1.124
> +++ netinet6/in6_proto.c	16 Apr 2025 14:49:20 -0000
> @@ -147,7 +147,8 @@ const struct protosw inet6sw[] = {
>    .pr_type	= SOCK_STREAM,
>    .pr_domain	= &inet6domain,
>    .pr_protocol	= IPPROTO_TCP,
> -  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
> +  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|
> +		    PR_MPINPUT,
>    .pr_input	= tcp_input,
>    .pr_ctlinput	= tcp6_ctlinput,
>    .pr_ctloutput	= tcp_ctloutput,
>