Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: SoftLRO for ixl(4), bnxt(4) and em(4)
To:
Jan Klemkow <jan@openbsd.org>
Cc:
tech@openbsd.org
Date:
Thu, 7 Nov 2024 11:30:26 +1000

Download raw body.

Thread
On Thu, Nov 07, 2024 at 01:10:10AM +0100, Jan Klemkow wrote:
> Hi,
> 
> This diff introduces a software solution for TCP Large Receive Offload
> (SoftLRO) for network interfaces don't hat hardware support for it.
> This is needes at least for newer Intel interfaces as their
> documentation said that LRO a.k.a. Receive Side Coalescing (RSC) has to
> be done by software.
> This diff coalesces TCP segments during the receive interrupt before
> queueing them.  Thus, our TCP/IP stack has to process less packet
> headers per amount of received data.
> 
> I measured receiving performance with Intel XXV710 25 GbE interfaces.
> It increased from 6 Gbit/s to 23 Gbit/s.
> 
> Even if we saturate em(4) without any of these technique its also part
> this diff.  I'm interested if this diff helps to reach 1 Gbit/s on old
> or slow hardware.
> 
> I also add bnxt(4) to this diff to increase test coverage.  If you want
> to tests this implementation with your favorite interface, just replace
> the ml_enqueue() call with the new tcp_softlro_enqueue() (as seen
> below).  It should work with all kind network interfaces.
> 
> Any comments and tests reports are welcome.

nice.

i would argue this should be ether_softlro_enqueue and put in
if_ethersubr.c because it's specific to ethernet interfaces. we don't
really have any other type of interface that bundles reception of
packets that we can take advantage of like this, and internally it
assumes it's pulling ethernet packets apart.

aside from that, just a few comments on the code.

> Bye,
> Jan
> 
> Index: dev/pci/if_bnxt.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v
> diff -u -p -r1.52 if_bnxt.c
> --- dev/pci/if_bnxt.c	6 Oct 2024 23:43:18 -0000	1.52
> +++ dev/pci/if_bnxt.c	6 Nov 2024 23:32:00 -0000
> @@ -646,6 +646,8 @@ bnxt_attach(struct device *parent, struc
>  	    IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv6 |
>  	    IFCAP_CSUM_TCPv6;
>  	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
> +	ifp->if_xflags |= IFXF_LRO;
> +	ifp->if_capabilities |= IFCAP_LRO;
>  #if NVLAN > 0
>  	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
>  #endif
> @@ -2283,6 +2285,7 @@ bnxt_rx(struct bnxt_softc *sc, struct bn
>      struct bnxt_cp_ring *cpr, struct mbuf_list *ml, int *slots, int *agslots,
>      struct cmpl_base *cmpl)
>  {
> +	struct ifnet *ifp = &sc->sc_ac.ac_if;
>  	struct mbuf *m, *am;
>  	struct bnxt_slot *bs;
>  	struct rx_pkt_cmpl *rxlo = (struct rx_pkt_cmpl *)cmpl;
> @@ -2355,7 +2358,10 @@ bnxt_rx(struct bnxt_softc *sc, struct bn
>  		(*agslots)++;
>  	}
>  
> -	ml_enqueue(ml, m);
> +	if (ISSET(ifp->if_xflags, IFXF_LRO))
> +		tcp_softlro_enqueue(ml, m);
> +	else
> +		ml_enqueue(ml, m);
>  	return (0);
>  }
>  
> Index: dev/pci/if_em.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_em.c,v
> diff -u -p -r1.378 if_em.c
> --- dev/pci/if_em.c	31 Aug 2024 16:23:09 -0000	1.378
> +++ dev/pci/if_em.c	6 Nov 2024 23:32:00 -0000
> @@ -2013,6 +2013,9 @@ em_setup_interface(struct em_softc *sc)
>  		ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>  	}
>  
> +	ifp->if_xflags |= IFXF_LRO;
> +	ifp->if_capabilities |= IFCAP_LRO;
> +
>  	/* 
>  	 * Specify the media types supported by this adapter and register
>  	 * callbacks to update media and link information
> @@ -3185,7 +3188,11 @@ em_rxeof(struct em_queue *que)
>  					m->m_flags |= M_VLANTAG;
>  				}
>  #endif
> -				ml_enqueue(&ml, m);
> +
> +				if (ISSET(ifp->if_xflags, IFXF_LRO))
> +					tcp_softlro_enqueue(&ml, m);
> +				else
> +					ml_enqueue(&ml, m);
>  
>  				que->rx.fmp = NULL;
>  				que->rx.lmp = NULL;
> Index: dev/pci/if_ixl.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
> diff -u -p -r1.102 if_ixl.c
> --- dev/pci/if_ixl.c	30 Oct 2024 18:02:45 -0000	1.102
> +++ dev/pci/if_ixl.c	6 Nov 2024 23:33:07 -0000
> @@ -1976,6 +1976,9 @@ ixl_attach(struct device *parent, struct
>  	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
>  	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>  
> +	ifp->if_xflags |= IFXF_LRO;
> +	ifp->if_capabilities |= IFCAP_LRO;
> +
>  	ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
>  
>  	ixl_media_add(sc, phy_types);
> @@ -3324,7 +3327,10 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>  #endif
>  
>  				ixl_rx_checksum(m, word);
> -				ml_enqueue(&ml, m);
> +				if (ISSET(ifp->if_xflags, IFXF_LRO))
> +					tcp_softlro_enqueue(&ml, m);
> +				else
> +					ml_enqueue(&ml, m);
>  			} else {
>  				ifp->if_ierrors++; /* XXX */
>  				m_freem(m);
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.407 tcp_input.c
> --- netinet/tcp_input.c	26 Aug 2024 13:55:14 -0000	1.407
> +++ netinet/tcp_input.c	6 Nov 2024 23:32:00 -0000
> @@ -84,6 +84,7 @@
>  #include <net/if_var.h>
>  #include <net/route.h>
>  
> +#include <netinet/if_ether.h>
>  #include <netinet/in.h>
>  #include <netinet/ip.h>
>  #include <netinet/in_pcb.h>
> @@ -4161,4 +4162,234 @@ syn_cache_respond(struct syn_cache *sc, 
>  #endif
>  	}
>  	return (error);
> +}
> +
> +int
> +tcp_softlro(struct mbuf *mhead, struct mbuf *mtail)
> +{
> +	struct ether_extracted	 head;
> +	struct ether_extracted	 tail;
> +	struct mbuf		*m;
> +	unsigned int		 hdrlen;
> +
> +	/*
> +	 * Check if head and tail are mergeable
> +	 */
> +
> +	ether_extract_headers(mhead, &head);
> +	ether_extract_headers(mtail, &tail);
> +
> +	/* Don't merge packets of different VLANs */
> +	if (head.evh && tail.evh) {
> +		if (head.evh->evl_tag != tail.evh->evl_tag)

vlan tags contain more than just the vlan id, there's at least prio
bits as well. you should mask these values before comparing them.

> +			return 0;
> +	} else if (head.evh || tail.evh)
> +		return 0;

technically vlan tag 0 can be used to encode priority, but is
otherwise equivalent to a packet without a vlan tag. i dont think
we care enough to complicate the code for this edge case though.

> +
> +	/* Check IP header. */
> +	if (head.ip4 && tail.ip4) {
> +		/* Don't merge packets with invalid header checksum. */
> +		if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
> +		    !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
> +		    ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD) ||
> +		    ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD))
> +			return 0;
> +
> +		/* Check IPv4 addresses. */
> +		if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr ||
> +		    head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr)
> +			return 0;
> +
> +		/* Don't merge IPv4 fragments. */
> +		if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) ||
> +		    ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF)))
> +			return 0;
> +
> +		/* Check max. IPv4 length. */
> +		if (head.iplen + tail.iplen > IP_MAXPACKET)
> +			return 0;
> +
> +		/* Don't merge IPv4 packets with option headers. */
> +		if (head.iphlen != sizeof(struct ip) ||
> +		    tail.iphlen != sizeof(struct ip))
> +			return 0;
> +
> +		/* Don't non-TCP packets. */
> +		if (head.ip4->ip_p != IPPROTO_TCP ||
> +		    tail.ip4->ip_p != IPPROTO_TCP)
> +			return 0;
> +	} else if (head.ip6 && tail.ip6) {
> +		/* Check IPv6 addresses. */
> +		if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) ||
> +		    !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst))
> +			return 0;
> +
> +		/* Check max. IPv6 length. */
> +		if ((head.iplen - head.iphlen) +
> +		    (tail.iplen - tail.iphlen) > IPV6_MAXPACKET)
> +			return 0;
> +
> +		/* Don't merge IPv6 packets with option headers nor non-TCP. */
> +		if (head.ip6->ip6_nxt != IPPROTO_TCP ||
> +		    tail.ip6->ip6_nxt != IPPROTO_TCP)
> +			return 0;
> +	} else {
> +		return 0;
> +	}
> +
> +	/* Check TCP header. */
> +	if (!head.tcp || !tail.tcp)
> +		return 0;
> +
> +	/* Check TCP ports. */
> +	if (head.tcp->th_sport != tail.tcp->th_sport ||
> +	    head.tcp->th_dport != tail.tcp->th_dport)
> +		return 0;
> +
> +	/* Don't merge empty segments. */
> +	if (head.paylen == 0 || tail.paylen == 0)
> +		return 0;
> +
> +	/* Check for continues segments. */
> +	if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq))
> +		return 0;
> +
> +	/* Just ACK and PUSH TCP flags are allowed. */
> +	if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) ||
> +	    ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH)))
> +		return 0;
> +
> +	/* TCP ACK flag has to be set. */
> +	if (!ISSET(head.tcp->th_flags, TH_ACK) ||
> +	    !ISSET(tail.tcp->th_flags, TH_ACK))
> +		return 0;
> +
> +	/* Ignore segments with different TCP options. */
> +	if (head.tcphlen - sizeof(struct tcphdr) !=
> +	    tail.tcphlen - sizeof(struct tcphdr))
> +		return 0;
> +
> +	/* Check for TCP options */
> +	if (head.tcphlen > sizeof(struct tcphdr)) {
> +		char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr);
> +		char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr);
> +		int optsize = head.tcphlen - sizeof(struct tcphdr);
> +		int optlen;
> +
> +		for (; optsize > 0; optsize -= optlen) {
> +			/* Ignore segments with different TCP options. */
> +			if (hopt[0] != topt[0] || hopt[1] != topt[1])
> +				return 0;
> +
> +			/* Get option length */
> +			optlen = hopt[1];
> +			if (hopt[0] == TCPOPT_NOP)
> +				optlen = 1;
> +			else if (optlen < 2 || optlen > optsize)
> +				return 0;	/* Illegal length */
> +
> +			if (hopt[0] != TCPOPT_NOP &&
> +			    hopt[0] != TCPOPT_TIMESTAMP)
> +				return 0;	/* Unsupported TCP option */
> +
> +			hopt += optlen;
> +			topt += optlen;
> +		}
> +	}
> +
> +	/*
> +	 * Prepare concatenation of head and tail.
> +	 */
> +
> +	/* Adjust IP header. */
> +	if (head.ip4) {
> +		head.ip4->ip_len = htons(head.iplen + tail.paylen);
> +	} else if (head.ip6) {
> +		head.ip6->ip6_plen =
> +		    htons(head.iplen - head.iphlen + tail.paylen);
> +	}
> +
> +	/* Combine TCP flags from head and tail. */
> +	if (ISSET(tail.tcp->th_flags, TH_PUSH))
> +		SET(head.tcp->th_flags, TH_PUSH);
> +
> +	/* Adjust TCP header. */
> +	head.tcp->th_win = tail.tcp->th_win;
> +	head.tcp->th_ack = tail.tcp->th_ack;
> +
> +	/* Calculate header length of tail packet. */
> +	hdrlen = sizeof(*tail.eh);
> +	if (tail.evh)
> +		hdrlen = sizeof(*tail.evh);
> +	hdrlen += tail.iphlen;
> +	hdrlen += tail.tcphlen;
> +
> +	/* Skip protocol headers in tail. */
> +	m_adj(mtail, hdrlen);
> +	CLR(mtail->m_flags, M_PKTHDR);
> +
> +	/* Concatenate */
> +	for (m = mhead; m->m_next;)
> +		m = m->m_next;
> +	m->m_next = mtail;
> +	mhead->m_pkthdr.len += tail.paylen;
> +
> +	/* Flag mbuf as TSO packet with MSS. */
> +	if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) {
> +		/* Set CSUM_OUT flags in case of forwarding. */
> +		SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> +		head.tcp->th_sum = 0;
> +		if (head.ip4) {
> +			SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
> +			head.ip4->ip_sum = 0;
> +		}
> +
> +		SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO);
> +		mhead->m_pkthdr.ph_mss = head.paylen;
> +		tcpstat_inc(tcps_inhwlro);
> +		tcpstat_inc(tcps_inpktlro);	/* count head */
> +	}
> +	mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen);
> +	tcpstat_inc(tcps_inpktlro);	/* count tail */
> +
> +	return 1;
> +}
> +
> +void
> +tcp_softlro_enqueue(struct mbuf_list *ml, struct mbuf *mtail)
> +{
> +	struct mbuf *mhead;
> +
> +	/* Don't merge packets with invalid header checksum. */
> +	if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK) ||
> +	    ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_BAD))

what's the point of checking the BAD flag here?

> +		goto out;
> +
> +	for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) {

not a code thing, but it is crazy to me that we can brute force our
way through a whole list of mbufs and do these comparisons without
suffering too much.

looking at my busy firewall, we don't actually do a lot of packets
in each interrupt, so maybe it's ok.

> +		/* Don't merge packets with invalid header checksum. */
> +		if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK) ||
> +		    ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_BAD))

again, what's the point of checking the BAD flag here?

> +			continue;
> +
> +		/* Use RSS hash to skip packets of different connections. */
> +		if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) &&
> +		    ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) &&
> +		    mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid)
> +			continue;
> +
> +		/* Don't merge packets of different VLANs */
> +		if (ISSET(mhead->m_flags, M_VLANTAG) !=
> +		    ISSET(mtail->m_flags, M_VLANTAG))
> +			continue;
> +
> +		if (ISSET(mhead->m_flags, M_VLANTAG) &&
> +		    ISSET(mtail->m_flags, M_VLANTAG) &&

you dont have to check that both flags are set here cos you just did that
above.

> +		    mhead->m_pkthdr.ether_vtag != mtail->m_pkthdr.ether_vtag)

like the evh check in tcp_softlro above, you need to mask the vlan
id bits before doing this comparison because there's other fields in
there.

> +			continue;
> +
> +		if (tcp_softlro(mhead, mtail))
> +			return;
> +	}
> + out:
> +	ml_enqueue(ml, mtail);
>  }
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.178 tcp_var.h
> --- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
> +++ netinet/tcp_var.h	6 Nov 2024 23:32:00 -0000
> @@ -720,6 +720,7 @@ void	 tcp_init(void);
>  int	 tcp_input(struct mbuf **, int *, int, int);
>  int	 tcp_mss(struct tcpcb *, int);
>  void	 tcp_mss_update(struct tcpcb *);
> +void	 tcp_softlro_enqueue(struct mbuf_list *, struct mbuf *);
>  u_int	 tcp_hdrsz(struct tcpcb *);
>  void	 tcp_mtudisc(struct inpcb *, int);
>  void	 tcp_mtudisc_increase(struct inpcb *, int);
>