Index | Thread | Search

From:
Jan Klemkow <jan@openbsd.org>
Subject:
SoftLRO for ixl(4), bnxt(4) and em(4)
To:
tech@openbsd.org
Date:
Thu, 7 Nov 2024 01:10:10 +0100

Download raw body.

Thread
Hi,

This diff introduces a software solution for TCP Large Receive Offload
(SoftLRO) for network interfaces don't hat hardware support for it.
This is needes at least for newer Intel interfaces as their
documentation said that LRO a.k.a. Receive Side Coalescing (RSC) has to
be done by software.
This diff coalesces TCP segments during the receive interrupt before
queueing them.  Thus, our TCP/IP stack has to process less packet
headers per amount of received data.

I measured receiving performance with Intel XXV710 25 GbE interfaces.
It increased from 6 Gbit/s to 23 Gbit/s.

Even if we saturate em(4) without any of these technique its also part
this diff.  I'm interested if this diff helps to reach 1 Gbit/s on old
or slow hardware.

I also add bnxt(4) to this diff to increase test coverage.  If you want
to tests this implementation with your favorite interface, just replace
the ml_enqueue() call with the new tcp_softlro_enqueue() (as seen
below).  It should work with all kind network interfaces.

Any comments and tests reports are welcome.

Bye,
Jan

Index: dev/pci/if_bnxt.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v
diff -u -p -r1.52 if_bnxt.c
--- dev/pci/if_bnxt.c	6 Oct 2024 23:43:18 -0000	1.52
+++ dev/pci/if_bnxt.c	6 Nov 2024 23:32:00 -0000
@@ -646,6 +646,8 @@ bnxt_attach(struct device *parent, struc
 	    IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv6 |
 	    IFCAP_CSUM_TCPv6;
 	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
+	ifp->if_xflags |= IFXF_LRO;
+	ifp->if_capabilities |= IFCAP_LRO;
 #if NVLAN > 0
 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
 #endif
@@ -2283,6 +2285,7 @@ bnxt_rx(struct bnxt_softc *sc, struct bn
     struct bnxt_cp_ring *cpr, struct mbuf_list *ml, int *slots, int *agslots,
     struct cmpl_base *cmpl)
 {
+	struct ifnet *ifp = &sc->sc_ac.ac_if;
 	struct mbuf *m, *am;
 	struct bnxt_slot *bs;
 	struct rx_pkt_cmpl *rxlo = (struct rx_pkt_cmpl *)cmpl;
@@ -2355,7 +2358,10 @@ bnxt_rx(struct bnxt_softc *sc, struct bn
 		(*agslots)++;
 	}
 
-	ml_enqueue(ml, m);
+	if (ISSET(ifp->if_xflags, IFXF_LRO))
+		tcp_softlro_enqueue(ml, m);
+	else
+		ml_enqueue(ml, m);
 	return (0);
 }
 
Index: dev/pci/if_em.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
diff -u -p -r1.378 if_em.c
--- dev/pci/if_em.c	31 Aug 2024 16:23:09 -0000	1.378
+++ dev/pci/if_em.c	6 Nov 2024 23:32:00 -0000
@@ -2013,6 +2013,9 @@ em_setup_interface(struct em_softc *sc)
 		ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 	}
 
+	ifp->if_xflags |= IFXF_LRO;
+	ifp->if_capabilities |= IFCAP_LRO;
+
 	/* 
 	 * Specify the media types supported by this adapter and register
 	 * callbacks to update media and link information
@@ -3185,7 +3188,11 @@ em_rxeof(struct em_queue *que)
 					m->m_flags |= M_VLANTAG;
 				}
 #endif
-				ml_enqueue(&ml, m);
+
+				if (ISSET(ifp->if_xflags, IFXF_LRO))
+					tcp_softlro_enqueue(&ml, m);
+				else
+					ml_enqueue(&ml, m);
 
 				que->rx.fmp = NULL;
 				que->rx.lmp = NULL;
Index: dev/pci/if_ixl.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
diff -u -p -r1.102 if_ixl.c
--- dev/pci/if_ixl.c	30 Oct 2024 18:02:45 -0000	1.102
+++ dev/pci/if_ixl.c	6 Nov 2024 23:33:07 -0000
@@ -1976,6 +1976,9 @@ ixl_attach(struct device *parent, struct
 	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
 	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 
+	ifp->if_xflags |= IFXF_LRO;
+	ifp->if_capabilities |= IFCAP_LRO;
+
 	ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
 
 	ixl_media_add(sc, phy_types);
@@ -3324,7 +3327,10 @@ ixl_rxeof(struct ixl_softc *sc, struct i
 #endif
 
 				ixl_rx_checksum(m, word);
-				ml_enqueue(&ml, m);
+				if (ISSET(ifp->if_xflags, IFXF_LRO))
+					tcp_softlro_enqueue(&ml, m);
+				else
+					ml_enqueue(&ml, m);
 			} else {
 				ifp->if_ierrors++; /* XXX */
 				m_freem(m);
Index: netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.407 tcp_input.c
--- netinet/tcp_input.c	26 Aug 2024 13:55:14 -0000	1.407
+++ netinet/tcp_input.c	6 Nov 2024 23:32:00 -0000
@@ -84,6 +84,7 @@
 #include <net/if_var.h>
 #include <net/route.h>
 
+#include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
@@ -4161,4 +4162,234 @@ syn_cache_respond(struct syn_cache *sc, 
 #endif
 	}
 	return (error);
+}
+
+int
+tcp_softlro(struct mbuf *mhead, struct mbuf *mtail)
+{
+	struct ether_extracted	 head;
+	struct ether_extracted	 tail;
+	struct mbuf		*m;
+	unsigned int		 hdrlen;
+
+	/*
+	 * Check if head and tail are mergeable
+	 */
+
+	ether_extract_headers(mhead, &head);
+	ether_extract_headers(mtail, &tail);
+
+	/* Don't merge packets of different VLANs */
+	if (head.evh && tail.evh) {
+		if (head.evh->evl_tag != tail.evh->evl_tag)
+			return 0;
+	} else if (head.evh || tail.evh)
+		return 0;
+
+	/* Check IP header. */
+	if (head.ip4 && tail.ip4) {
+		/* Don't merge packets with invalid header checksum. */
+		if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
+		    !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
+		    ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD) ||
+		    ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD))
+			return 0;
+
+		/* Check IPv4 addresses. */
+		if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr ||
+		    head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr)
+			return 0;
+
+		/* Don't merge IPv4 fragments. */
+		if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) ||
+		    ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF)))
+			return 0;
+
+		/* Check max. IPv4 length. */
+		if (head.iplen + tail.iplen > IP_MAXPACKET)
+			return 0;
+
+		/* Don't merge IPv4 packets with option headers. */
+		if (head.iphlen != sizeof(struct ip) ||
+		    tail.iphlen != sizeof(struct ip))
+			return 0;
+
+		/* Don't non-TCP packets. */
+		if (head.ip4->ip_p != IPPROTO_TCP ||
+		    tail.ip4->ip_p != IPPROTO_TCP)
+			return 0;
+	} else if (head.ip6 && tail.ip6) {
+		/* Check IPv6 addresses. */
+		if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) ||
+		    !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst))
+			return 0;
+
+		/* Check max. IPv6 length. */
+		if ((head.iplen - head.iphlen) +
+		    (tail.iplen - tail.iphlen) > IPV6_MAXPACKET)
+			return 0;
+
+		/* Don't merge IPv6 packets with option headers nor non-TCP. */
+		if (head.ip6->ip6_nxt != IPPROTO_TCP ||
+		    tail.ip6->ip6_nxt != IPPROTO_TCP)
+			return 0;
+	} else {
+		return 0;
+	}
+
+	/* Check TCP header. */
+	if (!head.tcp || !tail.tcp)
+		return 0;
+
+	/* Check TCP ports. */
+	if (head.tcp->th_sport != tail.tcp->th_sport ||
+	    head.tcp->th_dport != tail.tcp->th_dport)
+		return 0;
+
+	/* Don't merge empty segments. */
+	if (head.paylen == 0 || tail.paylen == 0)
+		return 0;
+
+	/* Check for continues segments. */
+	if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq))
+		return 0;
+
+	/* Just ACK and PUSH TCP flags are allowed. */
+	if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) ||
+	    ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH)))
+		return 0;
+
+	/* TCP ACK flag has to be set. */
+	if (!ISSET(head.tcp->th_flags, TH_ACK) ||
+	    !ISSET(tail.tcp->th_flags, TH_ACK))
+		return 0;
+
+	/* Ignore segments with different TCP options. */
+	if (head.tcphlen - sizeof(struct tcphdr) !=
+	    tail.tcphlen - sizeof(struct tcphdr))
+		return 0;
+
+	/* Check for TCP options */
+	if (head.tcphlen > sizeof(struct tcphdr)) {
+		char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr);
+		char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr);
+		int optsize = head.tcphlen - sizeof(struct tcphdr);
+		int optlen;
+
+		for (; optsize > 0; optsize -= optlen) {
+			/* Ignore segments with different TCP options. */
+			if (hopt[0] != topt[0] || hopt[1] != topt[1])
+				return 0;
+
+			/* Get option length */
+			optlen = hopt[1];
+			if (hopt[0] == TCPOPT_NOP)
+				optlen = 1;
+			else if (optlen < 2 || optlen > optsize)
+				return 0;	/* Illegal length */
+
+			if (hopt[0] != TCPOPT_NOP &&
+			    hopt[0] != TCPOPT_TIMESTAMP)
+				return 0;	/* Unsupported TCP option */
+
+			hopt += optlen;
+			topt += optlen;
+		}
+	}
+
+	/*
+	 * Prepare concatenation of head and tail.
+	 */
+
+	/* Adjust IP header. */
+	if (head.ip4) {
+		head.ip4->ip_len = htons(head.iplen + tail.paylen);
+	} else if (head.ip6) {
+		head.ip6->ip6_plen =
+		    htons(head.iplen - head.iphlen + tail.paylen);
+	}
+
+	/* Combine TCP flags from head and tail. */
+	if (ISSET(tail.tcp->th_flags, TH_PUSH))
+		SET(head.tcp->th_flags, TH_PUSH);
+
+	/* Adjust TCP header. */
+	head.tcp->th_win = tail.tcp->th_win;
+	head.tcp->th_ack = tail.tcp->th_ack;
+
+	/* Calculate header length of tail packet. */
+	hdrlen = sizeof(*tail.eh);
+	if (tail.evh)
+		hdrlen = sizeof(*tail.evh);
+	hdrlen += tail.iphlen;
+	hdrlen += tail.tcphlen;
+
+	/* Skip protocol headers in tail. */
+	m_adj(mtail, hdrlen);
+	CLR(mtail->m_flags, M_PKTHDR);
+
+	/* Concatenate */
+	for (m = mhead; m->m_next;)
+		m = m->m_next;
+	m->m_next = mtail;
+	mhead->m_pkthdr.len += tail.paylen;
+
+	/* Flag mbuf as TSO packet with MSS. */
+	if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) {
+		/* Set CSUM_OUT flags in case of forwarding. */
+		SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+		head.tcp->th_sum = 0;
+		if (head.ip4) {
+			SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
+			head.ip4->ip_sum = 0;
+		}
+
+		SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO);
+		mhead->m_pkthdr.ph_mss = head.paylen;
+		tcpstat_inc(tcps_inhwlro);
+		tcpstat_inc(tcps_inpktlro);	/* count head */
+	}
+	mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen);
+	tcpstat_inc(tcps_inpktlro);	/* count tail */
+
+	return 1;
+}
+
+void
+tcp_softlro_enqueue(struct mbuf_list *ml, struct mbuf *mtail)
+{
+	struct mbuf *mhead;
+
+	/* Don't merge packets with invalid header checksum. */
+	if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK) ||
+	    ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_BAD))
+		goto out;
+
+	for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) {
+		/* Don't merge packets with invalid header checksum. */
+		if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK) ||
+		    ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_BAD))
+			continue;
+
+		/* Use RSS hash to skip packets of different connections. */
+		if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) &&
+		    ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) &&
+		    mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid)
+			continue;
+
+		/* Don't merge packets of different VLANs */
+		if (ISSET(mhead->m_flags, M_VLANTAG) !=
+		    ISSET(mtail->m_flags, M_VLANTAG))
+			continue;
+
+		if (ISSET(mhead->m_flags, M_VLANTAG) &&
+		    ISSET(mtail->m_flags, M_VLANTAG) &&
+		    mhead->m_pkthdr.ether_vtag != mtail->m_pkthdr.ether_vtag)
+			continue;
+
+		if (tcp_softlro(mhead, mtail))
+			return;
+	}
+ out:
+	ml_enqueue(ml, mtail);
 }
Index: netinet/tcp_var.h
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.178 tcp_var.h
--- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
+++ netinet/tcp_var.h	6 Nov 2024 23:32:00 -0000
@@ -720,6 +720,7 @@ void	 tcp_init(void);
 int	 tcp_input(struct mbuf **, int *, int, int);
 int	 tcp_mss(struct tcpcb *, int);
 void	 tcp_mss_update(struct tcpcb *);
+void	 tcp_softlro_enqueue(struct mbuf_list *, struct mbuf *);
 u_int	 tcp_hdrsz(struct tcpcb *);
 void	 tcp_mtudisc(struct inpcb *, int);
 void	 tcp_mtudisc_increase(struct inpcb *, int);