Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: SoftLRO for ixl(4), bnxt(4) and em(4)
To:
Jan Klemkow <jan@openbsd.org>
Cc:
tech@openbsd.org, Janne Johansson <icepic.dz@gmail.com>, Yuichiro NAITO <naito.yuichiro@gmail.com>
Date:
Thu, 20 Mar 2025 15:20:25 +0100

Download raw body.

Thread
On Tue, Mar 04, 2025 at 08:02:31PM +0100, Jan Klemkow wrote:
> On Fri, Nov 15, 2024 at 11:30:08AM GMT, Jan Klemkow wrote:
> > On Thu, Nov 07, 2024 at 11:30:26AM GMT, David Gwynne wrote:
> > > On Thu, Nov 07, 2024 at 01:10:10AM +0100, Jan Klemkow wrote:
> > > > This diff introduces a software solution for TCP Large Receive Offload
> > > > (SoftLRO) for network interfaces don't hat hardware support for it.
> > > > This is needes at least for newer Intel interfaces as their
> > > > documentation said that LRO a.k.a. Receive Side Coalescing (RSC) has to
> > > > be done by software.
> > > > This diff coalesces TCP segments during the receive interrupt before
> > > > queueing them.  Thus, our TCP/IP stack has to process less packet
> > > > headers per amount of received data.
> > > > 
> > > > I measured receiving performance with Intel XXV710 25 GbE interfaces.
> > > > It increased from 6 Gbit/s to 23 Gbit/s.
> > > > 
> > > > Even if we saturate em(4) without any of these technique its also part
> > > > this diff.  I'm interested if this diff helps to reach 1 Gbit/s on old
> > > > or slow hardware.
> > > > 
> > > > I also add bnxt(4) to this diff to increase test coverage.  If you want
> > > > to tests this implementation with your favorite interface, just replace
> > > > the ml_enqueue() call with the new tcp_softlro_enqueue() (as seen
> > > > below).  It should work with all kind network interfaces.
> > > > 
> > > > Any comments and tests reports are welcome.
> > > 
> > > nice.
> > > 
> > > i would argue this should be ether_softlro_enqueue and put in
> > > if_ethersubr.c because it's specific to ethernet interfaces. we don't
> > > really have any other type of interface that bundles reception of
> > > packets that we can take advantage of like this, and internally it
> > > assumes it's pulling ethernet packets apart.
> > > 
> > > aside from that, just a few comments on the code.
> > 
> > I adapted your comments in the diff below.
> 
> I refactored the SoftLRO diff.  You just need to add the flags IFXF_LRO
> / IFCAP_LRO, and repalce ml_enqueue() with tcp_softlro_enqueue() to
> enable this on you favorit network device.
> 
> Janne: I adjusted your diff with correct headers.  But, I'm unable to
> test this part of the diff below, due to lack of hardware.  Could you
> test it again?
> 
> Yuichiro: Could you also retest your UDP/TCP forwarding test?  I added a
> short path for non-TCP packets in the ixl(4) driver.  Maybe its better
> now.
> 
> Further tests and comments are welcome.

As release lock will come soon, we should be careful.  I think we
can add the code if we ensure that default behavior does not change
anything.

We should start with ixl(4), ifconfig tcplro flag disabled per
default.  Then we can improve the TCP code in tree and add other
drivers later.  I found some issues like ethernet padding and
TCP option parsing that should be improved.  But doing this
on top of an initial commit is easier.

With some minor changes I am OK with commiting the diff below.

- remove all drivers except ixl(4) from the diff
- turn off ifconfig tcplro per default
- make sure that ixl(4) logic does not change, if tcplro is off
- I have fixed the livelocked logic in ixl(4)
- I renamed the function to tcp_enqueue_lro(), it is more consistent
  with the TSO functions.

As said before, tcp_softlro() needs more love, but it want to
do this in tree.

With my comments, jan@'s diff looks like this and would be OK bluhm@

Index: dev/pci/if_ixl.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
diff -u -p -r1.102 if_ixl.c
--- dev/pci/if_ixl.c	30 Oct 2024 18:02:45 -0000	1.102
+++ dev/pci/if_ixl.c	20 Mar 2025 10:29:12 -0000
@@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 {
 
 #define IXL_RX_DESC_PTYPE_SHIFT		30
 #define IXL_RX_DESC_PTYPE_MASK		(0xffULL << IXL_RX_DESC_PTYPE_SHIFT)
+#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP	26
+#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP	92
 
 #define IXL_RX_DESC_PLEN_SHIFT		38
 #define IXL_RX_DESC_PLEN_MASK		(0x3fffULL << IXL_RX_DESC_PLEN_SHIFT)
@@ -1975,6 +1977,11 @@ ixl_attach(struct device *parent, struct
 	    IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
 	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
 	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
+	ifp->if_capabilities |= IFCAP_LRO;
+#if 0
+	/* for now tcplro at ixl(4) is default off */
+	ifp->if_xflags |= IFXF_LRO;
+#endif
 
 	ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
 
@@ -3255,9 +3262,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i
 	struct ixl_rx_map *rxm;
 	bus_dmamap_t map;
 	unsigned int cons, prod;
+	struct mbuf_list mltcp = MBUF_LIST_INITIALIZER();
 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
 	struct mbuf *m;
 	uint64_t word;
+	unsigned int ptype;
 	unsigned int len;
 	unsigned int mask;
 	int done = 0;
@@ -3294,6 +3303,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i
 		m = rxm->rxm_m;
 		rxm->rxm_m = NULL;
 
+		ptype = (word & IXL_RX_DESC_PTYPE_MASK)
+		    >> IXL_RX_DESC_PTYPE_SHIFT;
 		len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT;
 		m->m_len = len;
 		m->m_pkthdr.len = 0;
@@ -3314,7 +3325,6 @@ ixl_rxeof(struct ixl_softc *sc, struct i
 					    lemtoh32(&rxd->filter_status);
 					m->m_pkthdr.csum_flags |= M_FLOWID;
 				}
-
 #if NVLAN > 0
 				if (ISSET(word, IXL_RX_DESC_L2TAG1P)) {
 					m->m_pkthdr.ether_vtag =
@@ -3322,9 +3332,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
 					SET(m->m_flags, M_VLANTAG);
 				}
 #endif
-
 				ixl_rx_checksum(m, word);
-				ml_enqueue(&ml, m);
+
+				if (ISSET(ifp->if_xflags, IFXF_LRO) &&
+				    (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP ||
+				    ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP))
+					tcp_enqueue_lro(&mltcp, m);
+				else
+					ml_enqueue(&ml, m);
 			} else {
 				ifp->if_ierrors++; /* XXX */
 				m_freem(m);
@@ -3341,8 +3356,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
 	} while (cons != prod);
 
 	if (done) {
+		int livelocked = 0;
+
 		rxr->rxr_cons = cons;
+		if (ifiq_input(ifiq, &mltcp))
+			livelocked = 1;
 		if (ifiq_input(ifiq, &ml))
+			livelocked = 1;
+		if (livelocked)
 			if_rxr_livelocked(&rxr->rxr_acct);
 		ixl_rxfill(sc, rxr);
 	}
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.434 tcp_input.c
--- netinet/tcp_input.c	10 Mar 2025 15:11:46 -0000	1.434
+++ netinet/tcp_input.c	20 Mar 2025 10:29:12 -0000
@@ -84,6 +84,7 @@
 #include <net/if_var.h>
 #include <net/route.h>
 
+#include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
@@ -4229,4 +4230,230 @@ syn_cache_respond(struct syn_cache *sc, 
 	}
 	in_pcbunref(inp);
 	return (error);
+}
+
+int
+tcp_softlro(struct mbuf *mhead, struct mbuf *mtail)
+{
+	struct ether_extracted	 head;
+	struct ether_extracted	 tail;
+	struct mbuf		*m;
+	unsigned int		 hdrlen;
+
+	/*
+	 * Check if head and tail are mergeable
+	 */
+
+	ether_extract_headers(mhead, &head);
+	ether_extract_headers(mtail, &tail);
+
+	/* Don't merge packets of different VLANs */
+	if (head.evh && tail.evh) {
+		if (head.evh->evl_tag != tail.evh->evl_tag)
+			return 0;
+	} else if (head.evh || tail.evh)
+		return 0;
+
+	/* Check IP header. */
+	if (head.ip4 && tail.ip4) {
+		/* Don't merge packets with invalid header checksum. */
+		if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
+		    !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK))
+			return 0;
+
+		/* Check IPv4 addresses. */
+		if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr ||
+		    head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr)
+			return 0;
+
+		/* Don't merge IPv4 fragments. */
+		if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) ||
+		    ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF)))
+			return 0;
+
+		/* Check max. IPv4 length. */
+		if (head.iplen + tail.iplen > IP_MAXPACKET)
+			return 0;
+
+		/* Don't merge IPv4 packets with option headers. */
+		if (head.iphlen != sizeof(struct ip) ||
+		    tail.iphlen != sizeof(struct ip))
+			return 0;
+
+		/* Don't non-TCP packets. */
+		if (head.ip4->ip_p != IPPROTO_TCP ||
+		    tail.ip4->ip_p != IPPROTO_TCP)
+			return 0;
+	} else if (head.ip6 && tail.ip6) {
+		/* Check IPv6 addresses. */
+		if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) ||
+		    !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst))
+			return 0;
+
+		/* Check max. IPv6 length. */
+		if ((head.iplen - head.iphlen) +
+		    (tail.iplen - tail.iphlen) > IPV6_MAXPACKET)
+			return 0;
+
+		/* Don't merge IPv6 packets with option headers nor non-TCP. */
+		if (head.ip6->ip6_nxt != IPPROTO_TCP ||
+		    tail.ip6->ip6_nxt != IPPROTO_TCP)
+			return 0;
+	} else {
+		return 0;
+	}
+
+	/* Check TCP header. */
+	if (!head.tcp || !tail.tcp)
+		return 0;
+
+	/* Check TCP ports. */
+	if (head.tcp->th_sport != tail.tcp->th_sport ||
+	    head.tcp->th_dport != tail.tcp->th_dport)
+		return 0;
+
+	/* Don't merge empty segments. */
+	if (head.paylen == 0 || tail.paylen == 0)
+		return 0;
+
+	/* Check for continues segments. */
+	if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq))
+		return 0;
+
+	/* Just ACK and PUSH TCP flags are allowed. */
+	if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) ||
+	    ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH)))
+		return 0;
+
+	/* TCP ACK flag has to be set. */
+	if (!ISSET(head.tcp->th_flags, TH_ACK) ||
+	    !ISSET(tail.tcp->th_flags, TH_ACK))
+		return 0;
+
+	/* Ignore segments with different TCP options. */
+	if (head.tcphlen - sizeof(struct tcphdr) !=
+	    tail.tcphlen - sizeof(struct tcphdr))
+		return 0;
+
+	/* Check for TCP options */
+	if (head.tcphlen > sizeof(struct tcphdr)) {
+		char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr);
+		char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr);
+		int optsize = head.tcphlen - sizeof(struct tcphdr);
+		int optlen;
+
+		for (; optsize > 0; optsize -= optlen) {
+			/* Ignore segments with different TCP options. */
+			if (hopt[0] != topt[0] || hopt[1] != topt[1])
+				return 0;
+
+			/* Get option length */
+			optlen = hopt[1];
+			if (hopt[0] == TCPOPT_NOP)
+				optlen = 1;
+			else if (optlen < 2 || optlen > optsize)
+				return 0;	/* Illegal length */
+
+			if (hopt[0] != TCPOPT_NOP &&
+			    hopt[0] != TCPOPT_TIMESTAMP)
+				return 0;	/* Unsupported TCP option */
+
+			hopt += optlen;
+			topt += optlen;
+		}
+	}
+
+	/*
+	 * Prepare concatenation of head and tail.
+	 */
+
+	/* Adjust IP header. */
+	if (head.ip4) {
+		head.ip4->ip_len = htons(head.iplen + tail.paylen);
+	} else if (head.ip6) {
+		head.ip6->ip6_plen =
+		    htons(head.iplen - head.iphlen + tail.paylen);
+	}
+
+	/* Combine TCP flags from head and tail. */
+	if (ISSET(tail.tcp->th_flags, TH_PUSH))
+		SET(head.tcp->th_flags, TH_PUSH);
+
+	/* Adjust TCP header. */
+	head.tcp->th_win = tail.tcp->th_win;
+	head.tcp->th_ack = tail.tcp->th_ack;
+
+	/* Calculate header length of tail packet. */
+	hdrlen = sizeof(*tail.eh);
+	if (tail.evh)
+		hdrlen = sizeof(*tail.evh);
+	hdrlen += tail.iphlen;
+	hdrlen += tail.tcphlen;
+
+	/* Skip protocol headers in tail. */
+	m_adj(mtail, hdrlen);
+	CLR(mtail->m_flags, M_PKTHDR);
+
+	/* Concatenate */
+	for (m = mhead; m->m_next;)
+		m = m->m_next;
+	m->m_next = mtail;
+	mhead->m_pkthdr.len += tail.paylen;
+
+	/* Flag mbuf as TSO packet with MSS. */
+	if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) {
+		/* Set CSUM_OUT flags in case of forwarding. */
+		SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+		head.tcp->th_sum = 0;
+		if (head.ip4) {
+			SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
+			head.ip4->ip_sum = 0;
+		}
+
+		SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO);
+		mhead->m_pkthdr.ph_mss = head.paylen;
+		tcpstat_inc(tcps_inhwlro);
+		tcpstat_inc(tcps_inpktlro);	/* count head */
+	}
+	mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen);
+	tcpstat_inc(tcps_inpktlro);	/* count tail */
+
+	return 1;
+}
+
+void
+tcp_enqueue_lro(struct mbuf_list *ml, struct mbuf *mtail)
+{
+	struct mbuf *mhead;
+
+	/* Don't merge packets with invalid header checksum. */
+	if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
+		goto out;
+
+	for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) {
+		/* Don't merge packets with invalid header checksum. */
+		if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
+			continue;
+
+		/* Use RSS hash to skip packets of different connections. */
+		if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) &&
+		    ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) &&
+		    mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid)
+			continue;
+
+		/* Don't merge packets of different VLANs */
+		if (ISSET(mhead->m_flags, M_VLANTAG) !=
+		    ISSET(mtail->m_flags, M_VLANTAG))
+			continue;
+
+		if (ISSET(mhead->m_flags, M_VLANTAG) &&
+		    EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) !=
+		    EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag))
+			continue;
+
+		if (tcp_softlro(mhead, mtail))
+			return;
+	}
+ out:
+	ml_enqueue(ml, mtail);
 }
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.186 tcp_var.h
--- netinet/tcp_var.h	2 Mar 2025 21:28:32 -0000	1.186
+++ netinet/tcp_var.h	20 Mar 2025 10:29:12 -0000
@@ -720,6 +720,7 @@ void	 tcp_init(void);
 int	 tcp_input(struct mbuf **, int *, int, int, struct netstack *);
 int	 tcp_mss(struct tcpcb *, int);
 void	 tcp_mss_update(struct tcpcb *);
+void	 tcp_enqueue_lro(struct mbuf_list *, struct mbuf *);
 u_int	 tcp_hdrsz(struct tcpcb *);
 void	 tcp_mtudisc(struct inpcb *, int);
 void	 tcp_mtudisc_increase(struct inpcb *, int);