Index | Thread | Search

From:
Mark Patruck <mark@wrapped.cx>
Subject:
Re: SoftLRO for ixl(4), bnxt(4) and em(4)
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
Jan Klemkow <jan@openbsd.org>, tech@openbsd.org, Janne Johansson <icepic.dz@gmail.com>, Yuichiro NAITO <naito.yuichiro@gmail.com>
Date:
Wed, 26 Mar 2025 11:33:56 +0100

Download raw body.

Thread
  • Mark Patruck:

    SoftLRO for ixl(4), bnxt(4) and em(4)

  • On 20.03.2025 15:20, Alexander Bluhm wrote:
    >As release lock will come soon, we should be careful.  I think we
    >can add the code if we ensure that default behavior does not change
    >anything.
    >
    >We should start with ixl(4), ifconfig tcplro flag disabled per
    >default.  Then we can improve the TCP code in tree and add other
    >drivers later.  I found some issues like ethernet padding and
    >TCP option parsing that should be improved.  But doing this
    >on top of an initial commit is easier.
    >
    >With some minor changes I am OK with commiting the diff below.
    >
    >- remove all drivers except ixl(4) from the diff
    >- turn off ifconfig tcplro per default
    >- make sure that ixl(4) logic does not change, if tcplro is off
    >- I have fixed the livelocked logic in ixl(4)
    >- I renamed the function to tcp_enqueue_lro(), it is more consistent
    >  with the TSO functions.
    >
    >As said before, tcp_softlro() needs more love, but it want to
    >do this in tree.
    >
    >With my comments, jan@'s diff looks like this and would be OK bluhm@
    
    No real issues spotted on a busy ixl(4)/carp(4) setup for the last four days.
    
    I did a few rounds of tcpbench between ixl(4)->vio(4) and the throughput
    is more or less constantly good (around 9Gb/s) and also mbufs are released
    directly after stopping and go back to the previous state.
    
    I still see some very low values, although never during a "good" run, that
    means, if the first 2,3 values are ~ 9Gb/s it never passes <8Gb/s, whereas
    sometimes it starts with 5,6 Gb/s and then never reaches the top, f.e
    
    # good run (most of the time)
       elapsed_ms          bytes         mbps   bwidth 
             1000      992793140     7942.345  100.00% 
    Conn:   1 Mbps:     7942.345 Peak Mbps:     7942.345 Avg Mbps:     7942.345
             2000     1174009396     9392.075  100.00% 
    Conn:   1 Mbps:     9392.075 Peak Mbps:     9392.075 Avg Mbps:     9392.075
             3000     1173654672     9389.237  100.00% 
    Conn:   1 Mbps:     9389.237 Peak Mbps:     9392.075 Avg Mbps:     9389.237
             4000     1161408036     9300.565  100.00% 
    Conn:   1 Mbps:     9300.565 Peak Mbps:     9392.075 Avg Mbps:     9300.565
             5000     1173661004     9389.288  100.00% 
    Conn:   1 Mbps:     9389.288 Peak Mbps:     9392.075 Avg Mbps:     9389.288
             6000     1173847200     9390.778  100.00% 
    Conn:   1 Mbps:     9390.778 Peak Mbps:     9392.075 Avg Mbps:     9390.778
             7000     1173698228     9389.586  100.00% 
    Conn:   1 Mbps:     9389.586 Peak Mbps:     9392.075 Avg Mbps:     9389.586
    ....continues like this....
    
    
    # bad run (every now and then)
       elapsed_ms          bytes         mbps   bwidth                                              
             1000      236474968     1891.800  100.00%                                              
    Conn:   1 Mbps:     1891.800 Peak Mbps:     1891.800 Avg Mbps:     1891.800                    
             2000      252852080     2022.817  100.00%                                              
    Conn:   1 Mbps:     2022.817 Peak Mbps:     2022.817 Avg Mbps:     2022.817                    
             3000      544229216     4353.834  100.00%                                              
    Conn:   1 Mbps:     4353.834 Peak Mbps:     4353.834 Avg Mbps:     4353.834
             4000      565089816     4520.719  100.00%                          
    Conn:   1 Mbps:     4520.719 Peak Mbps:     4520.719 Avg Mbps:     4520.719
             5004      413703444     3296.442  100.00%                                              
    Conn:   1 Mbps:     3296.442 Peak Mbps:     4520.719 Avg Mbps:     3296.442                    
             6005      330198224     2641.586  100.00%                                              
    Conn:   1 Mbps:     2641.586 Peak Mbps:     4520.719 Avg Mbps:     2641.586
    
    
    >Index: dev/pci/if_ixl.c
    >===================================================================
    >RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
    >diff -u -p -r1.102 if_ixl.c
    >--- dev/pci/if_ixl.c	30 Oct 2024 18:02:45 -0000	1.102
    >+++ dev/pci/if_ixl.c	20 Mar 2025 10:29:12 -0000
    >@@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 {
    >
    > #define IXL_RX_DESC_PTYPE_SHIFT		30
    > #define IXL_RX_DESC_PTYPE_MASK		(0xffULL << IXL_RX_DESC_PTYPE_SHIFT)
    >+#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP	26
    >+#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP	92
    >
    > #define IXL_RX_DESC_PLEN_SHIFT		38
    > #define IXL_RX_DESC_PLEN_MASK		(0x3fffULL << IXL_RX_DESC_PLEN_SHIFT)
    >@@ -1975,6 +1977,11 @@ ixl_attach(struct device *parent, struct
    > 	    IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
    > 	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
    > 	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
    >+	ifp->if_capabilities |= IFCAP_LRO;
    >+#if 0
    >+	/* for now tcplro at ixl(4) is default off */
    >+	ifp->if_xflags |= IFXF_LRO;
    >+#endif
    >
    > 	ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
    >
    >@@ -3255,9 +3262,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i
    > 	struct ixl_rx_map *rxm;
    > 	bus_dmamap_t map;
    > 	unsigned int cons, prod;
    >+	struct mbuf_list mltcp = MBUF_LIST_INITIALIZER();
    > 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
    > 	struct mbuf *m;
    > 	uint64_t word;
    >+	unsigned int ptype;
    > 	unsigned int len;
    > 	unsigned int mask;
    > 	int done = 0;
    >@@ -3294,6 +3303,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i
    > 		m = rxm->rxm_m;
    > 		rxm->rxm_m = NULL;
    >
    >+		ptype = (word & IXL_RX_DESC_PTYPE_MASK)
    >+		    >> IXL_RX_DESC_PTYPE_SHIFT;
    > 		len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT;
    > 		m->m_len = len;
    > 		m->m_pkthdr.len = 0;
    >@@ -3314,7 +3325,6 @@ ixl_rxeof(struct ixl_softc *sc, struct i
    > 					    lemtoh32(&rxd->filter_status);
    > 					m->m_pkthdr.csum_flags |= M_FLOWID;
    > 				}
    >-
    > #if NVLAN > 0
    > 				if (ISSET(word, IXL_RX_DESC_L2TAG1P)) {
    > 					m->m_pkthdr.ether_vtag =
    >@@ -3322,9 +3332,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
    > 					SET(m->m_flags, M_VLANTAG);
    > 				}
    > #endif
    >-
    > 				ixl_rx_checksum(m, word);
    >-				ml_enqueue(&ml, m);
    >+
    >+				if (ISSET(ifp->if_xflags, IFXF_LRO) &&
    >+				    (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP ||
    >+				    ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP))
    >+					tcp_enqueue_lro(&mltcp, m);
    >+				else
    >+					ml_enqueue(&ml, m);
    > 			} else {
    > 				ifp->if_ierrors++; /* XXX */
    > 				m_freem(m);
    >@@ -3341,8 +3356,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
    > 	} while (cons != prod);
    >
    > 	if (done) {
    >+		int livelocked = 0;
    >+
    > 		rxr->rxr_cons = cons;
    >+		if (ifiq_input(ifiq, &mltcp))
    >+			livelocked = 1;
    > 		if (ifiq_input(ifiq, &ml))
    >+			livelocked = 1;
    >+		if (livelocked)
    > 			if_rxr_livelocked(&rxr->rxr_acct);
    > 		ixl_rxfill(sc, rxr);
    > 	}
    >Index: netinet/tcp_input.c
    >===================================================================
    >RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
    >diff -u -p -r1.434 tcp_input.c
    >--- netinet/tcp_input.c	10 Mar 2025 15:11:46 -0000	1.434
    >+++ netinet/tcp_input.c	20 Mar 2025 10:29:12 -0000
    >@@ -84,6 +84,7 @@
    > #include <net/if_var.h>
    > #include <net/route.h>
    >
    >+#include <netinet/if_ether.h>
    > #include <netinet/in.h>
    > #include <netinet/ip.h>
    > #include <netinet/in_pcb.h>
    >@@ -4229,4 +4230,230 @@ syn_cache_respond(struct syn_cache *sc,
    > 	}
    > 	in_pcbunref(inp);
    > 	return (error);
    >+}
    >+
    >+int
    >+tcp_softlro(struct mbuf *mhead, struct mbuf *mtail)
    >+{
    >+	struct ether_extracted	 head;
    >+	struct ether_extracted	 tail;
    >+	struct mbuf		*m;
    >+	unsigned int		 hdrlen;
    >+
    >+	/*
    >+	 * Check if head and tail are mergeable
    >+	 */
    >+
    >+	ether_extract_headers(mhead, &head);
    >+	ether_extract_headers(mtail, &tail);
    >+
    >+	/* Don't merge packets of different VLANs */
    >+	if (head.evh && tail.evh) {
    >+		if (head.evh->evl_tag != tail.evh->evl_tag)
    >+			return 0;
    >+	} else if (head.evh || tail.evh)
    >+		return 0;
    >+
    >+	/* Check IP header. */
    >+	if (head.ip4 && tail.ip4) {
    >+		/* Don't merge packets with invalid header checksum. */
    >+		if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
    >+		    !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK))
    >+			return 0;
    >+
    >+		/* Check IPv4 addresses. */
    >+		if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr ||
    >+		    head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr)
    >+			return 0;
    >+
    >+		/* Don't merge IPv4 fragments. */
    >+		if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) ||
    >+		    ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF)))
    >+			return 0;
    >+
    >+		/* Check max. IPv4 length. */
    >+		if (head.iplen + tail.iplen > IP_MAXPACKET)
    >+			return 0;
    >+
    >+		/* Don't merge IPv4 packets with option headers. */
    >+		if (head.iphlen != sizeof(struct ip) ||
    >+		    tail.iphlen != sizeof(struct ip))
    >+			return 0;
    >+
    >+		/* Don't non-TCP packets. */
    >+		if (head.ip4->ip_p != IPPROTO_TCP ||
    >+		    tail.ip4->ip_p != IPPROTO_TCP)
    >+			return 0;
    >+	} else if (head.ip6 && tail.ip6) {
    >+		/* Check IPv6 addresses. */
    >+		if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) ||
    >+		    !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst))
    >+			return 0;
    >+
    >+		/* Check max. IPv6 length. */
    >+		if ((head.iplen - head.iphlen) +
    >+		    (tail.iplen - tail.iphlen) > IPV6_MAXPACKET)
    >+			return 0;
    >+
    >+		/* Don't merge IPv6 packets with option headers nor non-TCP. */
    >+		if (head.ip6->ip6_nxt != IPPROTO_TCP ||
    >+		    tail.ip6->ip6_nxt != IPPROTO_TCP)
    >+			return 0;
    >+	} else {
    >+		return 0;
    >+	}
    >+
    >+	/* Check TCP header. */
    >+	if (!head.tcp || !tail.tcp)
    >+		return 0;
    >+
    >+	/* Check TCP ports. */
    >+	if (head.tcp->th_sport != tail.tcp->th_sport ||
    >+	    head.tcp->th_dport != tail.tcp->th_dport)
    >+		return 0;
    >+
    >+	/* Don't merge empty segments. */
    >+	if (head.paylen == 0 || tail.paylen == 0)
    >+		return 0;
    >+
    >+	/* Check for continues segments. */
    >+	if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq))
    >+		return 0;
    >+
    >+	/* Just ACK and PUSH TCP flags are allowed. */
    >+	if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) ||
    >+	    ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH)))
    >+		return 0;
    >+
    >+	/* TCP ACK flag has to be set. */
    >+	if (!ISSET(head.tcp->th_flags, TH_ACK) ||
    >+	    !ISSET(tail.tcp->th_flags, TH_ACK))
    >+		return 0;
    >+
    >+	/* Ignore segments with different TCP options. */
    >+	if (head.tcphlen - sizeof(struct tcphdr) !=
    >+	    tail.tcphlen - sizeof(struct tcphdr))
    >+		return 0;
    >+
    >+	/* Check for TCP options */
    >+	if (head.tcphlen > sizeof(struct tcphdr)) {
    >+		char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr);
    >+		char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr);
    >+		int optsize = head.tcphlen - sizeof(struct tcphdr);
    >+		int optlen;
    >+
    >+		for (; optsize > 0; optsize -= optlen) {
    >+			/* Ignore segments with different TCP options. */
    >+			if (hopt[0] != topt[0] || hopt[1] != topt[1])
    >+				return 0;
    >+
    >+			/* Get option length */
    >+			optlen = hopt[1];
    >+			if (hopt[0] == TCPOPT_NOP)
    >+				optlen = 1;
    >+			else if (optlen < 2 || optlen > optsize)
    >+				return 0;	/* Illegal length */
    >+
    >+			if (hopt[0] != TCPOPT_NOP &&
    >+			    hopt[0] != TCPOPT_TIMESTAMP)
    >+				return 0;	/* Unsupported TCP option */
    >+
    >+			hopt += optlen;
    >+			topt += optlen;
    >+		}
    >+	}
    >+
    >+	/*
    >+	 * Prepare concatenation of head and tail.
    >+	 */
    >+
    >+	/* Adjust IP header. */
    >+	if (head.ip4) {
    >+		head.ip4->ip_len = htons(head.iplen + tail.paylen);
    >+	} else if (head.ip6) {
    >+		head.ip6->ip6_plen =
    >+		    htons(head.iplen - head.iphlen + tail.paylen);
    >+	}
    >+
    >+	/* Combine TCP flags from head and tail. */
    >+	if (ISSET(tail.tcp->th_flags, TH_PUSH))
    >+		SET(head.tcp->th_flags, TH_PUSH);
    >+
    >+	/* Adjust TCP header. */
    >+	head.tcp->th_win = tail.tcp->th_win;
    >+	head.tcp->th_ack = tail.tcp->th_ack;
    >+
    >+	/* Calculate header length of tail packet. */
    >+	hdrlen = sizeof(*tail.eh);
    >+	if (tail.evh)
    >+		hdrlen = sizeof(*tail.evh);
    >+	hdrlen += tail.iphlen;
    >+	hdrlen += tail.tcphlen;
    >+
    >+	/* Skip protocol headers in tail. */
    >+	m_adj(mtail, hdrlen);
    >+	CLR(mtail->m_flags, M_PKTHDR);
    >+
    >+	/* Concatenate */
    >+	for (m = mhead; m->m_next;)
    >+		m = m->m_next;
    >+	m->m_next = mtail;
    >+	mhead->m_pkthdr.len += tail.paylen;
    >+
    >+	/* Flag mbuf as TSO packet with MSS. */
    >+	if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) {
    >+		/* Set CSUM_OUT flags in case of forwarding. */
    >+		SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
    >+		head.tcp->th_sum = 0;
    >+		if (head.ip4) {
    >+			SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
    >+			head.ip4->ip_sum = 0;
    >+		}
    >+
    >+		SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO);
    >+		mhead->m_pkthdr.ph_mss = head.paylen;
    >+		tcpstat_inc(tcps_inhwlro);
    >+		tcpstat_inc(tcps_inpktlro);	/* count head */
    >+	}
    >+	mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen);
    >+	tcpstat_inc(tcps_inpktlro);	/* count tail */
    >+
    >+	return 1;
    >+}
    >+
    >+void
    >+tcp_enqueue_lro(struct mbuf_list *ml, struct mbuf *mtail)
    >+{
    >+	struct mbuf *mhead;
    >+
    >+	/* Don't merge packets with invalid header checksum. */
    >+	if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
    >+		goto out;
    >+
    >+	for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) {
    >+		/* Don't merge packets with invalid header checksum. */
    >+		if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
    >+			continue;
    >+
    >+		/* Use RSS hash to skip packets of different connections. */
    >+		if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) &&
    >+		    ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) &&
    >+		    mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid)
    >+			continue;
    >+
    >+		/* Don't merge packets of different VLANs */
    >+		if (ISSET(mhead->m_flags, M_VLANTAG) !=
    >+		    ISSET(mtail->m_flags, M_VLANTAG))
    >+			continue;
    >+
    >+		if (ISSET(mhead->m_flags, M_VLANTAG) &&
    >+		    EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) !=
    >+		    EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag))
    >+			continue;
    >+
    >+		if (tcp_softlro(mhead, mtail))
    >+			return;
    >+	}
    >+ out:
    >+	ml_enqueue(ml, mtail);
    > }
    >Index: netinet/tcp_var.h
    >===================================================================
    >RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
    >diff -u -p -r1.186 tcp_var.h
    >--- netinet/tcp_var.h	2 Mar 2025 21:28:32 -0000	1.186
    >+++ netinet/tcp_var.h	20 Mar 2025 10:29:12 -0000
    >@@ -720,6 +720,7 @@ void	 tcp_init(void);
    > int	 tcp_input(struct mbuf **, int *, int, int, struct netstack *);
    > int	 tcp_mss(struct tcpcb *, int);
    > void	 tcp_mss_update(struct tcpcb *);
    >+void	 tcp_enqueue_lro(struct mbuf_list *, struct mbuf *);
    > u_int	 tcp_hdrsz(struct tcpcb *);
    > void	 tcp_mtudisc(struct inpcb *, int);
    > void	 tcp_mtudisc_increase(struct inpcb *, int);
    >
    
    --
    Mark Patruck ( mark at wrapped.cx )
    GPG key 0xF2865E51 / 187F F6D3 EE04 1DCE 1C74  F644 0D3C F66F F286 5E51
      
    https://www.wrapped.cx
    
    
  • Mark Patruck:

    SoftLRO for ixl(4), bnxt(4) and em(4)