Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: SoftLRO for ixl(4), bnxt(4) and em(4)
To:
Jan Klemkow <jan@openbsd.org>
Cc:
tech@openbsd.org, Janne Johansson <icepic.dz@gmail.com>, Yuichiro NAITO <naito.yuichiro@gmail.com>
Date:
Wed, 16 Apr 2025 18:05:53 +0200

Download raw body.

Thread
On Sat, Apr 05, 2025 at 02:22:13PM +0200, Jan Klemkow wrote:
> This follwing diff adds checks to VLAN priorities.

My tests did not find problems anymore.  I would suggest some
improvement, but they can be better addressed in tree.  We should
commit what we have for ixl(4) only, and tcplro disabled per default.
Then fixing and testing will be easier.

OK bluhm@

> Index: dev/pci/if_ixl.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
> diff -u -p -r1.102 if_ixl.c
> --- dev/pci/if_ixl.c	30 Oct 2024 18:02:45 -0000	1.102
> +++ dev/pci/if_ixl.c	3 Apr 2025 15:45:43 -0000
> @@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 {
>  
>  #define IXL_RX_DESC_PTYPE_SHIFT		30
>  #define IXL_RX_DESC_PTYPE_MASK		(0xffULL << IXL_RX_DESC_PTYPE_SHIFT)
> +#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP	26
> +#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP	92
>  
>  #define IXL_RX_DESC_PLEN_SHIFT		38
>  #define IXL_RX_DESC_PLEN_MASK		(0x3fffULL << IXL_RX_DESC_PLEN_SHIFT)
> @@ -1976,6 +1978,12 @@ ixl_attach(struct device *parent, struct
>  	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
>  	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>  
> +	ifp->if_capabilities |= IFCAP_LRO;
> +#if notyet
> +	/* for now tcplro at ixl(4) is default off */
> +	ifp->if_xflags |= IFXF_LRO;
> +#endif
> +
>  	ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
>  
>  	ixl_media_add(sc, phy_types);
> @@ -3255,9 +3263,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>  	struct ixl_rx_map *rxm;
>  	bus_dmamap_t map;
>  	unsigned int cons, prod;
> +	struct mbuf_list mltcp = MBUF_LIST_INITIALIZER();
>  	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
>  	struct mbuf *m;
>  	uint64_t word;
> +	unsigned int ptype;
>  	unsigned int len;
>  	unsigned int mask;
>  	int done = 0;
> @@ -3294,6 +3304,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>  		m = rxm->rxm_m;
>  		rxm->rxm_m = NULL;
>  
> +		ptype = (word & IXL_RX_DESC_PTYPE_MASK)
> +		    >> IXL_RX_DESC_PTYPE_SHIFT;
>  		len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT;
>  		m->m_len = len;
>  		m->m_pkthdr.len = 0;
> @@ -3324,7 +3336,13 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>  #endif
>  
>  				ixl_rx_checksum(m, word);
> -				ml_enqueue(&ml, m);
> +
> +				if (ISSET(ifp->if_xflags, IFXF_LRO) &&
> +				    (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP ||
> +				     ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP))
> +					tcp_softlro_enqueue(ifp, &mltcp, m);
> +				else
> +					ml_enqueue(&ml, m);
>  			} else {
>  				ifp->if_ierrors++; /* XXX */
>  				m_freem(m);
> @@ -3341,8 +3359,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>  	} while (cons != prod);
>  
>  	if (done) {
> +		int livelocked = 0;
> +
>  		rxr->rxr_cons = cons;
> +		if (ifiq_input(ifiq, &mltcp))
> +			livelocked = 1;
>  		if (ifiq_input(ifiq, &ml))
> +			livelocked = 1;
> +		if (livelocked)
>  			if_rxr_livelocked(&rxr->rxr_acct);
>  		ixl_rxfill(sc, rxr);
>  	}
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.434 tcp_input.c
> --- netinet/tcp_input.c	10 Mar 2025 15:11:46 -0000	1.434
> +++ netinet/tcp_input.c	5 Apr 2025 11:46:19 -0000
> @@ -84,6 +84,7 @@
>  #include <net/if_var.h>
>  #include <net/route.h>
>  
> +#include <netinet/if_ether.h>
>  #include <netinet/in.h>
>  #include <netinet/ip.h>
>  #include <netinet/in_pcb.h>
> @@ -4229,4 +4230,256 @@ syn_cache_respond(struct syn_cache *sc, 
>  	}
>  	in_pcbunref(inp);
>  	return (error);
> +}
> +
> +int
> +tcp_softlro(struct mbuf *mhead, struct mbuf *mtail)
> +{
> +	struct ether_extracted	 head;
> +	struct ether_extracted	 tail;
> +	struct mbuf		*m;
> +	unsigned int		 hdrlen;
> +	unsigned int		 cnt = 0;
> +
> +	/*
> +	 * Check if head and tail are mergeable
> +	 */
> +
> +	ether_extract_headers(mhead, &head);
> +	ether_extract_headers(mtail, &tail);
> +
> +	/* Don't merge packets inside and outside of VLANs */
> +	if (head.evh && tail.evh) {
> +		/* Don't merge packets of different VLANs */
> +		if (EVL_VLANOFTAG(head.evh->evl_tag) !=
> +		    EVL_VLANOFTAG(tail.evh->evl_tag))
> +			return 0;
> +
> +		/* Don't merge packets of different priorities */
> +		if (EVL_PRIOFTAG(head.evh->evl_tag) !=
> +		    EVL_PRIOFTAG(tail.evh->evl_tag))
> +			return 0;
> +	} else if (head.evh || tail.evh)
> +		return 0;
> +
> +	/* Check IP header. */
> +	if (head.ip4 && tail.ip4) {
> +		/* Don't merge packets with invalid header checksum. */
> +		if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
> +		    !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK))
> +			return 0;
> +
> +		/* Check IPv4 addresses. */
> +		if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr ||
> +		    head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr)
> +			return 0;
> +
> +		/* Don't merge IPv4 fragments. */
> +		if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) ||
> +		    ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF)))
> +			return 0;
> +
> +		/* Check max. IPv4 length. */
> +		if (head.iplen + tail.iplen > IP_MAXPACKET)
> +			return 0;
> +
> +		/* Don't merge IPv4 packets with option headers. */
> +		if (head.iphlen != sizeof(struct ip) ||
> +		    tail.iphlen != sizeof(struct ip))
> +			return 0;
> +
> +		/* Don't non-TCP packets. */
> +		if (head.ip4->ip_p != IPPROTO_TCP ||
> +		    tail.ip4->ip_p != IPPROTO_TCP)
> +			return 0;
> +	} else if (head.ip6 && tail.ip6) {
> +		/* Check IPv6 addresses. */
> +		if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) ||
> +		    !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst))
> +			return 0;
> +
> +		/* Check max. IPv6 length. */
> +		if ((head.iplen - head.iphlen) +
> +		    (tail.iplen - tail.iphlen) > IPV6_MAXPACKET)
> +			return 0;
> +
> +		/* Don't merge IPv6 packets with option headers nor non-TCP. */
> +		if (head.ip6->ip6_nxt != IPPROTO_TCP ||
> +		    tail.ip6->ip6_nxt != IPPROTO_TCP)
> +			return 0;
> +	} else {
> +		return 0;
> +	}
> +
> +	/* Check TCP header. */
> +	if (!head.tcp || !tail.tcp)
> +		return 0;
> +
> +	/* Check TCP ports. */
> +	if (head.tcp->th_sport != tail.tcp->th_sport ||
> +	    head.tcp->th_dport != tail.tcp->th_dport)
> +		return 0;
> +
> +	/* Don't merge empty segments. */
> +	if (head.paylen == 0 || tail.paylen == 0)
> +		return 0;
> +
> +	/* Check for continues segments. */
> +	if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq))
> +		return 0;
> +
> +	/* Just ACK and PUSH TCP flags are allowed. */
> +	if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) ||
> +	    ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH)))
> +		return 0;
> +
> +	/* TCP ACK flag has to be set. */
> +	if (!ISSET(head.tcp->th_flags, TH_ACK) ||
> +	    !ISSET(tail.tcp->th_flags, TH_ACK))
> +		return 0;
> +
> +	/* Ignore segments with different TCP options. */
> +	if (head.tcphlen - sizeof(struct tcphdr) !=
> +	    tail.tcphlen - sizeof(struct tcphdr))
> +		return 0;
> +
> +	/* Check for TCP options */
> +	if (head.tcphlen > sizeof(struct tcphdr)) {
> +		char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr);
> +		char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr);
> +		int optsize = head.tcphlen - sizeof(struct tcphdr);
> +		int optlen;
> +
> +		for (; optsize > 0; optsize -= optlen) {
> +			/* Ignore segments with different TCP options. */
> +			if (hopt[0] != topt[0] || hopt[1] != topt[1])
> +				return 0;
> +
> +			/* Get option length */
> +			optlen = hopt[1];
> +			if (hopt[0] == TCPOPT_NOP)
> +				optlen = 1;
> +			else if (optlen < 2 || optlen > optsize)
> +				return 0;	/* Illegal length */
> +
> +			if (hopt[0] != TCPOPT_NOP &&
> +			    hopt[0] != TCPOPT_TIMESTAMP)
> +				return 0;	/* Unsupported TCP option */
> +
> +			hopt += optlen;
> +			topt += optlen;
> +		}
> +	}
> +
> +	/* Limit mbuf chain len to avoid m_defrag calls on forwarding. */
> +	for (m = mhead; m != NULL; m = m->m_next)
> +		if (cnt++ >= 8)
> +			return 0;
> +	for (m = mtail; m != NULL; m = m->m_next)
> +		if (cnt++ >= 8)
> +			return 0;
> +
> +	/*
> +	 * Prepare concatenation of head and tail.
> +	 */
> +
> +	/* Adjust IP header. */
> +	if (head.ip4) {
> +		head.ip4->ip_len = htons(head.iplen + tail.paylen);
> +	} else if (head.ip6) {
> +		head.ip6->ip6_plen =
> +		    htons(head.iplen - head.iphlen + tail.paylen);
> +	}
> +
> +	/* Combine TCP flags from head and tail. */
> +	if (ISSET(tail.tcp->th_flags, TH_PUSH))
> +		SET(head.tcp->th_flags, TH_PUSH);
> +
> +	/* Adjust TCP header. */
> +	head.tcp->th_win = tail.tcp->th_win;
> +	head.tcp->th_ack = tail.tcp->th_ack;
> +
> +	/* Calculate header length of tail packet. */
> +	hdrlen = sizeof(*tail.eh);
> +	if (tail.evh)
> +		hdrlen = sizeof(*tail.evh);
> +	hdrlen += tail.iphlen;
> +	hdrlen += tail.tcphlen;
> +
> +	/* Skip protocol headers in tail. */
> +	m_adj(mtail, hdrlen);
> +	CLR(mtail->m_flags, M_PKTHDR);
> +
> +	/* Concatenate */
> +	for (m = mhead; m->m_next;)
> +		m = m->m_next;
> +	m->m_next = mtail;
> +	mhead->m_pkthdr.len += tail.paylen;
> +
> +	/* Flag mbuf as TSO packet with MSS. */
> +	if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) {
> +		/* Set CSUM_OUT flags in case of forwarding. */
> +		SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> +		head.tcp->th_sum = 0;
> +		if (head.ip4) {
> +			SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
> +			head.ip4->ip_sum = 0;
> +		}
> +
> +		SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO);
> +		mhead->m_pkthdr.ph_mss = head.paylen;
> +		tcpstat_inc(tcps_inswlro);
> +		tcpstat_inc(tcps_inpktlro);	/* count head */
> +	}
> +	mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen);
> +	tcpstat_inc(tcps_inpktlro);	/* count tail */
> +
> +	return 1;
> +}
> +
> +void
> +tcp_softlro_enqueue(struct ifnet *ifp, struct mbuf_list *ml, struct mbuf *mtail)
> +{
> +	struct mbuf *mhead;
> +
> +	if (!ISSET(ifp->if_xflags, IFXF_LRO))
> +		goto out;
> +
> +	/* Don't merge packets with invalid header checksum. */
> +	if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
> +		goto out;
> +
> +	for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) {
> +		/* Don't merge packets with invalid header checksum. */
> +		if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
> +			continue;
> +
> +		/* Use RSS hash to skip packets of different connections. */
> +		if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) &&
> +		    ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) &&
> +		    mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid)
> +			continue;
> +
> +		/* Don't merge packets inside and outside of VLANs */
> +		if (ISSET(mhead->m_flags, M_VLANTAG) !=
> +		    ISSET(mtail->m_flags, M_VLANTAG))
> +			continue;
> +
> +		if (ISSET(mhead->m_flags, M_VLANTAG)) {
> +			/* Don't merge packets of different VLANs */
> +			if (EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) !=
> +			    EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag))
> +				continue;
> +
> +			/* Don't merge packets of different priorities */
> +			if (EVL_PRIOFTAG(mhead->m_pkthdr.ether_vtag) !=
> +			    EVL_PRIOFTAG(mtail->m_pkthdr.ether_vtag))
> +				continue;
> +		}
> +
> +		if (tcp_softlro(mhead, mtail))
> +			return;
> +	}
> + out:
> +	ml_enqueue(ml, mtail);
>  }
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.186 tcp_var.h
> --- netinet/tcp_var.h	2 Mar 2025 21:28:32 -0000	1.186
> +++ netinet/tcp_var.h	3 Apr 2025 15:43:55 -0000
> @@ -720,6 +720,7 @@ void	 tcp_init(void);
>  int	 tcp_input(struct mbuf **, int *, int, int, struct netstack *);
>  int	 tcp_mss(struct tcpcb *, int);
>  void	 tcp_mss_update(struct tcpcb *);
> +void	 tcp_softlro_enqueue(struct ifnet *, struct mbuf_list *, struct mbuf *);
>  u_int	 tcp_hdrsz(struct tcpcb *);
>  void	 tcp_mtudisc(struct inpcb *, int);
>  void	 tcp_mtudisc_increase(struct inpcb *, int);