From: Alexander Bluhm Subject: Re: SoftLRO for ixl(4), bnxt(4) and em(4) To: Jan Klemkow Cc: tech@openbsd.org, Janne Johansson , Yuichiro NAITO Date: Wed, 16 Apr 2025 18:05:53 +0200 On Sat, Apr 05, 2025 at 02:22:13PM +0200, Jan Klemkow wrote: > This follwing diff adds checks to VLAN priorities. My tests did not find problems anymore. I would suggest some improvement, but they can be better addressed in tree. We should commit what we have for ixl(4) only, and tcplro disabled per default. Then fixing and testing will be easier. OK bluhm@ > Index: dev/pci/if_ixl.c > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v > diff -u -p -r1.102 if_ixl.c > --- dev/pci/if_ixl.c 30 Oct 2024 18:02:45 -0000 1.102 > +++ dev/pci/if_ixl.c 3 Apr 2025 15:45:43 -0000 > @@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 { > > #define IXL_RX_DESC_PTYPE_SHIFT 30 > #define IXL_RX_DESC_PTYPE_MASK (0xffULL << IXL_RX_DESC_PTYPE_SHIFT) > +#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP 26 > +#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP 92 > > #define IXL_RX_DESC_PLEN_SHIFT 38 > #define IXL_RX_DESC_PLEN_MASK (0x3fffULL << IXL_RX_DESC_PLEN_SHIFT) > @@ -1976,6 +1978,12 @@ ixl_attach(struct device *parent, struct > IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; > ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; > > + ifp->if_capabilities |= IFCAP_LRO; > +#if notyet > + /* for now tcplro at ixl(4) is default off */ > + ifp->if_xflags |= IFXF_LRO; > +#endif > + > ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); > > ixl_media_add(sc, phy_types); > @@ -3255,9 +3263,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i > struct ixl_rx_map *rxm; > bus_dmamap_t map; > unsigned int cons, prod; > + struct mbuf_list mltcp = MBUF_LIST_INITIALIZER(); > struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > struct mbuf *m; > uint64_t word; > + unsigned int ptype; > unsigned int len; > unsigned int mask; > int done = 0; > @@ -3294,6 +3304,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i > m = rxm->rxm_m; > rxm->rxm_m = NULL; > > + ptype = (word & IXL_RX_DESC_PTYPE_MASK) > + >> IXL_RX_DESC_PTYPE_SHIFT; > len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT; > m->m_len = len; > m->m_pkthdr.len = 0; > @@ -3324,7 +3336,13 @@ ixl_rxeof(struct ixl_softc *sc, struct i > #endif > > ixl_rx_checksum(m, word); > - ml_enqueue(&ml, m); > + > + if (ISSET(ifp->if_xflags, IFXF_LRO) && > + (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP || > + ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP)) > + tcp_softlro_enqueue(ifp, &mltcp, m); > + else > + ml_enqueue(&ml, m); > } else { > ifp->if_ierrors++; /* XXX */ > m_freem(m); > @@ -3341,8 +3359,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i > } while (cons != prod); > > if (done) { > + int livelocked = 0; > + > rxr->rxr_cons = cons; > + if (ifiq_input(ifiq, &mltcp)) > + livelocked = 1; > if (ifiq_input(ifiq, &ml)) > + livelocked = 1; > + if (livelocked) > if_rxr_livelocked(&rxr->rxr_acct); > ixl_rxfill(sc, rxr); > } > Index: netinet/tcp_input.c > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.434 tcp_input.c > --- netinet/tcp_input.c 10 Mar 2025 15:11:46 -0000 1.434 > +++ netinet/tcp_input.c 5 Apr 2025 11:46:19 -0000 > @@ -84,6 +84,7 @@ > #include > #include > > +#include > #include > #include > #include > @@ -4229,4 +4230,256 @@ syn_cache_respond(struct syn_cache *sc, > } > in_pcbunref(inp); > return (error); > +} > + > +int > +tcp_softlro(struct mbuf *mhead, struct mbuf *mtail) > +{ > + struct ether_extracted head; > + struct ether_extracted tail; > + struct mbuf *m; > + unsigned int hdrlen; > + unsigned int cnt = 0; > + > + /* > + * Check if head and tail are mergeable > + */ > + > + ether_extract_headers(mhead, &head); > + ether_extract_headers(mtail, &tail); > + > + /* Don't merge packets inside and outside of VLANs */ > + if (head.evh && tail.evh) { > + /* Don't merge packets of different VLANs */ > + if (EVL_VLANOFTAG(head.evh->evl_tag) != > + EVL_VLANOFTAG(tail.evh->evl_tag)) > + return 0; > + > + /* Don't merge packets of different priorities */ > + if (EVL_PRIOFTAG(head.evh->evl_tag) != > + EVL_PRIOFTAG(tail.evh->evl_tag)) > + return 0; > + } else if (head.evh || tail.evh) > + return 0; > + > + /* Check IP header. */ > + if (head.ip4 && tail.ip4) { > + /* Don't merge packets with invalid header checksum. */ > + if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) || > + !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) > + return 0; > + > + /* Check IPv4 addresses. */ > + if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr || > + head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr) > + return 0; > + > + /* Don't merge IPv4 fragments. */ > + if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) || > + ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF))) > + return 0; > + > + /* Check max. IPv4 length. */ > + if (head.iplen + tail.iplen > IP_MAXPACKET) > + return 0; > + > + /* Don't merge IPv4 packets with option headers. */ > + if (head.iphlen != sizeof(struct ip) || > + tail.iphlen != sizeof(struct ip)) > + return 0; > + > + /* Don't non-TCP packets. */ > + if (head.ip4->ip_p != IPPROTO_TCP || > + tail.ip4->ip_p != IPPROTO_TCP) > + return 0; > + } else if (head.ip6 && tail.ip6) { > + /* Check IPv6 addresses. */ > + if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) || > + !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst)) > + return 0; > + > + /* Check max. IPv6 length. */ > + if ((head.iplen - head.iphlen) + > + (tail.iplen - tail.iphlen) > IPV6_MAXPACKET) > + return 0; > + > + /* Don't merge IPv6 packets with option headers nor non-TCP. */ > + if (head.ip6->ip6_nxt != IPPROTO_TCP || > + tail.ip6->ip6_nxt != IPPROTO_TCP) > + return 0; > + } else { > + return 0; > + } > + > + /* Check TCP header. */ > + if (!head.tcp || !tail.tcp) > + return 0; > + > + /* Check TCP ports. */ > + if (head.tcp->th_sport != tail.tcp->th_sport || > + head.tcp->th_dport != tail.tcp->th_dport) > + return 0; > + > + /* Don't merge empty segments. */ > + if (head.paylen == 0 || tail.paylen == 0) > + return 0; > + > + /* Check for continues segments. */ > + if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq)) > + return 0; > + > + /* Just ACK and PUSH TCP flags are allowed. */ > + if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) || > + ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH))) > + return 0; > + > + /* TCP ACK flag has to be set. */ > + if (!ISSET(head.tcp->th_flags, TH_ACK) || > + !ISSET(tail.tcp->th_flags, TH_ACK)) > + return 0; > + > + /* Ignore segments with different TCP options. */ > + if (head.tcphlen - sizeof(struct tcphdr) != > + tail.tcphlen - sizeof(struct tcphdr)) > + return 0; > + > + /* Check for TCP options */ > + if (head.tcphlen > sizeof(struct tcphdr)) { > + char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr); > + char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr); > + int optsize = head.tcphlen - sizeof(struct tcphdr); > + int optlen; > + > + for (; optsize > 0; optsize -= optlen) { > + /* Ignore segments with different TCP options. */ > + if (hopt[0] != topt[0] || hopt[1] != topt[1]) > + return 0; > + > + /* Get option length */ > + optlen = hopt[1]; > + if (hopt[0] == TCPOPT_NOP) > + optlen = 1; > + else if (optlen < 2 || optlen > optsize) > + return 0; /* Illegal length */ > + > + if (hopt[0] != TCPOPT_NOP && > + hopt[0] != TCPOPT_TIMESTAMP) > + return 0; /* Unsupported TCP option */ > + > + hopt += optlen; > + topt += optlen; > + } > + } > + > + /* Limit mbuf chain len to avoid m_defrag calls on forwarding. */ > + for (m = mhead; m != NULL; m = m->m_next) > + if (cnt++ >= 8) > + return 0; > + for (m = mtail; m != NULL; m = m->m_next) > + if (cnt++ >= 8) > + return 0; > + > + /* > + * Prepare concatenation of head and tail. > + */ > + > + /* Adjust IP header. */ > + if (head.ip4) { > + head.ip4->ip_len = htons(head.iplen + tail.paylen); > + } else if (head.ip6) { > + head.ip6->ip6_plen = > + htons(head.iplen - head.iphlen + tail.paylen); > + } > + > + /* Combine TCP flags from head and tail. */ > + if (ISSET(tail.tcp->th_flags, TH_PUSH)) > + SET(head.tcp->th_flags, TH_PUSH); > + > + /* Adjust TCP header. */ > + head.tcp->th_win = tail.tcp->th_win; > + head.tcp->th_ack = tail.tcp->th_ack; > + > + /* Calculate header length of tail packet. */ > + hdrlen = sizeof(*tail.eh); > + if (tail.evh) > + hdrlen = sizeof(*tail.evh); > + hdrlen += tail.iphlen; > + hdrlen += tail.tcphlen; > + > + /* Skip protocol headers in tail. */ > + m_adj(mtail, hdrlen); > + CLR(mtail->m_flags, M_PKTHDR); > + > + /* Concatenate */ > + for (m = mhead; m->m_next;) > + m = m->m_next; > + m->m_next = mtail; > + mhead->m_pkthdr.len += tail.paylen; > + > + /* Flag mbuf as TSO packet with MSS. */ > + if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) { > + /* Set CSUM_OUT flags in case of forwarding. */ > + SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); > + head.tcp->th_sum = 0; > + if (head.ip4) { > + SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); > + head.ip4->ip_sum = 0; > + } > + > + SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO); > + mhead->m_pkthdr.ph_mss = head.paylen; > + tcpstat_inc(tcps_inswlro); > + tcpstat_inc(tcps_inpktlro); /* count head */ > + } > + mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen); > + tcpstat_inc(tcps_inpktlro); /* count tail */ > + > + return 1; > +} > + > +void > +tcp_softlro_enqueue(struct ifnet *ifp, struct mbuf_list *ml, struct mbuf *mtail) > +{ > + struct mbuf *mhead; > + > + if (!ISSET(ifp->if_xflags, IFXF_LRO)) > + goto out; > + > + /* Don't merge packets with invalid header checksum. */ > + if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) > + goto out; > + > + for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) { > + /* Don't merge packets with invalid header checksum. */ > + if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) > + continue; > + > + /* Use RSS hash to skip packets of different connections. */ > + if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) && > + ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) && > + mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid) > + continue; > + > + /* Don't merge packets inside and outside of VLANs */ > + if (ISSET(mhead->m_flags, M_VLANTAG) != > + ISSET(mtail->m_flags, M_VLANTAG)) > + continue; > + > + if (ISSET(mhead->m_flags, M_VLANTAG)) { > + /* Don't merge packets of different VLANs */ > + if (EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) != > + EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag)) > + continue; > + > + /* Don't merge packets of different priorities */ > + if (EVL_PRIOFTAG(mhead->m_pkthdr.ether_vtag) != > + EVL_PRIOFTAG(mtail->m_pkthdr.ether_vtag)) > + continue; > + } > + > + if (tcp_softlro(mhead, mtail)) > + return; > + } > + out: > + ml_enqueue(ml, mtail); > } > Index: netinet/tcp_var.h > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.186 tcp_var.h > --- netinet/tcp_var.h 2 Mar 2025 21:28:32 -0000 1.186 > +++ netinet/tcp_var.h 3 Apr 2025 15:43:55 -0000 > @@ -720,6 +720,7 @@ void tcp_init(void); > int tcp_input(struct mbuf **, int *, int, int, struct netstack *); > int tcp_mss(struct tcpcb *, int); > void tcp_mss_update(struct tcpcb *); > +void tcp_softlro_enqueue(struct ifnet *, struct mbuf_list *, struct mbuf *); > u_int tcp_hdrsz(struct tcpcb *); > void tcp_mtudisc(struct inpcb *, int); > void tcp_mtudisc_increase(struct inpcb *, int);