From: Mark Patruck Subject: Re: SoftLRO for ixl(4), bnxt(4) and em(4) To: Alexander Bluhm Cc: Jan Klemkow , tech@openbsd.org, Janne Johansson , Yuichiro NAITO Date: Wed, 26 Mar 2025 11:33:56 +0100 On 20.03.2025 15:20, Alexander Bluhm wrote: >As release lock will come soon, we should be careful. I think we >can add the code if we ensure that default behavior does not change >anything. > >We should start with ixl(4), ifconfig tcplro flag disabled per >default. Then we can improve the TCP code in tree and add other >drivers later. I found some issues like ethernet padding and >TCP option parsing that should be improved. But doing this >on top of an initial commit is easier. > >With some minor changes I am OK with commiting the diff below. > >- remove all drivers except ixl(4) from the diff >- turn off ifconfig tcplro per default >- make sure that ixl(4) logic does not change, if tcplro is off >- I have fixed the livelocked logic in ixl(4) >- I renamed the function to tcp_enqueue_lro(), it is more consistent > with the TSO functions. > >As said before, tcp_softlro() needs more love, but it want to >do this in tree. > >With my comments, jan@'s diff looks like this and would be OK bluhm@ No real issues spotted on a busy ixl(4)/carp(4) setup for the last four days. I did a few rounds of tcpbench between ixl(4)->vio(4) and the throughput is more or less constantly good (around 9Gb/s) and also mbufs are released directly after stopping and go back to the previous state. I still see some very low values, although never during a "good" run, that means, if the first 2,3 values are ~ 9Gb/s it never passes <8Gb/s, whereas sometimes it starts with 5,6 Gb/s and then never reaches the top, f.e # good run (most of the time) elapsed_ms bytes mbps bwidth 1000 992793140 7942.345 100.00% Conn: 1 Mbps: 7942.345 Peak Mbps: 7942.345 Avg Mbps: 7942.345 2000 1174009396 9392.075 100.00% Conn: 1 Mbps: 9392.075 Peak Mbps: 9392.075 Avg Mbps: 9392.075 3000 1173654672 9389.237 100.00% Conn: 1 Mbps: 9389.237 Peak Mbps: 9392.075 Avg Mbps: 9389.237 4000 1161408036 9300.565 100.00% Conn: 1 Mbps: 9300.565 Peak Mbps: 9392.075 Avg Mbps: 9300.565 5000 1173661004 9389.288 100.00% Conn: 1 Mbps: 9389.288 Peak Mbps: 9392.075 Avg Mbps: 9389.288 6000 1173847200 9390.778 100.00% Conn: 1 Mbps: 9390.778 Peak Mbps: 9392.075 Avg Mbps: 9390.778 7000 1173698228 9389.586 100.00% Conn: 1 Mbps: 9389.586 Peak Mbps: 9392.075 Avg Mbps: 9389.586 ....continues like this.... # bad run (every now and then) elapsed_ms bytes mbps bwidth 1000 236474968 1891.800 100.00% Conn: 1 Mbps: 1891.800 Peak Mbps: 1891.800 Avg Mbps: 1891.800 2000 252852080 2022.817 100.00% Conn: 1 Mbps: 2022.817 Peak Mbps: 2022.817 Avg Mbps: 2022.817 3000 544229216 4353.834 100.00% Conn: 1 Mbps: 4353.834 Peak Mbps: 4353.834 Avg Mbps: 4353.834 4000 565089816 4520.719 100.00% Conn: 1 Mbps: 4520.719 Peak Mbps: 4520.719 Avg Mbps: 4520.719 5004 413703444 3296.442 100.00% Conn: 1 Mbps: 3296.442 Peak Mbps: 4520.719 Avg Mbps: 3296.442 6005 330198224 2641.586 100.00% Conn: 1 Mbps: 2641.586 Peak Mbps: 4520.719 Avg Mbps: 2641.586 >Index: dev/pci/if_ixl.c >=================================================================== >RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v >diff -u -p -r1.102 if_ixl.c >--- dev/pci/if_ixl.c 30 Oct 2024 18:02:45 -0000 1.102 >+++ dev/pci/if_ixl.c 20 Mar 2025 10:29:12 -0000 >@@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 { > > #define IXL_RX_DESC_PTYPE_SHIFT 30 > #define IXL_RX_DESC_PTYPE_MASK (0xffULL << IXL_RX_DESC_PTYPE_SHIFT) >+#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP 26 >+#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP 92 > > #define IXL_RX_DESC_PLEN_SHIFT 38 > #define IXL_RX_DESC_PLEN_MASK (0x3fffULL << IXL_RX_DESC_PLEN_SHIFT) >@@ -1975,6 +1977,11 @@ ixl_attach(struct device *parent, struct > IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | > IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; > ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; >+ ifp->if_capabilities |= IFCAP_LRO; >+#if 0 >+ /* for now tcplro at ixl(4) is default off */ >+ ifp->if_xflags |= IFXF_LRO; >+#endif > > ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); > >@@ -3255,9 +3262,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i > struct ixl_rx_map *rxm; > bus_dmamap_t map; > unsigned int cons, prod; >+ struct mbuf_list mltcp = MBUF_LIST_INITIALIZER(); > struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > struct mbuf *m; > uint64_t word; >+ unsigned int ptype; > unsigned int len; > unsigned int mask; > int done = 0; >@@ -3294,6 +3303,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i > m = rxm->rxm_m; > rxm->rxm_m = NULL; > >+ ptype = (word & IXL_RX_DESC_PTYPE_MASK) >+ >> IXL_RX_DESC_PTYPE_SHIFT; > len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT; > m->m_len = len; > m->m_pkthdr.len = 0; >@@ -3314,7 +3325,6 @@ ixl_rxeof(struct ixl_softc *sc, struct i > lemtoh32(&rxd->filter_status); > m->m_pkthdr.csum_flags |= M_FLOWID; > } >- > #if NVLAN > 0 > if (ISSET(word, IXL_RX_DESC_L2TAG1P)) { > m->m_pkthdr.ether_vtag = >@@ -3322,9 +3332,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i > SET(m->m_flags, M_VLANTAG); > } > #endif >- > ixl_rx_checksum(m, word); >- ml_enqueue(&ml, m); >+ >+ if (ISSET(ifp->if_xflags, IFXF_LRO) && >+ (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP || >+ ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP)) >+ tcp_enqueue_lro(&mltcp, m); >+ else >+ ml_enqueue(&ml, m); > } else { > ifp->if_ierrors++; /* XXX */ > m_freem(m); >@@ -3341,8 +3356,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i > } while (cons != prod); > > if (done) { >+ int livelocked = 0; >+ > rxr->rxr_cons = cons; >+ if (ifiq_input(ifiq, &mltcp)) >+ livelocked = 1; > if (ifiq_input(ifiq, &ml)) >+ livelocked = 1; >+ if (livelocked) > if_rxr_livelocked(&rxr->rxr_acct); > ixl_rxfill(sc, rxr); > } >Index: netinet/tcp_input.c >=================================================================== >RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v >diff -u -p -r1.434 tcp_input.c >--- netinet/tcp_input.c 10 Mar 2025 15:11:46 -0000 1.434 >+++ netinet/tcp_input.c 20 Mar 2025 10:29:12 -0000 >@@ -84,6 +84,7 @@ > #include > #include > >+#include > #include > #include > #include >@@ -4229,4 +4230,230 @@ syn_cache_respond(struct syn_cache *sc, > } > in_pcbunref(inp); > return (error); >+} >+ >+int >+tcp_softlro(struct mbuf *mhead, struct mbuf *mtail) >+{ >+ struct ether_extracted head; >+ struct ether_extracted tail; >+ struct mbuf *m; >+ unsigned int hdrlen; >+ >+ /* >+ * Check if head and tail are mergeable >+ */ >+ >+ ether_extract_headers(mhead, &head); >+ ether_extract_headers(mtail, &tail); >+ >+ /* Don't merge packets of different VLANs */ >+ if (head.evh && tail.evh) { >+ if (head.evh->evl_tag != tail.evh->evl_tag) >+ return 0; >+ } else if (head.evh || tail.evh) >+ return 0; >+ >+ /* Check IP header. */ >+ if (head.ip4 && tail.ip4) { >+ /* Don't merge packets with invalid header checksum. */ >+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) || >+ !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) >+ return 0; >+ >+ /* Check IPv4 addresses. */ >+ if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr || >+ head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr) >+ return 0; >+ >+ /* Don't merge IPv4 fragments. */ >+ if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) || >+ ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF))) >+ return 0; >+ >+ /* Check max. IPv4 length. */ >+ if (head.iplen + tail.iplen > IP_MAXPACKET) >+ return 0; >+ >+ /* Don't merge IPv4 packets with option headers. */ >+ if (head.iphlen != sizeof(struct ip) || >+ tail.iphlen != sizeof(struct ip)) >+ return 0; >+ >+ /* Don't non-TCP packets. */ >+ if (head.ip4->ip_p != IPPROTO_TCP || >+ tail.ip4->ip_p != IPPROTO_TCP) >+ return 0; >+ } else if (head.ip6 && tail.ip6) { >+ /* Check IPv6 addresses. */ >+ if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) || >+ !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst)) >+ return 0; >+ >+ /* Check max. IPv6 length. */ >+ if ((head.iplen - head.iphlen) + >+ (tail.iplen - tail.iphlen) > IPV6_MAXPACKET) >+ return 0; >+ >+ /* Don't merge IPv6 packets with option headers nor non-TCP. */ >+ if (head.ip6->ip6_nxt != IPPROTO_TCP || >+ tail.ip6->ip6_nxt != IPPROTO_TCP) >+ return 0; >+ } else { >+ return 0; >+ } >+ >+ /* Check TCP header. */ >+ if (!head.tcp || !tail.tcp) >+ return 0; >+ >+ /* Check TCP ports. */ >+ if (head.tcp->th_sport != tail.tcp->th_sport || >+ head.tcp->th_dport != tail.tcp->th_dport) >+ return 0; >+ >+ /* Don't merge empty segments. */ >+ if (head.paylen == 0 || tail.paylen == 0) >+ return 0; >+ >+ /* Check for continues segments. */ >+ if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq)) >+ return 0; >+ >+ /* Just ACK and PUSH TCP flags are allowed. */ >+ if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) || >+ ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH))) >+ return 0; >+ >+ /* TCP ACK flag has to be set. */ >+ if (!ISSET(head.tcp->th_flags, TH_ACK) || >+ !ISSET(tail.tcp->th_flags, TH_ACK)) >+ return 0; >+ >+ /* Ignore segments with different TCP options. */ >+ if (head.tcphlen - sizeof(struct tcphdr) != >+ tail.tcphlen - sizeof(struct tcphdr)) >+ return 0; >+ >+ /* Check for TCP options */ >+ if (head.tcphlen > sizeof(struct tcphdr)) { >+ char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr); >+ char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr); >+ int optsize = head.tcphlen - sizeof(struct tcphdr); >+ int optlen; >+ >+ for (; optsize > 0; optsize -= optlen) { >+ /* Ignore segments with different TCP options. */ >+ if (hopt[0] != topt[0] || hopt[1] != topt[1]) >+ return 0; >+ >+ /* Get option length */ >+ optlen = hopt[1]; >+ if (hopt[0] == TCPOPT_NOP) >+ optlen = 1; >+ else if (optlen < 2 || optlen > optsize) >+ return 0; /* Illegal length */ >+ >+ if (hopt[0] != TCPOPT_NOP && >+ hopt[0] != TCPOPT_TIMESTAMP) >+ return 0; /* Unsupported TCP option */ >+ >+ hopt += optlen; >+ topt += optlen; >+ } >+ } >+ >+ /* >+ * Prepare concatenation of head and tail. >+ */ >+ >+ /* Adjust IP header. */ >+ if (head.ip4) { >+ head.ip4->ip_len = htons(head.iplen + tail.paylen); >+ } else if (head.ip6) { >+ head.ip6->ip6_plen = >+ htons(head.iplen - head.iphlen + tail.paylen); >+ } >+ >+ /* Combine TCP flags from head and tail. */ >+ if (ISSET(tail.tcp->th_flags, TH_PUSH)) >+ SET(head.tcp->th_flags, TH_PUSH); >+ >+ /* Adjust TCP header. */ >+ head.tcp->th_win = tail.tcp->th_win; >+ head.tcp->th_ack = tail.tcp->th_ack; >+ >+ /* Calculate header length of tail packet. */ >+ hdrlen = sizeof(*tail.eh); >+ if (tail.evh) >+ hdrlen = sizeof(*tail.evh); >+ hdrlen += tail.iphlen; >+ hdrlen += tail.tcphlen; >+ >+ /* Skip protocol headers in tail. */ >+ m_adj(mtail, hdrlen); >+ CLR(mtail->m_flags, M_PKTHDR); >+ >+ /* Concatenate */ >+ for (m = mhead; m->m_next;) >+ m = m->m_next; >+ m->m_next = mtail; >+ mhead->m_pkthdr.len += tail.paylen; >+ >+ /* Flag mbuf as TSO packet with MSS. */ >+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) { >+ /* Set CSUM_OUT flags in case of forwarding. */ >+ SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); >+ head.tcp->th_sum = 0; >+ if (head.ip4) { >+ SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); >+ head.ip4->ip_sum = 0; >+ } >+ >+ SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO); >+ mhead->m_pkthdr.ph_mss = head.paylen; >+ tcpstat_inc(tcps_inhwlro); >+ tcpstat_inc(tcps_inpktlro); /* count head */ >+ } >+ mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen); >+ tcpstat_inc(tcps_inpktlro); /* count tail */ >+ >+ return 1; >+} >+ >+void >+tcp_enqueue_lro(struct mbuf_list *ml, struct mbuf *mtail) >+{ >+ struct mbuf *mhead; >+ >+ /* Don't merge packets with invalid header checksum. */ >+ if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) >+ goto out; >+ >+ for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) { >+ /* Don't merge packets with invalid header checksum. */ >+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) >+ continue; >+ >+ /* Use RSS hash to skip packets of different connections. */ >+ if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) && >+ ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) && >+ mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid) >+ continue; >+ >+ /* Don't merge packets of different VLANs */ >+ if (ISSET(mhead->m_flags, M_VLANTAG) != >+ ISSET(mtail->m_flags, M_VLANTAG)) >+ continue; >+ >+ if (ISSET(mhead->m_flags, M_VLANTAG) && >+ EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) != >+ EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag)) >+ continue; >+ >+ if (tcp_softlro(mhead, mtail)) >+ return; >+ } >+ out: >+ ml_enqueue(ml, mtail); > } >Index: netinet/tcp_var.h >=================================================================== >RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v >diff -u -p -r1.186 tcp_var.h >--- netinet/tcp_var.h 2 Mar 2025 21:28:32 -0000 1.186 >+++ netinet/tcp_var.h 20 Mar 2025 10:29:12 -0000 >@@ -720,6 +720,7 @@ void tcp_init(void); > int tcp_input(struct mbuf **, int *, int, int, struct netstack *); > int tcp_mss(struct tcpcb *, int); > void tcp_mss_update(struct tcpcb *); >+void tcp_enqueue_lro(struct mbuf_list *, struct mbuf *); > u_int tcp_hdrsz(struct tcpcb *); > void tcp_mtudisc(struct inpcb *, int); > void tcp_mtudisc_increase(struct inpcb *, int); > -- Mark Patruck ( mark at wrapped.cx ) GPG key 0xF2865E51 / 187F F6D3 EE04 1DCE 1C74 F644 0D3C F66F F286 5E51 https://www.wrapped.cx