From: Mark Patruck Subject: Re: SoftLRO for ixl(4), bnxt(4) and em(4) To: Alexander Bluhm Cc: Jan Klemkow , tech@openbsd.org, Janne Johansson , Yuichiro NAITO Date: Wed, 26 Mar 2025 15:55:41 +0100 On 26.03.2025 11:33, Mark Patruck wrote: >On 20.03.2025 15:20, Alexander Bluhm wrote: >>As release lock will come soon, we should be careful. I think we >>can add the code if we ensure that default behavior does not change >>anything. >> >>We should start with ixl(4), ifconfig tcplro flag disabled per >>default. Then we can improve the TCP code in tree and add other >>drivers later. I found some issues like ethernet padding and >>TCP option parsing that should be improved. But doing this >>on top of an initial commit is easier. >> >>With some minor changes I am OK with commiting the diff below. >> >>- remove all drivers except ixl(4) from the diff >>- turn off ifconfig tcplro per default >>- make sure that ixl(4) logic does not change, if tcplro is off >>- I have fixed the livelocked logic in ixl(4) >>- I renamed the function to tcp_enqueue_lro(), it is more consistent >> with the TSO functions. >> >>As said before, tcp_softlro() needs more love, but it want to >>do this in tree. >> >>With my comments, jan@'s diff looks like this and would be OK bluhm@ > >No real issues spotted on a busy ixl(4)/carp(4) setup for the last four days. > >I did a few rounds of tcpbench between ixl(4)->vio(4) and the throughput >is more or less constantly good (around 9Gb/s) and also mbufs are released >directly after stopping and go back to the previous state. > >I still see some very low values, although never during a "good" run, that >means, if the first 2,3 values are ~ 9Gb/s it never passes <8Gb/s, whereas >sometimes it starts with 5,6 Gb/s and then never reaches the top, f.e > ># good run (most of the time) > elapsed_ms bytes mbps bwidth 1000 >992793140 7942.345 100.00% Conn: 1 Mbps: 7942.345 Peak >Mbps: 7942.345 Avg Mbps: 7942.345 > 2000 1174009396 9392.075 100.00% Conn: 1 Mbps: >9392.075 Peak Mbps: 9392.075 Avg Mbps: 9392.075 > 3000 1173654672 9389.237 100.00% Conn: 1 Mbps: >9389.237 Peak Mbps: 9392.075 Avg Mbps: 9389.237 > 4000 1161408036 9300.565 100.00% Conn: 1 Mbps: >9300.565 Peak Mbps: 9392.075 Avg Mbps: 9300.565 > 5000 1173661004 9389.288 100.00% Conn: 1 Mbps: >9389.288 Peak Mbps: 9392.075 Avg Mbps: 9389.288 > 6000 1173847200 9390.778 100.00% Conn: 1 Mbps: >9390.778 Peak Mbps: 9392.075 Avg Mbps: 9390.778 > 7000 1173698228 9389.586 100.00% Conn: 1 Mbps: >9389.586 Peak Mbps: 9392.075 Avg Mbps: 9389.586 >....continues like this.... > > ># bad run (every now and then) > elapsed_ms bytes mbps bwidth >1000 236474968 1891.800 100.00% >Conn: 1 Mbps: 1891.800 Peak Mbps: 1891.800 Avg Mbps: >1891.800 2000 252852080 2022.817 >100.00% Conn: 1 Mbps: >2022.817 Peak Mbps: 2022.817 Avg Mbps: 2022.817 >3000 544229216 4353.834 100.00% >Conn: 1 Mbps: 4353.834 Peak Mbps: 4353.834 Avg Mbps: >4353.834 > 4000 565089816 4520.719 100.00% >Conn: 1 Mbps: 4520.719 Peak Mbps: 4520.719 Avg Mbps: >4520.719 > 5004 413703444 3296.442 100.00% >Conn: 1 Mbps: 3296.442 Peak Mbps: 4520.719 Avg Mbps: >3296.442 6005 330198224 2641.586 >100.00% Conn: 1 Mbps: >2641.586 Peak Mbps: 4520.719 Avg Mbps: 2641.586 Something else i've stumbled upon while testing directly from ixl(4) -> ixl(4), both systems running bluhm@'s diff below. Compared to running from ixl(4) -> vio(4), mbufs increase much faster and also higher and almost won't get released anymore. It pretty much stays on the highest value. Freshly booted: - ixl(4) -> vio(4): - mbufs start: 610 - mbufs highest after 3x10 runs: 1007 - mbufs after tcpbench exited: 689 - mbufs highest after 12x10 runs: 1217 - mbufs after tcpbench exited: 767 - ixl(4) -> ixl(4): - mbufs start: 639 - mbufs highest after 3x10 runs: 1628 - mbufs after tcpbench exited: 1588 - mbufs highest after 12x10 runs: 2184 - mbufs after tcpbench exited: 2013 >>Index: dev/pci/if_ixl.c >>=================================================================== >>RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v >>diff -u -p -r1.102 if_ixl.c >>--- dev/pci/if_ixl.c 30 Oct 2024 18:02:45 -0000 1.102 >>+++ dev/pci/if_ixl.c 20 Mar 2025 10:29:12 -0000 >>@@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 { >> >>#define IXL_RX_DESC_PTYPE_SHIFT 30 >>#define IXL_RX_DESC_PTYPE_MASK (0xffULL << IXL_RX_DESC_PTYPE_SHIFT) >>+#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP 26 >>+#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP 92 >> >>#define IXL_RX_DESC_PLEN_SHIFT 38 >>#define IXL_RX_DESC_PLEN_MASK (0x3fffULL << IXL_RX_DESC_PLEN_SHIFT) >>@@ -1975,6 +1977,11 @@ ixl_attach(struct device *parent, struct >> IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | >> IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; >> ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; >>+ ifp->if_capabilities |= IFCAP_LRO; >>+#if 0 >>+ /* for now tcplro at ixl(4) is default off */ >>+ ifp->if_xflags |= IFXF_LRO; >>+#endif >> >> ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); >> >>@@ -3255,9 +3262,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i >> struct ixl_rx_map *rxm; >> bus_dmamap_t map; >> unsigned int cons, prod; >>+ struct mbuf_list mltcp = MBUF_LIST_INITIALIZER(); >> struct mbuf_list ml = MBUF_LIST_INITIALIZER(); >> struct mbuf *m; >> uint64_t word; >>+ unsigned int ptype; >> unsigned int len; >> unsigned int mask; >> int done = 0; >>@@ -3294,6 +3303,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i >> m = rxm->rxm_m; >> rxm->rxm_m = NULL; >> >>+ ptype = (word & IXL_RX_DESC_PTYPE_MASK) >>+ >> IXL_RX_DESC_PTYPE_SHIFT; >> len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT; >> m->m_len = len; >> m->m_pkthdr.len = 0; >>@@ -3314,7 +3325,6 @@ ixl_rxeof(struct ixl_softc *sc, struct i >> lemtoh32(&rxd->filter_status); >> m->m_pkthdr.csum_flags |= M_FLOWID; >> } >>- >>#if NVLAN > 0 >> if (ISSET(word, IXL_RX_DESC_L2TAG1P)) { >> m->m_pkthdr.ether_vtag = >>@@ -3322,9 +3332,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i >> SET(m->m_flags, M_VLANTAG); >> } >>#endif >>- >> ixl_rx_checksum(m, word); >>- ml_enqueue(&ml, m); >>+ >>+ if (ISSET(ifp->if_xflags, IFXF_LRO) && >>+ (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP || >>+ ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP)) >>+ tcp_enqueue_lro(&mltcp, m); >>+ else >>+ ml_enqueue(&ml, m); >> } else { >> ifp->if_ierrors++; /* XXX */ >> m_freem(m); >>@@ -3341,8 +3356,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i >> } while (cons != prod); >> >> if (done) { >>+ int livelocked = 0; >>+ >> rxr->rxr_cons = cons; >>+ if (ifiq_input(ifiq, &mltcp)) >>+ livelocked = 1; >> if (ifiq_input(ifiq, &ml)) >>+ livelocked = 1; >>+ if (livelocked) >> if_rxr_livelocked(&rxr->rxr_acct); >> ixl_rxfill(sc, rxr); >> } >>Index: netinet/tcp_input.c >>=================================================================== >>RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v >>diff -u -p -r1.434 tcp_input.c >>--- netinet/tcp_input.c 10 Mar 2025 15:11:46 -0000 1.434 >>+++ netinet/tcp_input.c 20 Mar 2025 10:29:12 -0000 >>@@ -84,6 +84,7 @@ >>#include >>#include >> >>+#include >>#include >>#include >>#include >>@@ -4229,4 +4230,230 @@ syn_cache_respond(struct syn_cache *sc, >> } >> in_pcbunref(inp); >> return (error); >>+} >>+ >>+int >>+tcp_softlro(struct mbuf *mhead, struct mbuf *mtail) >>+{ >>+ struct ether_extracted head; >>+ struct ether_extracted tail; >>+ struct mbuf *m; >>+ unsigned int hdrlen; >>+ >>+ /* >>+ * Check if head and tail are mergeable >>+ */ >>+ >>+ ether_extract_headers(mhead, &head); >>+ ether_extract_headers(mtail, &tail); >>+ >>+ /* Don't merge packets of different VLANs */ >>+ if (head.evh && tail.evh) { >>+ if (head.evh->evl_tag != tail.evh->evl_tag) >>+ return 0; >>+ } else if (head.evh || tail.evh) >>+ return 0; >>+ >>+ /* Check IP header. */ >>+ if (head.ip4 && tail.ip4) { >>+ /* Don't merge packets with invalid header checksum. */ >>+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) || >>+ !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) >>+ return 0; >>+ >>+ /* Check IPv4 addresses. */ >>+ if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr || >>+ head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr) >>+ return 0; >>+ >>+ /* Don't merge IPv4 fragments. */ >>+ if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) || >>+ ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF))) >>+ return 0; >>+ >>+ /* Check max. IPv4 length. */ >>+ if (head.iplen + tail.iplen > IP_MAXPACKET) >>+ return 0; >>+ >>+ /* Don't merge IPv4 packets with option headers. */ >>+ if (head.iphlen != sizeof(struct ip) || >>+ tail.iphlen != sizeof(struct ip)) >>+ return 0; >>+ >>+ /* Don't non-TCP packets. */ >>+ if (head.ip4->ip_p != IPPROTO_TCP || >>+ tail.ip4->ip_p != IPPROTO_TCP) >>+ return 0; >>+ } else if (head.ip6 && tail.ip6) { >>+ /* Check IPv6 addresses. */ >>+ if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) || >>+ !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst)) >>+ return 0; >>+ >>+ /* Check max. IPv6 length. */ >>+ if ((head.iplen - head.iphlen) + >>+ (tail.iplen - tail.iphlen) > IPV6_MAXPACKET) >>+ return 0; >>+ >>+ /* Don't merge IPv6 packets with option headers nor non-TCP. */ >>+ if (head.ip6->ip6_nxt != IPPROTO_TCP || >>+ tail.ip6->ip6_nxt != IPPROTO_TCP) >>+ return 0; >>+ } else { >>+ return 0; >>+ } >>+ >>+ /* Check TCP header. */ >>+ if (!head.tcp || !tail.tcp) >>+ return 0; >>+ >>+ /* Check TCP ports. */ >>+ if (head.tcp->th_sport != tail.tcp->th_sport || >>+ head.tcp->th_dport != tail.tcp->th_dport) >>+ return 0; >>+ >>+ /* Don't merge empty segments. */ >>+ if (head.paylen == 0 || tail.paylen == 0) >>+ return 0; >>+ >>+ /* Check for continues segments. */ >>+ if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq)) >>+ return 0; >>+ >>+ /* Just ACK and PUSH TCP flags are allowed. */ >>+ if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) || >>+ ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH))) >>+ return 0; >>+ >>+ /* TCP ACK flag has to be set. */ >>+ if (!ISSET(head.tcp->th_flags, TH_ACK) || >>+ !ISSET(tail.tcp->th_flags, TH_ACK)) >>+ return 0; >>+ >>+ /* Ignore segments with different TCP options. */ >>+ if (head.tcphlen - sizeof(struct tcphdr) != >>+ tail.tcphlen - sizeof(struct tcphdr)) >>+ return 0; >>+ >>+ /* Check for TCP options */ >>+ if (head.tcphlen > sizeof(struct tcphdr)) { >>+ char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr); >>+ char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr); >>+ int optsize = head.tcphlen - sizeof(struct tcphdr); >>+ int optlen; >>+ >>+ for (; optsize > 0; optsize -= optlen) { >>+ /* Ignore segments with different TCP options. */ >>+ if (hopt[0] != topt[0] || hopt[1] != topt[1]) >>+ return 0; >>+ >>+ /* Get option length */ >>+ optlen = hopt[1]; >>+ if (hopt[0] == TCPOPT_NOP) >>+ optlen = 1; >>+ else if (optlen < 2 || optlen > optsize) >>+ return 0; /* Illegal length */ >>+ >>+ if (hopt[0] != TCPOPT_NOP && >>+ hopt[0] != TCPOPT_TIMESTAMP) >>+ return 0; /* Unsupported TCP option */ >>+ >>+ hopt += optlen; >>+ topt += optlen; >>+ } >>+ } >>+ >>+ /* >>+ * Prepare concatenation of head and tail. >>+ */ >>+ >>+ /* Adjust IP header. */ >>+ if (head.ip4) { >>+ head.ip4->ip_len = htons(head.iplen + tail.paylen); >>+ } else if (head.ip6) { >>+ head.ip6->ip6_plen = >>+ htons(head.iplen - head.iphlen + tail.paylen); >>+ } >>+ >>+ /* Combine TCP flags from head and tail. */ >>+ if (ISSET(tail.tcp->th_flags, TH_PUSH)) >>+ SET(head.tcp->th_flags, TH_PUSH); >>+ >>+ /* Adjust TCP header. */ >>+ head.tcp->th_win = tail.tcp->th_win; >>+ head.tcp->th_ack = tail.tcp->th_ack; >>+ >>+ /* Calculate header length of tail packet. */ >>+ hdrlen = sizeof(*tail.eh); >>+ if (tail.evh) >>+ hdrlen = sizeof(*tail.evh); >>+ hdrlen += tail.iphlen; >>+ hdrlen += tail.tcphlen; >>+ >>+ /* Skip protocol headers in tail. */ >>+ m_adj(mtail, hdrlen); >>+ CLR(mtail->m_flags, M_PKTHDR); >>+ >>+ /* Concatenate */ >>+ for (m = mhead; m->m_next;) >>+ m = m->m_next; >>+ m->m_next = mtail; >>+ mhead->m_pkthdr.len += tail.paylen; >>+ >>+ /* Flag mbuf as TSO packet with MSS. */ >>+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) { >>+ /* Set CSUM_OUT flags in case of forwarding. */ >>+ SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); >>+ head.tcp->th_sum = 0; >>+ if (head.ip4) { >>+ SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); >>+ head.ip4->ip_sum = 0; >>+ } >>+ >>+ SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO); >>+ mhead->m_pkthdr.ph_mss = head.paylen; >>+ tcpstat_inc(tcps_inhwlro); >>+ tcpstat_inc(tcps_inpktlro); /* count head */ >>+ } >>+ mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen); >>+ tcpstat_inc(tcps_inpktlro); /* count tail */ >>+ >>+ return 1; >>+} >>+ >>+void >>+tcp_enqueue_lro(struct mbuf_list *ml, struct mbuf *mtail) >>+{ >>+ struct mbuf *mhead; >>+ >>+ /* Don't merge packets with invalid header checksum. */ >>+ if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) >>+ goto out; >>+ >>+ for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) { >>+ /* Don't merge packets with invalid header checksum. */ >>+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) >>+ continue; >>+ >>+ /* Use RSS hash to skip packets of different connections. */ >>+ if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) && >>+ ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) && >>+ mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid) >>+ continue; >>+ >>+ /* Don't merge packets of different VLANs */ >>+ if (ISSET(mhead->m_flags, M_VLANTAG) != >>+ ISSET(mtail->m_flags, M_VLANTAG)) >>+ continue; >>+ >>+ if (ISSET(mhead->m_flags, M_VLANTAG) && >>+ EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) != >>+ EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag)) >>+ continue; >>+ >>+ if (tcp_softlro(mhead, mtail)) >>+ return; >>+ } >>+ out: >>+ ml_enqueue(ml, mtail); >>} >>Index: netinet/tcp_var.h >>=================================================================== >>RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v >>diff -u -p -r1.186 tcp_var.h >>--- netinet/tcp_var.h 2 Mar 2025 21:28:32 -0000 1.186 >>+++ netinet/tcp_var.h 20 Mar 2025 10:29:12 -0000 >>@@ -720,6 +720,7 @@ void tcp_init(void); >>int tcp_input(struct mbuf **, int *, int, int, struct netstack *); >>int tcp_mss(struct tcpcb *, int); >>void tcp_mss_update(struct tcpcb *); >>+void tcp_enqueue_lro(struct mbuf_list *, struct mbuf *); >>u_int tcp_hdrsz(struct tcpcb *); >>void tcp_mtudisc(struct inpcb *, int); >>void tcp_mtudisc_increase(struct inpcb *, int); >> > >-- >Mark Patruck ( mark at wrapped.cx ) >GPG key 0xF2865E51 / 187F F6D3 EE04 1DCE 1C74 F644 0D3C F66F F286 5E51 >https://www.wrapped.cx -- Mark Patruck ( mark at wrapped.cx ) GPG key 0xF2865E51 / 187F F6D3 EE04 1DCE 1C74 F644 0D3C F66F F286 5E51 https://www.wrapped.cx