Download raw body.
SoftLRO for ixl(4), bnxt(4) and em(4)
On 26.03.2025 11:33, Mark Patruck wrote:
>On 20.03.2025 15:20, Alexander Bluhm wrote:
>>As release lock will come soon, we should be careful. I think we
>>can add the code if we ensure that default behavior does not change
>>anything.
>>
>>We should start with ixl(4), ifconfig tcplro flag disabled per
>>default. Then we can improve the TCP code in tree and add other
>>drivers later. I found some issues like ethernet padding and
>>TCP option parsing that should be improved. But doing this
>>on top of an initial commit is easier.
>>
>>With some minor changes I am OK with commiting the diff below.
>>
>>- remove all drivers except ixl(4) from the diff
>>- turn off ifconfig tcplro per default
>>- make sure that ixl(4) logic does not change, if tcplro is off
>>- I have fixed the livelocked logic in ixl(4)
>>- I renamed the function to tcp_enqueue_lro(), it is more consistent
>> with the TSO functions.
>>
>>As said before, tcp_softlro() needs more love, but it want to
>>do this in tree.
>>
>>With my comments, jan@'s diff looks like this and would be OK bluhm@
>
>No real issues spotted on a busy ixl(4)/carp(4) setup for the last four days.
>
>I did a few rounds of tcpbench between ixl(4)->vio(4) and the throughput
>is more or less constantly good (around 9Gb/s) and also mbufs are released
>directly after stopping and go back to the previous state.
>
>I still see some very low values, although never during a "good" run, that
>means, if the first 2,3 values are ~ 9Gb/s it never passes <8Gb/s, whereas
>sometimes it starts with 5,6 Gb/s and then never reaches the top, f.e
>
># good run (most of the time)
> elapsed_ms bytes mbps bwidth 1000
>992793140 7942.345 100.00% Conn: 1 Mbps: 7942.345 Peak
>Mbps: 7942.345 Avg Mbps: 7942.345
> 2000 1174009396 9392.075 100.00% Conn: 1 Mbps:
>9392.075 Peak Mbps: 9392.075 Avg Mbps: 9392.075
> 3000 1173654672 9389.237 100.00% Conn: 1 Mbps:
>9389.237 Peak Mbps: 9392.075 Avg Mbps: 9389.237
> 4000 1161408036 9300.565 100.00% Conn: 1 Mbps:
>9300.565 Peak Mbps: 9392.075 Avg Mbps: 9300.565
> 5000 1173661004 9389.288 100.00% Conn: 1 Mbps:
>9389.288 Peak Mbps: 9392.075 Avg Mbps: 9389.288
> 6000 1173847200 9390.778 100.00% Conn: 1 Mbps:
>9390.778 Peak Mbps: 9392.075 Avg Mbps: 9390.778
> 7000 1173698228 9389.586 100.00% Conn: 1 Mbps:
>9389.586 Peak Mbps: 9392.075 Avg Mbps: 9389.586
>....continues like this....
>
>
># bad run (every now and then)
> elapsed_ms bytes mbps bwidth
>1000 236474968 1891.800 100.00%
>Conn: 1 Mbps: 1891.800 Peak Mbps: 1891.800 Avg Mbps:
>1891.800 2000 252852080 2022.817
>100.00% Conn: 1 Mbps:
>2022.817 Peak Mbps: 2022.817 Avg Mbps: 2022.817
>3000 544229216 4353.834 100.00%
>Conn: 1 Mbps: 4353.834 Peak Mbps: 4353.834 Avg Mbps:
>4353.834
> 4000 565089816 4520.719 100.00%
>Conn: 1 Mbps: 4520.719 Peak Mbps: 4520.719 Avg Mbps:
>4520.719
> 5004 413703444 3296.442 100.00%
>Conn: 1 Mbps: 3296.442 Peak Mbps: 4520.719 Avg Mbps:
>3296.442 6005 330198224 2641.586
>100.00% Conn: 1 Mbps:
>2641.586 Peak Mbps: 4520.719 Avg Mbps: 2641.586
Something else i've stumbled upon while testing directly from
ixl(4) -> ixl(4), both systems running bluhm@'s diff below.
Compared to running from ixl(4) -> vio(4), mbufs increase much
faster and also higher and almost won't get released anymore.
It pretty much stays on the highest value.
Freshly booted:
- ixl(4) -> vio(4):
- mbufs start: 610
- mbufs highest after 3x10 runs: 1007
- mbufs after tcpbench exited: 689
- mbufs highest after 12x10 runs: 1217
- mbufs after tcpbench exited: 767
- ixl(4) -> ixl(4):
- mbufs start: 639
- mbufs highest after 3x10 runs: 1628
- mbufs after tcpbench exited: 1588
- mbufs highest after 12x10 runs: 2184
- mbufs after tcpbench exited: 2013
>>Index: dev/pci/if_ixl.c
>>===================================================================
>>RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
>>diff -u -p -r1.102 if_ixl.c
>>--- dev/pci/if_ixl.c 30 Oct 2024 18:02:45 -0000 1.102
>>+++ dev/pci/if_ixl.c 20 Mar 2025 10:29:12 -0000
>>@@ -883,6 +883,8 @@ struct ixl_rx_wb_desc_16 {
>>
>>#define IXL_RX_DESC_PTYPE_SHIFT 30
>>#define IXL_RX_DESC_PTYPE_MASK (0xffULL << IXL_RX_DESC_PTYPE_SHIFT)
>>+#define IXL_RX_DESC_PTYPE_MAC_IPV4_TCP 26
>>+#define IXL_RX_DESC_PTYPE_MAC_IPV6_TCP 92
>>
>>#define IXL_RX_DESC_PLEN_SHIFT 38
>>#define IXL_RX_DESC_PLEN_MASK (0x3fffULL << IXL_RX_DESC_PLEN_SHIFT)
>>@@ -1975,6 +1977,11 @@ ixl_attach(struct device *parent, struct
>> IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
>> IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
>> ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>>+ ifp->if_capabilities |= IFCAP_LRO;
>>+#if 0
>>+ /* for now tcplro at ixl(4) is default off */
>>+ ifp->if_xflags |= IFXF_LRO;
>>+#endif
>>
>> ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
>>
>>@@ -3255,9 +3262,11 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>> struct ixl_rx_map *rxm;
>> bus_dmamap_t map;
>> unsigned int cons, prod;
>>+ struct mbuf_list mltcp = MBUF_LIST_INITIALIZER();
>> struct mbuf_list ml = MBUF_LIST_INITIALIZER();
>> struct mbuf *m;
>> uint64_t word;
>>+ unsigned int ptype;
>> unsigned int len;
>> unsigned int mask;
>> int done = 0;
>>@@ -3294,6 +3303,8 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>> m = rxm->rxm_m;
>> rxm->rxm_m = NULL;
>>
>>+ ptype = (word & IXL_RX_DESC_PTYPE_MASK)
>>+ >> IXL_RX_DESC_PTYPE_SHIFT;
>> len = (word & IXL_RX_DESC_PLEN_MASK) >> IXL_RX_DESC_PLEN_SHIFT;
>> m->m_len = len;
>> m->m_pkthdr.len = 0;
>>@@ -3314,7 +3325,6 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>> lemtoh32(&rxd->filter_status);
>> m->m_pkthdr.csum_flags |= M_FLOWID;
>> }
>>-
>>#if NVLAN > 0
>> if (ISSET(word, IXL_RX_DESC_L2TAG1P)) {
>> m->m_pkthdr.ether_vtag =
>>@@ -3322,9 +3332,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>> SET(m->m_flags, M_VLANTAG);
>> }
>>#endif
>>-
>> ixl_rx_checksum(m, word);
>>- ml_enqueue(&ml, m);
>>+
>>+ if (ISSET(ifp->if_xflags, IFXF_LRO) &&
>>+ (ptype == IXL_RX_DESC_PTYPE_MAC_IPV4_TCP ||
>>+ ptype == IXL_RX_DESC_PTYPE_MAC_IPV6_TCP))
>>+ tcp_enqueue_lro(&mltcp, m);
>>+ else
>>+ ml_enqueue(&ml, m);
>> } else {
>> ifp->if_ierrors++; /* XXX */
>> m_freem(m);
>>@@ -3341,8 +3356,14 @@ ixl_rxeof(struct ixl_softc *sc, struct i
>> } while (cons != prod);
>>
>> if (done) {
>>+ int livelocked = 0;
>>+
>> rxr->rxr_cons = cons;
>>+ if (ifiq_input(ifiq, &mltcp))
>>+ livelocked = 1;
>> if (ifiq_input(ifiq, &ml))
>>+ livelocked = 1;
>>+ if (livelocked)
>> if_rxr_livelocked(&rxr->rxr_acct);
>> ixl_rxfill(sc, rxr);
>> }
>>Index: netinet/tcp_input.c
>>===================================================================
>>RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
>>diff -u -p -r1.434 tcp_input.c
>>--- netinet/tcp_input.c 10 Mar 2025 15:11:46 -0000 1.434
>>+++ netinet/tcp_input.c 20 Mar 2025 10:29:12 -0000
>>@@ -84,6 +84,7 @@
>>#include <net/if_var.h>
>>#include <net/route.h>
>>
>>+#include <netinet/if_ether.h>
>>#include <netinet/in.h>
>>#include <netinet/ip.h>
>>#include <netinet/in_pcb.h>
>>@@ -4229,4 +4230,230 @@ syn_cache_respond(struct syn_cache *sc,
>> }
>> in_pcbunref(inp);
>> return (error);
>>+}
>>+
>>+int
>>+tcp_softlro(struct mbuf *mhead, struct mbuf *mtail)
>>+{
>>+ struct ether_extracted head;
>>+ struct ether_extracted tail;
>>+ struct mbuf *m;
>>+ unsigned int hdrlen;
>>+
>>+ /*
>>+ * Check if head and tail are mergeable
>>+ */
>>+
>>+ ether_extract_headers(mhead, &head);
>>+ ether_extract_headers(mtail, &tail);
>>+
>>+ /* Don't merge packets of different VLANs */
>>+ if (head.evh && tail.evh) {
>>+ if (head.evh->evl_tag != tail.evh->evl_tag)
>>+ return 0;
>>+ } else if (head.evh || tail.evh)
>>+ return 0;
>>+
>>+ /* Check IP header. */
>>+ if (head.ip4 && tail.ip4) {
>>+ /* Don't merge packets with invalid header checksum. */
>>+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) ||
>>+ !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK))
>>+ return 0;
>>+
>>+ /* Check IPv4 addresses. */
>>+ if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr ||
>>+ head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr)
>>+ return 0;
>>+
>>+ /* Don't merge IPv4 fragments. */
>>+ if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) ||
>>+ ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF)))
>>+ return 0;
>>+
>>+ /* Check max. IPv4 length. */
>>+ if (head.iplen + tail.iplen > IP_MAXPACKET)
>>+ return 0;
>>+
>>+ /* Don't merge IPv4 packets with option headers. */
>>+ if (head.iphlen != sizeof(struct ip) ||
>>+ tail.iphlen != sizeof(struct ip))
>>+ return 0;
>>+
>>+ /* Don't non-TCP packets. */
>>+ if (head.ip4->ip_p != IPPROTO_TCP ||
>>+ tail.ip4->ip_p != IPPROTO_TCP)
>>+ return 0;
>>+ } else if (head.ip6 && tail.ip6) {
>>+ /* Check IPv6 addresses. */
>>+ if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) ||
>>+ !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst))
>>+ return 0;
>>+
>>+ /* Check max. IPv6 length. */
>>+ if ((head.iplen - head.iphlen) +
>>+ (tail.iplen - tail.iphlen) > IPV6_MAXPACKET)
>>+ return 0;
>>+
>>+ /* Don't merge IPv6 packets with option headers nor non-TCP. */
>>+ if (head.ip6->ip6_nxt != IPPROTO_TCP ||
>>+ tail.ip6->ip6_nxt != IPPROTO_TCP)
>>+ return 0;
>>+ } else {
>>+ return 0;
>>+ }
>>+
>>+ /* Check TCP header. */
>>+ if (!head.tcp || !tail.tcp)
>>+ return 0;
>>+
>>+ /* Check TCP ports. */
>>+ if (head.tcp->th_sport != tail.tcp->th_sport ||
>>+ head.tcp->th_dport != tail.tcp->th_dport)
>>+ return 0;
>>+
>>+ /* Don't merge empty segments. */
>>+ if (head.paylen == 0 || tail.paylen == 0)
>>+ return 0;
>>+
>>+ /* Check for continues segments. */
>>+ if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq))
>>+ return 0;
>>+
>>+ /* Just ACK and PUSH TCP flags are allowed. */
>>+ if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) ||
>>+ ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH)))
>>+ return 0;
>>+
>>+ /* TCP ACK flag has to be set. */
>>+ if (!ISSET(head.tcp->th_flags, TH_ACK) ||
>>+ !ISSET(tail.tcp->th_flags, TH_ACK))
>>+ return 0;
>>+
>>+ /* Ignore segments with different TCP options. */
>>+ if (head.tcphlen - sizeof(struct tcphdr) !=
>>+ tail.tcphlen - sizeof(struct tcphdr))
>>+ return 0;
>>+
>>+ /* Check for TCP options */
>>+ if (head.tcphlen > sizeof(struct tcphdr)) {
>>+ char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr);
>>+ char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr);
>>+ int optsize = head.tcphlen - sizeof(struct tcphdr);
>>+ int optlen;
>>+
>>+ for (; optsize > 0; optsize -= optlen) {
>>+ /* Ignore segments with different TCP options. */
>>+ if (hopt[0] != topt[0] || hopt[1] != topt[1])
>>+ return 0;
>>+
>>+ /* Get option length */
>>+ optlen = hopt[1];
>>+ if (hopt[0] == TCPOPT_NOP)
>>+ optlen = 1;
>>+ else if (optlen < 2 || optlen > optsize)
>>+ return 0; /* Illegal length */
>>+
>>+ if (hopt[0] != TCPOPT_NOP &&
>>+ hopt[0] != TCPOPT_TIMESTAMP)
>>+ return 0; /* Unsupported TCP option */
>>+
>>+ hopt += optlen;
>>+ topt += optlen;
>>+ }
>>+ }
>>+
>>+ /*
>>+ * Prepare concatenation of head and tail.
>>+ */
>>+
>>+ /* Adjust IP header. */
>>+ if (head.ip4) {
>>+ head.ip4->ip_len = htons(head.iplen + tail.paylen);
>>+ } else if (head.ip6) {
>>+ head.ip6->ip6_plen =
>>+ htons(head.iplen - head.iphlen + tail.paylen);
>>+ }
>>+
>>+ /* Combine TCP flags from head and tail. */
>>+ if (ISSET(tail.tcp->th_flags, TH_PUSH))
>>+ SET(head.tcp->th_flags, TH_PUSH);
>>+
>>+ /* Adjust TCP header. */
>>+ head.tcp->th_win = tail.tcp->th_win;
>>+ head.tcp->th_ack = tail.tcp->th_ack;
>>+
>>+ /* Calculate header length of tail packet. */
>>+ hdrlen = sizeof(*tail.eh);
>>+ if (tail.evh)
>>+ hdrlen = sizeof(*tail.evh);
>>+ hdrlen += tail.iphlen;
>>+ hdrlen += tail.tcphlen;
>>+
>>+ /* Skip protocol headers in tail. */
>>+ m_adj(mtail, hdrlen);
>>+ CLR(mtail->m_flags, M_PKTHDR);
>>+
>>+ /* Concatenate */
>>+ for (m = mhead; m->m_next;)
>>+ m = m->m_next;
>>+ m->m_next = mtail;
>>+ mhead->m_pkthdr.len += tail.paylen;
>>+
>>+ /* Flag mbuf as TSO packet with MSS. */
>>+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) {
>>+ /* Set CSUM_OUT flags in case of forwarding. */
>>+ SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
>>+ head.tcp->th_sum = 0;
>>+ if (head.ip4) {
>>+ SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
>>+ head.ip4->ip_sum = 0;
>>+ }
>>+
>>+ SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO);
>>+ mhead->m_pkthdr.ph_mss = head.paylen;
>>+ tcpstat_inc(tcps_inhwlro);
>>+ tcpstat_inc(tcps_inpktlro); /* count head */
>>+ }
>>+ mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen);
>>+ tcpstat_inc(tcps_inpktlro); /* count tail */
>>+
>>+ return 1;
>>+}
>>+
>>+void
>>+tcp_enqueue_lro(struct mbuf_list *ml, struct mbuf *mtail)
>>+{
>>+ struct mbuf *mhead;
>>+
>>+ /* Don't merge packets with invalid header checksum. */
>>+ if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
>>+ goto out;
>>+
>>+ for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) {
>>+ /* Don't merge packets with invalid header checksum. */
>>+ if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK))
>>+ continue;
>>+
>>+ /* Use RSS hash to skip packets of different connections. */
>>+ if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) &&
>>+ ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) &&
>>+ mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid)
>>+ continue;
>>+
>>+ /* Don't merge packets of different VLANs */
>>+ if (ISSET(mhead->m_flags, M_VLANTAG) !=
>>+ ISSET(mtail->m_flags, M_VLANTAG))
>>+ continue;
>>+
>>+ if (ISSET(mhead->m_flags, M_VLANTAG) &&
>>+ EVL_VLANOFTAG(mhead->m_pkthdr.ether_vtag) !=
>>+ EVL_VLANOFTAG(mtail->m_pkthdr.ether_vtag))
>>+ continue;
>>+
>>+ if (tcp_softlro(mhead, mtail))
>>+ return;
>>+ }
>>+ out:
>>+ ml_enqueue(ml, mtail);
>>}
>>Index: netinet/tcp_var.h
>>===================================================================
>>RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
>>diff -u -p -r1.186 tcp_var.h
>>--- netinet/tcp_var.h 2 Mar 2025 21:28:32 -0000 1.186
>>+++ netinet/tcp_var.h 20 Mar 2025 10:29:12 -0000
>>@@ -720,6 +720,7 @@ void tcp_init(void);
>>int tcp_input(struct mbuf **, int *, int, int, struct netstack *);
>>int tcp_mss(struct tcpcb *, int);
>>void tcp_mss_update(struct tcpcb *);
>>+void tcp_enqueue_lro(struct mbuf_list *, struct mbuf *);
>>u_int tcp_hdrsz(struct tcpcb *);
>>void tcp_mtudisc(struct inpcb *, int);
>>void tcp_mtudisc_increase(struct inpcb *, int);
>>
>
>--
>Mark Patruck ( mark at wrapped.cx )
>GPG key 0xF2865E51 / 187F F6D3 EE04 1DCE 1C74 F644 0D3C F66F F286 5E51
>https://www.wrapped.cx
--
Mark Patruck ( mark at wrapped.cx )
GPG key 0xF2865E51 / 187F F6D3 EE04 1DCE 1C74 F644 0D3C F66F F286 5E51
https://www.wrapped.cx
SoftLRO for ixl(4), bnxt(4) and em(4)