From: Jan Klemkow Subject: SoftLRO for ixl(4), bnxt(4) and em(4) To: tech@openbsd.org Date: Thu, 7 Nov 2024 01:10:10 +0100 Hi, This diff introduces a software solution for TCP Large Receive Offload (SoftLRO) for network interfaces don't hat hardware support for it. This is needes at least for newer Intel interfaces as their documentation said that LRO a.k.a. Receive Side Coalescing (RSC) has to be done by software. This diff coalesces TCP segments during the receive interrupt before queueing them. Thus, our TCP/IP stack has to process less packet headers per amount of received data. I measured receiving performance with Intel XXV710 25 GbE interfaces. It increased from 6 Gbit/s to 23 Gbit/s. Even if we saturate em(4) without any of these technique its also part this diff. I'm interested if this diff helps to reach 1 Gbit/s on old or slow hardware. I also add bnxt(4) to this diff to increase test coverage. If you want to tests this implementation with your favorite interface, just replace the ml_enqueue() call with the new tcp_softlro_enqueue() (as seen below). It should work with all kind network interfaces. Any comments and tests reports are welcome. Bye, Jan Index: dev/pci/if_bnxt.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v diff -u -p -r1.52 if_bnxt.c --- dev/pci/if_bnxt.c 6 Oct 2024 23:43:18 -0000 1.52 +++ dev/pci/if_bnxt.c 6 Nov 2024 23:32:00 -0000 @@ -646,6 +646,8 @@ bnxt_attach(struct device *parent, struc IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv6; ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; + ifp->if_xflags |= IFXF_LRO; + ifp->if_capabilities |= IFCAP_LRO; #if NVLAN > 0 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; #endif @@ -2283,6 +2285,7 @@ bnxt_rx(struct bnxt_softc *sc, struct bn struct bnxt_cp_ring *cpr, struct mbuf_list *ml, int *slots, int *agslots, struct cmpl_base *cmpl) { + struct ifnet *ifp = &sc->sc_ac.ac_if; struct mbuf *m, *am; struct bnxt_slot *bs; struct rx_pkt_cmpl *rxlo = (struct rx_pkt_cmpl *)cmpl; @@ -2355,7 +2358,10 @@ bnxt_rx(struct bnxt_softc *sc, struct bn (*agslots)++; } - ml_enqueue(ml, m); + if (ISSET(ifp->if_xflags, IFXF_LRO)) + tcp_softlro_enqueue(ml, m); + else + ml_enqueue(ml, m); return (0); } Index: dev/pci/if_em.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.c,v diff -u -p -r1.378 if_em.c --- dev/pci/if_em.c 31 Aug 2024 16:23:09 -0000 1.378 +++ dev/pci/if_em.c 6 Nov 2024 23:32:00 -0000 @@ -2013,6 +2013,9 @@ em_setup_interface(struct em_softc *sc) ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; } + ifp->if_xflags |= IFXF_LRO; + ifp->if_capabilities |= IFCAP_LRO; + /* * Specify the media types supported by this adapter and register * callbacks to update media and link information @@ -3185,7 +3188,11 @@ em_rxeof(struct em_queue *que) m->m_flags |= M_VLANTAG; } #endif - ml_enqueue(&ml, m); + + if (ISSET(ifp->if_xflags, IFXF_LRO)) + tcp_softlro_enqueue(&ml, m); + else + ml_enqueue(&ml, m); que->rx.fmp = NULL; que->rx.lmp = NULL; Index: dev/pci/if_ixl.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v diff -u -p -r1.102 if_ixl.c --- dev/pci/if_ixl.c 30 Oct 2024 18:02:45 -0000 1.102 +++ dev/pci/if_ixl.c 6 Nov 2024 23:33:07 -0000 @@ -1976,6 +1976,9 @@ ixl_attach(struct device *parent, struct IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; + ifp->if_xflags |= IFXF_LRO; + ifp->if_capabilities |= IFCAP_LRO; + ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); ixl_media_add(sc, phy_types); @@ -3324,7 +3327,10 @@ ixl_rxeof(struct ixl_softc *sc, struct i #endif ixl_rx_checksum(m, word); - ml_enqueue(&ml, m); + if (ISSET(ifp->if_xflags, IFXF_LRO)) + tcp_softlro_enqueue(&ml, m); + else + ml_enqueue(&ml, m); } else { ifp->if_ierrors++; /* XXX */ m_freem(m); Index: netinet/tcp_input.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_input.c,v diff -u -p -r1.407 tcp_input.c --- netinet/tcp_input.c 26 Aug 2024 13:55:14 -0000 1.407 +++ netinet/tcp_input.c 6 Nov 2024 23:32:00 -0000 @@ -84,6 +84,7 @@ #include #include +#include #include #include #include @@ -4161,4 +4162,234 @@ syn_cache_respond(struct syn_cache *sc, #endif } return (error); +} + +int +tcp_softlro(struct mbuf *mhead, struct mbuf *mtail) +{ + struct ether_extracted head; + struct ether_extracted tail; + struct mbuf *m; + unsigned int hdrlen; + + /* + * Check if head and tail are mergeable + */ + + ether_extract_headers(mhead, &head); + ether_extract_headers(mtail, &tail); + + /* Don't merge packets of different VLANs */ + if (head.evh && tail.evh) { + if (head.evh->evl_tag != tail.evh->evl_tag) + return 0; + } else if (head.evh || tail.evh) + return 0; + + /* Check IP header. */ + if (head.ip4 && tail.ip4) { + /* Don't merge packets with invalid header checksum. */ + if (!ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) || + !ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK) || + ISSET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD) || + ISSET(mtail->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD)) + return 0; + + /* Check IPv4 addresses. */ + if (head.ip4->ip_src.s_addr != tail.ip4->ip_src.s_addr || + head.ip4->ip_dst.s_addr != tail.ip4->ip_dst.s_addr) + return 0; + + /* Don't merge IPv4 fragments. */ + if (ISSET(head.ip4->ip_off, htons(IP_OFFMASK | IP_MF)) || + ISSET(tail.ip4->ip_off, htons(IP_OFFMASK | IP_MF))) + return 0; + + /* Check max. IPv4 length. */ + if (head.iplen + tail.iplen > IP_MAXPACKET) + return 0; + + /* Don't merge IPv4 packets with option headers. */ + if (head.iphlen != sizeof(struct ip) || + tail.iphlen != sizeof(struct ip)) + return 0; + + /* Don't non-TCP packets. */ + if (head.ip4->ip_p != IPPROTO_TCP || + tail.ip4->ip_p != IPPROTO_TCP) + return 0; + } else if (head.ip6 && tail.ip6) { + /* Check IPv6 addresses. */ + if (!IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_src, &tail.ip6->ip6_src) || + !IN6_ARE_ADDR_EQUAL(&head.ip6->ip6_dst, &tail.ip6->ip6_dst)) + return 0; + + /* Check max. IPv6 length. */ + if ((head.iplen - head.iphlen) + + (tail.iplen - tail.iphlen) > IPV6_MAXPACKET) + return 0; + + /* Don't merge IPv6 packets with option headers nor non-TCP. */ + if (head.ip6->ip6_nxt != IPPROTO_TCP || + tail.ip6->ip6_nxt != IPPROTO_TCP) + return 0; + } else { + return 0; + } + + /* Check TCP header. */ + if (!head.tcp || !tail.tcp) + return 0; + + /* Check TCP ports. */ + if (head.tcp->th_sport != tail.tcp->th_sport || + head.tcp->th_dport != tail.tcp->th_dport) + return 0; + + /* Don't merge empty segments. */ + if (head.paylen == 0 || tail.paylen == 0) + return 0; + + /* Check for continues segments. */ + if (ntohl(head.tcp->th_seq) + head.paylen != ntohl(tail.tcp->th_seq)) + return 0; + + /* Just ACK and PUSH TCP flags are allowed. */ + if (ISSET(head.tcp->th_flags, ~(TH_ACK|TH_PUSH)) || + ISSET(tail.tcp->th_flags, ~(TH_ACK|TH_PUSH))) + return 0; + + /* TCP ACK flag has to be set. */ + if (!ISSET(head.tcp->th_flags, TH_ACK) || + !ISSET(tail.tcp->th_flags, TH_ACK)) + return 0; + + /* Ignore segments with different TCP options. */ + if (head.tcphlen - sizeof(struct tcphdr) != + tail.tcphlen - sizeof(struct tcphdr)) + return 0; + + /* Check for TCP options */ + if (head.tcphlen > sizeof(struct tcphdr)) { + char *hopt = (char *)(head.tcp) + sizeof(struct tcphdr); + char *topt = (char *)(tail.tcp) + sizeof(struct tcphdr); + int optsize = head.tcphlen - sizeof(struct tcphdr); + int optlen; + + for (; optsize > 0; optsize -= optlen) { + /* Ignore segments with different TCP options. */ + if (hopt[0] != topt[0] || hopt[1] != topt[1]) + return 0; + + /* Get option length */ + optlen = hopt[1]; + if (hopt[0] == TCPOPT_NOP) + optlen = 1; + else if (optlen < 2 || optlen > optsize) + return 0; /* Illegal length */ + + if (hopt[0] != TCPOPT_NOP && + hopt[0] != TCPOPT_TIMESTAMP) + return 0; /* Unsupported TCP option */ + + hopt += optlen; + topt += optlen; + } + } + + /* + * Prepare concatenation of head and tail. + */ + + /* Adjust IP header. */ + if (head.ip4) { + head.ip4->ip_len = htons(head.iplen + tail.paylen); + } else if (head.ip6) { + head.ip6->ip6_plen = + htons(head.iplen - head.iphlen + tail.paylen); + } + + /* Combine TCP flags from head and tail. */ + if (ISSET(tail.tcp->th_flags, TH_PUSH)) + SET(head.tcp->th_flags, TH_PUSH); + + /* Adjust TCP header. */ + head.tcp->th_win = tail.tcp->th_win; + head.tcp->th_ack = tail.tcp->th_ack; + + /* Calculate header length of tail packet. */ + hdrlen = sizeof(*tail.eh); + if (tail.evh) + hdrlen = sizeof(*tail.evh); + hdrlen += tail.iphlen; + hdrlen += tail.tcphlen; + + /* Skip protocol headers in tail. */ + m_adj(mtail, hdrlen); + CLR(mtail->m_flags, M_PKTHDR); + + /* Concatenate */ + for (m = mhead; m->m_next;) + m = m->m_next; + m->m_next = mtail; + mhead->m_pkthdr.len += tail.paylen; + + /* Flag mbuf as TSO packet with MSS. */ + if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_TSO)) { + /* Set CSUM_OUT flags in case of forwarding. */ + SET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); + head.tcp->th_sum = 0; + if (head.ip4) { + SET(mhead->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); + head.ip4->ip_sum = 0; + } + + SET(mhead->m_pkthdr.csum_flags, M_TCP_TSO); + mhead->m_pkthdr.ph_mss = head.paylen; + tcpstat_inc(tcps_inhwlro); + tcpstat_inc(tcps_inpktlro); /* count head */ + } + mhead->m_pkthdr.ph_mss = MAX(mhead->m_pkthdr.ph_mss, tail.paylen); + tcpstat_inc(tcps_inpktlro); /* count tail */ + + return 1; +} + +void +tcp_softlro_enqueue(struct mbuf_list *ml, struct mbuf *mtail) +{ + struct mbuf *mhead; + + /* Don't merge packets with invalid header checksum. */ + if (!ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK) || + ISSET(mtail->m_pkthdr.csum_flags, M_TCP_CSUM_IN_BAD)) + goto out; + + for (mhead = ml->ml_head; mhead != NULL; mhead = mhead->m_nextpkt) { + /* Don't merge packets with invalid header checksum. */ + if (!ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK) || + ISSET(mhead->m_pkthdr.csum_flags, M_TCP_CSUM_IN_BAD)) + continue; + + /* Use RSS hash to skip packets of different connections. */ + if (ISSET(mhead->m_pkthdr.csum_flags, M_FLOWID) && + ISSET(mtail->m_pkthdr.csum_flags, M_FLOWID) && + mhead->m_pkthdr.ph_flowid != mtail->m_pkthdr.ph_flowid) + continue; + + /* Don't merge packets of different VLANs */ + if (ISSET(mhead->m_flags, M_VLANTAG) != + ISSET(mtail->m_flags, M_VLANTAG)) + continue; + + if (ISSET(mhead->m_flags, M_VLANTAG) && + ISSET(mtail->m_flags, M_VLANTAG) && + mhead->m_pkthdr.ether_vtag != mtail->m_pkthdr.ether_vtag) + continue; + + if (tcp_softlro(mhead, mtail)) + return; + } + out: + ml_enqueue(ml, mtail); } Index: netinet/tcp_var.h =================================================================== RCS file: /cvs/src/sys/netinet/tcp_var.h,v diff -u -p -r1.178 tcp_var.h --- netinet/tcp_var.h 13 May 2024 01:15:53 -0000 1.178 +++ netinet/tcp_var.h 6 Nov 2024 23:32:00 -0000 @@ -720,6 +720,7 @@ void tcp_init(void); int tcp_input(struct mbuf **, int *, int, int); int tcp_mss(struct tcpcb *, int); void tcp_mss_update(struct tcpcb *); +void tcp_softlro_enqueue(struct mbuf_list *, struct mbuf *); u_int tcp_hdrsz(struct tcpcb *); void tcp_mtudisc(struct inpcb *, int); void tcp_mtudisc_increase(struct inpcb *, int);