From: Alexander Bluhm Subject: Re: bnxt(4) tso To: Jonathan Matthew Cc: Hrvoje Popovski , Jonathan Matthew , tech@openbsd.org Date: Wed, 17 Jan 2024 15:33:49 +0100 On Mon, Jan 15, 2024 at 07:54:05PM +1000, Jonathan Matthew wrote: > On Fri, Jan 12, 2024 at 10:15:02PM +0100, Alexander Bluhm wrote: > > On Fri, Jan 12, 2024 at 11:27:31AM +0100, Hrvoje Popovski wrote: > > > is it maybe because of mtu. I've tested even that and it's working just > > > fine .. > > > > I have identified the test that causes problems. > > > > netbench.pl -v -b1000000 -croot@lt40 -sroot@lt43 -A10.10.21.20 -a10.10.22.40 -t10 tcpsplice > > > > It runs a single TCP connection from Linux to OpenBSD bnxt0, splices > > it to another TCP connection that goes via bnxt1 to a second Linux > > machine. > > Thanks for tracking this down! I worked out that TSO was generating packets > with 32 DMA segments, but after looking at if_bnxtreg.h again I realised > 31 is the most we can do. r1.44 of if_bnxt.c fixes the driver so it only > allows 31 segments, and writes the right value into the tx ring slot when > this happens. > > With that fixed, I can no longer reproduce this problem with socket splicing. > Updated diff below, addressing your comments from elsewhere in the thread. My tests work now. http://bluhm.genua.de/netlink/results/2024-01-16T10%3A37%3A06Z/netlink.html The failures with UDP socket splicing also happen elsewere and seem unrelated. Tested with vlan and jumbo frames. OK bluhm@ > Index: if_bnxt.c > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v > retrieving revision 1.44 > diff -u -p -r1.44 if_bnxt.c > --- if_bnxt.c 15 Jan 2024 08:56:45 -0000 1.44 > +++ if_bnxt.c 15 Jan 2024 09:21:25 -0000 > @@ -68,6 +68,7 @@ > > #include > #include > +#include > #include > > #if NBPFILTER > 0 > @@ -75,7 +76,12 @@ > #endif > > #include > +#include > +#include > #include > +#include > +#include > +#include > > #define BNXT_HWRM_BAR 0x10 > #define BNXT_DOORBELL_BAR 0x18 > @@ -642,6 +649,7 @@ bnxt_attach(struct device *parent, struc > ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 | > IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv6 | > IFCAP_CSUM_TCPv6; > + ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; > #if NVLAN > 0 > ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; > #endif > @@ -929,7 +937,7 @@ bnxt_queue_up(struct bnxt_softc *sc, str > > for (i = 0; i < tx->tx_ring.ring_size; i++) { > bs = &tx->tx_slots[i]; > - if (bus_dmamap_create(sc->sc_dmat, BNXT_MAX_MTU, BNXT_MAX_TX_SEGS, > + if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES, BNXT_MAX_TX_SEGS, > BNXT_MAX_MTU, 0, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, > &bs->bs_map) != 0) { > printf("%s: failed to allocate tx dma maps\n", > @@ -1337,11 +1345,12 @@ bnxt_start(struct ifqueue *ifq) > struct bnxt_tx_queue *tx = ifq->ifq_softc; > struct bnxt_softc *sc = tx->tx_softc; > struct bnxt_slot *bs; > + struct ether_extracted ext; > bus_dmamap_t map; > struct mbuf *m; > u_int idx, free, used, laststart; > - uint16_t txflags; > - int i; > + uint16_t txflags, lflags; > + int i, slen; > > txring = (struct tx_bd_short *)BNXT_DMA_KVA(tx->tx_ring_mem); > > @@ -1385,12 +1394,16 @@ bnxt_start(struct ifqueue *ifq) > txring[idx].len = htole16(map->dm_segs[0].ds_len); > txring[idx].opaque = tx->tx_prod; > txring[idx].addr = htole64(map->dm_segs[0].ds_addr); > + if (m->m_pkthdr.csum_flags & M_TCP_TSO) > + slen = m->m_pkthdr.ph_mss; > + else > + slen = map->dm_mapsize; > > - if (map->dm_mapsize < 512) > + if (slen < 512) > txflags = TX_BD_LONG_FLAGS_LHINT_LT512; > - else if (map->dm_mapsize < 1024) > + else if (slen < 1024) > txflags = TX_BD_LONG_FLAGS_LHINT_LT1K; > - else if (map->dm_mapsize < 2048) > + else if (slen < 2048) > txflags = TX_BD_LONG_FLAGS_LHINT_LT2K; > else > txflags = TX_BD_LONG_FLAGS_LHINT_GTE2K; > @@ -1409,12 +1422,44 @@ bnxt_start(struct ifqueue *ifq) > /* long tx descriptor */ > txhi = (struct tx_bd_long_hi *)&txring[idx]; > memset(txhi, 0, sizeof(*txhi)); > - txflags = 0; > - if (m->m_pkthdr.csum_flags & (M_UDP_CSUM_OUT | M_TCP_CSUM_OUT)) > - txflags |= TX_BD_LONG_LFLAGS_TCP_UDP_CHKSUM; > - if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT) > - txflags |= TX_BD_LONG_LFLAGS_IP_CHKSUM; > - txhi->lflags = htole16(txflags); > + > + lflags = 0; > + if (m->m_pkthdr.csum_flags & M_TCP_TSO) { > + int hdrsize; > + int outlen; > + int paylen; > + > + ether_extract_headers(m, &ext); > + if (ext.tcp) { > + lflags |= TX_BD_LONG_LFLAGS_LSO; > + hdrsize = sizeof(*ext.eh); > + if (ext.ip4) > + hdrsize += ext.ip4->ip_hl << 2; > + else if (ext.ip6) > + hdrsize += sizeof(*ext.ip6); > + else > + tcpstat_inc(tcps_outbadtso); > + > + hdrsize += ext.tcp->th_off << 2; > + txhi->hdr_size = htole16(hdrsize / 2); > + > + outlen = m->m_pkthdr.ph_mss; > + txhi->mss = htole32(outlen); > + > + paylen = m->m_pkthdr.len - hdrsize; > + tcpstat_add(tcps_outpkttso, > + (paylen + outlen + 1) / outlen); > + } else { > + tcpstat_inc(tcps_outbadtso); > + } > + } else { > + if (m->m_pkthdr.csum_flags & (M_UDP_CSUM_OUT | > + M_TCP_CSUM_OUT)) > + lflags |= TX_BD_LONG_LFLAGS_TCP_UDP_CHKSUM; > + if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT) > + lflags |= TX_BD_LONG_LFLAGS_IP_CHKSUM; > + } > + txhi->lflags = htole16(lflags); > > #if NVLAN > 0 > if (m->m_flags & M_VLANTAG) {