Index | Thread | Search

From:
Jonathan Matthew <jonathan@d14n.org>
Subject:
aq(4): add TSO
To:
tech@openbsd.org
Cc:
kettenis@openbsd.org, jan@openbsd.org
Date:
Mon, 10 Feb 2025 20:59:41 +1000

Download raw body.

Thread
  • Jonathan Matthew:

    aq(4): add TSO

This adds TSO support to aq(4).  Loosely modelled on how it's done
in ix(4) etc.

This works about as expected on my AQC113 card, reducing the cpu time
required to make tcpbench/iperf/etc. go at 2.5Gb/s.  On an AQC107 card
it works, but actually seems to make those straightforward benchmark
tools go slower, so in this diff I'm only enabling it for AQC11x/aq2
devices.

I'm not sure if this means there's something wrong elsewhere in the
driver, or that the implementation of TSO in the first generation
devices wasn't great, or something else, but hopefully I can figure
that out at some point.

ok?

Index: if_aq_pci.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_aq_pci.c,v
diff -u -p -r1.30 if_aq_pci.c
--- if_aq_pci.c	2 Feb 2025 08:28:14 -0000	1.30
+++ if_aq_pci.c	10 Feb 2025 10:41:42 -0000
@@ -90,10 +90,14 @@
 
 #include <net/if.h>
 #include <net/if_media.h>
+#include <net/route.h>
 #include <net/toeplitz.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
 
 #ifdef __HAVE_FDT
 #include <dev/ofw/openfirm.h>
@@ -354,6 +358,8 @@
 #define  TPO_HWCSUM_L4CSUM_EN			(1 << 0)
 #define  TPO_HWCSUM_IP4CSUM_EN			(1 << 1)
 
+#define TDM_LSO_EN_REG				0x7810
+
 #define THM_LSO_TCP_FLAG1_REG			0x7820
 #define  THM_LSO_TCP_FLAG1_FIRST		0xFFF
 #define  THM_LSO_TCP_FLAG1_MID			0xFFF0000
@@ -912,12 +918,21 @@ struct aq_tx_desc {
 #define AQ_TXDESC_CTL1_CMD_FCS	(1 << 23)
 #define AQ_TXDESC_CTL1_CMD_IP4CSUM (1 << 24)
 #define AQ_TXDESC_CTL1_CMD_L4CSUM (1 << 25)
+#define AQ_TXDESC_CTL1_CMD_LSO	(1 << 26)
 #define AQ_TXDESC_CTL1_CMD_WB	(1 << 27)
 
+#define AQ_TXDESC_CTL1_CMD_IPV6 (1 << 21)
+#define AQ_TXDESC_CTL1_CMD_TCP	(1 << 22)
+#define AQ_TXDESC_CTL1_L2LEN_SHIFT 24
+#define AQ_TXDESC_CTL1_L3LEN_SHIFT 31
+
 #define AQ_TXDESC_CTL1_VID_SHIFT 4
 	uint32_t		ctl2;
+#define AQ_TXDESC_CTL2_MSS_SHIFT 16
 #define AQ_TXDESC_CTL2_LEN_SHIFT 14
 #define AQ_TXDESC_CTL2_CTX_EN	(1 << 13)
+#define AQ_TXDESC_CTL2_L3LEN_SHIFT 1
+#define AQ_TXDESC_CTL2_L4LEN_SHIFT 8
 } __packed;
 
 struct aq_slot {
@@ -1135,6 +1150,7 @@ int	aq_ioctl(struct ifnet *, u_long, cad
 int	aq_up(struct aq_softc *);
 void	aq_down(struct aq_softc *);
 void	aq_iff(struct aq_softc *);
+int	aq_tx_setup(struct aq_tx_desc *, uint32_t *, uint32_t *, struct mbuf *);
 void	aq_start(struct ifqueue *);
 void	aq_ifmedia_status(struct ifnet *, struct ifmediareq *);
 int	aq_ifmedia_change(struct ifnet *);
@@ -1367,6 +1383,13 @@ aq_attach(struct device *parent, struct 
 	ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
 	    IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
 	    IFCAP_CSUM_TCPv6;
+
+	/*
+	 * only use TSO on AQ2; it works on AQ1, but it's slower than
+	 * not using it.
+	 */
+	if (aqp->aq_hwtype == HWTYPE_AQ2)
+		ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 #if NVLAN > 0
 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
 #endif
@@ -3246,6 +3269,68 @@ aq_txeof(struct aq_softc *sc, struct aq_
 	}
 }
 
+int
+aq_tx_setup(struct aq_tx_desc *txd, uint32_t *ctl1, uint32_t *ctl2,
+    struct mbuf *m)
+{
+	int context = 0;
+
+#if NVLAN > 0
+	if (m->m_flags & M_VLANTAG) {
+		txd->ctl1 |= htole32(AQ_TXDESC_CTL1_TYPE_TXC |
+		    (m->m_pkthdr.ether_vtag << AQ_TXDESC_CTL1_VLAN_SHIFT));
+
+		*ctl1 |= AQ_TXDESC_CTL1_CMD_VLAN;
+		*ctl2 |= AQ_TXDESC_CTL2_CTX_EN;
+
+		context = 1;
+	}
+#endif
+
+	if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
+		*ctl1 |= AQ_TXDESC_CTL1_CMD_IP4CSUM;
+	if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
+		*ctl1 |= AQ_TXDESC_CTL1_CMD_L4CSUM;
+
+	if (m->m_pkthdr.csum_flags & M_TCP_TSO) {
+		int paylen, outlen;
+		struct ether_extracted ext;
+
+		ether_extract_headers(m, &ext);
+		if (ext.tcp && m->m_pkthdr.ph_mss > 0 && (ext.ip4 || ext.ip6)) {
+			outlen = m->m_pkthdr.ph_mss;
+			paylen = m->m_pkthdr.len - (ETHER_HDR_LEN +
+			    ext.iphlen + ext.tcphlen);
+
+			txd->ctl1 |= htole32(AQ_TXDESC_CTL1_TYPE_TXC |
+			    AQ_TXDESC_CTL1_CMD_TCP |
+			    (ETHER_HDR_LEN << AQ_TXDESC_CTL1_L2LEN_SHIFT) |
+			    (ext.iphlen << AQ_TXDESC_CTL1_L3LEN_SHIFT));
+
+			txd->ctl2 |= htole32(
+			    (outlen << AQ_TXDESC_CTL2_MSS_SHIFT) |
+			    (ext.tcphlen << AQ_TXDESC_CTL2_L4LEN_SHIFT)|
+			    (ext.iphlen >> AQ_TXDESC_CTL2_L3LEN_SHIFT));
+
+			if (ext.ip6)
+				txd->ctl1 |= htole32(AQ_TXDESC_CTL1_CMD_IPV6);
+
+			context = 1;
+			*ctl1 |= AQ_TXDESC_CTL1_CMD_LSO;
+			*ctl2 |= AQ_TXDESC_CTL2_CTX_EN;
+			*ctl2 |= (paylen << AQ_TXDESC_CTL2_LEN_SHIFT);
+
+			tcpstat_add(tcps_outpkttso,
+			    (paylen + outlen + 1) / outlen);
+		} else {
+			tcpstat_inc(tcps_outbadtso);
+		}
+	} else
+		*ctl2 |= m->m_pkthdr.len << AQ_TXDESC_CTL2_LEN_SHIFT;
+
+	return context;
+}
+
 void
 aq_start(struct ifqueue *ifq)
 {
@@ -3303,31 +3388,20 @@ aq_start(struct ifqueue *ifq)
 		bus_dmamap_sync(sc->sc_dmat, as->as_map, 0,
 		    as->as_map->dm_mapsize, BUS_DMASYNC_PREWRITE);
 
-		ctl2 = m->m_pkthdr.len << AQ_TXDESC_CTL2_LEN_SHIFT;
 		ctl1 = AQ_TXDESC_CTL1_TYPE_TXD | AQ_TXDESC_CTL1_CMD_FCS;
-#if NVLAN > 0
-		if (m->m_flags & M_VLANTAG) {
-			txd = ring + idx;
-			txd->buf_addr = 0;
-			txd->ctl1 = htole32(AQ_TXDESC_CTL1_TYPE_TXC |
-			    (m->m_pkthdr.ether_vtag << AQ_TXDESC_CTL1_VLAN_SHIFT));
-			txd->ctl2 = 0;
+		ctl2 = 0;
 
-			ctl1 |= AQ_TXDESC_CTL1_CMD_VLAN;
-			ctl2 |= AQ_TXDESC_CTL2_CTX_EN;
+		txd = ring + idx;
+		txd->buf_addr = 0;
+		txd->ctl1 = 0;
+		txd->ctl2 = 0;
 
+		if (aq_tx_setup(txd, &ctl1, &ctl2, m)) {
 			idx++;
 			if (idx == AQ_TXD_NUM)
 				idx = 0;
 			used++;
 		}
-#endif
-
-		if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
-			ctl1 |= AQ_TXDESC_CTL1_CMD_IP4CSUM;
-		if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
-			ctl1 |= AQ_TXDESC_CTL1_CMD_L4CSUM;
-
 		for (i = 0; i < as->as_map->dm_nsegs; i++) {
 
 			if (i == as->as_map->dm_nsegs - 1)
@@ -3453,7 +3527,7 @@ aq_queue_up(struct aq_softc *sc, struct 
 	struct aq_rxring *rx;
 	struct aq_txring *tx;
 	struct aq_slot *as;
-	int i, mtu;
+	int i;
 
 	rx = &aq->q_rx;
 	rx->rx_slots = mallocarray(sizeof(*as), AQ_RXD_NUM, M_DEVBUF,
@@ -3491,11 +3565,11 @@ aq_queue_up(struct aq_softc *sc, struct 
 		goto destroy_rx_ring;
 	}
 
-	mtu = sc->sc_arpcom.ac_if.if_hardmtu;
 	for (i = 0; i < AQ_TXD_NUM; i++) {
 		as = &tx->tx_slots[i];
-		if (bus_dmamap_create(sc->sc_dmat, mtu, AQ_TX_MAX_SEGMENTS,
-		    MCLBYTES, 0, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
+		if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES,
+		    AQ_TX_MAX_SEGMENTS, MAXMCLBYTES, 0,
+		    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
 		    &as->as_map) != 0) {
 			printf("%s: failed to allocated tx dma maps %d\n",
 			    DEVNAME(sc), aq->q_index);
@@ -3602,6 +3676,8 @@ aq_up(struct aq_softc *sc)
 
 	AQ_WRITE_REG_BIT(sc, RPO_HWCSUM_REG, RPO_HWCSUM_IP4CSUM_EN, 1);
 	AQ_WRITE_REG_BIT(sc, RPO_HWCSUM_REG, RPO_HWCSUM_L4CSUM_EN, 1);
+
+	AQ_WRITE_REG(sc, TDM_LSO_EN_REG, 0xffffffff);
 
 	SET(ifp->if_flags, IFF_RUNNING);
 	aq_enable_intr(sc, 1, 1);