Download raw body.
aq(4): add TSO
This adds TSO support to aq(4). Loosely modelled on how it's done
in ix(4) etc.
This works about as expected on my AQC113 card, reducing the cpu time
required to make tcpbench/iperf/etc. go at 2.5Gb/s. On an AQC107 card
it works, but actually seems to make those straightforward benchmark
tools go slower, so in this diff I'm only enabling it for AQC11x/aq2
devices.
I'm not sure if this means there's something wrong elsewhere in the
driver, or that the implementation of TSO in the first generation
devices wasn't great, or something else, but hopefully I can figure
that out at some point.
ok?
Index: if_aq_pci.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_aq_pci.c,v
diff -u -p -r1.30 if_aq_pci.c
--- if_aq_pci.c 2 Feb 2025 08:28:14 -0000 1.30
+++ if_aq_pci.c 10 Feb 2025 10:41:42 -0000
@@ -90,10 +90,14 @@
#include <net/if.h>
#include <net/if_media.h>
+#include <net/route.h>
#include <net/toeplitz.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
#ifdef __HAVE_FDT
#include <dev/ofw/openfirm.h>
@@ -354,6 +358,8 @@
#define TPO_HWCSUM_L4CSUM_EN (1 << 0)
#define TPO_HWCSUM_IP4CSUM_EN (1 << 1)
+#define TDM_LSO_EN_REG 0x7810
+
#define THM_LSO_TCP_FLAG1_REG 0x7820
#define THM_LSO_TCP_FLAG1_FIRST 0xFFF
#define THM_LSO_TCP_FLAG1_MID 0xFFF0000
@@ -912,12 +918,21 @@ struct aq_tx_desc {
#define AQ_TXDESC_CTL1_CMD_FCS (1 << 23)
#define AQ_TXDESC_CTL1_CMD_IP4CSUM (1 << 24)
#define AQ_TXDESC_CTL1_CMD_L4CSUM (1 << 25)
+#define AQ_TXDESC_CTL1_CMD_LSO (1 << 26)
#define AQ_TXDESC_CTL1_CMD_WB (1 << 27)
+#define AQ_TXDESC_CTL1_CMD_IPV6 (1 << 21)
+#define AQ_TXDESC_CTL1_CMD_TCP (1 << 22)
+#define AQ_TXDESC_CTL1_L2LEN_SHIFT 24
+#define AQ_TXDESC_CTL1_L3LEN_SHIFT 31
+
#define AQ_TXDESC_CTL1_VID_SHIFT 4
uint32_t ctl2;
+#define AQ_TXDESC_CTL2_MSS_SHIFT 16
#define AQ_TXDESC_CTL2_LEN_SHIFT 14
#define AQ_TXDESC_CTL2_CTX_EN (1 << 13)
+#define AQ_TXDESC_CTL2_L3LEN_SHIFT 1
+#define AQ_TXDESC_CTL2_L4LEN_SHIFT 8
} __packed;
struct aq_slot {
@@ -1135,6 +1150,7 @@ int aq_ioctl(struct ifnet *, u_long, cad
int aq_up(struct aq_softc *);
void aq_down(struct aq_softc *);
void aq_iff(struct aq_softc *);
+int aq_tx_setup(struct aq_tx_desc *, uint32_t *, uint32_t *, struct mbuf *);
void aq_start(struct ifqueue *);
void aq_ifmedia_status(struct ifnet *, struct ifmediareq *);
int aq_ifmedia_change(struct ifnet *);
@@ -1367,6 +1383,13 @@ aq_attach(struct device *parent, struct
ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
IFCAP_CSUM_TCPv6;
+
+ /*
+ * only use TSO on AQ2; it works on AQ1, but it's slower than
+ * not using it.
+ */
+ if (aqp->aq_hwtype == HWTYPE_AQ2)
+ ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
#if NVLAN > 0
ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
#endif
@@ -3246,6 +3269,68 @@ aq_txeof(struct aq_softc *sc, struct aq_
}
}
+int
+aq_tx_setup(struct aq_tx_desc *txd, uint32_t *ctl1, uint32_t *ctl2,
+ struct mbuf *m)
+{
+ int context = 0;
+
+#if NVLAN > 0
+ if (m->m_flags & M_VLANTAG) {
+ txd->ctl1 |= htole32(AQ_TXDESC_CTL1_TYPE_TXC |
+ (m->m_pkthdr.ether_vtag << AQ_TXDESC_CTL1_VLAN_SHIFT));
+
+ *ctl1 |= AQ_TXDESC_CTL1_CMD_VLAN;
+ *ctl2 |= AQ_TXDESC_CTL2_CTX_EN;
+
+ context = 1;
+ }
+#endif
+
+ if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
+ *ctl1 |= AQ_TXDESC_CTL1_CMD_IP4CSUM;
+ if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
+ *ctl1 |= AQ_TXDESC_CTL1_CMD_L4CSUM;
+
+ if (m->m_pkthdr.csum_flags & M_TCP_TSO) {
+ int paylen, outlen;
+ struct ether_extracted ext;
+
+ ether_extract_headers(m, &ext);
+ if (ext.tcp && m->m_pkthdr.ph_mss > 0 && (ext.ip4 || ext.ip6)) {
+ outlen = m->m_pkthdr.ph_mss;
+ paylen = m->m_pkthdr.len - (ETHER_HDR_LEN +
+ ext.iphlen + ext.tcphlen);
+
+ txd->ctl1 |= htole32(AQ_TXDESC_CTL1_TYPE_TXC |
+ AQ_TXDESC_CTL1_CMD_TCP |
+ (ETHER_HDR_LEN << AQ_TXDESC_CTL1_L2LEN_SHIFT) |
+ (ext.iphlen << AQ_TXDESC_CTL1_L3LEN_SHIFT));
+
+ txd->ctl2 |= htole32(
+ (outlen << AQ_TXDESC_CTL2_MSS_SHIFT) |
+ (ext.tcphlen << AQ_TXDESC_CTL2_L4LEN_SHIFT)|
+ (ext.iphlen >> AQ_TXDESC_CTL2_L3LEN_SHIFT));
+
+ if (ext.ip6)
+ txd->ctl1 |= htole32(AQ_TXDESC_CTL1_CMD_IPV6);
+
+ context = 1;
+ *ctl1 |= AQ_TXDESC_CTL1_CMD_LSO;
+ *ctl2 |= AQ_TXDESC_CTL2_CTX_EN;
+ *ctl2 |= (paylen << AQ_TXDESC_CTL2_LEN_SHIFT);
+
+ tcpstat_add(tcps_outpkttso,
+ (paylen + outlen + 1) / outlen);
+ } else {
+ tcpstat_inc(tcps_outbadtso);
+ }
+ } else
+ *ctl2 |= m->m_pkthdr.len << AQ_TXDESC_CTL2_LEN_SHIFT;
+
+ return context;
+}
+
void
aq_start(struct ifqueue *ifq)
{
@@ -3303,31 +3388,20 @@ aq_start(struct ifqueue *ifq)
bus_dmamap_sync(sc->sc_dmat, as->as_map, 0,
as->as_map->dm_mapsize, BUS_DMASYNC_PREWRITE);
- ctl2 = m->m_pkthdr.len << AQ_TXDESC_CTL2_LEN_SHIFT;
ctl1 = AQ_TXDESC_CTL1_TYPE_TXD | AQ_TXDESC_CTL1_CMD_FCS;
-#if NVLAN > 0
- if (m->m_flags & M_VLANTAG) {
- txd = ring + idx;
- txd->buf_addr = 0;
- txd->ctl1 = htole32(AQ_TXDESC_CTL1_TYPE_TXC |
- (m->m_pkthdr.ether_vtag << AQ_TXDESC_CTL1_VLAN_SHIFT));
- txd->ctl2 = 0;
+ ctl2 = 0;
- ctl1 |= AQ_TXDESC_CTL1_CMD_VLAN;
- ctl2 |= AQ_TXDESC_CTL2_CTX_EN;
+ txd = ring + idx;
+ txd->buf_addr = 0;
+ txd->ctl1 = 0;
+ txd->ctl2 = 0;
+ if (aq_tx_setup(txd, &ctl1, &ctl2, m)) {
idx++;
if (idx == AQ_TXD_NUM)
idx = 0;
used++;
}
-#endif
-
- if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
- ctl1 |= AQ_TXDESC_CTL1_CMD_IP4CSUM;
- if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
- ctl1 |= AQ_TXDESC_CTL1_CMD_L4CSUM;
-
for (i = 0; i < as->as_map->dm_nsegs; i++) {
if (i == as->as_map->dm_nsegs - 1)
@@ -3453,7 +3527,7 @@ aq_queue_up(struct aq_softc *sc, struct
struct aq_rxring *rx;
struct aq_txring *tx;
struct aq_slot *as;
- int i, mtu;
+ int i;
rx = &aq->q_rx;
rx->rx_slots = mallocarray(sizeof(*as), AQ_RXD_NUM, M_DEVBUF,
@@ -3491,11 +3565,11 @@ aq_queue_up(struct aq_softc *sc, struct
goto destroy_rx_ring;
}
- mtu = sc->sc_arpcom.ac_if.if_hardmtu;
for (i = 0; i < AQ_TXD_NUM; i++) {
as = &tx->tx_slots[i];
- if (bus_dmamap_create(sc->sc_dmat, mtu, AQ_TX_MAX_SEGMENTS,
- MCLBYTES, 0, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
+ if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES,
+ AQ_TX_MAX_SEGMENTS, MAXMCLBYTES, 0,
+ BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
&as->as_map) != 0) {
printf("%s: failed to allocated tx dma maps %d\n",
DEVNAME(sc), aq->q_index);
@@ -3602,6 +3676,8 @@ aq_up(struct aq_softc *sc)
AQ_WRITE_REG_BIT(sc, RPO_HWCSUM_REG, RPO_HWCSUM_IP4CSUM_EN, 1);
AQ_WRITE_REG_BIT(sc, RPO_HWCSUM_REG, RPO_HWCSUM_L4CSUM_EN, 1);
+
+ AQ_WRITE_REG(sc, TDM_LSO_EN_REG, 0xffffffff);
SET(ifp->if_flags, IFF_RUNNING);
aq_enable_intr(sc, 1, 1);
aq(4): add TSO