Index | Thread | Search

From:
jan@openbsd.org
Subject:
vmx(4): TCP Large Receive Offload
To:
tech@openbsd.org
Date:
Wed, 22 May 2024 22:47:01 +0200

Download raw body.

Thread
Hi,

This diff introduces TCP Large Receive Offload (LRO) for vmx(4).

The virtual device annotates LRO packets with receive descriptors of
type 4.  We need this additional information to calculate a valid MSS
for this packet.  Thus, we are able to route this kind of packets.
But, we just get type 4 descriptors if we pretend support vmxnet3
in revision 2.

I tested it on ESXi 8 with Linux guests and external hosts.  It
increases the single TCP performance up to 20 GBit/s in my setup.

Tests are welcome, especially with different ESXi versions and IP
forwarding.

bye,
Jan

Index: dev/pci/if_vmx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v
diff -u -p -r1.86 if_vmx.c
--- dev/pci/if_vmx.c	21 May 2024 19:49:06 -0000	1.86
+++ dev/pci/if_vmx.c	22 May 2024 20:18:20 -0000
@@ -114,6 +114,7 @@ struct vmxnet3_comp_ring {
 	};
 	u_int next;
 	u_int32_t gen;
+	struct mbuf *m;
 };
 
 struct vmxnet3_txqueue {
@@ -170,7 +171,7 @@ struct vmxnet3_softc {
 #endif
 };
 
-#define JUMBO_LEN (1024 * 9)
+#define JUMBO_LEN ((16 * 1024) - 1)
 #define DMAADDR(map) ((map)->dm_segs[0].ds_addr)
 
 #define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg)
@@ -273,15 +274,20 @@ vmxnet3_attach(struct device *parent, st
 		return;
 	}
 
+	/* Vmxnet3 Revision Report and Selection */
 	ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS);
-	if ((ver & 0x1) == 0) {
+	if (ISSET(ver, 0x2)) {
+		WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 2);
+	} else if (ISSET(ver, 0x1)) {
+		WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1);
+	} else {
 		printf(": unsupported hardware version 0x%x\n", ver);
 		return;
 	}
-	WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1);
 
+	/* UPT Version Report and Selection */
 	ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS);
-	if ((ver & 0x1) == 0) {
+	if (!ISSET(ver, 0x1)) {
 		printf(": incompatible UPT version 0x%x\n", ver);
 		return;
 	}
@@ -410,6 +416,9 @@ vmxnet3_attach(struct device *parent, st
 
 	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 
+	ifp->if_xflags |= IFXF_LRO;
+	ifp->if_capabilities |= IFCAP_LRO;
+
 #if NVLAN > 0
 	if (sc->sc_ds->upt_features & UPT1_F_VLAN)
 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
@@ -515,6 +524,7 @@ vmxnet3_dma_init(struct vmxnet3_softc *s
 #if NVLAN > 0
 	ds->upt_features |= UPT1_F_VLAN;
 #endif
+	ds->nrxsg_max = 0;
 	ds->driver_data = ~0ULL;
 	ds->driver_data_len = 0;
 	ds->queue_shared = qs_pa;
@@ -668,6 +678,7 @@ vmxnet3_txinit(struct vmxnet3_softc *sc,
 	ring->gen = VMX_TX_GEN;
 	comp_ring->next = 0;
 	comp_ring->gen = VMX_TXC_GEN;
+	comp_ring->m = NULL;
 	memset(VMX_DMA_KVA(&ring->dmamem), 0,
 	    VMX_DMA_LEN(&ring->dmamem));
 	bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem),
@@ -704,6 +715,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ri
 	uint32_t rgen;
 	uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S);
 
+	/* Second ring just contains packet bodies. */
+	if (ring->rid == 1)
+		type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S);
+
 	MUTEX_ASSERT_LOCKED(&ring->mtx);
 
 	slots = if_rxr_get(&ring->rxr, NRXDESC);
@@ -781,17 +796,16 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc,
 		    VMX_DMA_LEN(&ring->dmamem));
 		bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem),
 		    0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE);
-	}
 
-	/* XXX only fill ring 0 */
-	ring = &rq->cmd_ring[0];
-	mtx_enter(&ring->mtx);
-	vmxnet3_rxfill(ring);
-	mtx_leave(&ring->mtx);
+		mtx_enter(&ring->mtx);
+		vmxnet3_rxfill(ring);
+		mtx_leave(&ring->mtx);
+	}
 
 	comp_ring = &rq->comp_ring;
 	comp_ring->next = 0;
 	comp_ring->gen = VMX_RXC_GEN;
+	comp_ring->m = NULL;
 
 	memset(VMX_DMA_KVA(&comp_ring->dmamem), 0,
 	    VMX_DMA_LEN(&comp_ring->dmamem));
@@ -1072,11 +1086,11 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 	struct vmxnet3_rxring *ring;
 	struct vmxnet3_rxcompdesc *rxcd;
 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
-	struct mbuf *m;
+	struct mbuf *m, *sendmp = NULL;
 	bus_dmamap_t map;
-	unsigned int idx, len;
+	unsigned int idx;
 	unsigned int next, rgen;
-	unsigned int done = 0;
+	unsigned int rid, done[2] = {0, 0};
 
 	next = comp_ring->next;
 	rgen = comp_ring->gen;
@@ -1096,11 +1110,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 
 		idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) &
 		    VMXNET3_RXC_IDX_M);
+
 		if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) &
 		    VMXNET3_RXC_QID_M) < sc->sc_nqueues)
-			ring = &rq->cmd_ring[0];
+			rid = 0;
 		else
-			ring = &rq->cmd_ring[1];
+			rid = 1;
+
+		ring = &rq->cmd_ring[rid];
 
 		m = ring->m[idx];
 		KASSERT(m != NULL);
@@ -1111,31 +1128,65 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 		    BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->sc_dmat, map);
 
-		done++;
+		done[rid]++;
+
+		/*
+		 * A receive descriptor of type 4 which is flaged as start of
+		 * packet, contains the number of TCP segment of an LRO packet.
+		 */
+		if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >>
+		    VMXNET3_RXC_TYPE_S) == 4 &&
+		    ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) {
+			m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 &
+			    VMXNET3_RXC_SEG_CNT_M);
+		}
+
+		m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
+		    VMXNET3_RXC_LEN_M);
+
+		sendmp = comp_ring->m;
+		comp_ring->m = NULL;
+
+		if (sendmp == NULL) {
+			sendmp = m;
+			sendmp->m_pkthdr.len = m->m_len;
+		} else {
+			struct mbuf *mp;
+
+			sendmp->m_pkthdr.len += m->m_len;
+			for (mp = sendmp; mp->m_next != NULL; mp = mp->m_next);
+			mp->m_next = m;
+		}
+
+		if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP)) {
+			comp_ring->m = sendmp;
+			continue;
+		}
+
+		/*
+		 * End of Packet
+		 */
 
 		if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) {
 			ifp->if_ierrors++;
-			m_freem(m);
+			m_freem(sendmp);
 			continue;
 		}
 
-		len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
-		    VMXNET3_RXC_LEN_M);
-		if (len < VMXNET3_MIN_MTU) {
+		if (sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) {
 			m_freem(m);
 			continue;
 		}
-		m->m_pkthdr.len = m->m_len = len;
 
-		vmxnet3_rx_offload(rxcd, m);
+		vmxnet3_rx_offload(rxcd, sendmp);
 
 		if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) &
 		    VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) {
-			m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1);
-			SET(m->m_pkthdr.csum_flags, M_FLOWID);
+			sendmp->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1);
+			SET(sendmp->m_pkthdr.csum_flags, M_FLOWID);
 		}
 
-		ml_enqueue(&ml, m);
+		ml_enqueue(&ml, sendmp);
 	}
 
 	bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem),
@@ -1144,19 +1195,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 	comp_ring->next = next;
 	comp_ring->gen = rgen;
 
-	if (done == 0)
-		return;
+	for (int i = 0; i < 2; i++) {
+		if (done[i] == 0)
+			continue;
 
-	ring = &rq->cmd_ring[0];
+		ring = &rq->cmd_ring[i];
 
-	if (ifiq_input(rq->ifiq, &ml))
-		if_rxr_livelocked(&ring->rxr);
+		if (ifiq_input(rq->ifiq, &ml))
+			if_rxr_livelocked(&ring->rxr);
 
-	/* XXX Should we (try to) allocate buffers for ring 2 too? */
-	mtx_enter(&ring->mtx);
-	if_rxr_put(&ring->rxr, done);
-	vmxnet3_rxfill(ring);
-	mtx_leave(&ring->mtx);
+		mtx_enter(&ring->mtx);
+		if_rxr_put(&ring->rxr, done[i]);
+		vmxnet3_rxfill(ring);
+		mtx_leave(&ring->mtx);
+	}
 }
 
 void
@@ -1207,10 +1259,11 @@ vmxnet3_iff(struct vmxnet3_softc *sc)
 	WRITE_CMD(sc, VMXNET3_CMD_SET_RXMODE);
 }
 
-
 void
 vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m)
 {
+	uint32_t pkts;
+
 	/*
 	 * VLAN Offload
 	 */
@@ -1243,6 +1296,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcomp
 		else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP))
 			SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
 	}
+
+	/*
+	 * TCP Large Receive Offload
+	 */
+
+	pkts = m->m_pkthdr.ph_mss;
+	m->m_pkthdr.ph_mss = 0;
+
+	if (pkts > 1) {
+		struct ether_extracted ext;
+		uint32_t paylen;
+
+		ether_extract_headers(m, &ext);
+
+		paylen = ext.iplen;
+		if (ext.ip4 || ext.ip6)
+			paylen -= ext.iphlen;
+
+		if (ext.tcp) {
+			paylen -= ext.tcphlen;
+			tcpstat_inc(tcps_inhwlro);
+			tcpstat_add(tcps_inpktlro, pkts);
+		} else {
+			tcpstat_inc(tcps_inbadlro);
+		}
+
+		/*
+		 * If we gonna forward this packet, we have to mark it as TSO,
+		 * set a correct mss, and recalculate the TCP checksum.
+		 */
+		if (ext.tcp && paylen >= pkts) {
+			SET(m->m_pkthdr.csum_flags, M_TCP_TSO);
+			m->m_pkthdr.ph_mss = paylen / pkts;
+		}
+		if (ext.tcp &&
+		    ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) {
+			SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+		}
+	}
 }
 
 void
@@ -1308,6 +1400,13 @@ vmxnet3_init(struct vmxnet3_softc *sc)
 		vmxnet3_stop(ifp);
 		return EIO;
 	}
+
+	/* TCP Large Receive Offload */
+	if (ISSET(ifp->if_xflags, IFXF_LRO))
+		SET(sc->sc_ds->upt_features, UPT1_F_LRO);
+	else
+		CLR(sc->sc_ds->upt_features, UPT1_F_LRO);
+	WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE);
 
 	/* Program promiscuous mode and multicast filters. */
 	vmxnet3_iff(sc);
Index: dev/pci/if_vmxreg.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmxreg.h,v
diff -u -p -r1.9 if_vmxreg.h
--- dev/pci/if_vmxreg.h	7 Jul 2020 01:36:49 -0000	1.9
+++ dev/pci/if_vmxreg.h	22 May 2024 19:45:29 -0000
@@ -76,6 +76,7 @@ enum UPT1_RxStats {
 #define VMXNET3_CMD_RESET	0xcafe0002	/* reset device */
 #define VMXNET3_CMD_SET_RXMODE	0xcafe0003	/* set interface flags */
 #define VMXNET3_CMD_SET_FILTER	0xcafe0004	/* set address filter */
+#define VMXNET3_CMD_SET_FEATURE	0xcafe0009	/* set features */
 #define VMXNET3_CMD_GET_STATUS	0xf00d0000	/* get queue errors */
 #define VMXNET3_CMD_GET_STATS	0xf00d0001
 #define VMXNET3_CMD_GET_LINK	0xf00d0002	/* get link status */
@@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc {
 	u_int32_t		rxc_word1;
 #define VMXNET3_RXC_RSSHASH_M	0xffffffff	/* RSS hash value */
 #define VMXNET3_RXC_RSSHASH_S	0
+#define VMXNET3_RXC_SEG_CNT_M	0x000000ff	/* No. of seg. in LRO pkt */
 
 	u_int32_t		rxc_word2;
 #define VMXNET3_RXC_LEN_M	0x00003fff
@@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc {
 #define VMXNET3_RXC_FRAGMENT	0x00400000	/* IP fragment */
 #define VMXNET3_RXC_FCS		0x00800000	/* frame CRC correct */
 #define VMXNET3_RXC_TYPE_M	0x7f000000
+#define VMXNET3_RXC_TYPE_S	24
 #define VMXNET3_RXC_GEN_M	0x00000001U
 #define VMXNET3_RXC_GEN_S	31
 } __packed;