Index | Thread | Search

From:
Jan Klemkow <jan@openbsd.org>
Subject:
Re: vmx(4): TCP Large Receive Offload
To:
Hrvoje Popovski <hrvoje@srce.hr>
Cc:
tech@openbsd.org
Date:
Fri, 31 May 2024 21:07:53 +0200

Download raw body.

Thread
On Wed, May 29, 2024 at 12:34:24AM +0200, jan@openbsd.org wrote:
> On Thu, May 23, 2024 at 12:10:57PM GMT, Hrvoje Popovski wrote:
> > On 22.5.2024. 22:47, jan@openbsd.org wrote:
> > > This diff introduces TCP Large Receive Offload (LRO) for vmx(4).
> > > 
> > > The virtual device annotates LRO packets with receive descriptors of
> > > type 4.  We need this additional information to calculate a valid MSS
> > > for this packet.  Thus, we are able to route this kind of packets.
> > > But, we just get type 4 descriptors if we pretend support vmxnet3
> > > in revision 2.
> > > 
> > > I tested it on ESXi 8 with Linux guests and external hosts.  It
> > > increases the single TCP performance up to 20 GBit/s in my setup.
> > > 
> > > Tests are welcome, especially with different ESXi versions and IP
> > > forwarding.
> > 
> > I'm running this diff with forwarding and iperf setup at the same time
> > and box seems fine. Vm is on ESXi8.
> 
> Thanks for testing!
> 
> Here is an update version of the diff with suggestions form bluhm.
> 
>  - We negotiate LRO only on vmxnet3 rev 2 devices
>  - Remove the mbus for loop
>  - Track last mbuf via lastmp variable
>  - Remove some useless lines

Hrvoje found some panics in my last diff.  I just forgot the set
send/lastmp to NULL after m_freem().

Fixed diff below.

Thanks,
Jan

Index: dev/pci/if_vmx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v
diff -u -p -r1.86 if_vmx.c
--- dev/pci/if_vmx.c	21 May 2024 19:49:06 -0000	1.86
+++ dev/pci/if_vmx.c	31 May 2024 18:57:56 -0000
@@ -114,6 +114,8 @@ struct vmxnet3_comp_ring {
 	};
 	u_int next;
 	u_int32_t gen;
+	struct mbuf *sendmp;
+	struct mbuf *lastmp;
 };
 
 struct vmxnet3_txqueue {
@@ -160,6 +162,7 @@ struct vmxnet3_softc {
 	struct vmxnet3_queue *sc_q;
 	struct intrmap *sc_intrmap;
 
+	u_int sc_vrrs;
 	struct vmxnet3_driver_shared *sc_ds;
 	u_int8_t *sc_mcast;
 	struct vmxnet3_upt1_rss_conf *sc_rss;
@@ -170,7 +173,7 @@ struct vmxnet3_softc {
 #endif
 };
 
-#define JUMBO_LEN (1024 * 9)
+#define JUMBO_LEN ((16 * 1024) - 1)
 #define DMAADDR(map) ((map)->dm_segs[0].ds_addr)
 
 #define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg)
@@ -273,15 +276,21 @@ vmxnet3_attach(struct device *parent, st
 		return;
 	}
 
+	/* Vmxnet3 Revision Report and Selection */
 	ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS);
-	if ((ver & 0x1) == 0) {
+	if (ISSET(ver, 0x2)) {
+		sc->sc_vrrs = 2;
+	} else if (ISSET(ver, 0x1)) {
+		sc->sc_vrrs = 1;
+	} else {
 		printf(": unsupported hardware version 0x%x\n", ver);
 		return;
 	}
-	WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1);
+	WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, sc->sc_vrrs);
 
+	/* UPT Version Report and Selection */
 	ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS);
-	if ((ver & 0x1) == 0) {
+	if (!ISSET(ver, 0x1)) {
 		printf(": incompatible UPT version 0x%x\n", ver);
 		return;
 	}
@@ -410,6 +419,11 @@ vmxnet3_attach(struct device *parent, st
 
 	ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 
+	if (sc->sc_vrrs == 2) {
+		ifp->if_xflags |= IFXF_LRO;
+		ifp->if_capabilities |= IFCAP_LRO;
+	}
+
 #if NVLAN > 0
 	if (sc->sc_ds->upt_features & UPT1_F_VLAN)
 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
@@ -704,6 +718,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ri
 	uint32_t rgen;
 	uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S);
 
+	/* Second ring just contains packet bodies. */
+	if (ring->rid == 1)
+		type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S);
+
 	MUTEX_ASSERT_LOCKED(&ring->mtx);
 
 	slots = if_rxr_get(&ring->rxr, NRXDESC);
@@ -781,17 +799,17 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc,
 		    VMX_DMA_LEN(&ring->dmamem));
 		bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem),
 		    0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE);
-	}
 
-	/* XXX only fill ring 0 */
-	ring = &rq->cmd_ring[0];
-	mtx_enter(&ring->mtx);
-	vmxnet3_rxfill(ring);
-	mtx_leave(&ring->mtx);
+		mtx_enter(&ring->mtx);
+		vmxnet3_rxfill(ring);
+		mtx_leave(&ring->mtx);
+	}
 
 	comp_ring = &rq->comp_ring;
 	comp_ring->next = 0;
 	comp_ring->gen = VMX_RXC_GEN;
+	comp_ring->sendmp = NULL;
+	comp_ring->lastmp = NULL;
 
 	memset(VMX_DMA_KVA(&comp_ring->dmamem), 0,
 	    VMX_DMA_LEN(&comp_ring->dmamem));
@@ -1074,9 +1092,9 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
 	struct mbuf *m;
 	bus_dmamap_t map;
-	unsigned int idx, len;
+	unsigned int idx;
 	unsigned int next, rgen;
-	unsigned int done = 0;
+	unsigned int rid, done[2] = {0, 0};
 
 	next = comp_ring->next;
 	rgen = comp_ring->gen;
@@ -1096,11 +1114,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 
 		idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) &
 		    VMXNET3_RXC_IDX_M);
+
 		if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) &
 		    VMXNET3_RXC_QID_M) < sc->sc_nqueues)
-			ring = &rq->cmd_ring[0];
+			rid = 0;
 		else
-			ring = &rq->cmd_ring[1];
+			rid = 1;
+
+		ring = &rq->cmd_ring[rid];
 
 		m = ring->m[idx];
 		KASSERT(m != NULL);
@@ -1111,31 +1132,62 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 		    BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->sc_dmat, map);
 
-		done++;
+		done[rid]++;
+
+		/*
+		 * A receive descriptor of type 4 which is flaged as start of
+		 * packet, contains the number of TCP segment of an LRO packet.
+		 */
+		if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >>
+		    VMXNET3_RXC_TYPE_S) == 4 &&
+		    ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) {
+			m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 &
+			    VMXNET3_RXC_SEG_CNT_M);
+		}
+
+		m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
+		    VMXNET3_RXC_LEN_M);
+
+		if (comp_ring->sendmp == NULL) {
+			comp_ring->sendmp = comp_ring->lastmp = m;
+			comp_ring->sendmp->m_pkthdr.len = 0;
+		} else {
+			CLR(m->m_flags, M_PKTHDR);
+			comp_ring->lastmp->m_next = m;
+			comp_ring->lastmp = m;
+		}
+		comp_ring->sendmp->m_pkthdr.len += m->m_len;
+
+		if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP))
+			continue;
+
+		/*
+		 * End of Packet
+		 */
 
 		if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) {
 			ifp->if_ierrors++;
-			m_freem(m);
+			m_freem(comp_ring->sendmp);
+			comp_ring->sendmp = comp_ring->lastmp = NULL;
 			continue;
 		}
 
-		len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
-		    VMXNET3_RXC_LEN_M);
-		if (len < VMXNET3_MIN_MTU) {
-			m_freem(m);
+		if (comp_ring->sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) {
+			m_freem(comp_ring->sendmp);
+			comp_ring->sendmp = comp_ring->lastmp = NULL;
 			continue;
 		}
-		m->m_pkthdr.len = m->m_len = len;
-
-		vmxnet3_rx_offload(rxcd, m);
 
 		if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) &
 		    VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) {
-			m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1);
-			SET(m->m_pkthdr.csum_flags, M_FLOWID);
+			comp_ring->sendmp->m_pkthdr.ph_flowid =
+			    letoh32(rxcd->rxc_word1);
+			SET(comp_ring->sendmp->m_pkthdr.csum_flags, M_FLOWID);
 		}
 
-		ml_enqueue(&ml, m);
+		vmxnet3_rx_offload(rxcd, comp_ring->sendmp);
+		ml_enqueue(&ml, comp_ring->sendmp);
+		comp_ring->sendmp = comp_ring->lastmp = NULL;
 	}
 
 	bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem),
@@ -1144,19 +1196,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
 	comp_ring->next = next;
 	comp_ring->gen = rgen;
 
-	if (done == 0)
-		return;
+	for (int i = 0; i < 2; i++) {
+		if (done[i] == 0)
+			continue;
 
-	ring = &rq->cmd_ring[0];
+		ring = &rq->cmd_ring[i];
 
-	if (ifiq_input(rq->ifiq, &ml))
-		if_rxr_livelocked(&ring->rxr);
+		if (ifiq_input(rq->ifiq, &ml))
+			if_rxr_livelocked(&ring->rxr);
 
-	/* XXX Should we (try to) allocate buffers for ring 2 too? */
-	mtx_enter(&ring->mtx);
-	if_rxr_put(&ring->rxr, done);
-	vmxnet3_rxfill(ring);
-	mtx_leave(&ring->mtx);
+		mtx_enter(&ring->mtx);
+		if_rxr_put(&ring->rxr, done[i]);
+		vmxnet3_rxfill(ring);
+		mtx_leave(&ring->mtx);
+	}
 }
 
 void
@@ -1211,6 +1264,8 @@ vmxnet3_iff(struct vmxnet3_softc *sc)
 void
 vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m)
 {
+	uint32_t pkts;
+
 	/*
 	 * VLAN Offload
 	 */
@@ -1243,6 +1298,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcomp
 		else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP))
 			SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
 	}
+
+	/*
+	 * TCP Large Receive Offload
+	 */
+
+	pkts = m->m_pkthdr.ph_mss;
+	m->m_pkthdr.ph_mss = 0;
+
+	if (pkts > 1) {
+		struct ether_extracted ext;
+		uint32_t paylen;
+
+		ether_extract_headers(m, &ext);
+
+		paylen = ext.iplen;
+		if (ext.ip4 || ext.ip6)
+			paylen -= ext.iphlen;
+
+		if (ext.tcp) {
+			paylen -= ext.tcphlen;
+			tcpstat_inc(tcps_inhwlro);
+			tcpstat_add(tcps_inpktlro, pkts);
+		} else {
+			tcpstat_inc(tcps_inbadlro);
+		}
+
+		/*
+		 * If we gonna forward this packet, we have to mark it as TSO,
+		 * set a correct mss, and recalculate the TCP checksum.
+		 */
+		if (ext.tcp && paylen >= pkts) {
+			SET(m->m_pkthdr.csum_flags, M_TCP_TSO);
+			m->m_pkthdr.ph_mss = paylen / pkts;
+		}
+		if (ext.tcp &&
+		    ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) {
+			SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+		}
+	}
 }
 
 void
@@ -1308,6 +1402,13 @@ vmxnet3_init(struct vmxnet3_softc *sc)
 		vmxnet3_stop(ifp);
 		return EIO;
 	}
+
+	/* TCP Large Receive Offload */
+	if (ISSET(ifp->if_xflags, IFXF_LRO))
+		SET(sc->sc_ds->upt_features, UPT1_F_LRO);
+	else
+		CLR(sc->sc_ds->upt_features, UPT1_F_LRO);
+	WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE);
 
 	/* Program promiscuous mode and multicast filters. */
 	vmxnet3_iff(sc);
Index: dev/pci/if_vmxreg.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmxreg.h,v
diff -u -p -r1.9 if_vmxreg.h
--- dev/pci/if_vmxreg.h	7 Jul 2020 01:36:49 -0000	1.9
+++ dev/pci/if_vmxreg.h	28 May 2024 20:10:33 -0000
@@ -76,6 +76,7 @@ enum UPT1_RxStats {
 #define VMXNET3_CMD_RESET	0xcafe0002	/* reset device */
 #define VMXNET3_CMD_SET_RXMODE	0xcafe0003	/* set interface flags */
 #define VMXNET3_CMD_SET_FILTER	0xcafe0004	/* set address filter */
+#define VMXNET3_CMD_SET_FEATURE	0xcafe0009	/* set features */
 #define VMXNET3_CMD_GET_STATUS	0xf00d0000	/* get queue errors */
 #define VMXNET3_CMD_GET_STATS	0xf00d0001
 #define VMXNET3_CMD_GET_LINK	0xf00d0002	/* get link status */
@@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc {
 	u_int32_t		rxc_word1;
 #define VMXNET3_RXC_RSSHASH_M	0xffffffff	/* RSS hash value */
 #define VMXNET3_RXC_RSSHASH_S	0
+#define VMXNET3_RXC_SEG_CNT_M	0x000000ff	/* No. of seg. in LRO pkt */
 
 	u_int32_t		rxc_word2;
 #define VMXNET3_RXC_LEN_M	0x00003fff
@@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc {
 #define VMXNET3_RXC_FRAGMENT	0x00400000	/* IP fragment */
 #define VMXNET3_RXC_FCS		0x00800000	/* frame CRC correct */
 #define VMXNET3_RXC_TYPE_M	0x7f000000
+#define VMXNET3_RXC_TYPE_S	24
 #define VMXNET3_RXC_GEN_M	0x00000001U
 #define VMXNET3_RXC_GEN_S	31
 } __packed;