Download raw body.
vmx(4): TCP Large Receive Offload
On Thu, May 23, 2024 at 12:10:57PM GMT, Hrvoje Popovski wrote:
> On 22.5.2024. 22:47, jan@openbsd.org wrote:
> > This diff introduces TCP Large Receive Offload (LRO) for vmx(4).
> >
> > The virtual device annotates LRO packets with receive descriptors of
> > type 4. We need this additional information to calculate a valid MSS
> > for this packet. Thus, we are able to route this kind of packets.
> > But, we just get type 4 descriptors if we pretend support vmxnet3
> > in revision 2.
> >
> > I tested it on ESXi 8 with Linux guests and external hosts. It
> > increases the single TCP performance up to 20 GBit/s in my setup.
> >
> > Tests are welcome, especially with different ESXi versions and IP
> > forwarding.
>
> I'm running this diff with forwarding and iperf setup at the same time
> and box seems fine. Vm is on ESXi8.
Thanks for testing!
Here is an update version of the diff with suggestions form bluhm.
- We negotiate LRO only on vmxnet3 rev 2 devices
- Remove the mbus for loop
- Track last mbuf via lastmp variable
- Remove some useless lines
bye,
Jan
Index: dev/pci/if_vmx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v
diff -u -p -r1.86 if_vmx.c
--- dev/pci/if_vmx.c 21 May 2024 19:49:06 -0000 1.86
+++ dev/pci/if_vmx.c 28 May 2024 22:23:56 -0000
@@ -114,6 +114,8 @@ struct vmxnet3_comp_ring {
};
u_int next;
u_int32_t gen;
+ struct mbuf *sendmp;
+ struct mbuf *lastmp;
};
struct vmxnet3_txqueue {
@@ -160,6 +162,7 @@ struct vmxnet3_softc {
struct vmxnet3_queue *sc_q;
struct intrmap *sc_intrmap;
+ u_int sc_vrrs;
struct vmxnet3_driver_shared *sc_ds;
u_int8_t *sc_mcast;
struct vmxnet3_upt1_rss_conf *sc_rss;
@@ -170,7 +173,7 @@ struct vmxnet3_softc {
#endif
};
-#define JUMBO_LEN (1024 * 9)
+#define JUMBO_LEN ((16 * 1024) - 1)
#define DMAADDR(map) ((map)->dm_segs[0].ds_addr)
#define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg)
@@ -273,15 +276,21 @@ vmxnet3_attach(struct device *parent, st
return;
}
+ /* Vmxnet3 Revision Report and Selection */
ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS);
- if ((ver & 0x1) == 0) {
+ if (ISSET(ver, 0x2)) {
+ sc->sc_vrrs = 2;
+ } else if (ISSET(ver, 0x1)) {
+ sc->sc_vrrs = 1;
+ } else {
printf(": unsupported hardware version 0x%x\n", ver);
return;
}
- WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1);
+ WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, sc->sc_vrrs);
+ /* UPT Version Report and Selection */
ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS);
- if ((ver & 0x1) == 0) {
+ if (!ISSET(ver, 0x1)) {
printf(": incompatible UPT version 0x%x\n", ver);
return;
}
@@ -410,6 +419,11 @@ vmxnet3_attach(struct device *parent, st
ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
+ if (sc->sc_vrrs == 2) {
+ ifp->if_xflags |= IFXF_LRO;
+ ifp->if_capabilities |= IFCAP_LRO;
+ }
+
#if NVLAN > 0
if (sc->sc_ds->upt_features & UPT1_F_VLAN)
ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
@@ -704,6 +718,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ri
uint32_t rgen;
uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S);
+ /* Second ring just contains packet bodies. */
+ if (ring->rid == 1)
+ type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S);
+
MUTEX_ASSERT_LOCKED(&ring->mtx);
slots = if_rxr_get(&ring->rxr, NRXDESC);
@@ -781,17 +799,17 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc,
VMX_DMA_LEN(&ring->dmamem));
bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem),
0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE);
- }
- /* XXX only fill ring 0 */
- ring = &rq->cmd_ring[0];
- mtx_enter(&ring->mtx);
- vmxnet3_rxfill(ring);
- mtx_leave(&ring->mtx);
+ mtx_enter(&ring->mtx);
+ vmxnet3_rxfill(ring);
+ mtx_leave(&ring->mtx);
+ }
comp_ring = &rq->comp_ring;
comp_ring->next = 0;
comp_ring->gen = VMX_RXC_GEN;
+ comp_ring->sendmp = NULL;
+ comp_ring->lastmp = NULL;
memset(VMX_DMA_KVA(&comp_ring->dmamem), 0,
VMX_DMA_LEN(&comp_ring->dmamem));
@@ -1074,9 +1092,9 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
struct mbuf_list ml = MBUF_LIST_INITIALIZER();
struct mbuf *m;
bus_dmamap_t map;
- unsigned int idx, len;
+ unsigned int idx;
unsigned int next, rgen;
- unsigned int done = 0;
+ unsigned int rid, done[2] = {0, 0};
next = comp_ring->next;
rgen = comp_ring->gen;
@@ -1096,11 +1114,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) &
VMXNET3_RXC_IDX_M);
+
if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) &
VMXNET3_RXC_QID_M) < sc->sc_nqueues)
- ring = &rq->cmd_ring[0];
+ rid = 0;
else
- ring = &rq->cmd_ring[1];
+ rid = 1;
+
+ ring = &rq->cmd_ring[rid];
m = ring->m[idx];
KASSERT(m != NULL);
@@ -1111,31 +1132,60 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(sc->sc_dmat, map);
- done++;
+ done[rid]++;
+
+ /*
+ * A receive descriptor of type 4 which is flaged as start of
+ * packet, contains the number of TCP segment of an LRO packet.
+ */
+ if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >>
+ VMXNET3_RXC_TYPE_S) == 4 &&
+ ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) {
+ m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 &
+ VMXNET3_RXC_SEG_CNT_M);
+ }
+
+ m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
+ VMXNET3_RXC_LEN_M);
+
+ if (comp_ring->sendmp == NULL) {
+ comp_ring->sendmp = comp_ring->lastmp = m;
+ comp_ring->sendmp->m_pkthdr.len = 0;
+ } else {
+ CLR(m->m_flags, M_PKTHDR);
+ comp_ring->lastmp->m_next = m;
+ comp_ring->lastmp = m;
+ }
+ comp_ring->sendmp->m_pkthdr.len += m->m_len;
+
+ if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP))
+ continue;
+
+ /*
+ * End of Packet
+ */
if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) {
ifp->if_ierrors++;
- m_freem(m);
+ m_freem(comp_ring->sendmp);
continue;
}
- len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
- VMXNET3_RXC_LEN_M);
- if (len < VMXNET3_MIN_MTU) {
- m_freem(m);
+ if (comp_ring->sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) {
+ m_freem(comp_ring->sendmp);
continue;
}
- m->m_pkthdr.len = m->m_len = len;
-
- vmxnet3_rx_offload(rxcd, m);
if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) &
VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) {
- m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1);
- SET(m->m_pkthdr.csum_flags, M_FLOWID);
+ comp_ring->sendmp->m_pkthdr.ph_flowid =
+ letoh32(rxcd->rxc_word1);
+ SET(comp_ring->sendmp->m_pkthdr.csum_flags, M_FLOWID);
}
- ml_enqueue(&ml, m);
+ vmxnet3_rx_offload(rxcd, comp_ring->sendmp);
+ ml_enqueue(&ml, comp_ring->sendmp);
+ comp_ring->sendmp = comp_ring->lastmp = NULL;
}
bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem),
@@ -1144,19 +1194,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
comp_ring->next = next;
comp_ring->gen = rgen;
- if (done == 0)
- return;
+ for (int i = 0; i < 2; i++) {
+ if (done[i] == 0)
+ continue;
- ring = &rq->cmd_ring[0];
+ ring = &rq->cmd_ring[i];
- if (ifiq_input(rq->ifiq, &ml))
- if_rxr_livelocked(&ring->rxr);
+ if (ifiq_input(rq->ifiq, &ml))
+ if_rxr_livelocked(&ring->rxr);
- /* XXX Should we (try to) allocate buffers for ring 2 too? */
- mtx_enter(&ring->mtx);
- if_rxr_put(&ring->rxr, done);
- vmxnet3_rxfill(ring);
- mtx_leave(&ring->mtx);
+ mtx_enter(&ring->mtx);
+ if_rxr_put(&ring->rxr, done[i]);
+ vmxnet3_rxfill(ring);
+ mtx_leave(&ring->mtx);
+ }
}
void
@@ -1211,6 +1262,8 @@ vmxnet3_iff(struct vmxnet3_softc *sc)
void
vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m)
{
+ uint32_t pkts;
+
/*
* VLAN Offload
*/
@@ -1243,6 +1296,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcomp
else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP))
SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
}
+
+ /*
+ * TCP Large Receive Offload
+ */
+
+ pkts = m->m_pkthdr.ph_mss;
+ m->m_pkthdr.ph_mss = 0;
+
+ if (pkts > 1) {
+ struct ether_extracted ext;
+ uint32_t paylen;
+
+ ether_extract_headers(m, &ext);
+
+ paylen = ext.iplen;
+ if (ext.ip4 || ext.ip6)
+ paylen -= ext.iphlen;
+
+ if (ext.tcp) {
+ paylen -= ext.tcphlen;
+ tcpstat_inc(tcps_inhwlro);
+ tcpstat_add(tcps_inpktlro, pkts);
+ } else {
+ tcpstat_inc(tcps_inbadlro);
+ }
+
+ /*
+ * If we gonna forward this packet, we have to mark it as TSO,
+ * set a correct mss, and recalculate the TCP checksum.
+ */
+ if (ext.tcp && paylen >= pkts) {
+ SET(m->m_pkthdr.csum_flags, M_TCP_TSO);
+ m->m_pkthdr.ph_mss = paylen / pkts;
+ }
+ if (ext.tcp &&
+ ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) {
+ SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+ }
+ }
}
void
@@ -1308,6 +1400,13 @@ vmxnet3_init(struct vmxnet3_softc *sc)
vmxnet3_stop(ifp);
return EIO;
}
+
+ /* TCP Large Receive Offload */
+ if (ISSET(ifp->if_xflags, IFXF_LRO))
+ SET(sc->sc_ds->upt_features, UPT1_F_LRO);
+ else
+ CLR(sc->sc_ds->upt_features, UPT1_F_LRO);
+ WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE);
/* Program promiscuous mode and multicast filters. */
vmxnet3_iff(sc);
Index: dev/pci/if_vmxreg.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmxreg.h,v
diff -u -p -r1.9 if_vmxreg.h
--- dev/pci/if_vmxreg.h 7 Jul 2020 01:36:49 -0000 1.9
+++ dev/pci/if_vmxreg.h 28 May 2024 20:10:33 -0000
@@ -76,6 +76,7 @@ enum UPT1_RxStats {
#define VMXNET3_CMD_RESET 0xcafe0002 /* reset device */
#define VMXNET3_CMD_SET_RXMODE 0xcafe0003 /* set interface flags */
#define VMXNET3_CMD_SET_FILTER 0xcafe0004 /* set address filter */
+#define VMXNET3_CMD_SET_FEATURE 0xcafe0009 /* set features */
#define VMXNET3_CMD_GET_STATUS 0xf00d0000 /* get queue errors */
#define VMXNET3_CMD_GET_STATS 0xf00d0001
#define VMXNET3_CMD_GET_LINK 0xf00d0002 /* get link status */
@@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc {
u_int32_t rxc_word1;
#define VMXNET3_RXC_RSSHASH_M 0xffffffff /* RSS hash value */
#define VMXNET3_RXC_RSSHASH_S 0
+#define VMXNET3_RXC_SEG_CNT_M 0x000000ff /* No. of seg. in LRO pkt */
u_int32_t rxc_word2;
#define VMXNET3_RXC_LEN_M 0x00003fff
@@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc {
#define VMXNET3_RXC_FRAGMENT 0x00400000 /* IP fragment */
#define VMXNET3_RXC_FCS 0x00800000 /* frame CRC correct */
#define VMXNET3_RXC_TYPE_M 0x7f000000
+#define VMXNET3_RXC_TYPE_S 24
#define VMXNET3_RXC_GEN_M 0x00000001U
#define VMXNET3_RXC_GEN_S 31
} __packed;
vmx(4): TCP Large Receive Offload