From: jan@openbsd.org Subject: vmx(4): TCP Large Receive Offload To: tech@openbsd.org Date: Wed, 22 May 2024 22:47:01 +0200 Hi, This diff introduces TCP Large Receive Offload (LRO) for vmx(4). The virtual device annotates LRO packets with receive descriptors of type 4. We need this additional information to calculate a valid MSS for this packet. Thus, we are able to route this kind of packets. But, we just get type 4 descriptors if we pretend support vmxnet3 in revision 2. I tested it on ESXi 8 with Linux guests and external hosts. It increases the single TCP performance up to 20 GBit/s in my setup. Tests are welcome, especially with different ESXi versions and IP forwarding. bye, Jan Index: dev/pci/if_vmx.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v diff -u -p -r1.86 if_vmx.c --- dev/pci/if_vmx.c 21 May 2024 19:49:06 -0000 1.86 +++ dev/pci/if_vmx.c 22 May 2024 20:18:20 -0000 @@ -114,6 +114,7 @@ struct vmxnet3_comp_ring { }; u_int next; u_int32_t gen; + struct mbuf *m; }; struct vmxnet3_txqueue { @@ -170,7 +171,7 @@ struct vmxnet3_softc { #endif }; -#define JUMBO_LEN (1024 * 9) +#define JUMBO_LEN ((16 * 1024) - 1) #define DMAADDR(map) ((map)->dm_segs[0].ds_addr) #define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg) @@ -273,15 +274,20 @@ vmxnet3_attach(struct device *parent, st return; } + /* Vmxnet3 Revision Report and Selection */ ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS); - if ((ver & 0x1) == 0) { + if (ISSET(ver, 0x2)) { + WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 2); + } else if (ISSET(ver, 0x1)) { + WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1); + } else { printf(": unsupported hardware version 0x%x\n", ver); return; } - WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1); + /* UPT Version Report and Selection */ ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS); - if ((ver & 0x1) == 0) { + if (!ISSET(ver, 0x1)) { printf(": incompatible UPT version 0x%x\n", ver); return; } @@ -410,6 +416,9 @@ vmxnet3_attach(struct device *parent, st ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; + ifp->if_xflags |= IFXF_LRO; + ifp->if_capabilities |= IFCAP_LRO; + #if NVLAN > 0 if (sc->sc_ds->upt_features & UPT1_F_VLAN) ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; @@ -515,6 +524,7 @@ vmxnet3_dma_init(struct vmxnet3_softc *s #if NVLAN > 0 ds->upt_features |= UPT1_F_VLAN; #endif + ds->nrxsg_max = 0; ds->driver_data = ~0ULL; ds->driver_data_len = 0; ds->queue_shared = qs_pa; @@ -668,6 +678,7 @@ vmxnet3_txinit(struct vmxnet3_softc *sc, ring->gen = VMX_TX_GEN; comp_ring->next = 0; comp_ring->gen = VMX_TXC_GEN; + comp_ring->m = NULL; memset(VMX_DMA_KVA(&ring->dmamem), 0, VMX_DMA_LEN(&ring->dmamem)); bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem), @@ -704,6 +715,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ri uint32_t rgen; uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S); + /* Second ring just contains packet bodies. */ + if (ring->rid == 1) + type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S); + MUTEX_ASSERT_LOCKED(&ring->mtx); slots = if_rxr_get(&ring->rxr, NRXDESC); @@ -781,17 +796,16 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc, VMX_DMA_LEN(&ring->dmamem)); bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem), 0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE); - } - /* XXX only fill ring 0 */ - ring = &rq->cmd_ring[0]; - mtx_enter(&ring->mtx); - vmxnet3_rxfill(ring); - mtx_leave(&ring->mtx); + mtx_enter(&ring->mtx); + vmxnet3_rxfill(ring); + mtx_leave(&ring->mtx); + } comp_ring = &rq->comp_ring; comp_ring->next = 0; comp_ring->gen = VMX_RXC_GEN; + comp_ring->m = NULL; memset(VMX_DMA_KVA(&comp_ring->dmamem), 0, VMX_DMA_LEN(&comp_ring->dmamem)); @@ -1072,11 +1086,11 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, struct vmxnet3_rxring *ring; struct vmxnet3_rxcompdesc *rxcd; struct mbuf_list ml = MBUF_LIST_INITIALIZER(); - struct mbuf *m; + struct mbuf *m, *sendmp = NULL; bus_dmamap_t map; - unsigned int idx, len; + unsigned int idx; unsigned int next, rgen; - unsigned int done = 0; + unsigned int rid, done[2] = {0, 0}; next = comp_ring->next; rgen = comp_ring->gen; @@ -1096,11 +1110,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) & VMXNET3_RXC_IDX_M); + if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) & VMXNET3_RXC_QID_M) < sc->sc_nqueues) - ring = &rq->cmd_ring[0]; + rid = 0; else - ring = &rq->cmd_ring[1]; + rid = 1; + + ring = &rq->cmd_ring[rid]; m = ring->m[idx]; KASSERT(m != NULL); @@ -1111,31 +1128,65 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->sc_dmat, map); - done++; + done[rid]++; + + /* + * A receive descriptor of type 4 which is flaged as start of + * packet, contains the number of TCP segment of an LRO packet. + */ + if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >> + VMXNET3_RXC_TYPE_S) == 4 && + ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) { + m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 & + VMXNET3_RXC_SEG_CNT_M); + } + + m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) & + VMXNET3_RXC_LEN_M); + + sendmp = comp_ring->m; + comp_ring->m = NULL; + + if (sendmp == NULL) { + sendmp = m; + sendmp->m_pkthdr.len = m->m_len; + } else { + struct mbuf *mp; + + sendmp->m_pkthdr.len += m->m_len; + for (mp = sendmp; mp->m_next != NULL; mp = mp->m_next); + mp->m_next = m; + } + + if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP)) { + comp_ring->m = sendmp; + continue; + } + + /* + * End of Packet + */ if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) { ifp->if_ierrors++; - m_freem(m); + m_freem(sendmp); continue; } - len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) & - VMXNET3_RXC_LEN_M); - if (len < VMXNET3_MIN_MTU) { + if (sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) { m_freem(m); continue; } - m->m_pkthdr.len = m->m_len = len; - vmxnet3_rx_offload(rxcd, m); + vmxnet3_rx_offload(rxcd, sendmp); if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) & VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) { - m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1); - SET(m->m_pkthdr.csum_flags, M_FLOWID); + sendmp->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1); + SET(sendmp->m_pkthdr.csum_flags, M_FLOWID); } - ml_enqueue(&ml, m); + ml_enqueue(&ml, sendmp); } bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem), @@ -1144,19 +1195,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, comp_ring->next = next; comp_ring->gen = rgen; - if (done == 0) - return; + for (int i = 0; i < 2; i++) { + if (done[i] == 0) + continue; - ring = &rq->cmd_ring[0]; + ring = &rq->cmd_ring[i]; - if (ifiq_input(rq->ifiq, &ml)) - if_rxr_livelocked(&ring->rxr); + if (ifiq_input(rq->ifiq, &ml)) + if_rxr_livelocked(&ring->rxr); - /* XXX Should we (try to) allocate buffers for ring 2 too? */ - mtx_enter(&ring->mtx); - if_rxr_put(&ring->rxr, done); - vmxnet3_rxfill(ring); - mtx_leave(&ring->mtx); + mtx_enter(&ring->mtx); + if_rxr_put(&ring->rxr, done[i]); + vmxnet3_rxfill(ring); + mtx_leave(&ring->mtx); + } } void @@ -1207,10 +1259,11 @@ vmxnet3_iff(struct vmxnet3_softc *sc) WRITE_CMD(sc, VMXNET3_CMD_SET_RXMODE); } - void vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m) { + uint32_t pkts; + /* * VLAN Offload */ @@ -1243,6 +1296,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcomp else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP)) SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK); } + + /* + * TCP Large Receive Offload + */ + + pkts = m->m_pkthdr.ph_mss; + m->m_pkthdr.ph_mss = 0; + + if (pkts > 1) { + struct ether_extracted ext; + uint32_t paylen; + + ether_extract_headers(m, &ext); + + paylen = ext.iplen; + if (ext.ip4 || ext.ip6) + paylen -= ext.iphlen; + + if (ext.tcp) { + paylen -= ext.tcphlen; + tcpstat_inc(tcps_inhwlro); + tcpstat_add(tcps_inpktlro, pkts); + } else { + tcpstat_inc(tcps_inbadlro); + } + + /* + * If we gonna forward this packet, we have to mark it as TSO, + * set a correct mss, and recalculate the TCP checksum. + */ + if (ext.tcp && paylen >= pkts) { + SET(m->m_pkthdr.csum_flags, M_TCP_TSO); + m->m_pkthdr.ph_mss = paylen / pkts; + } + if (ext.tcp && + ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) { + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); + } + } } void @@ -1308,6 +1400,13 @@ vmxnet3_init(struct vmxnet3_softc *sc) vmxnet3_stop(ifp); return EIO; } + + /* TCP Large Receive Offload */ + if (ISSET(ifp->if_xflags, IFXF_LRO)) + SET(sc->sc_ds->upt_features, UPT1_F_LRO); + else + CLR(sc->sc_ds->upt_features, UPT1_F_LRO); + WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE); /* Program promiscuous mode and multicast filters. */ vmxnet3_iff(sc); Index: dev/pci/if_vmxreg.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_vmxreg.h,v diff -u -p -r1.9 if_vmxreg.h --- dev/pci/if_vmxreg.h 7 Jul 2020 01:36:49 -0000 1.9 +++ dev/pci/if_vmxreg.h 22 May 2024 19:45:29 -0000 @@ -76,6 +76,7 @@ enum UPT1_RxStats { #define VMXNET3_CMD_RESET 0xcafe0002 /* reset device */ #define VMXNET3_CMD_SET_RXMODE 0xcafe0003 /* set interface flags */ #define VMXNET3_CMD_SET_FILTER 0xcafe0004 /* set address filter */ +#define VMXNET3_CMD_SET_FEATURE 0xcafe0009 /* set features */ #define VMXNET3_CMD_GET_STATUS 0xf00d0000 /* get queue errors */ #define VMXNET3_CMD_GET_STATS 0xf00d0001 #define VMXNET3_CMD_GET_LINK 0xf00d0002 /* get link status */ @@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc { u_int32_t rxc_word1; #define VMXNET3_RXC_RSSHASH_M 0xffffffff /* RSS hash value */ #define VMXNET3_RXC_RSSHASH_S 0 +#define VMXNET3_RXC_SEG_CNT_M 0x000000ff /* No. of seg. in LRO pkt */ u_int32_t rxc_word2; #define VMXNET3_RXC_LEN_M 0x00003fff @@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc { #define VMXNET3_RXC_FRAGMENT 0x00400000 /* IP fragment */ #define VMXNET3_RXC_FCS 0x00800000 /* frame CRC correct */ #define VMXNET3_RXC_TYPE_M 0x7f000000 +#define VMXNET3_RXC_TYPE_S 24 #define VMXNET3_RXC_GEN_M 0x00000001U #define VMXNET3_RXC_GEN_S 31 } __packed;