From: Alexander Bluhm Subject: Re: vmx(4): TCP Large Receive Offload To: Jan Klemkow Cc: tech@openbsd.org Date: Thu, 6 Jun 2024 22:31:46 +0200 On Fri, May 31, 2024 at 09:07:53PM +0200, Jan Klemkow wrote: > On Wed, May 29, 2024 at 12:34:24AM +0200, jan@openbsd.org wrote: > > On Thu, May 23, 2024 at 12:10:57PM GMT, Hrvoje Popovski wrote: > > > On 22.5.2024. 22:47, jan@openbsd.org wrote: > > > > This diff introduces TCP Large Receive Offload (LRO) for vmx(4). > > > > > > > > The virtual device annotates LRO packets with receive descriptors of > > > > type 4. We need this additional information to calculate a valid MSS > > > > for this packet. Thus, we are able to route this kind of packets. > > > > But, we just get type 4 descriptors if we pretend support vmxnet3 > > > > in revision 2. > > > > > > > > I tested it on ESXi 8 with Linux guests and external hosts. It > > > > increases the single TCP performance up to 20 GBit/s in my setup. > > > > > > > > Tests are welcome, especially with different ESXi versions and IP > > > > forwarding. > > > > > > I'm running this diff with forwarding and iperf setup at the same time > > > and box seems fine. Vm is on ESXi8. > > > > Thanks for testing! > > > > Here is an update version of the diff with suggestions form bluhm. > > > > - We negotiate LRO only on vmxnet3 rev 2 devices > > - Remove the mbus for loop > > - Track last mbuf via lastmp variable > > - Remove some useless lines > > Hrvoje found some panics in my last diff. I just forgot the set > send/lastmp to NULL after m_freem(). > > Fixed diff below. tested and OK bluhm@ > Index: dev/pci/if_vmx.c > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v > diff -u -p -r1.86 if_vmx.c > --- dev/pci/if_vmx.c 21 May 2024 19:49:06 -0000 1.86 > +++ dev/pci/if_vmx.c 31 May 2024 18:57:56 -0000 > @@ -114,6 +114,8 @@ struct vmxnet3_comp_ring { > }; > u_int next; > u_int32_t gen; > + struct mbuf *sendmp; > + struct mbuf *lastmp; > }; > > struct vmxnet3_txqueue { > @@ -160,6 +162,7 @@ struct vmxnet3_softc { > struct vmxnet3_queue *sc_q; > struct intrmap *sc_intrmap; > > + u_int sc_vrrs; > struct vmxnet3_driver_shared *sc_ds; > u_int8_t *sc_mcast; > struct vmxnet3_upt1_rss_conf *sc_rss; > @@ -170,7 +173,7 @@ struct vmxnet3_softc { > #endif > }; > > -#define JUMBO_LEN (1024 * 9) > +#define JUMBO_LEN ((16 * 1024) - 1) > #define DMAADDR(map) ((map)->dm_segs[0].ds_addr) > > #define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg) > @@ -273,15 +276,21 @@ vmxnet3_attach(struct device *parent, st > return; > } > > + /* Vmxnet3 Revision Report and Selection */ > ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS); > - if ((ver & 0x1) == 0) { > + if (ISSET(ver, 0x2)) { > + sc->sc_vrrs = 2; > + } else if (ISSET(ver, 0x1)) { > + sc->sc_vrrs = 1; > + } else { > printf(": unsupported hardware version 0x%x\n", ver); > return; > } > - WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1); > + WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, sc->sc_vrrs); > > + /* UPT Version Report and Selection */ > ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS); > - if ((ver & 0x1) == 0) { > + if (!ISSET(ver, 0x1)) { > printf(": incompatible UPT version 0x%x\n", ver); > return; > } > @@ -410,6 +419,11 @@ vmxnet3_attach(struct device *parent, st > > ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; > > + if (sc->sc_vrrs == 2) { > + ifp->if_xflags |= IFXF_LRO; > + ifp->if_capabilities |= IFCAP_LRO; > + } > + > #if NVLAN > 0 > if (sc->sc_ds->upt_features & UPT1_F_VLAN) > ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; > @@ -704,6 +718,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ri > uint32_t rgen; > uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S); > > + /* Second ring just contains packet bodies. */ > + if (ring->rid == 1) > + type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S); > + > MUTEX_ASSERT_LOCKED(&ring->mtx); > > slots = if_rxr_get(&ring->rxr, NRXDESC); > @@ -781,17 +799,17 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc, > VMX_DMA_LEN(&ring->dmamem)); > bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem), > 0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE); > - } > > - /* XXX only fill ring 0 */ > - ring = &rq->cmd_ring[0]; > - mtx_enter(&ring->mtx); > - vmxnet3_rxfill(ring); > - mtx_leave(&ring->mtx); > + mtx_enter(&ring->mtx); > + vmxnet3_rxfill(ring); > + mtx_leave(&ring->mtx); > + } > > comp_ring = &rq->comp_ring; > comp_ring->next = 0; > comp_ring->gen = VMX_RXC_GEN; > + comp_ring->sendmp = NULL; > + comp_ring->lastmp = NULL; > > memset(VMX_DMA_KVA(&comp_ring->dmamem), 0, > VMX_DMA_LEN(&comp_ring->dmamem)); > @@ -1074,9 +1092,9 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, > struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > struct mbuf *m; > bus_dmamap_t map; > - unsigned int idx, len; > + unsigned int idx; > unsigned int next, rgen; > - unsigned int done = 0; > + unsigned int rid, done[2] = {0, 0}; > > next = comp_ring->next; > rgen = comp_ring->gen; > @@ -1096,11 +1114,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, > > idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) & > VMXNET3_RXC_IDX_M); > + > if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) & > VMXNET3_RXC_QID_M) < sc->sc_nqueues) > - ring = &rq->cmd_ring[0]; > + rid = 0; > else > - ring = &rq->cmd_ring[1]; > + rid = 1; > + > + ring = &rq->cmd_ring[rid]; > > m = ring->m[idx]; > KASSERT(m != NULL); > @@ -1111,31 +1132,62 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, > BUS_DMASYNC_POSTREAD); > bus_dmamap_unload(sc->sc_dmat, map); > > - done++; > + done[rid]++; > + > + /* > + * A receive descriptor of type 4 which is flaged as start of > + * packet, contains the number of TCP segment of an LRO packet. > + */ > + if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >> > + VMXNET3_RXC_TYPE_S) == 4 && > + ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) { > + m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 & > + VMXNET3_RXC_SEG_CNT_M); > + } > + > + m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) & > + VMXNET3_RXC_LEN_M); > + > + if (comp_ring->sendmp == NULL) { > + comp_ring->sendmp = comp_ring->lastmp = m; > + comp_ring->sendmp->m_pkthdr.len = 0; > + } else { > + CLR(m->m_flags, M_PKTHDR); > + comp_ring->lastmp->m_next = m; > + comp_ring->lastmp = m; > + } > + comp_ring->sendmp->m_pkthdr.len += m->m_len; > + > + if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP)) > + continue; > + > + /* > + * End of Packet > + */ > > if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) { > ifp->if_ierrors++; > - m_freem(m); > + m_freem(comp_ring->sendmp); > + comp_ring->sendmp = comp_ring->lastmp = NULL; > continue; > } > > - len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) & > - VMXNET3_RXC_LEN_M); > - if (len < VMXNET3_MIN_MTU) { > - m_freem(m); > + if (comp_ring->sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) { > + m_freem(comp_ring->sendmp); > + comp_ring->sendmp = comp_ring->lastmp = NULL; > continue; > } > - m->m_pkthdr.len = m->m_len = len; > - > - vmxnet3_rx_offload(rxcd, m); > > if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) & > VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) { > - m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1); > - SET(m->m_pkthdr.csum_flags, M_FLOWID); > + comp_ring->sendmp->m_pkthdr.ph_flowid = > + letoh32(rxcd->rxc_word1); > + SET(comp_ring->sendmp->m_pkthdr.csum_flags, M_FLOWID); > } > > - ml_enqueue(&ml, m); > + vmxnet3_rx_offload(rxcd, comp_ring->sendmp); > + ml_enqueue(&ml, comp_ring->sendmp); > + comp_ring->sendmp = comp_ring->lastmp = NULL; > } > > bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem), > @@ -1144,19 +1196,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, > comp_ring->next = next; > comp_ring->gen = rgen; > > - if (done == 0) > - return; > + for (int i = 0; i < 2; i++) { > + if (done[i] == 0) > + continue; > > - ring = &rq->cmd_ring[0]; > + ring = &rq->cmd_ring[i]; > > - if (ifiq_input(rq->ifiq, &ml)) > - if_rxr_livelocked(&ring->rxr); > + if (ifiq_input(rq->ifiq, &ml)) > + if_rxr_livelocked(&ring->rxr); > > - /* XXX Should we (try to) allocate buffers for ring 2 too? */ > - mtx_enter(&ring->mtx); > - if_rxr_put(&ring->rxr, done); > - vmxnet3_rxfill(ring); > - mtx_leave(&ring->mtx); > + mtx_enter(&ring->mtx); > + if_rxr_put(&ring->rxr, done[i]); > + vmxnet3_rxfill(ring); > + mtx_leave(&ring->mtx); > + } > } > > void > @@ -1211,6 +1264,8 @@ vmxnet3_iff(struct vmxnet3_softc *sc) > void > vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m) > { > + uint32_t pkts; > + > /* > * VLAN Offload > */ > @@ -1243,6 +1298,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcomp > else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP)) > SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK); > } > + > + /* > + * TCP Large Receive Offload > + */ > + > + pkts = m->m_pkthdr.ph_mss; > + m->m_pkthdr.ph_mss = 0; > + > + if (pkts > 1) { > + struct ether_extracted ext; > + uint32_t paylen; > + > + ether_extract_headers(m, &ext); > + > + paylen = ext.iplen; > + if (ext.ip4 || ext.ip6) > + paylen -= ext.iphlen; > + > + if (ext.tcp) { > + paylen -= ext.tcphlen; > + tcpstat_inc(tcps_inhwlro); > + tcpstat_add(tcps_inpktlro, pkts); > + } else { > + tcpstat_inc(tcps_inbadlro); > + } > + > + /* > + * If we gonna forward this packet, we have to mark it as TSO, > + * set a correct mss, and recalculate the TCP checksum. > + */ > + if (ext.tcp && paylen >= pkts) { > + SET(m->m_pkthdr.csum_flags, M_TCP_TSO); > + m->m_pkthdr.ph_mss = paylen / pkts; > + } > + if (ext.tcp && > + ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) { > + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); > + } > + } > } > > void > @@ -1308,6 +1402,13 @@ vmxnet3_init(struct vmxnet3_softc *sc) > vmxnet3_stop(ifp); > return EIO; > } > + > + /* TCP Large Receive Offload */ > + if (ISSET(ifp->if_xflags, IFXF_LRO)) > + SET(sc->sc_ds->upt_features, UPT1_F_LRO); > + else > + CLR(sc->sc_ds->upt_features, UPT1_F_LRO); > + WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE); > > /* Program promiscuous mode and multicast filters. */ > vmxnet3_iff(sc); > Index: dev/pci/if_vmxreg.h > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_vmxreg.h,v > diff -u -p -r1.9 if_vmxreg.h > --- dev/pci/if_vmxreg.h 7 Jul 2020 01:36:49 -0000 1.9 > +++ dev/pci/if_vmxreg.h 28 May 2024 20:10:33 -0000 > @@ -76,6 +76,7 @@ enum UPT1_RxStats { > #define VMXNET3_CMD_RESET 0xcafe0002 /* reset device */ > #define VMXNET3_CMD_SET_RXMODE 0xcafe0003 /* set interface flags */ > #define VMXNET3_CMD_SET_FILTER 0xcafe0004 /* set address filter */ > +#define VMXNET3_CMD_SET_FEATURE 0xcafe0009 /* set features */ > #define VMXNET3_CMD_GET_STATUS 0xf00d0000 /* get queue errors */ > #define VMXNET3_CMD_GET_STATS 0xf00d0001 > #define VMXNET3_CMD_GET_LINK 0xf00d0002 /* get link status */ > @@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc { > u_int32_t rxc_word1; > #define VMXNET3_RXC_RSSHASH_M 0xffffffff /* RSS hash value */ > #define VMXNET3_RXC_RSSHASH_S 0 > +#define VMXNET3_RXC_SEG_CNT_M 0x000000ff /* No. of seg. in LRO pkt */ > > u_int32_t rxc_word2; > #define VMXNET3_RXC_LEN_M 0x00003fff > @@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc { > #define VMXNET3_RXC_FRAGMENT 0x00400000 /* IP fragment */ > #define VMXNET3_RXC_FCS 0x00800000 /* frame CRC correct */ > #define VMXNET3_RXC_TYPE_M 0x7f000000 > +#define VMXNET3_RXC_TYPE_S 24 > #define VMXNET3_RXC_GEN_M 0x00000001U > #define VMXNET3_RXC_GEN_S 31 > } __packed;