Download raw body.
vmx(4): TCP Large Receive Offload
On Fri, May 31, 2024 at 09:07:53PM +0200, Jan Klemkow wrote:
> On Wed, May 29, 2024 at 12:34:24AM +0200, jan@openbsd.org wrote:
> > On Thu, May 23, 2024 at 12:10:57PM GMT, Hrvoje Popovski wrote:
> > > On 22.5.2024. 22:47, jan@openbsd.org wrote:
> > > > This diff introduces TCP Large Receive Offload (LRO) for vmx(4).
> > > >
> > > > The virtual device annotates LRO packets with receive descriptors of
> > > > type 4. We need this additional information to calculate a valid MSS
> > > > for this packet. Thus, we are able to route this kind of packets.
> > > > But, we just get type 4 descriptors if we pretend support vmxnet3
> > > > in revision 2.
> > > >
> > > > I tested it on ESXi 8 with Linux guests and external hosts. It
> > > > increases the single TCP performance up to 20 GBit/s in my setup.
> > > >
> > > > Tests are welcome, especially with different ESXi versions and IP
> > > > forwarding.
> > >
> > > I'm running this diff with forwarding and iperf setup at the same time
> > > and box seems fine. Vm is on ESXi8.
> >
> > Thanks for testing!
> >
> > Here is an update version of the diff with suggestions form bluhm.
> >
> > - We negotiate LRO only on vmxnet3 rev 2 devices
> > - Remove the mbus for loop
> > - Track last mbuf via lastmp variable
> > - Remove some useless lines
>
> Hrvoje found some panics in my last diff. I just forgot the set
> send/lastmp to NULL after m_freem().
>
> Fixed diff below.
tested and OK bluhm@
> Index: dev/pci/if_vmx.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v
> diff -u -p -r1.86 if_vmx.c
> --- dev/pci/if_vmx.c 21 May 2024 19:49:06 -0000 1.86
> +++ dev/pci/if_vmx.c 31 May 2024 18:57:56 -0000
> @@ -114,6 +114,8 @@ struct vmxnet3_comp_ring {
> };
> u_int next;
> u_int32_t gen;
> + struct mbuf *sendmp;
> + struct mbuf *lastmp;
> };
>
> struct vmxnet3_txqueue {
> @@ -160,6 +162,7 @@ struct vmxnet3_softc {
> struct vmxnet3_queue *sc_q;
> struct intrmap *sc_intrmap;
>
> + u_int sc_vrrs;
> struct vmxnet3_driver_shared *sc_ds;
> u_int8_t *sc_mcast;
> struct vmxnet3_upt1_rss_conf *sc_rss;
> @@ -170,7 +173,7 @@ struct vmxnet3_softc {
> #endif
> };
>
> -#define JUMBO_LEN (1024 * 9)
> +#define JUMBO_LEN ((16 * 1024) - 1)
> #define DMAADDR(map) ((map)->dm_segs[0].ds_addr)
>
> #define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg)
> @@ -273,15 +276,21 @@ vmxnet3_attach(struct device *parent, st
> return;
> }
>
> + /* Vmxnet3 Revision Report and Selection */
> ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS);
> - if ((ver & 0x1) == 0) {
> + if (ISSET(ver, 0x2)) {
> + sc->sc_vrrs = 2;
> + } else if (ISSET(ver, 0x1)) {
> + sc->sc_vrrs = 1;
> + } else {
> printf(": unsupported hardware version 0x%x\n", ver);
> return;
> }
> - WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1);
> + WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, sc->sc_vrrs);
>
> + /* UPT Version Report and Selection */
> ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS);
> - if ((ver & 0x1) == 0) {
> + if (!ISSET(ver, 0x1)) {
> printf(": incompatible UPT version 0x%x\n", ver);
> return;
> }
> @@ -410,6 +419,11 @@ vmxnet3_attach(struct device *parent, st
>
> ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>
> + if (sc->sc_vrrs == 2) {
> + ifp->if_xflags |= IFXF_LRO;
> + ifp->if_capabilities |= IFCAP_LRO;
> + }
> +
> #if NVLAN > 0
> if (sc->sc_ds->upt_features & UPT1_F_VLAN)
> ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
> @@ -704,6 +718,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ri
> uint32_t rgen;
> uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S);
>
> + /* Second ring just contains packet bodies. */
> + if (ring->rid == 1)
> + type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S);
> +
> MUTEX_ASSERT_LOCKED(&ring->mtx);
>
> slots = if_rxr_get(&ring->rxr, NRXDESC);
> @@ -781,17 +799,17 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc,
> VMX_DMA_LEN(&ring->dmamem));
> bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem),
> 0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE);
> - }
>
> - /* XXX only fill ring 0 */
> - ring = &rq->cmd_ring[0];
> - mtx_enter(&ring->mtx);
> - vmxnet3_rxfill(ring);
> - mtx_leave(&ring->mtx);
> + mtx_enter(&ring->mtx);
> + vmxnet3_rxfill(ring);
> + mtx_leave(&ring->mtx);
> + }
>
> comp_ring = &rq->comp_ring;
> comp_ring->next = 0;
> comp_ring->gen = VMX_RXC_GEN;
> + comp_ring->sendmp = NULL;
> + comp_ring->lastmp = NULL;
>
> memset(VMX_DMA_KVA(&comp_ring->dmamem), 0,
> VMX_DMA_LEN(&comp_ring->dmamem));
> @@ -1074,9 +1092,9 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
> struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> struct mbuf *m;
> bus_dmamap_t map;
> - unsigned int idx, len;
> + unsigned int idx;
> unsigned int next, rgen;
> - unsigned int done = 0;
> + unsigned int rid, done[2] = {0, 0};
>
> next = comp_ring->next;
> rgen = comp_ring->gen;
> @@ -1096,11 +1114,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
>
> idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) &
> VMXNET3_RXC_IDX_M);
> +
> if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) &
> VMXNET3_RXC_QID_M) < sc->sc_nqueues)
> - ring = &rq->cmd_ring[0];
> + rid = 0;
> else
> - ring = &rq->cmd_ring[1];
> + rid = 1;
> +
> + ring = &rq->cmd_ring[rid];
>
> m = ring->m[idx];
> KASSERT(m != NULL);
> @@ -1111,31 +1132,62 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
> BUS_DMASYNC_POSTREAD);
> bus_dmamap_unload(sc->sc_dmat, map);
>
> - done++;
> + done[rid]++;
> +
> + /*
> + * A receive descriptor of type 4 which is flaged as start of
> + * packet, contains the number of TCP segment of an LRO packet.
> + */
> + if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >>
> + VMXNET3_RXC_TYPE_S) == 4 &&
> + ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) {
> + m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 &
> + VMXNET3_RXC_SEG_CNT_M);
> + }
> +
> + m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
> + VMXNET3_RXC_LEN_M);
> +
> + if (comp_ring->sendmp == NULL) {
> + comp_ring->sendmp = comp_ring->lastmp = m;
> + comp_ring->sendmp->m_pkthdr.len = 0;
> + } else {
> + CLR(m->m_flags, M_PKTHDR);
> + comp_ring->lastmp->m_next = m;
> + comp_ring->lastmp = m;
> + }
> + comp_ring->sendmp->m_pkthdr.len += m->m_len;
> +
> + if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP))
> + continue;
> +
> + /*
> + * End of Packet
> + */
>
> if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) {
> ifp->if_ierrors++;
> - m_freem(m);
> + m_freem(comp_ring->sendmp);
> + comp_ring->sendmp = comp_ring->lastmp = NULL;
> continue;
> }
>
> - len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
> - VMXNET3_RXC_LEN_M);
> - if (len < VMXNET3_MIN_MTU) {
> - m_freem(m);
> + if (comp_ring->sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) {
> + m_freem(comp_ring->sendmp);
> + comp_ring->sendmp = comp_ring->lastmp = NULL;
> continue;
> }
> - m->m_pkthdr.len = m->m_len = len;
> -
> - vmxnet3_rx_offload(rxcd, m);
>
> if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) &
> VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) {
> - m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1);
> - SET(m->m_pkthdr.csum_flags, M_FLOWID);
> + comp_ring->sendmp->m_pkthdr.ph_flowid =
> + letoh32(rxcd->rxc_word1);
> + SET(comp_ring->sendmp->m_pkthdr.csum_flags, M_FLOWID);
> }
>
> - ml_enqueue(&ml, m);
> + vmxnet3_rx_offload(rxcd, comp_ring->sendmp);
> + ml_enqueue(&ml, comp_ring->sendmp);
> + comp_ring->sendmp = comp_ring->lastmp = NULL;
> }
>
> bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem),
> @@ -1144,19 +1196,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc,
> comp_ring->next = next;
> comp_ring->gen = rgen;
>
> - if (done == 0)
> - return;
> + for (int i = 0; i < 2; i++) {
> + if (done[i] == 0)
> + continue;
>
> - ring = &rq->cmd_ring[0];
> + ring = &rq->cmd_ring[i];
>
> - if (ifiq_input(rq->ifiq, &ml))
> - if_rxr_livelocked(&ring->rxr);
> + if (ifiq_input(rq->ifiq, &ml))
> + if_rxr_livelocked(&ring->rxr);
>
> - /* XXX Should we (try to) allocate buffers for ring 2 too? */
> - mtx_enter(&ring->mtx);
> - if_rxr_put(&ring->rxr, done);
> - vmxnet3_rxfill(ring);
> - mtx_leave(&ring->mtx);
> + mtx_enter(&ring->mtx);
> + if_rxr_put(&ring->rxr, done[i]);
> + vmxnet3_rxfill(ring);
> + mtx_leave(&ring->mtx);
> + }
> }
>
> void
> @@ -1211,6 +1264,8 @@ vmxnet3_iff(struct vmxnet3_softc *sc)
> void
> vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m)
> {
> + uint32_t pkts;
> +
> /*
> * VLAN Offload
> */
> @@ -1243,6 +1298,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcomp
> else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP))
> SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
> }
> +
> + /*
> + * TCP Large Receive Offload
> + */
> +
> + pkts = m->m_pkthdr.ph_mss;
> + m->m_pkthdr.ph_mss = 0;
> +
> + if (pkts > 1) {
> + struct ether_extracted ext;
> + uint32_t paylen;
> +
> + ether_extract_headers(m, &ext);
> +
> + paylen = ext.iplen;
> + if (ext.ip4 || ext.ip6)
> + paylen -= ext.iphlen;
> +
> + if (ext.tcp) {
> + paylen -= ext.tcphlen;
> + tcpstat_inc(tcps_inhwlro);
> + tcpstat_add(tcps_inpktlro, pkts);
> + } else {
> + tcpstat_inc(tcps_inbadlro);
> + }
> +
> + /*
> + * If we gonna forward this packet, we have to mark it as TSO,
> + * set a correct mss, and recalculate the TCP checksum.
> + */
> + if (ext.tcp && paylen >= pkts) {
> + SET(m->m_pkthdr.csum_flags, M_TCP_TSO);
> + m->m_pkthdr.ph_mss = paylen / pkts;
> + }
> + if (ext.tcp &&
> + ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) {
> + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> + }
> + }
> }
>
> void
> @@ -1308,6 +1402,13 @@ vmxnet3_init(struct vmxnet3_softc *sc)
> vmxnet3_stop(ifp);
> return EIO;
> }
> +
> + /* TCP Large Receive Offload */
> + if (ISSET(ifp->if_xflags, IFXF_LRO))
> + SET(sc->sc_ds->upt_features, UPT1_F_LRO);
> + else
> + CLR(sc->sc_ds->upt_features, UPT1_F_LRO);
> + WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE);
>
> /* Program promiscuous mode and multicast filters. */
> vmxnet3_iff(sc);
> Index: dev/pci/if_vmxreg.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_vmxreg.h,v
> diff -u -p -r1.9 if_vmxreg.h
> --- dev/pci/if_vmxreg.h 7 Jul 2020 01:36:49 -0000 1.9
> +++ dev/pci/if_vmxreg.h 28 May 2024 20:10:33 -0000
> @@ -76,6 +76,7 @@ enum UPT1_RxStats {
> #define VMXNET3_CMD_RESET 0xcafe0002 /* reset device */
> #define VMXNET3_CMD_SET_RXMODE 0xcafe0003 /* set interface flags */
> #define VMXNET3_CMD_SET_FILTER 0xcafe0004 /* set address filter */
> +#define VMXNET3_CMD_SET_FEATURE 0xcafe0009 /* set features */
> #define VMXNET3_CMD_GET_STATUS 0xf00d0000 /* get queue errors */
> #define VMXNET3_CMD_GET_STATS 0xf00d0001
> #define VMXNET3_CMD_GET_LINK 0xf00d0002 /* get link status */
> @@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc {
> u_int32_t rxc_word1;
> #define VMXNET3_RXC_RSSHASH_M 0xffffffff /* RSS hash value */
> #define VMXNET3_RXC_RSSHASH_S 0
> +#define VMXNET3_RXC_SEG_CNT_M 0x000000ff /* No. of seg. in LRO pkt */
>
> u_int32_t rxc_word2;
> #define VMXNET3_RXC_LEN_M 0x00003fff
> @@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc {
> #define VMXNET3_RXC_FRAGMENT 0x00400000 /* IP fragment */
> #define VMXNET3_RXC_FCS 0x00800000 /* frame CRC correct */
> #define VMXNET3_RXC_TYPE_M 0x7f000000
> +#define VMXNET3_RXC_TYPE_S 24
> #define VMXNET3_RXC_GEN_M 0x00000001U
> #define VMXNET3_RXC_GEN_S 31
> } __packed;
vmx(4): TCP Large Receive Offload