Index | Thread | Search

From:
Jonathan Matthew <jonathan@d14n.org>
Subject:
Re: dwqe(4) Tx checksum offloading
To:
Mark Kettenis <mark.kettenis@xs4all.nl>
Cc:
Stefan Sperling <stsp@stsp.name>, tech@openbsd.org
Date:
Mon, 6 May 2024 11:30:45 +1000

Download raw body.

Thread
On Sat, May 04, 2024 at 09:28:41PM +0200, Mark Kettenis wrote:
> > Date: Sat, 4 May 2024 11:52:50 +0200
> > From: Stefan Sperling <stsp@stsp.name>
> > 
> > The relatively small diff below seems to be enough to get Tx checksum
> > offloading going on dwqe(4).
> > 
> > Tested on Elkhart Lake with tcpbench and tcpbench -u.
> > 
> > Throughput is excellent, though as far as I can tell unchanged on this
> > particular hardware with a 1.9GHz x6425RE Intel Atom:
> > Conn:   1 Mbps:      935.823 Peak Mbps:      935.823 Avg Mbps:      935.823
> > 
> > Tests were run between dwqe0 and em0, both in the same machine, but
> > separated by rdomains to get packets routed on the wire.
> > 
> > Before this patch, software checksums on dwqe0 would raise counters:
> > 
> > $ netstat -s | grep software-checksummed
> >         158030 input datagrams software-checksummed
> >         269208 output datagrams software-checksummed
> >                 269208 packets software-checksummed
> >                 158000 packets software-checksummed
> >         30 input packets software-checksummed
> >         0 output packets software-checksummed
> > 
> > With patch, testing with IPv4, both em0 and dwqe0 use hardware checksums
> > only:
> > 
> > $ netstat -s | grep software-checksummed
> >         0 input datagrams software-checksummed
> >         0 output datagrams software-checksummed
> >                 0 packets software-checksummed
> >                 0 packets software-checksummed
> >         0 input packets software-checksummed
> >         0 output packets software-checksummed
> > 
> > After testing with IPv6 (our stack doesn't offload v6 header checksums):
> > 
> > $ netstat -s | grep software-checksummed
> >         0 input datagrams software-checksummed
> >         0 output datagrams software-checksummed
> >                 0 packets software-checksummed
> >                 0 packets software-checksummed
> >         103219 input packets software-checksummed
> >         243694 output packets software-checksummed
> > 
> > This diff includes the full set of Tx descriptor macros from a previous
> > diff I sent. I can remove the unused ones before commit if preferred.
> > Though I suspect we'll need most or all of them eventually for VLAN
> > tagging offload (I plan to work on this next) and for TSO.
> > 
> > ok?
> 
> Seems to work on RK3568 as well.  Not seeing a significant reduction
> in the CPU load though.
> 
> ok kettenis@

On an RK3568 system with hw.setperf=26 (hw.cpuspeed=816) this improves
single stream TCP send performance from ~700Mbps to ~830Mbps.

ok jmatthew@

> 
> > diff refs/heads/master refs/heads/dwqe-txcsum
> > commit - c604ab84837e7f949d28ecd1ff96b40c36777c45
> > commit + 645f033f44f178f2416685728c4ffddd7c25a8bb
> > blob - b683535744e196ef87db5dae9ca0e40753b985a5
> > blob + acb0ad807c1b80af1826cae99743d5acd492d39c
> > --- sys/dev/ic/dwqe.c
> > +++ sys/dev/ic/dwqe.c
> > @@ -94,6 +94,12 @@ struct mbuf *dwqe_alloc_mbuf(struct dwqe_softc *, bus_
> >  void	dwqe_fill_rx_ring(struct dwqe_softc *);
> >  
> >  int
> > +dwqe_have_tx_csum_offload(struct dwqe_softc *sc)
> > +{
> > +	return (sc->sc_hw_feature[0] & GMAC_MAC_HW_FEATURE0_TXCOESEL);
> > +}
> > +
> > +int
> >  dwqe_attach(struct dwqe_softc *sc)
> >  {
> >  	struct ifnet *ifp;
> > @@ -121,6 +127,11 @@ dwqe_attach(struct dwqe_softc *sc)
> >  	bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
> >  
> >  	ifp->if_capabilities = IFCAP_VLAN_MTU;
> > +	if (dwqe_have_tx_csum_offload(sc)) {
> > +		ifp->if_capabilities |= (IFCAP_CSUM_IPv4 |
> > +		    IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> > +		    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6);
> > +	}
> >  
> >  	sc->sc_mii.mii_ifp = ifp;
> >  	sc->sc_mii.mii_readreg = dwqe_mii_readreg;
> > @@ -1077,6 +1088,25 @@ dwqe_iff(struct dwqe_softc *sc)
> >  	dwqe_write(sc, GMAC_MAC_PACKET_FILTER, reg);
> >  }
> >  
> > +void
> > +dwqe_tx_csum(struct dwqe_softc *sc, struct mbuf *m, struct dwqe_desc *txd)
> > +{
> > +	if (!dwqe_have_tx_csum_offload(sc))
> > +		return;
> > +
> > +	/* Checksum flags are valid only on first descriptor. */
> > +	if ((txd->sd_tdes3 & TDES3_FS) == 0)
> > +		return;
> > +
> > +	/* TSO and Tx checksum offloading are incompatible. */
> > +	if (txd->sd_tdes3 & TDES3_TSO_EN)
> > +		return;
> > +
> > +	if (m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
> > +	    M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
> > +		txd->sd_tdes3 |= TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR;
> > +}
> > +
> >  int
> >  dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int *idx, int *used)
> >  {
> > @@ -1107,8 +1137,10 @@ dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int 
> >  		txd->sd_tdes1 = (uint32_t)(map->dm_segs[i].ds_addr >> 32);
> >  		txd->sd_tdes2 = map->dm_segs[i].ds_len;
> >  		txd->sd_tdes3 = m->m_pkthdr.len;
> > -		if (i == 0)
> > +		if (i == 0) {
> >  			txd->sd_tdes3 |= TDES3_FS;
> > +			dwqe_tx_csum(sc, m, txd);
> > +		}
> >  		if (i == (map->dm_nsegs - 1)) {
> >  			txd->sd_tdes2 |= TDES2_IC;
> >  			txd->sd_tdes3 |= TDES3_LS;
> > blob - 7db8b2a40d56b4ab04eaafbaccfead0f941c656a
> > blob + badb42083521bd27ae6ac86b683d6d03dd6d0d2d
> > --- sys/dev/ic/dwqereg.h
> > +++ sys/dev/ic/dwqereg.h
> > @@ -230,14 +230,37 @@ struct dwqe_desc {
> >  	uint32_t sd_tdes3;
> >  };
> >  
> > -/* Tx bits */
> > +/* Tx bits (read format; host to device) */
> > +#define TDES2_HDR_LEN		0x000003ff	/* if TSO is enabled */
> > +#define TDES2_BUF1_LEN		0x00003fff	/* if TSO is disabled */
> > +#define TDES2_VLAN_TIR		0x0000c000
> > +#define   TDES2_NO_VLAN_TAGGING		(0x0 << 14)
> > +#define   TDES2_VLAN_TAG_STRIP		(0x1 << 14)
> > +#define   TDES2_VLAN_TAG_INSERT		(0x2 << 14)
> > +#define   TDES2_VLAN_TAG_REPLACE	(0x3 << 14)
> > +#define TDES2_BUF2_LEN		0x3fff0000
> > +#define TDES2_TX_TIMESTAMP_EN	(1 << 30)	/* if TSO is disabled */
> > +#define TDES2_TSO_EXTMEM_DIS	(1 << 30)	/* if TSO is enabled */
> >  #define TDES2_IC		(1U << 31)
> > -#define TDES3_ES		(1 << 15)
> > -#define TDES3_DE		(1 << 23)
> > +#define TDES3_TCP_PAYLOAD_LEN	0x0003ffff	/* if TSO is enabled */
> > +#define TDES3_FRAME_LEN		0x00007fff	/* if TSO is disabled */
> > +#define TDES3_CIC		0x00030000	/* if TSO is disabled */
> > +#define   TDES3_CSUM_DISABLE			(0x0 << 16)
> > +#define   TDES3_CSUM_IPHDR			(0x1 << 16)
> > +#define   TDES3_CSUM_IPHDR_PAYLOAD		(0x2 << 16)
> > +#define   TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR	(0x3 << 16)
> > +#define TDES3_TSO_EN		(1 << 18)
> >  #define TDES3_LS		(1 << 28)
> >  #define TDES3_FS		(1 << 29)
> >  #define TDES3_OWN		(1U << 31)
> >  
> > +/* Tx bits (writeback format; device to host) */
> > +#define TDES3_ES		(1 << 15)
> > +#define TDES3_DE		(1 << 23)
> > +/* Bit 28 is the LS bit, as in "read" format. */
> > +/* Bit 29 is the FS bit, as in "read" format. */
> > +/* Bit 31 is the OWN bit, as in "read" format. */
> > +
> >  /* Rx bits (read format; host to device) */
> >  #define RDES3_BUF1V		(1 << 24)
> >  #define RDES3_BUF2V		(1 << 25)
> > 
> > 
> > 
>