Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: dwqe(4) Tx checksum offloading
To:
Stefan Sperling <stsp@stsp.name>
Cc:
tech@openbsd.org
Date:
Sat, 04 May 2024 21:28:41 +0200

Download raw body.

Thread
> Date: Sat, 4 May 2024 11:52:50 +0200
> From: Stefan Sperling <stsp@stsp.name>
> 
> The relatively small diff below seems to be enough to get Tx checksum
> offloading going on dwqe(4).
> 
> Tested on Elkhart Lake with tcpbench and tcpbench -u.
> 
> Throughput is excellent, though as far as I can tell unchanged on this
> particular hardware with a 1.9GHz x6425RE Intel Atom:
> Conn:   1 Mbps:      935.823 Peak Mbps:      935.823 Avg Mbps:      935.823
> 
> Tests were run between dwqe0 and em0, both in the same machine, but
> separated by rdomains to get packets routed on the wire.
> 
> Before this patch, software checksums on dwqe0 would raise counters:
> 
> $ netstat -s | grep software-checksummed
>         158030 input datagrams software-checksummed
>         269208 output datagrams software-checksummed
>                 269208 packets software-checksummed
>                 158000 packets software-checksummed
>         30 input packets software-checksummed
>         0 output packets software-checksummed
> 
> With patch, testing with IPv4, both em0 and dwqe0 use hardware checksums
> only:
> 
> $ netstat -s | grep software-checksummed
>         0 input datagrams software-checksummed
>         0 output datagrams software-checksummed
>                 0 packets software-checksummed
>                 0 packets software-checksummed
>         0 input packets software-checksummed
>         0 output packets software-checksummed
> 
> After testing with IPv6 (our stack doesn't offload v6 header checksums):
> 
> $ netstat -s | grep software-checksummed
>         0 input datagrams software-checksummed
>         0 output datagrams software-checksummed
>                 0 packets software-checksummed
>                 0 packets software-checksummed
>         103219 input packets software-checksummed
>         243694 output packets software-checksummed
> 
> This diff includes the full set of Tx descriptor macros from a previous
> diff I sent. I can remove the unused ones before commit if preferred.
> Though I suspect we'll need most or all of them eventually for VLAN
> tagging offload (I plan to work on this next) and for TSO.
> 
> ok?

Seems to work on RK3568 as well.  Not seeing a significant reduction
in the CPU load though.

ok kettenis@

> diff refs/heads/master refs/heads/dwqe-txcsum
> commit - c604ab84837e7f949d28ecd1ff96b40c36777c45
> commit + 645f033f44f178f2416685728c4ffddd7c25a8bb
> blob - b683535744e196ef87db5dae9ca0e40753b985a5
> blob + acb0ad807c1b80af1826cae99743d5acd492d39c
> --- sys/dev/ic/dwqe.c
> +++ sys/dev/ic/dwqe.c
> @@ -94,6 +94,12 @@ struct mbuf *dwqe_alloc_mbuf(struct dwqe_softc *, bus_
>  void	dwqe_fill_rx_ring(struct dwqe_softc *);
>  
>  int
> +dwqe_have_tx_csum_offload(struct dwqe_softc *sc)
> +{
> +	return (sc->sc_hw_feature[0] & GMAC_MAC_HW_FEATURE0_TXCOESEL);
> +}
> +
> +int
>  dwqe_attach(struct dwqe_softc *sc)
>  {
>  	struct ifnet *ifp;
> @@ -121,6 +127,11 @@ dwqe_attach(struct dwqe_softc *sc)
>  	bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
>  
>  	ifp->if_capabilities = IFCAP_VLAN_MTU;
> +	if (dwqe_have_tx_csum_offload(sc)) {
> +		ifp->if_capabilities |= (IFCAP_CSUM_IPv4 |
> +		    IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> +		    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6);
> +	}
>  
>  	sc->sc_mii.mii_ifp = ifp;
>  	sc->sc_mii.mii_readreg = dwqe_mii_readreg;
> @@ -1077,6 +1088,25 @@ dwqe_iff(struct dwqe_softc *sc)
>  	dwqe_write(sc, GMAC_MAC_PACKET_FILTER, reg);
>  }
>  
> +void
> +dwqe_tx_csum(struct dwqe_softc *sc, struct mbuf *m, struct dwqe_desc *txd)
> +{
> +	if (!dwqe_have_tx_csum_offload(sc))
> +		return;
> +
> +	/* Checksum flags are valid only on first descriptor. */
> +	if ((txd->sd_tdes3 & TDES3_FS) == 0)
> +		return;
> +
> +	/* TSO and Tx checksum offloading are incompatible. */
> +	if (txd->sd_tdes3 & TDES3_TSO_EN)
> +		return;
> +
> +	if (m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
> +	    M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
> +		txd->sd_tdes3 |= TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR;
> +}
> +
>  int
>  dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int *idx, int *used)
>  {
> @@ -1107,8 +1137,10 @@ dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int 
>  		txd->sd_tdes1 = (uint32_t)(map->dm_segs[i].ds_addr >> 32);
>  		txd->sd_tdes2 = map->dm_segs[i].ds_len;
>  		txd->sd_tdes3 = m->m_pkthdr.len;
> -		if (i == 0)
> +		if (i == 0) {
>  			txd->sd_tdes3 |= TDES3_FS;
> +			dwqe_tx_csum(sc, m, txd);
> +		}
>  		if (i == (map->dm_nsegs - 1)) {
>  			txd->sd_tdes2 |= TDES2_IC;
>  			txd->sd_tdes3 |= TDES3_LS;
> blob - 7db8b2a40d56b4ab04eaafbaccfead0f941c656a
> blob + badb42083521bd27ae6ac86b683d6d03dd6d0d2d
> --- sys/dev/ic/dwqereg.h
> +++ sys/dev/ic/dwqereg.h
> @@ -230,14 +230,37 @@ struct dwqe_desc {
>  	uint32_t sd_tdes3;
>  };
>  
> -/* Tx bits */
> +/* Tx bits (read format; host to device) */
> +#define TDES2_HDR_LEN		0x000003ff	/* if TSO is enabled */
> +#define TDES2_BUF1_LEN		0x00003fff	/* if TSO is disabled */
> +#define TDES2_VLAN_TIR		0x0000c000
> +#define   TDES2_NO_VLAN_TAGGING		(0x0 << 14)
> +#define   TDES2_VLAN_TAG_STRIP		(0x1 << 14)
> +#define   TDES2_VLAN_TAG_INSERT		(0x2 << 14)
> +#define   TDES2_VLAN_TAG_REPLACE	(0x3 << 14)
> +#define TDES2_BUF2_LEN		0x3fff0000
> +#define TDES2_TX_TIMESTAMP_EN	(1 << 30)	/* if TSO is disabled */
> +#define TDES2_TSO_EXTMEM_DIS	(1 << 30)	/* if TSO is enabled */
>  #define TDES2_IC		(1U << 31)
> -#define TDES3_ES		(1 << 15)
> -#define TDES3_DE		(1 << 23)
> +#define TDES3_TCP_PAYLOAD_LEN	0x0003ffff	/* if TSO is enabled */
> +#define TDES3_FRAME_LEN		0x00007fff	/* if TSO is disabled */
> +#define TDES3_CIC		0x00030000	/* if TSO is disabled */
> +#define   TDES3_CSUM_DISABLE			(0x0 << 16)
> +#define   TDES3_CSUM_IPHDR			(0x1 << 16)
> +#define   TDES3_CSUM_IPHDR_PAYLOAD		(0x2 << 16)
> +#define   TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR	(0x3 << 16)
> +#define TDES3_TSO_EN		(1 << 18)
>  #define TDES3_LS		(1 << 28)
>  #define TDES3_FS		(1 << 29)
>  #define TDES3_OWN		(1U << 31)
>  
> +/* Tx bits (writeback format; device to host) */
> +#define TDES3_ES		(1 << 15)
> +#define TDES3_DE		(1 << 23)
> +/* Bit 28 is the LS bit, as in "read" format. */
> +/* Bit 29 is the FS bit, as in "read" format. */
> +/* Bit 31 is the OWN bit, as in "read" format. */
> +
>  /* Rx bits (read format; host to device) */
>  #define RDES3_BUF1V		(1 << 24)
>  #define RDES3_BUF2V		(1 << 25)
> 
> 
>