Download raw body.
dwqe(4) Tx checksum offloading
> Date: Sat, 4 May 2024 11:52:50 +0200
> From: Stefan Sperling <stsp@stsp.name>
>
> The relatively small diff below seems to be enough to get Tx checksum
> offloading going on dwqe(4).
>
> Tested on Elkhart Lake with tcpbench and tcpbench -u.
>
> Throughput is excellent, though as far as I can tell unchanged on this
> particular hardware with a 1.9GHz x6425RE Intel Atom:
> Conn: 1 Mbps: 935.823 Peak Mbps: 935.823 Avg Mbps: 935.823
>
> Tests were run between dwqe0 and em0, both in the same machine, but
> separated by rdomains to get packets routed on the wire.
>
> Before this patch, software checksums on dwqe0 would raise counters:
>
> $ netstat -s | grep software-checksummed
> 158030 input datagrams software-checksummed
> 269208 output datagrams software-checksummed
> 269208 packets software-checksummed
> 158000 packets software-checksummed
> 30 input packets software-checksummed
> 0 output packets software-checksummed
>
> With patch, testing with IPv4, both em0 and dwqe0 use hardware checksums
> only:
>
> $ netstat -s | grep software-checksummed
> 0 input datagrams software-checksummed
> 0 output datagrams software-checksummed
> 0 packets software-checksummed
> 0 packets software-checksummed
> 0 input packets software-checksummed
> 0 output packets software-checksummed
>
> After testing with IPv6 (our stack doesn't offload v6 header checksums):
>
> $ netstat -s | grep software-checksummed
> 0 input datagrams software-checksummed
> 0 output datagrams software-checksummed
> 0 packets software-checksummed
> 0 packets software-checksummed
> 103219 input packets software-checksummed
> 243694 output packets software-checksummed
>
> This diff includes the full set of Tx descriptor macros from a previous
> diff I sent. I can remove the unused ones before commit if preferred.
> Though I suspect we'll need most or all of them eventually for VLAN
> tagging offload (I plan to work on this next) and for TSO.
>
> ok?
Seems to work on RK3568 as well. Not seeing a significant reduction
in the CPU load though.
ok kettenis@
> diff refs/heads/master refs/heads/dwqe-txcsum
> commit - c604ab84837e7f949d28ecd1ff96b40c36777c45
> commit + 645f033f44f178f2416685728c4ffddd7c25a8bb
> blob - b683535744e196ef87db5dae9ca0e40753b985a5
> blob + acb0ad807c1b80af1826cae99743d5acd492d39c
> --- sys/dev/ic/dwqe.c
> +++ sys/dev/ic/dwqe.c
> @@ -94,6 +94,12 @@ struct mbuf *dwqe_alloc_mbuf(struct dwqe_softc *, bus_
> void dwqe_fill_rx_ring(struct dwqe_softc *);
>
> int
> +dwqe_have_tx_csum_offload(struct dwqe_softc *sc)
> +{
> + return (sc->sc_hw_feature[0] & GMAC_MAC_HW_FEATURE0_TXCOESEL);
> +}
> +
> +int
> dwqe_attach(struct dwqe_softc *sc)
> {
> struct ifnet *ifp;
> @@ -121,6 +127,11 @@ dwqe_attach(struct dwqe_softc *sc)
> bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
>
> ifp->if_capabilities = IFCAP_VLAN_MTU;
> + if (dwqe_have_tx_csum_offload(sc)) {
> + ifp->if_capabilities |= (IFCAP_CSUM_IPv4 |
> + IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6);
> + }
>
> sc->sc_mii.mii_ifp = ifp;
> sc->sc_mii.mii_readreg = dwqe_mii_readreg;
> @@ -1077,6 +1088,25 @@ dwqe_iff(struct dwqe_softc *sc)
> dwqe_write(sc, GMAC_MAC_PACKET_FILTER, reg);
> }
>
> +void
> +dwqe_tx_csum(struct dwqe_softc *sc, struct mbuf *m, struct dwqe_desc *txd)
> +{
> + if (!dwqe_have_tx_csum_offload(sc))
> + return;
> +
> + /* Checksum flags are valid only on first descriptor. */
> + if ((txd->sd_tdes3 & TDES3_FS) == 0)
> + return;
> +
> + /* TSO and Tx checksum offloading are incompatible. */
> + if (txd->sd_tdes3 & TDES3_TSO_EN)
> + return;
> +
> + if (m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
> + M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
> + txd->sd_tdes3 |= TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR;
> +}
> +
> int
> dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int *idx, int *used)
> {
> @@ -1107,8 +1137,10 @@ dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int
> txd->sd_tdes1 = (uint32_t)(map->dm_segs[i].ds_addr >> 32);
> txd->sd_tdes2 = map->dm_segs[i].ds_len;
> txd->sd_tdes3 = m->m_pkthdr.len;
> - if (i == 0)
> + if (i == 0) {
> txd->sd_tdes3 |= TDES3_FS;
> + dwqe_tx_csum(sc, m, txd);
> + }
> if (i == (map->dm_nsegs - 1)) {
> txd->sd_tdes2 |= TDES2_IC;
> txd->sd_tdes3 |= TDES3_LS;
> blob - 7db8b2a40d56b4ab04eaafbaccfead0f941c656a
> blob + badb42083521bd27ae6ac86b683d6d03dd6d0d2d
> --- sys/dev/ic/dwqereg.h
> +++ sys/dev/ic/dwqereg.h
> @@ -230,14 +230,37 @@ struct dwqe_desc {
> uint32_t sd_tdes3;
> };
>
> -/* Tx bits */
> +/* Tx bits (read format; host to device) */
> +#define TDES2_HDR_LEN 0x000003ff /* if TSO is enabled */
> +#define TDES2_BUF1_LEN 0x00003fff /* if TSO is disabled */
> +#define TDES2_VLAN_TIR 0x0000c000
> +#define TDES2_NO_VLAN_TAGGING (0x0 << 14)
> +#define TDES2_VLAN_TAG_STRIP (0x1 << 14)
> +#define TDES2_VLAN_TAG_INSERT (0x2 << 14)
> +#define TDES2_VLAN_TAG_REPLACE (0x3 << 14)
> +#define TDES2_BUF2_LEN 0x3fff0000
> +#define TDES2_TX_TIMESTAMP_EN (1 << 30) /* if TSO is disabled */
> +#define TDES2_TSO_EXTMEM_DIS (1 << 30) /* if TSO is enabled */
> #define TDES2_IC (1U << 31)
> -#define TDES3_ES (1 << 15)
> -#define TDES3_DE (1 << 23)
> +#define TDES3_TCP_PAYLOAD_LEN 0x0003ffff /* if TSO is enabled */
> +#define TDES3_FRAME_LEN 0x00007fff /* if TSO is disabled */
> +#define TDES3_CIC 0x00030000 /* if TSO is disabled */
> +#define TDES3_CSUM_DISABLE (0x0 << 16)
> +#define TDES3_CSUM_IPHDR (0x1 << 16)
> +#define TDES3_CSUM_IPHDR_PAYLOAD (0x2 << 16)
> +#define TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR (0x3 << 16)
> +#define TDES3_TSO_EN (1 << 18)
> #define TDES3_LS (1 << 28)
> #define TDES3_FS (1 << 29)
> #define TDES3_OWN (1U << 31)
>
> +/* Tx bits (writeback format; device to host) */
> +#define TDES3_ES (1 << 15)
> +#define TDES3_DE (1 << 23)
> +/* Bit 28 is the LS bit, as in "read" format. */
> +/* Bit 29 is the FS bit, as in "read" format. */
> +/* Bit 31 is the OWN bit, as in "read" format. */
> +
> /* Rx bits (read format; host to device) */
> #define RDES3_BUF1V (1 << 24)
> #define RDES3_BUF2V (1 << 25)
>
>
>
dwqe(4) Tx checksum offloading