Download raw body.
dwqe(4) Tx checksum offloading
On Sat, May 04, 2024 at 09:28:41PM +0200, Mark Kettenis wrote:
> > Date: Sat, 4 May 2024 11:52:50 +0200
> > From: Stefan Sperling <stsp@stsp.name>
> >
> > The relatively small diff below seems to be enough to get Tx checksum
> > offloading going on dwqe(4).
> >
> > Tested on Elkhart Lake with tcpbench and tcpbench -u.
> >
> > Throughput is excellent, though as far as I can tell unchanged on this
> > particular hardware with a 1.9GHz x6425RE Intel Atom:
> > Conn: 1 Mbps: 935.823 Peak Mbps: 935.823 Avg Mbps: 935.823
> >
> > Tests were run between dwqe0 and em0, both in the same machine, but
> > separated by rdomains to get packets routed on the wire.
> >
> > Before this patch, software checksums on dwqe0 would raise counters:
> >
> > $ netstat -s | grep software-checksummed
> > 158030 input datagrams software-checksummed
> > 269208 output datagrams software-checksummed
> > 269208 packets software-checksummed
> > 158000 packets software-checksummed
> > 30 input packets software-checksummed
> > 0 output packets software-checksummed
> >
> > With patch, testing with IPv4, both em0 and dwqe0 use hardware checksums
> > only:
> >
> > $ netstat -s | grep software-checksummed
> > 0 input datagrams software-checksummed
> > 0 output datagrams software-checksummed
> > 0 packets software-checksummed
> > 0 packets software-checksummed
> > 0 input packets software-checksummed
> > 0 output packets software-checksummed
> >
> > After testing with IPv6 (our stack doesn't offload v6 header checksums):
> >
> > $ netstat -s | grep software-checksummed
> > 0 input datagrams software-checksummed
> > 0 output datagrams software-checksummed
> > 0 packets software-checksummed
> > 0 packets software-checksummed
> > 103219 input packets software-checksummed
> > 243694 output packets software-checksummed
> >
> > This diff includes the full set of Tx descriptor macros from a previous
> > diff I sent. I can remove the unused ones before commit if preferred.
> > Though I suspect we'll need most or all of them eventually for VLAN
> > tagging offload (I plan to work on this next) and for TSO.
> >
> > ok?
>
> Seems to work on RK3568 as well. Not seeing a significant reduction
> in the CPU load though.
>
> ok kettenis@
On an RK3568 system with hw.setperf=26 (hw.cpuspeed=816) this improves
single stream TCP send performance from ~700Mbps to ~830Mbps.
ok jmatthew@
>
> > diff refs/heads/master refs/heads/dwqe-txcsum
> > commit - c604ab84837e7f949d28ecd1ff96b40c36777c45
> > commit + 645f033f44f178f2416685728c4ffddd7c25a8bb
> > blob - b683535744e196ef87db5dae9ca0e40753b985a5
> > blob + acb0ad807c1b80af1826cae99743d5acd492d39c
> > --- sys/dev/ic/dwqe.c
> > +++ sys/dev/ic/dwqe.c
> > @@ -94,6 +94,12 @@ struct mbuf *dwqe_alloc_mbuf(struct dwqe_softc *, bus_
> > void dwqe_fill_rx_ring(struct dwqe_softc *);
> >
> > int
> > +dwqe_have_tx_csum_offload(struct dwqe_softc *sc)
> > +{
> > + return (sc->sc_hw_feature[0] & GMAC_MAC_HW_FEATURE0_TXCOESEL);
> > +}
> > +
> > +int
> > dwqe_attach(struct dwqe_softc *sc)
> > {
> > struct ifnet *ifp;
> > @@ -121,6 +127,11 @@ dwqe_attach(struct dwqe_softc *sc)
> > bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
> >
> > ifp->if_capabilities = IFCAP_VLAN_MTU;
> > + if (dwqe_have_tx_csum_offload(sc)) {
> > + ifp->if_capabilities |= (IFCAP_CSUM_IPv4 |
> > + IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> > + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6);
> > + }
> >
> > sc->sc_mii.mii_ifp = ifp;
> > sc->sc_mii.mii_readreg = dwqe_mii_readreg;
> > @@ -1077,6 +1088,25 @@ dwqe_iff(struct dwqe_softc *sc)
> > dwqe_write(sc, GMAC_MAC_PACKET_FILTER, reg);
> > }
> >
> > +void
> > +dwqe_tx_csum(struct dwqe_softc *sc, struct mbuf *m, struct dwqe_desc *txd)
> > +{
> > + if (!dwqe_have_tx_csum_offload(sc))
> > + return;
> > +
> > + /* Checksum flags are valid only on first descriptor. */
> > + if ((txd->sd_tdes3 & TDES3_FS) == 0)
> > + return;
> > +
> > + /* TSO and Tx checksum offloading are incompatible. */
> > + if (txd->sd_tdes3 & TDES3_TSO_EN)
> > + return;
> > +
> > + if (m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
> > + M_TCP_CSUM_OUT | M_UDP_CSUM_OUT))
> > + txd->sd_tdes3 |= TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR;
> > +}
> > +
> > int
> > dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int *idx, int *used)
> > {
> > @@ -1107,8 +1137,10 @@ dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int
> > txd->sd_tdes1 = (uint32_t)(map->dm_segs[i].ds_addr >> 32);
> > txd->sd_tdes2 = map->dm_segs[i].ds_len;
> > txd->sd_tdes3 = m->m_pkthdr.len;
> > - if (i == 0)
> > + if (i == 0) {
> > txd->sd_tdes3 |= TDES3_FS;
> > + dwqe_tx_csum(sc, m, txd);
> > + }
> > if (i == (map->dm_nsegs - 1)) {
> > txd->sd_tdes2 |= TDES2_IC;
> > txd->sd_tdes3 |= TDES3_LS;
> > blob - 7db8b2a40d56b4ab04eaafbaccfead0f941c656a
> > blob + badb42083521bd27ae6ac86b683d6d03dd6d0d2d
> > --- sys/dev/ic/dwqereg.h
> > +++ sys/dev/ic/dwqereg.h
> > @@ -230,14 +230,37 @@ struct dwqe_desc {
> > uint32_t sd_tdes3;
> > };
> >
> > -/* Tx bits */
> > +/* Tx bits (read format; host to device) */
> > +#define TDES2_HDR_LEN 0x000003ff /* if TSO is enabled */
> > +#define TDES2_BUF1_LEN 0x00003fff /* if TSO is disabled */
> > +#define TDES2_VLAN_TIR 0x0000c000
> > +#define TDES2_NO_VLAN_TAGGING (0x0 << 14)
> > +#define TDES2_VLAN_TAG_STRIP (0x1 << 14)
> > +#define TDES2_VLAN_TAG_INSERT (0x2 << 14)
> > +#define TDES2_VLAN_TAG_REPLACE (0x3 << 14)
> > +#define TDES2_BUF2_LEN 0x3fff0000
> > +#define TDES2_TX_TIMESTAMP_EN (1 << 30) /* if TSO is disabled */
> > +#define TDES2_TSO_EXTMEM_DIS (1 << 30) /* if TSO is enabled */
> > #define TDES2_IC (1U << 31)
> > -#define TDES3_ES (1 << 15)
> > -#define TDES3_DE (1 << 23)
> > +#define TDES3_TCP_PAYLOAD_LEN 0x0003ffff /* if TSO is enabled */
> > +#define TDES3_FRAME_LEN 0x00007fff /* if TSO is disabled */
> > +#define TDES3_CIC 0x00030000 /* if TSO is disabled */
> > +#define TDES3_CSUM_DISABLE (0x0 << 16)
> > +#define TDES3_CSUM_IPHDR (0x1 << 16)
> > +#define TDES3_CSUM_IPHDR_PAYLOAD (0x2 << 16)
> > +#define TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR (0x3 << 16)
> > +#define TDES3_TSO_EN (1 << 18)
> > #define TDES3_LS (1 << 28)
> > #define TDES3_FS (1 << 29)
> > #define TDES3_OWN (1U << 31)
> >
> > +/* Tx bits (writeback format; device to host) */
> > +#define TDES3_ES (1 << 15)
> > +#define TDES3_DE (1 << 23)
> > +/* Bit 28 is the LS bit, as in "read" format. */
> > +/* Bit 29 is the FS bit, as in "read" format. */
> > +/* Bit 31 is the OWN bit, as in "read" format. */
> > +
> > /* Rx bits (read format; host to device) */
> > #define RDES3_BUF1V (1 << 24)
> > #define RDES3_BUF2V (1 << 25)
> >
> >
> >
>
dwqe(4) Tx checksum offloading