From: Stefan Sperling Subject: dwqe(4) Tx checksum offloading To: tech@openbsd.org Date: Sat, 4 May 2024 11:52:50 +0200 The relatively small diff below seems to be enough to get Tx checksum offloading going on dwqe(4). Tested on Elkhart Lake with tcpbench and tcpbench -u. Throughput is excellent, though as far as I can tell unchanged on this particular hardware with a 1.9GHz x6425RE Intel Atom: Conn: 1 Mbps: 935.823 Peak Mbps: 935.823 Avg Mbps: 935.823 Tests were run between dwqe0 and em0, both in the same machine, but separated by rdomains to get packets routed on the wire. Before this patch, software checksums on dwqe0 would raise counters: $ netstat -s | grep software-checksummed 158030 input datagrams software-checksummed 269208 output datagrams software-checksummed 269208 packets software-checksummed 158000 packets software-checksummed 30 input packets software-checksummed 0 output packets software-checksummed With patch, testing with IPv4, both em0 and dwqe0 use hardware checksums only: $ netstat -s | grep software-checksummed 0 input datagrams software-checksummed 0 output datagrams software-checksummed 0 packets software-checksummed 0 packets software-checksummed 0 input packets software-checksummed 0 output packets software-checksummed After testing with IPv6 (our stack doesn't offload v6 header checksums): $ netstat -s | grep software-checksummed 0 input datagrams software-checksummed 0 output datagrams software-checksummed 0 packets software-checksummed 0 packets software-checksummed 103219 input packets software-checksummed 243694 output packets software-checksummed This diff includes the full set of Tx descriptor macros from a previous diff I sent. I can remove the unused ones before commit if preferred. Though I suspect we'll need most or all of them eventually for VLAN tagging offload (I plan to work on this next) and for TSO. ok? diff refs/heads/master refs/heads/dwqe-txcsum commit - c604ab84837e7f949d28ecd1ff96b40c36777c45 commit + 645f033f44f178f2416685728c4ffddd7c25a8bb blob - b683535744e196ef87db5dae9ca0e40753b985a5 blob + acb0ad807c1b80af1826cae99743d5acd492d39c --- sys/dev/ic/dwqe.c +++ sys/dev/ic/dwqe.c @@ -94,6 +94,12 @@ struct mbuf *dwqe_alloc_mbuf(struct dwqe_softc *, bus_ void dwqe_fill_rx_ring(struct dwqe_softc *); int +dwqe_have_tx_csum_offload(struct dwqe_softc *sc) +{ + return (sc->sc_hw_feature[0] & GMAC_MAC_HW_FEATURE0_TXCOESEL); +} + +int dwqe_attach(struct dwqe_softc *sc) { struct ifnet *ifp; @@ -121,6 +127,11 @@ dwqe_attach(struct dwqe_softc *sc) bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ); ifp->if_capabilities = IFCAP_VLAN_MTU; + if (dwqe_have_tx_csum_offload(sc)) { + ifp->if_capabilities |= (IFCAP_CSUM_IPv4 | + IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6); + } sc->sc_mii.mii_ifp = ifp; sc->sc_mii.mii_readreg = dwqe_mii_readreg; @@ -1077,6 +1088,25 @@ dwqe_iff(struct dwqe_softc *sc) dwqe_write(sc, GMAC_MAC_PACKET_FILTER, reg); } +void +dwqe_tx_csum(struct dwqe_softc *sc, struct mbuf *m, struct dwqe_desc *txd) +{ + if (!dwqe_have_tx_csum_offload(sc)) + return; + + /* Checksum flags are valid only on first descriptor. */ + if ((txd->sd_tdes3 & TDES3_FS) == 0) + return; + + /* TSO and Tx checksum offloading are incompatible. */ + if (txd->sd_tdes3 & TDES3_TSO_EN) + return; + + if (m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT | + M_TCP_CSUM_OUT | M_UDP_CSUM_OUT)) + txd->sd_tdes3 |= TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR; +} + int dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int *idx, int *used) { @@ -1107,8 +1137,10 @@ dwqe_encap(struct dwqe_softc *sc, struct mbuf *m, int txd->sd_tdes1 = (uint32_t)(map->dm_segs[i].ds_addr >> 32); txd->sd_tdes2 = map->dm_segs[i].ds_len; txd->sd_tdes3 = m->m_pkthdr.len; - if (i == 0) + if (i == 0) { txd->sd_tdes3 |= TDES3_FS; + dwqe_tx_csum(sc, m, txd); + } if (i == (map->dm_nsegs - 1)) { txd->sd_tdes2 |= TDES2_IC; txd->sd_tdes3 |= TDES3_LS; blob - 7db8b2a40d56b4ab04eaafbaccfead0f941c656a blob + badb42083521bd27ae6ac86b683d6d03dd6d0d2d --- sys/dev/ic/dwqereg.h +++ sys/dev/ic/dwqereg.h @@ -230,14 +230,37 @@ struct dwqe_desc { uint32_t sd_tdes3; }; -/* Tx bits */ +/* Tx bits (read format; host to device) */ +#define TDES2_HDR_LEN 0x000003ff /* if TSO is enabled */ +#define TDES2_BUF1_LEN 0x00003fff /* if TSO is disabled */ +#define TDES2_VLAN_TIR 0x0000c000 +#define TDES2_NO_VLAN_TAGGING (0x0 << 14) +#define TDES2_VLAN_TAG_STRIP (0x1 << 14) +#define TDES2_VLAN_TAG_INSERT (0x2 << 14) +#define TDES2_VLAN_TAG_REPLACE (0x3 << 14) +#define TDES2_BUF2_LEN 0x3fff0000 +#define TDES2_TX_TIMESTAMP_EN (1 << 30) /* if TSO is disabled */ +#define TDES2_TSO_EXTMEM_DIS (1 << 30) /* if TSO is enabled */ #define TDES2_IC (1U << 31) -#define TDES3_ES (1 << 15) -#define TDES3_DE (1 << 23) +#define TDES3_TCP_PAYLOAD_LEN 0x0003ffff /* if TSO is enabled */ +#define TDES3_FRAME_LEN 0x00007fff /* if TSO is disabled */ +#define TDES3_CIC 0x00030000 /* if TSO is disabled */ +#define TDES3_CSUM_DISABLE (0x0 << 16) +#define TDES3_CSUM_IPHDR (0x1 << 16) +#define TDES3_CSUM_IPHDR_PAYLOAD (0x2 << 16) +#define TDES3_CSUM_IPHDR_PAYLOAD_PSEUDOHDR (0x3 << 16) +#define TDES3_TSO_EN (1 << 18) #define TDES3_LS (1 << 28) #define TDES3_FS (1 << 29) #define TDES3_OWN (1U << 31) +/* Tx bits (writeback format; device to host) */ +#define TDES3_ES (1 << 15) +#define TDES3_DE (1 << 23) +/* Bit 28 is the LS bit, as in "read" format. */ +/* Bit 29 is the FS bit, as in "read" format. */ +/* Bit 31 is the OWN bit, as in "read" format. */ + /* Rx bits (read format; host to device) */ #define RDES3_BUF1V (1 << 24) #define RDES3_BUF2V (1 << 25)