Download raw body.
ice(4) TSO support
On Tue, Jun 17, 2025 at 11:16:58AM +0200, Stefan Sperling wrote:
> The patch below adds TSO (TCP Segmentation Offload) support to ice(4).
>
> TSO has been tested both with and without ice-ddp firmware loaded.
> On my test device TSO works in both cases.
>
> # ifconfig ice0 mtu 9710
>
> $ iperf -c 192.168.99.1 --mss 512
>
> [ 1] local 192.168.99.2 port 20750 connected with 192.168.99.1 port 5001
> [ ID] Interval Transfer Bandwidth
> [ 1] 0.00-10.02 sec 11.5 GBytes 9.89 Gbits/sec
>
> $ iperf -c 2001:db8::1 --mss 512
>
> [ 1] local 2001:db8:: port 11682 connected with 2001:db8::1 port 5001
> [ ID] Interval Transfer Bandwidth
> [ 1] 0.00-10.02 sec 11.5 GBytes 9.84 Gbits/sec
>
> (This 25Gb E810 device is connected to a 10Gb switch.)
>
> $ netstat -s | grep TSO
> 0 output TSO large packets chopped in software
> 5434855 output TSO large packets to device
> 13327052 output TSO generated packets sent to network
> 0 bad TSO packets dropped
>
> The known limitation of ixl(4) where packets spanning more than 8 DMA
> segments cannot be processed by the device also affects ice(4).
> ice(4) will drop such packets rather than passing them to hardware.
> This avoids the following error after which the device becomes non-functional:
>
> ice0: Malicious Driver Detection Tx Descriptor check event 'Packet too small or too big' on Tx queue 0 PF# 0 VF# 0
> ice_request_stack_reinit: not implemented
>
> As far as i know, a general fix to make the forwarding path deal with
> this limitation is still in progress.
> Disabling TSO on routers seems to be a good idea in general anyway.
>
>
> ok?
I have tested it, now we reach 10 Gbit line speed when sending TCP.
Do we actually need ice_tso_detect_sparse()? In theory
bus_dmamap_load_mbuf() should find unsupported layout of mbufs,
return EFBIG, and m_defrag() in ice_load_mbuf() fixes it.
We see problems in ixl(4) that jan@ and I do not fully understand
it yet. Does it also not work with ice(4)? Have you seen bad
behavior?
Is the ICE_MAX_TSO_HDR_SEGS check stricter than _bus_dmamap_load_buffer()?
Is it possible to call m_defrag() to send the packet instead of
dropping it?
bluhm
> M sys/dev/pci/if_ice.c | 168+ 22-
>
> 1 file changed, 168 insertions(+), 22 deletions(-)
>
> commit - dce1da27fcc7e851ab497aa79c511b8be816a365
> commit + 6f6738a01020d1372d5b357faef2b01a533c9c33
> blob - 21fbff1b1dbd5c169175759b4454ba7dd4bf7536
> blob + 3de2bd9d21f6e06772735a706cd3f5256fac7c78
> --- sys/dev/pci/if_ice.c
> +++ sys/dev/pci/if_ice.c
> @@ -84,6 +84,9 @@
>
> #include <netinet/in.h>
> #include <netinet/if_ether.h>
> +#include <netinet/tcp.h>
> +#include <netinet/tcp_timer.h>
> +#include <netinet/tcp_var.h>
> #include <netinet/udp.h>
>
> #define STRUCT_HACK_VAR_LEN
> @@ -102,6 +105,12 @@
> #include "if_icereg.h"
> #include "if_icevar.h"
>
> +/*
> + * Our network stack cannot handle packets greater than MAXMCLBYTES.
> + * This interface cannot handle packets greater than ICE_TSO_SIZE.
> + */
> +CTASSERT(MAXMCLBYTES < ICE_TSO_SIZE);
> +
> /**
> * @var ice_driver_version
> * @brief driver version string
> @@ -13641,11 +13650,97 @@ ice_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
> return error;
> }
>
> +/**
> + * ice_tso_detect_sparse - detect TSO packets with too many segments
> + *
> + * Hardware only transmits packets with a maximum of 8 descriptors. For TSO
> + * packets, hardware needs to be able to build the split packets using 8 or
> + * fewer descriptors. Additionally, the header must be contained within at
> + * most 3 descriptors.
> + *
> + * To verify this, we walk the headers to find out how many descriptors the
> + * headers require (usually 1). Then we ensure that, for each TSO segment, its
> + * data plus the headers are contained within 8 or fewer descriptors.
> + */
> +int
> +ice_tso_detect_sparse(struct mbuf *m, struct ether_extracted *ext,
> + bus_dmamap_t map)
> +{
> + int count, curseg, i, hlen, segsz, seglen, hdrs, maxsegs;
> + bus_dma_segment_t *segs;
> + uint64_t paylen, outlen, nsegs;
> +
> + curseg = hdrs = 0;
> +
> + hlen = ETHER_HDR_LEN + ext->iphlen + ext->tcphlen;
> + outlen = MIN(9668, MAX(64, m->m_pkthdr.ph_mss));
> + paylen = m->m_pkthdr.len - hlen;
> + nsegs = (paylen + outlen - 1) / outlen;
> +
> + segs = map->dm_segs;
> +
> + /* First, count the number of descriptors for the header.
> + * Additionally, make sure it does not span more than 3 segments.
> + */
> + i = 0;
> + curseg = segs[0].ds_len;
> + while (hlen > 0) {
> + hdrs++;
> + if (hdrs > ICE_MAX_TSO_HDR_SEGS)
> + return (1);
> + if (curseg == 0) {
> + i++;
> + if (i == nsegs)
> + return (1);
> +
> + curseg = segs[i].ds_len;
> + }
> + seglen = MIN(curseg, hlen);
> + curseg -= seglen;
> + hlen -= seglen;
> + }
> +
> + maxsegs = ICE_MAX_TX_SEGS - hdrs;
> +
> + /* We must count the headers, in order to verify that they take up
> + * 3 or fewer descriptors. However, we don't need to check the data
> + * if the total segments is small.
> + */
> + if (nsegs <= maxsegs)
> + return (0);
> +
> + count = 0;
> +
> + /* Now check the data to make sure that each TSO segment is made up of
> + * no more than maxsegs descriptors. This ensures that hardware will
> + * be capable of performing TSO offload.
> + */
> + while (paylen > 0) {
> + segsz = m->m_pkthdr.ph_mss;
> + while (segsz > 0 && paylen != 0) {
> + count++;
> + if (count > maxsegs)
> + return (1);
> + if (curseg == 0) {
> + i++;
> + if (i == nsegs)
> + return (1);
> + curseg = segs[i].ds_len;
> + }
> + seglen = MIN(curseg, segsz);
> + segsz -= seglen;
> + curseg -= seglen;
> + paylen -= seglen;
> + }
> + count = 0;
> + }
> +
> + return (0);
> +}
> +
> uint64_t
> -ice_tx_setup_offload(struct mbuf *m0, struct ice_tx_queue *txq,
> - unsigned int prod)
> +ice_tx_setup_offload(struct mbuf *m0, struct ether_extracted *ext)
> {
> - struct ether_extracted ext;
> uint64_t offload = 0, hlen;
>
> #if NVLAN > 0
> @@ -13659,17 +13754,16 @@ ice_tx_setup_offload(struct mbuf *m0, struct ice_tx_qu
> M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO))
> return offload;
>
> - ether_extract_headers(m0, &ext);
> - hlen = ext.iphlen;
> + hlen = ext->iphlen;
>
> - if (ext.ip4) {
> + if (ext->ip4) {
> if (ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT))
> offload |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM <<
> ICE_TXD_QW1_CMD_S;
> else
> offload |= ICE_TX_DESC_CMD_IIPT_IPV4 <<
> ICE_TXD_QW1_CMD_S;
> - } else if (ext.ip6)
> + } else if (ext->ip6)
> offload |= ICE_TX_DESC_CMD_IIPT_IPV6 << ICE_TXD_QW1_CMD_S;
> else
> return offload;
> @@ -13679,13 +13773,13 @@ ice_tx_setup_offload(struct mbuf *m0, struct ice_tx_qu
> offload |= ((hlen >> 2) << ICE_TX_DESC_LEN_IPLEN_S) <<
> ICE_TXD_QW1_OFFSET_S;
>
> - if (ext.tcp && ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
> + if (ext->tcp && ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
> offload |= ICE_TX_DESC_CMD_L4T_EOFT_TCP << ICE_TXD_QW1_CMD_S;
> - offload |= ((uint64_t)(ext.tcphlen >> 2) <<
> + offload |= ((uint64_t)(ext->tcphlen >> 2) <<
> ICE_TX_DESC_LEN_L4_LEN_S) << ICE_TXD_QW1_OFFSET_S;
> - } else if (ext.udp && ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
> + } else if (ext->udp && ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
> offload |= ICE_TX_DESC_CMD_L4T_EOFT_UDP << ICE_TXD_QW1_CMD_S;
> - offload |= ((uint64_t)(sizeof(*ext.udp) >> 2) <<
> + offload |= ((uint64_t)(sizeof(*ext->udp) >> 2) <<
> ICE_TX_DESC_LEN_L4_LEN_S) << ICE_TXD_QW1_OFFSET_S;
> }
>
> @@ -13711,6 +13805,36 @@ ice_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, st
> }
>
> void
> +ice_set_tso_context(struct mbuf *m0, struct ice_tx_queue *txq,
> + unsigned int prod, struct ether_extracted *ext)
> +{
> + struct ice_tx_desc *ring;
> + struct ice_tx_ctx_desc *txd;
> + uint64_t qword1 = 0, paylen, outlen;
> +
> + /*
> + * The MSS should not be set to a lower value than 64.
> + */
> + outlen = MAX(64, m0->m_pkthdr.ph_mss);
> + paylen = m0->m_pkthdr.len - ETHER_HDR_LEN - ext->iphlen - ext->tcphlen;
> +
> + ring = ICE_DMA_KVA(&txq->tx_desc_mem);
> + txd = (struct ice_tx_ctx_desc *)&ring[prod];
> +
> + qword1 |= ICE_TX_DESC_DTYPE_CTX;
> + qword1 |= ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S;
> + qword1 |= paylen << ICE_TXD_CTX_QW1_TSO_LEN_S;
> + qword1 |= outlen << ICE_TXD_CTX_QW1_MSS_S;
> +
> + htolem32(&txd->tunneling_params, 0);
> + htolem16(&txd->l2tag2, 0);
> + htolem16(&txd->rsvd, 0);
> + htolem64(&txd->qw1, qword1);
> +
> + tcpstat_add(tcps_outpkttso, (paylen + outlen - 1) / outlen);
> +}
> +
> +void
> ice_start(struct ifqueue *ifq)
> {
> struct ifnet *ifp = ifq->ifq_if;
> @@ -13727,6 +13851,7 @@ ice_start(struct ifqueue *ifq)
> uint64_t offload;
> uint64_t paddr;
> uint64_t seglen;
> + struct ether_extracted ext;
> #if NBPFILTER > 0
> caddr_t if_bpf;
> #endif
> @@ -13759,17 +13884,22 @@ ice_start(struct ifqueue *ifq)
> if (m == NULL)
> break;
>
> - offload = ice_tx_setup_offload(m, txq, prod);
> + ether_extract_headers(m, &ext);
> + offload = ice_tx_setup_offload(m, &ext);
>
> + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> + if (ext.tcp == NULL || m->m_pkthdr.ph_mss == 0 ||
> + m->m_pkthdr.ph_mss > ICE_TXD_CTX_MAX_MSS) {
> + tcpstat_inc(tcps_outbadtso);
> + ifq->ifq_errors++;
> + m_freem(m);
> + continue;
> + }
> + }
> +
> txm = &txq->tx_map[prod];
> map = txm->txm_map;
> -#if 0
> - if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> - prod++;
> - prod &= mask;
> - free--;
> - }
> -#endif
> +
> if (ice_load_mbuf(sc->sc_dmat, map, m) != 0) {
> ifq->ifq_errors++;
> m_freem(m);
> @@ -13779,6 +13909,21 @@ ice_start(struct ifqueue *ifq)
> bus_dmamap_sync(sc->sc_dmat, map, 0,
> map->dm_mapsize, BUS_DMASYNC_PREWRITE);
>
> + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> + if (ice_tso_detect_sparse(m, &ext, map)) {
> + bus_dmamap_unload(sc->sc_dmat, map);
> + tcpstat_inc(tcps_outbadtso);
> + ifq->ifq_errors++;
> + m_freem(m);
> + continue;
> + }
> +
> + ice_set_tso_context(m, txq, prod, &ext);
> + prod++;
> + prod &= mask;
> + free--;
> + }
> +
> for (i = 0; i < map->dm_nsegs; i++) {
> txd = &ring[prod];
>
> @@ -29345,8 +29490,8 @@ ice_tx_queues_alloc(struct ice_softc *sc)
>
> for (j = 0; j < sc->isc_ntxd[i]; j++) {
> map = &txq->tx_map[j];
> - if (bus_dmamap_create(sc->sc_dmat, ICE_MAX_FRAME_SIZE,
> - ICE_MAX_TX_SEGS, ICE_MAX_FRAME_SIZE, 0,
> + if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES,
> + ICE_MAX_TX_SEGS, ICE_MAX_DMA_SEG_SIZE, 0,
> BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
> &map->txm_map) != 0) {
> printf("%s: could not allocate Tx DMA map\n",
> @@ -30276,7 +30421,8 @@ ice_attach_hook(struct device *self)
> #endif
> ifp->if_capabilities |= IFCAP_CSUM_IPv4 |
> IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> - IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
> + IFCAP_TSOv4 | IFCAP_TSOv6;
>
> if_attach(ifp);
> ether_ifattach(ifp);
>
>
ice(4) TSO support