From: Jan Klemkow Subject: Re: ixl/ice(4): Avoid unnecessary defrag of TSO packets To: Alexander Bluhm Cc: Mark Kettenis , tech@openbsd.org Date: Wed, 17 Sep 2025 14:12:22 +0200 On Tue, Sep 16, 2025 at 09:20:43PM +0200, Alexander Bluhm wrote: > On Sat, Sep 13, 2025 at 04:00:04PM +0200, Jan Klemkow wrote: > > On Fri, Sep 12, 2025 at 05:57:49PM +0200, Mark Kettenis wrote: > > > > Date: Fri, 12 Sep 2025 16:00:39 +0200 > > > > From: Jan Klemkow > > > > ixl(4) and ice(4) network cards can handle just up to 8 DMA segments for > > > > a regular packet, but 128 segments for a TSO packet. TSO packets reach > > > > the 8 segments limit very fast. Thus, we run into unnecessary > > > > m_defrag() calls, which cost throughput. > > > > > > > > This diff sets the limit to 128 segments for ice(4) and ixl(4) and adds an > > > > additional check for non-TSO packets. > > > > > > > > ok? > > > > > > I don't think so. This bypasses all the bus_dma(9) logic. There > > > isn't necessarily a relationship between the number of elements in an > > > mbuf chain and the number of bus_dma(9) segments. > > > > > > I think you should use separate bus_dma(9) maps for TSO. > > > > Like the following diff? > > I have tested it successfully. Comments inline. > > > Index: dev/pci/if_ice.c > > =================================================================== > > RCS file: /cvs/src/sys/dev/pci/if_ice.c,v > > diff -u -p -r1.58 if_ice.c > > --- dev/pci/if_ice.c 19 Aug 2025 11:46:52 -0000 1.58 > > +++ dev/pci/if_ice.c 13 Sep 2025 13:29:55 -0000 > > @@ -13868,7 +13870,7 @@ ice_tso_detect_sparse(struct mbuf *m, st > > hlen -= seglen; > > } > > > > - maxsegs = ICE_MAX_TX_SEGS - hdrs; > > + maxsegs = ICE_MAX_TSO_SEGS - hdrs; > > > > /* We must count the headers, in order to verify that they take up > > * 3 or fewer descriptors. However, we don't need to check the data > > The comment above ice_tso_detect_sparse() talks about 8. That > should be 128 in some cases. Please adapt. done > In general, parts of ice_tso_detect_sparse() do what bus_dmamap_load_mbuf() > checks anyway. Other drivers don't have theses checks although I > guess they have similar hardware limitations. I think this function > has to be reconsidered. But not in this diff. > > > @@ -29674,6 +29688,16 @@ ice_tx_queues_alloc(struct ice_softc *sc > > ICE_MAX_TX_SEGS, ICE_MAX_DMA_SEG_SIZE, 0, > > BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, > > &map->txm_map) != 0) { > > + printf("%s: could not allocate Tx DMA map\n", > > + sc->sc_dev.dv_xname); > > + err = ENOMEM; > > + goto free_tx_queues; > > + } > > + > > + if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES, > > + ICE_MAX_TSO_SEGS, ICE_MAX_DMA_SEG_SIZE, 0, > > + BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, > > + &map->txm_map_tso) != 0) { > > printf("%s: could not allocate Tx DMA map\n", > > sc->sc_dev.dv_xname); > > err = ENOMEM; > > Plase use unique error messages. Rename the second one to > "%s: could not allocate TSO DMA map\n" done > > Index: dev/pci/if_ixl.c > > =================================================================== > > RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v > > diff -u -p -r1.108 if_ixl.c > > --- dev/pci/if_ixl.c 24 Jun 2025 11:03:10 -0000 1.108 > > +++ dev/pci/if_ixl.c 13 Sep 2025 13:30:47 -0000 > > @@ -2604,6 +2612,11 @@ uncreate: > > continue; > > > > bus_dmamap_destroy(sc->sc_dmat, txm->txm_map); > > + > > + if (txm->txm_map_tso == NULL) > > + continue; > > + > > + bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso); > > } > > > > ixl_dmamem_free(sc, &txr->txr_mem); > > This continue is wrong as the first one also affects the second map. > > if (txm->txm_map != NULL) > bus_dmamap_destroy(sc->sc_dmat, txm->txm_map); > if (txm->txm_map_tso != NULL) > bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso); done ok? Thanks, jan Index: dev/pci/if_ice.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ice.c,v diff -u -p -r1.58 if_ice.c --- dev/pci/if_ice.c 19 Aug 2025 11:46:52 -0000 1.58 +++ dev/pci/if_ice.c 17 Sep 2025 11:57:46 -0000 @@ -13432,8 +13432,10 @@ ice_txq_clean(struct ice_softc *sc, stru if (txm->txm_m == NULL) continue; - - map = txm->txm_map; + if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO)) + map = txm->txm_map_tso; + else + map = txm->txm_map; bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->sc_dmat, map); @@ -13868,10 +13870,10 @@ ice_tso_detect_sparse(struct mbuf *m, st hlen -= seglen; } - maxsegs = ICE_MAX_TX_SEGS - hdrs; + maxsegs = ICE_MAX_TSO_SEGS - hdrs; /* We must count the headers, in order to verify that they take up - * 3 or fewer descriptors. However, we don't need to check the data + * 128 or fewer descriptors. However, we don't need to check the data * if the total segments is small. */ if (nsegs <= maxsegs) @@ -14066,7 +14068,11 @@ ice_start(struct ifqueue *ifq) } txm = &txq->tx_map[prod]; - map = txm->txm_map; + + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) + map = txm->txm_map_tso; + else + map = txm->txm_map; if (ice_load_mbuf(sc->sc_dmat, map, m) != 0) { ifq->ifq_errors++; @@ -29478,7 +29484,10 @@ ice_txeof(struct ice_softc *sc, struct i if (dtype != htole64(ICE_TX_DESC_DTYPE_DESC_DONE)) break; - map = txm->txm_map; + if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO)) + map = txm->txm_map_tso; + else + map = txm->txm_map; bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_POSTWRITE); @@ -29601,6 +29610,11 @@ ice_free_tx_queues(struct ice_softc *sc) bus_dmamap_destroy(sc->sc_dmat, map->txm_map); map->txm_map = NULL; } + if (map->txm_map_tso != NULL) { + bus_dmamap_destroy(sc->sc_dmat, + map->txm_map_tso); + map->txm_map_tso = NULL; + } } free(txq->tx_map, M_DEVBUF, txq->desc_count * sizeof(*map)); txq->tx_map = NULL; @@ -29675,6 +29689,16 @@ ice_tx_queues_alloc(struct ice_softc *sc BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, &map->txm_map) != 0) { printf("%s: could not allocate Tx DMA map\n", + sc->sc_dev.dv_xname); + err = ENOMEM; + goto free_tx_queues; + } + + if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES, + ICE_MAX_TSO_SEGS, ICE_MAX_DMA_SEG_SIZE, 0, + BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, + &map->txm_map_tso) != 0) { + printf("%s: could not allocate TSO DMA map\n", sc->sc_dev.dv_xname); err = ENOMEM; goto free_tx_queues; Index: dev/pci/if_icevar.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_icevar.h,v diff -u -p -r1.8 if_icevar.h --- dev/pci/if_icevar.h 19 Aug 2025 11:46:52 -0000 1.8 +++ dev/pci/if_icevar.h 17 Sep 2025 11:57:46 -0000 @@ -4537,6 +4537,7 @@ struct ice_pf_sw_stats { struct ice_tx_map { struct mbuf *txm_m; bus_dmamap_t txm_map; + bus_dmamap_t txm_map_tso; unsigned int txm_eop; }; Index: dev/pci/if_ixl.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v diff -u -p -r1.108 if_ixl.c --- dev/pci/if_ixl.c 24 Jun 2025 11:03:10 -0000 1.108 +++ dev/pci/if_ixl.c 17 Sep 2025 11:57:46 -0000 @@ -900,6 +900,7 @@ struct ixl_rx_wb_desc_32 { } __packed __aligned(16); #define IXL_TX_PKT_DESCS 8 +#define IXL_TX_TSO_PKT_DESCS 128 #define IXL_TX_QUEUE_ALIGN 128 #define IXL_RX_QUEUE_ALIGN 128 @@ -1142,6 +1143,7 @@ struct ixl_chip; struct ixl_tx_map { struct mbuf *txm_m; bus_dmamap_t txm_map; + bus_dmamap_t txm_map_tso; unsigned int txm_eop; }; @@ -2584,6 +2586,12 @@ ixl_txr_alloc(struct ixl_softc *sc, unsi &txm->txm_map) != 0) goto uncreate; + if (bus_dmamap_create(sc->sc_dmat, + MAXMCLBYTES, IXL_TX_TSO_PKT_DESCS, IXL_MAX_DMA_SEG_SIZE, 0, + BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, + &txm->txm_map_tso) != 0) + goto uncreate; + txm->txm_eop = -1; txm->txm_m = NULL; } @@ -2600,10 +2608,10 @@ uncreate: for (i = 0; i < sc->sc_tx_ring_ndescs; i++) { txm = &maps[i]; - if (txm->txm_map == NULL) - continue; - - bus_dmamap_destroy(sc->sc_dmat, txm->txm_map); + if (txm->txm_map != NULL) + bus_dmamap_destroy(sc->sc_dmat, txm->txm_map); + if (txm->txm_map_tso != NULL) + bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso); } ixl_dmamem_free(sc, &txr->txr_mem); @@ -2680,7 +2688,10 @@ ixl_txr_clean(struct ixl_softc *sc, stru if (txm->txm_m == NULL) continue; - map = txm->txm_map; + if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO)) + map = txm->txm_map_tso; + else + map = txm->txm_map; bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->sc_dmat, map); @@ -2739,6 +2750,7 @@ ixl_txr_free(struct ixl_softc *sc, struc txm = &maps[i]; bus_dmamap_destroy(sc->sc_dmat, txm->txm_map); + bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso); } ixl_dmamem_free(sc, &txr->txr_mem); @@ -2885,7 +2897,7 @@ ixl_start(struct ifqueue *ifq) for (;;) { /* We need one extra descriptor for TSO packets. */ - if (free <= (IXL_TX_PKT_DESCS + 1)) { + if (free <= (IXL_TX_TSO_PKT_DESCS + 1)) { ifq_set_oactive(ifq); break; } @@ -2897,7 +2909,10 @@ ixl_start(struct ifqueue *ifq) offload = ixl_tx_setup_offload(m, txr, prod); txm = &txr->txr_maps[prod]; - map = txm->txm_map; + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) + map = txm->txm_map_tso; + else + map = txm->txm_map; if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { prod++; @@ -2988,7 +3003,10 @@ ixl_txeof(struct ixl_softc *sc, struct i if (dtype != htole64(IXL_TX_DESC_DTYPE_DONE)) break; - map = txm->txm_map; + if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO)) + map = txm->txm_map_tso; + else + map = txm->txm_map; bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_POSTWRITE);