Index | Thread | Search

From:
Jan Klemkow <jan@openbsd.org>
Subject:
Re: ixl/ice(4): Avoid unnecessary defrag of TSO packets
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
Mark Kettenis <mark.kettenis@xs4all.nl>, tech@openbsd.org
Date:
Wed, 17 Sep 2025 14:12:22 +0200

Download raw body.

Thread
On Tue, Sep 16, 2025 at 09:20:43PM +0200, Alexander Bluhm wrote:
> On Sat, Sep 13, 2025 at 04:00:04PM +0200, Jan Klemkow wrote:
> > On Fri, Sep 12, 2025 at 05:57:49PM +0200, Mark Kettenis wrote:
> > > > Date: Fri, 12 Sep 2025 16:00:39 +0200
> > > > From: Jan Klemkow <jan@openbsd.org>
> > > > ixl(4) and ice(4) network cards can handle just up to 8 DMA segments for
> > > > a regular packet, but 128 segments for a TSO packet.  TSO packets reach
> > > > the 8 segments limit very fast.  Thus, we run into unnecessary
> > > > m_defrag() calls, which cost throughput.
> > > > 
> > > > This diff sets the limit to 128 segments for ice(4) and ixl(4) and adds an
> > > > additional check for non-TSO packets.
> > > > 
> > > > ok?
> > > 
> > > I don't think so.  This bypasses all the bus_dma(9) logic.  There
> > > isn't necessarily a relationship between the number of elements in an
> > > mbuf chain and the number of bus_dma(9) segments.
> > > 
> > > I think you should use separate bus_dma(9) maps for TSO.
> > 
> > Like the following diff?
> 
> I have tested it successfully.  Comments inline.
> 
> > Index: dev/pci/if_ice.c
> > ===================================================================
> > RCS file: /cvs/src/sys/dev/pci/if_ice.c,v
> > diff -u -p -r1.58 if_ice.c
> > --- dev/pci/if_ice.c	19 Aug 2025 11:46:52 -0000	1.58
> > +++ dev/pci/if_ice.c	13 Sep 2025 13:29:55 -0000
> > @@ -13868,7 +13870,7 @@ ice_tso_detect_sparse(struct mbuf *m, st
> >  		hlen -= seglen;
> >  	}
> >  
> > -	maxsegs = ICE_MAX_TX_SEGS - hdrs;
> > +	maxsegs = ICE_MAX_TSO_SEGS - hdrs;
> >  
> >  	/* We must count the headers, in order to verify that they take up
> >  	 * 3 or fewer descriptors. However, we don't need to check the data
> 
> The comment above ice_tso_detect_sparse() talks about 8.  That
> should be 128 in some cases.  Please adapt.

done

> In general, parts of ice_tso_detect_sparse() do what bus_dmamap_load_mbuf()
> checks anyway.  Other drivers don't have theses checks although I
> guess they have similar hardware limitations.  I think this function
> has to be reconsidered.  But not in this diff.
> 
> > @@ -29674,6 +29688,16 @@ ice_tx_queues_alloc(struct ice_softc *sc
> >  			    ICE_MAX_TX_SEGS, ICE_MAX_DMA_SEG_SIZE, 0,
> >  			    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
> >  			    &map->txm_map) != 0) {
> > +				printf("%s: could not allocate Tx DMA map\n",
> > +				    sc->sc_dev.dv_xname);
> > +				err = ENOMEM;
> > +				goto free_tx_queues;
> > +			}
> > +
> > +			if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES,
> > +			    ICE_MAX_TSO_SEGS, ICE_MAX_DMA_SEG_SIZE, 0,
> > +			    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
> > +			    &map->txm_map_tso) != 0) {
> >  				printf("%s: could not allocate Tx DMA map\n",
> >  				    sc->sc_dev.dv_xname);
> >  				err = ENOMEM;
> 
> Plase use unique error messages.  Rename the second one to
> "%s: could not allocate TSO DMA map\n"

done

> > Index: dev/pci/if_ixl.c
> > ===================================================================
> > RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
> > diff -u -p -r1.108 if_ixl.c
> > --- dev/pci/if_ixl.c	24 Jun 2025 11:03:10 -0000	1.108
> > +++ dev/pci/if_ixl.c	13 Sep 2025 13:30:47 -0000
> > @@ -2604,6 +2612,11 @@ uncreate:
> >  			continue;
> >  
> >  		bus_dmamap_destroy(sc->sc_dmat, txm->txm_map);
> > +
> > +		if (txm->txm_map_tso == NULL)
> > +			continue;
> > +
> > +		bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso);
> >  	}
> >  
> >  	ixl_dmamem_free(sc, &txr->txr_mem);
> 
> This continue is wrong as the first one also affects the second map.
> 
> 		if (txm->txm_map != NULL)
> 			bus_dmamap_destroy(sc->sc_dmat, txm->txm_map);
> 		if (txm->txm_map_tso != NULL)
> 			bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso);

done

ok?

Thanks,
jan

Index: dev/pci/if_ice.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ice.c,v
diff -u -p -r1.58 if_ice.c
--- dev/pci/if_ice.c	19 Aug 2025 11:46:52 -0000	1.58
+++ dev/pci/if_ice.c	17 Sep 2025 11:57:46 -0000
@@ -13432,8 +13432,10 @@ ice_txq_clean(struct ice_softc *sc, stru
 
 		if (txm->txm_m == NULL)
 			continue;
-
-		map = txm->txm_map;
+		if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO))
+			map = txm->txm_map_tso;
+		else
+			map = txm->txm_map;
 		bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize,
 		    BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(sc->sc_dmat, map);
@@ -13868,10 +13870,10 @@ ice_tso_detect_sparse(struct mbuf *m, st
 		hlen -= seglen;
 	}
 
-	maxsegs = ICE_MAX_TX_SEGS - hdrs;
+	maxsegs = ICE_MAX_TSO_SEGS - hdrs;
 
 	/* We must count the headers, in order to verify that they take up
-	 * 3 or fewer descriptors. However, we don't need to check the data
+	 * 128 or fewer descriptors. However, we don't need to check the data
 	 * if the total segments is small.
 	 */
 	if (nsegs <= maxsegs)
@@ -14066,7 +14068,11 @@ ice_start(struct ifqueue *ifq)
 		}
 
 		txm = &txq->tx_map[prod];
-		map = txm->txm_map;
+
+		if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO))
+			map = txm->txm_map_tso;
+		else
+			map = txm->txm_map;
 
 		if (ice_load_mbuf(sc->sc_dmat, map, m) != 0) {
 			ifq->ifq_errors++;
@@ -29478,7 +29484,10 @@ ice_txeof(struct ice_softc *sc, struct i
 		if (dtype != htole64(ICE_TX_DESC_DTYPE_DESC_DONE))
 			break;
 
-		map = txm->txm_map;
+		if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO))
+			map = txm->txm_map_tso;
+		else
+			map = txm->txm_map;
 
 		bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize,
 		    BUS_DMASYNC_POSTWRITE);
@@ -29601,6 +29610,11 @@ ice_free_tx_queues(struct ice_softc *sc)
 				bus_dmamap_destroy(sc->sc_dmat, map->txm_map);
 				map->txm_map = NULL;
 			}
+			if (map->txm_map_tso != NULL) {
+				bus_dmamap_destroy(sc->sc_dmat,
+				    map->txm_map_tso);
+				map->txm_map_tso = NULL;
+			}
 		}
 		free(txq->tx_map, M_DEVBUF, txq->desc_count * sizeof(*map));
 		txq->tx_map = NULL;
@@ -29675,6 +29689,16 @@ ice_tx_queues_alloc(struct ice_softc *sc
 			    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
 			    &map->txm_map) != 0) {
 				printf("%s: could not allocate Tx DMA map\n",
+				    sc->sc_dev.dv_xname);
+				err = ENOMEM;
+				goto free_tx_queues;
+			}
+
+			if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES,
+			    ICE_MAX_TSO_SEGS, ICE_MAX_DMA_SEG_SIZE, 0,
+			    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
+			    &map->txm_map_tso) != 0) {
+				printf("%s: could not allocate TSO DMA map\n",
 				    sc->sc_dev.dv_xname);
 				err = ENOMEM;
 				goto free_tx_queues;
Index: dev/pci/if_icevar.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_icevar.h,v
diff -u -p -r1.8 if_icevar.h
--- dev/pci/if_icevar.h	19 Aug 2025 11:46:52 -0000	1.8
+++ dev/pci/if_icevar.h	17 Sep 2025 11:57:46 -0000
@@ -4537,6 +4537,7 @@ struct ice_pf_sw_stats {
 struct ice_tx_map {
 	struct mbuf		*txm_m;
 	bus_dmamap_t		 txm_map;
+	bus_dmamap_t		 txm_map_tso;
 	unsigned int		 txm_eop;
 };
 
Index: dev/pci/if_ixl.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
diff -u -p -r1.108 if_ixl.c
--- dev/pci/if_ixl.c	24 Jun 2025 11:03:10 -0000	1.108
+++ dev/pci/if_ixl.c	17 Sep 2025 11:57:46 -0000
@@ -900,6 +900,7 @@ struct ixl_rx_wb_desc_32 {
 } __packed __aligned(16);
 
 #define IXL_TX_PKT_DESCS		8
+#define IXL_TX_TSO_PKT_DESCS		128
 #define IXL_TX_QUEUE_ALIGN		128
 #define IXL_RX_QUEUE_ALIGN		128
 
@@ -1142,6 +1143,7 @@ struct ixl_chip;
 struct ixl_tx_map {
 	struct mbuf		*txm_m;
 	bus_dmamap_t		 txm_map;
+	bus_dmamap_t		 txm_map_tso;
 	unsigned int		 txm_eop;
 };
 
@@ -2584,6 +2586,12 @@ ixl_txr_alloc(struct ixl_softc *sc, unsi
 		    &txm->txm_map) != 0)
 			goto uncreate;
 
+		if (bus_dmamap_create(sc->sc_dmat,
+		    MAXMCLBYTES, IXL_TX_TSO_PKT_DESCS, IXL_MAX_DMA_SEG_SIZE, 0,
+		    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
+		    &txm->txm_map_tso) != 0)
+			goto uncreate;
+
 		txm->txm_eop = -1;
 		txm->txm_m = NULL;
 	}
@@ -2600,10 +2608,10 @@ uncreate:
 	for (i = 0; i < sc->sc_tx_ring_ndescs; i++) {
 		txm = &maps[i];
 
-		if (txm->txm_map == NULL)
-			continue;
-
-		bus_dmamap_destroy(sc->sc_dmat, txm->txm_map);
+		if (txm->txm_map != NULL)
+			bus_dmamap_destroy(sc->sc_dmat, txm->txm_map);
+		if (txm->txm_map_tso != NULL)
+			bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso);
 	}
 
 	ixl_dmamem_free(sc, &txr->txr_mem);
@@ -2680,7 +2688,10 @@ ixl_txr_clean(struct ixl_softc *sc, stru
 		if (txm->txm_m == NULL)
 			continue;
 
-		map = txm->txm_map;
+		if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO))
+			map = txm->txm_map_tso;
+		else
+			map = txm->txm_map;
 		bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize,
 		    BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(sc->sc_dmat, map);
@@ -2739,6 +2750,7 @@ ixl_txr_free(struct ixl_softc *sc, struc
 		txm = &maps[i];
 
 		bus_dmamap_destroy(sc->sc_dmat, txm->txm_map);
+		bus_dmamap_destroy(sc->sc_dmat, txm->txm_map_tso);
 	}
 
 	ixl_dmamem_free(sc, &txr->txr_mem);
@@ -2885,7 +2897,7 @@ ixl_start(struct ifqueue *ifq)
 
 	for (;;) {
 		/* We need one extra descriptor for TSO packets. */
-		if (free <= (IXL_TX_PKT_DESCS + 1)) {
+		if (free <= (IXL_TX_TSO_PKT_DESCS + 1)) {
 			ifq_set_oactive(ifq);
 			break;
 		}
@@ -2897,7 +2909,10 @@ ixl_start(struct ifqueue *ifq)
 		offload = ixl_tx_setup_offload(m, txr, prod);
 
 		txm = &txr->txr_maps[prod];
-		map = txm->txm_map;
+		if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO))
+			map = txm->txm_map_tso;
+		else
+			map = txm->txm_map;
 
 		if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
 			prod++;
@@ -2988,7 +3003,10 @@ ixl_txeof(struct ixl_softc *sc, struct i
 		if (dtype != htole64(IXL_TX_DESC_DTYPE_DONE))
 			break;
 
-		map = txm->txm_map;
+		if (ISSET(txm->txm_m->m_pkthdr.csum_flags, M_TCP_TSO))
+			map = txm->txm_map_tso;
+		else
+			map = txm->txm_map;
 
 		bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize,
 		    BUS_DMASYNC_POSTWRITE);