Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: bge/bnx/iavf/igc/ix/ixl/ngbe/pcn: ifq_restart() fix
To:
tech@openbsd.org
Date:
Mon, 23 Jun 2025 20:04:06 +0200

Download raw body.

Thread
On Fri, Jun 20, 2025 at 12:12:14PM +0200, Stefan Sperling wrote:
> A bug has been fixed by yasuaok@ in vmx(4) where the driver was
> calling ifq_restart() without actually having made any space on
> a full Tx ring. Calling ifq_restart() in this case can lead to
> a condition where the interface gets stuck in OACTIVE until the
> interface is reset with ifconfig.
> 
> I could trigger the same bug on ice(4) with iperf as follows:
>   for i in `seq 5`; do (iperf -l0 -t 0 -c 2001:db8::1 -u &) ; done
> This needs another machine which runs iperf -u -s (and -V if using IPv6).
> 
> I have checked all ethernet drivers in dev/pci which call ifq_restart().
> A few already avoid calling ifq_restart() unnecessarily.
> The same bug affects bge, bnx, iavf, igc, ix, ixl, ngbe, and pcn.
> 
> The bad pattern looks like:
> 
>  	while (cons != prod) {
> 		descriptor = &ring[cons];
> 
> 		if (descriptor is not done)
> 			break;
> 
> 		free descriptor;
> 	}
> 
> 	if (ifq_is_oactive(ifq))
> 		ifq_restart(ifq);
> 
> If we break out of the loop without freeing any descriptor, we will
> call ifq_restart() without having made any space on the ring.
> For example, this can happen when drivers invoke both Tx and Rx interrupt
> handlers during the same MSIX interrupt cause, An Rx interrupt can then
> lead to processing of a Tx ring which has made no progress.
> 
> Fixed code looks like:
> 
> 	done = 0;
> 
>  	while (cons != prod) {
> 		desc = &ring[cons];
> 
> 		if (desc is not done)
> 			break;
> 
> 		free descriptor;
> 		done = 1;
> 	}
> 
> 	if (done && ifq_is_oactive(ifq))
> 		ifq_restart(ifq);
> 
> 
> I am looking for tests and OKs.
> I myself don't have any hardware for these drivers to test with.

I have tested ix, ixl, igc.  OK bluhm@ for these.

In August server room reconstruction should be finished, then I
will have a setup with bge again.

jan@ could test iavf, but it needs manual setup and is not that
easy.

And what should we do with the drivers noone has hardware and wants
to test?  Just commit?

Any volunteers for bnx, iavf, ngbe, pcn?

bluhm

> M  sys/dev/pci/if_bge.c   |  4+  3-
> M  sys/dev/pci/if_bnx.c   |  1+  1-
> M  sys/dev/pci/if_iavf.c  |  1+  1-
> M  sys/dev/pci/if_igc.c   |  1+  1-
> M  sys/dev/pci/if_ix.c    |  3+  1-
> M  sys/dev/pci/if_ixl.c   |  1+  1-
> M  sys/dev/pci/if_ngbe.c  |  3+  1-
> M  sys/dev/pci/if_pcn.c   |  3+  1-
> 
> 8 files changed, 17 insertions(+), 10 deletions(-)
> 
> commit - a89c75a2984a37717d6fffa3850bd269e697c3c7
> commit + b6ed26d3094a0bd6d44e939b604136db06b1c0db
> blob - 5b022a8bf67db05f03aecfd699cc8fc1701191e4
> blob + c3112a656819b5b109ca032d610ff24f9fe743d6
> --- sys/dev/pci/if_bge.c
> +++ sys/dev/pci/if_bge.c
> @@ -3739,9 +3739,10 @@ bge_txeof(struct bge_softc *sc)
>  
>  	sc->bge_tx_saved_considx = cons;
>  
> -	if (ifq_is_oactive(&ifp->if_snd))
> -		ifq_restart(&ifp->if_snd);
> -	else if (txcnt == 0)
> +	if (ifq_is_oactive(&ifp->if_snd)) {
> +		if (freed)
> +			ifq_restart(&ifp->if_snd);
> +	} else if (txcnt == 0)
>  		ifp->if_timer = 0;
>  }
>  
> blob - 6d18e3a22795dede966a95c3f81d74f71fd20b12
> blob + 79382274aee75c3d51f54d747f6eda4ae84f0d99
> --- sys/dev/pci/if_bnx.c
> +++ sys/dev/pci/if_bnx.c
> @@ -4579,7 +4579,7 @@ bnx_tx_intr(struct bnx_softc *sc)
>  	if (used == 0)
>  		ifp->if_timer = 0;
>  
> -	if (ifq_is_oactive(&ifp->if_snd))
> +	if (freed && ifq_is_oactive(&ifp->if_snd))
>  		ifq_restart(&ifp->if_snd);
>  }
>  
> blob - bdfeefe85158bea04895190c8e151bd17b9cb6a4
> blob + 9ae28af22a0fdbc6fb47db73f7df8db4d1fa400a
> --- sys/dev/pci/if_iavf.c
> +++ sys/dev/pci/if_iavf.c
> @@ -2178,7 +2178,7 @@ iavf_txeof(struct iavf_softc *sc, struct ifqueue *ifq)
>  
>  	//ixl_enable(sc, txr->txr_msix);
>  
> -	if (ifq_is_oactive(ifq))
> +	if (done && ifq_is_oactive(ifq))
>  		ifq_restart(ifq);
>  
>  	return (done);
> blob - 6f9aaed06ed82859719b420c8198e9f5042fe823
> blob + 69800e703841c3635185849dabe61aa430d5f32a
> --- sys/dev/pci/if_igc.c
> +++ sys/dev/pci/if_igc.c
> @@ -1119,7 +1119,7 @@ igc_txeof(struct igc_txring *txr)
>  
>  	txr->next_to_clean = cons;
>  
> -	if (ifq_is_oactive(ifq))
> +	if (done && ifq_is_oactive(ifq))
>  		ifq_restart(ifq);
>  
>  	return (done);
> blob - eece3848ec6deeaf57a0b3da4412119979c5a74f
> blob + e35807fa84586257960ff739743e4c1a05ed65de
> --- sys/dev/pci/if_ix.c
> +++ sys/dev/pci/if_ix.c
> @@ -2641,6 +2641,7 @@ ixgbe_txeof(struct ix_txring *txr)
>  	unsigned int			 head, tail, last;
>  	struct ixgbe_tx_buf		*tx_buffer;
>  	struct ixgbe_legacy_tx_desc	*tx_desc;
> +	int done = 0;
>  
>  	if (!ISSET(ifp->if_flags, IFF_RUNNING))
>  		return FALSE;
> @@ -2673,6 +2674,7 @@ ixgbe_txeof(struct ix_txring *txr)
>  		tx_buffer->m_head = NULL;
>  		tx_buffer->eop_index = -1;
>  
> +		done = 1;
>  		tail = last + 1;
>  		if (tail == sc->num_tx_desc)
>  			tail = 0;
> @@ -2691,7 +2693,7 @@ ixgbe_txeof(struct ix_txring *txr)
>  
>  	txr->next_to_clean = tail;
>  
> -	if (ifq_is_oactive(ifq))
> +	if (done && ifq_is_oactive(ifq))
>  		ifq_restart(ifq);
>  
>  	return TRUE;
> blob - c0c958c59581caac0e1fca5261fd1956e152f193
> blob + e338ee24ecda59a9155a02fabaafc7f9f79b2173
> --- sys/dev/pci/if_ixl.c
> +++ sys/dev/pci/if_ixl.c
> @@ -3011,7 +3011,7 @@ ixl_txeof(struct ixl_softc *sc, struct ixl_tx_ring *tx
>  
>  	//ixl_enable(sc, txr->txr_msix);
>  
> -	if (ifq_is_oactive(ifq))
> +	if (done && ifq_is_oactive(ifq))
>  		ifq_restart(ifq);
>  
>  	return (done);
> blob - 7c7a6eb24b69070c80ee4feff9847168188a30fb
> blob + 3e5e939ddba83973bf8df4893206ddd5aad5f1e7
> --- sys/dev/pci/if_ngbe.c
> +++ sys/dev/pci/if_ngbe.c
> @@ -4461,6 +4461,7 @@ ngbe_txeof(struct tx_ring *txr)
>  	struct ngbe_tx_buf *tx_buffer;
>  	union ngbe_tx_desc *tx_desc;
>  	unsigned int prod, cons, last;
> +	int done = 0;
>  
>  	if (!ISSET(ifp->if_flags, IFF_RUNNING))
>  		return;
> @@ -4486,6 +4487,7 @@ ngbe_txeof(struct tx_ring *txr)
>  		    0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE);
>  		bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map);
>  		m_freem(tx_buffer->m_head);
> +		done = 1;
>  
>  		tx_buffer->m_head = NULL;
>  		tx_buffer->eop_index = -1;
> @@ -4505,7 +4507,7 @@ ngbe_txeof(struct tx_ring *txr)
>  
>  	txr->next_to_clean = cons;
>  
> -	if (ifq_is_oactive(ifq))
> +	if (done && ifq_is_oactive(ifq))
>  		ifq_restart(ifq);
>  }
>  
> blob - 3d93a269ba97b398505e353329db00db6cf0e937
> blob + 97780dba62a5b32a6f7eacccc003f9e0097ab5da
> --- sys/dev/pci/if_pcn.c
> +++ sys/dev/pci/if_pcn.c
> @@ -1143,6 +1143,7 @@ pcn_txintr(struct pcn_softc *sc)
>  	struct pcn_txsoft *txs;
>  	uint32_t tmd1, tmd2, tmd;
>  	int i, j;
> +	int done = 0;
>  
>  	/*
>  	 * Go through our Tx list and free mbufs for those
> @@ -1219,6 +1220,7 @@ pcn_txintr(struct pcn_softc *sc)
>  		bus_dmamap_unload(sc->sc_dmat, txs->txs_dmamap);
>  		m_freem(txs->txs_mbuf);
>  		txs->txs_mbuf = NULL;
> +		done = 1;
>  	}
>  
>  	/* Update the dirty transmit buffer pointer. */
> @@ -1231,7 +1233,7 @@ pcn_txintr(struct pcn_softc *sc)
>  	if (sc->sc_txsfree == PCN_TXQUEUELEN)
>  		ifp->if_timer = 0;
>  
> -	if (ifq_is_oactive(&ifp->if_snd))
> +	if (done && ifq_is_oactive(&ifp->if_snd))
>  		ifq_restart(&ifp->if_snd);
>  }
>