Index | Thread | Search

From:
Kevin Lo <kevlo@kevlo.org>
Subject:
Re: bge/bnx/iavf/igc/ix/ixl/ngbe/pcn: ifq_restart() fix
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Tue, 24 Jun 2025 11:18:31 +0800

Download raw body.

Thread
On Mon, Jun 23, 2025 at 08:04:06PM +0200, Alexander Bluhm wrote:
>
> On Fri, Jun 20, 2025 at 12:12:14PM +0200, Stefan Sperling wrote:
> > A bug has been fixed by yasuaok@ in vmx(4) where the driver was
> > calling ifq_restart() without actually having made any space on
> > a full Tx ring. Calling ifq_restart() in this case can lead to
> > a condition where the interface gets stuck in OACTIVE until the
> > interface is reset with ifconfig.
> >
> > I could trigger the same bug on ice(4) with iperf as follows:
> >   for i in `seq 5`; do (iperf -l0 -t 0 -c 2001:db8::1 -u &) ; done
> > This needs another machine which runs iperf -u -s (and -V if using IPv6).
> >
> > I have checked all ethernet drivers in dev/pci which call ifq_restart().
> > A few already avoid calling ifq_restart() unnecessarily.
> > The same bug affects bge, bnx, iavf, igc, ix, ixl, ngbe, and pcn.
> >
> > The bad pattern looks like:
> >
> >  	while (cons != prod) {
> > 		descriptor = &ring[cons];
> >
> > 		if (descriptor is not done)
> > 			break;
> >
> > 		free descriptor;
> > 	}
> >
> > 	if (ifq_is_oactive(ifq))
> > 		ifq_restart(ifq);
> >
> > If we break out of the loop without freeing any descriptor, we will
> > call ifq_restart() without having made any space on the ring.
> > For example, this can happen when drivers invoke both Tx and Rx interrupt
> > handlers during the same MSIX interrupt cause, An Rx interrupt can then
> > lead to processing of a Tx ring which has made no progress.
> >
> > Fixed code looks like:
> >
> > 	done = 0;
> >
> >  	while (cons != prod) {
> > 		desc = &ring[cons];
> >
> > 		if (desc is not done)
> > 			break;
> >
> > 		free descriptor;
> > 		done = 1;
> > 	}
> >
> > 	if (done && ifq_is_oactive(ifq))
> > 		ifq_restart(ifq);
> >
> >
> > I am looking for tests and OKs.
> > I myself don't have any hardware for these drivers to test with.
>
> I have tested ix, ixl, igc.  OK bluhm@ for these.
>
> In August server room reconstruction should be finished, then I
> will have a setup with bge again.
>
> jan@ could test iavf, but it needs manual setup and is not that
> easy.
>
> And what should we do with the drivers noone has hardware and wants
> to test?  Just commit?
>
> Any volunteers for bnx, iavf, ngbe, pcn?

I've tested ix(4), igc(4) and ngbe(4):

ix0 at pci5 dev 0 function 0 "Intel X550T" rev 0x01, msix, 6 queues, address a0:36:9f:xx:xx:xx
igc0 at pci6 dev 0 function 0 "Intel I225-V" rev 0x03, msix, 4 queues, address 2c:53:4a:xx:xx:xx
ngbe0 at pci4 dev 0 function 0 "Beijing WangXun Technology WX1860AL1" rev 0x01, msix, 4 queues, address 6c:b3:11:xx:xx:xx

As stsp@ mentioned, when I run iperf -u -l0 -t0 -c 192.168.8.22,
the interface indeed gets stuck in OACTIVE until it is reset with ifconfig.

stsp@'s diff fixes the problem, thanks.

ok kevlo@

> bluhm
>
> > M  sys/dev/pci/if_bge.c   |  4+  3-
> > M  sys/dev/pci/if_bnx.c   |  1+  1-
> > M  sys/dev/pci/if_iavf.c  |  1+  1-
> > M  sys/dev/pci/if_igc.c   |  1+  1-
> > M  sys/dev/pci/if_ix.c    |  3+  1-
> > M  sys/dev/pci/if_ixl.c   |  1+  1-
> > M  sys/dev/pci/if_ngbe.c  |  3+  1-
> > M  sys/dev/pci/if_pcn.c   |  3+  1-
> >
> > 8 files changed, 17 insertions(+), 10 deletions(-)
> >
> > commit - a89c75a2984a37717d6fffa3850bd269e697c3c7
> > commit + b6ed26d3094a0bd6d44e939b604136db06b1c0db
> > blob - 5b022a8bf67db05f03aecfd699cc8fc1701191e4
> > blob + c3112a656819b5b109ca032d610ff24f9fe743d6
> > --- sys/dev/pci/if_bge.c
> > +++ sys/dev/pci/if_bge.c
> > @@ -3739,9 +3739,10 @@ bge_txeof(struct bge_softc *sc)
> >
> >  	sc->bge_tx_saved_considx = cons;
> >
> > -	if (ifq_is_oactive(&ifp->if_snd))
> > -		ifq_restart(&ifp->if_snd);
> > -	else if (txcnt == 0)
> > +	if (ifq_is_oactive(&ifp->if_snd)) {
> > +		if (freed)
> > +			ifq_restart(&ifp->if_snd);
> > +	} else if (txcnt == 0)
> >  		ifp->if_timer = 0;
> >  }
> >
> > blob - 6d18e3a22795dede966a95c3f81d74f71fd20b12
> > blob + 79382274aee75c3d51f54d747f6eda4ae84f0d99
> > --- sys/dev/pci/if_bnx.c
> > +++ sys/dev/pci/if_bnx.c
> > @@ -4579,7 +4579,7 @@ bnx_tx_intr(struct bnx_softc *sc)
> >  	if (used == 0)
> >  		ifp->if_timer = 0;
> >
> > -	if (ifq_is_oactive(&ifp->if_snd))
> > +	if (freed && ifq_is_oactive(&ifp->if_snd))
> >  		ifq_restart(&ifp->if_snd);
> >  }
> >
> > blob - bdfeefe85158bea04895190c8e151bd17b9cb6a4
> > blob + 9ae28af22a0fdbc6fb47db73f7df8db4d1fa400a
> > --- sys/dev/pci/if_iavf.c
> > +++ sys/dev/pci/if_iavf.c
> > @@ -2178,7 +2178,7 @@ iavf_txeof(struct iavf_softc *sc, struct ifqueue *ifq)
> >
> >  	//ixl_enable(sc, txr->txr_msix);
> >
> > -	if (ifq_is_oactive(ifq))
> > +	if (done && ifq_is_oactive(ifq))
> >  		ifq_restart(ifq);
> >
> >  	return (done);
> > blob - 6f9aaed06ed82859719b420c8198e9f5042fe823
> > blob + 69800e703841c3635185849dabe61aa430d5f32a
> > --- sys/dev/pci/if_igc.c
> > +++ sys/dev/pci/if_igc.c
> > @@ -1119,7 +1119,7 @@ igc_txeof(struct igc_txring *txr)
> >
> >  	txr->next_to_clean = cons;
> >
> > -	if (ifq_is_oactive(ifq))
> > +	if (done && ifq_is_oactive(ifq))
> >  		ifq_restart(ifq);
> >
> >  	return (done);
> > blob - eece3848ec6deeaf57a0b3da4412119979c5a74f
> > blob + e35807fa84586257960ff739743e4c1a05ed65de
> > --- sys/dev/pci/if_ix.c
> > +++ sys/dev/pci/if_ix.c
> > @@ -2641,6 +2641,7 @@ ixgbe_txeof(struct ix_txring *txr)
> >  	unsigned int			 head, tail, last;
> >  	struct ixgbe_tx_buf		*tx_buffer;
> >  	struct ixgbe_legacy_tx_desc	*tx_desc;
> > +	int done = 0;
> >
> >  	if (!ISSET(ifp->if_flags, IFF_RUNNING))
> >  		return FALSE;
> > @@ -2673,6 +2674,7 @@ ixgbe_txeof(struct ix_txring *txr)
> >  		tx_buffer->m_head = NULL;
> >  		tx_buffer->eop_index = -1;
> >
> > +		done = 1;
> >  		tail = last + 1;
> >  		if (tail == sc->num_tx_desc)
> >  			tail = 0;
> > @@ -2691,7 +2693,7 @@ ixgbe_txeof(struct ix_txring *txr)
> >
> >  	txr->next_to_clean = tail;
> >
> > -	if (ifq_is_oactive(ifq))
> > +	if (done && ifq_is_oactive(ifq))
> >  		ifq_restart(ifq);
> >
> >  	return TRUE;
> > blob - c0c958c59581caac0e1fca5261fd1956e152f193
> > blob + e338ee24ecda59a9155a02fabaafc7f9f79b2173
> > --- sys/dev/pci/if_ixl.c
> > +++ sys/dev/pci/if_ixl.c
> > @@ -3011,7 +3011,7 @@ ixl_txeof(struct ixl_softc *sc, struct ixl_tx_ring *tx
> >
> >  	//ixl_enable(sc, txr->txr_msix);
> >
> > -	if (ifq_is_oactive(ifq))
> > +	if (done && ifq_is_oactive(ifq))
> >  		ifq_restart(ifq);
> >
> >  	return (done);
> > blob - 7c7a6eb24b69070c80ee4feff9847168188a30fb
> > blob + 3e5e939ddba83973bf8df4893206ddd5aad5f1e7
> > --- sys/dev/pci/if_ngbe.c
> > +++ sys/dev/pci/if_ngbe.c
> > @@ -4461,6 +4461,7 @@ ngbe_txeof(struct tx_ring *txr)
> >  	struct ngbe_tx_buf *tx_buffer;
> >  	union ngbe_tx_desc *tx_desc;
> >  	unsigned int prod, cons, last;
> > +	int done = 0;
> >
> >  	if (!ISSET(ifp->if_flags, IFF_RUNNING))
> >  		return;
> > @@ -4486,6 +4487,7 @@ ngbe_txeof(struct tx_ring *txr)
> >  		    0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE);
> >  		bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map);
> >  		m_freem(tx_buffer->m_head);
> > +		done = 1;
> >
> >  		tx_buffer->m_head = NULL;
> >  		tx_buffer->eop_index = -1;
> > @@ -4505,7 +4507,7 @@ ngbe_txeof(struct tx_ring *txr)
> >
> >  	txr->next_to_clean = cons;
> >
> > -	if (ifq_is_oactive(ifq))
> > +	if (done && ifq_is_oactive(ifq))
> >  		ifq_restart(ifq);
> >  }
> >
> > blob - 3d93a269ba97b398505e353329db00db6cf0e937
> > blob + 97780dba62a5b32a6f7eacccc003f9e0097ab5da
> > --- sys/dev/pci/if_pcn.c
> > +++ sys/dev/pci/if_pcn.c
> > @@ -1143,6 +1143,7 @@ pcn_txintr(struct pcn_softc *sc)
> >  	struct pcn_txsoft *txs;
> >  	uint32_t tmd1, tmd2, tmd;
> >  	int i, j;
> > +	int done = 0;
> >
> >  	/*
> >  	 * Go through our Tx list and free mbufs for those
> > @@ -1219,6 +1220,7 @@ pcn_txintr(struct pcn_softc *sc)
> >  		bus_dmamap_unload(sc->sc_dmat, txs->txs_dmamap);
> >  		m_freem(txs->txs_mbuf);
> >  		txs->txs_mbuf = NULL;
> > +		done = 1;
> >  	}
> >
> >  	/* Update the dirty transmit buffer pointer. */
> > @@ -1231,7 +1233,7 @@ pcn_txintr(struct pcn_softc *sc)
> >  	if (sc->sc_txsfree == PCN_TXQUEUELEN)
> >  		ifp->if_timer = 0;
> >
> > -	if (ifq_is_oactive(&ifp->if_snd))
> > +	if (done && ifq_is_oactive(&ifp->if_snd))
> >  		ifq_restart(&ifp->if_snd);
> >  }
> >
>