From: Kevin Lo Subject: Re: bge/bnx/iavf/igc/ix/ixl/ngbe/pcn: ifq_restart() fix To: Alexander Bluhm Cc: tech@openbsd.org Date: Tue, 24 Jun 2025 11:18:31 +0800 On Mon, Jun 23, 2025 at 08:04:06PM +0200, Alexander Bluhm wrote: > > On Fri, Jun 20, 2025 at 12:12:14PM +0200, Stefan Sperling wrote: > > A bug has been fixed by yasuaok@ in vmx(4) where the driver was > > calling ifq_restart() without actually having made any space on > > a full Tx ring. Calling ifq_restart() in this case can lead to > > a condition where the interface gets stuck in OACTIVE until the > > interface is reset with ifconfig. > > > > I could trigger the same bug on ice(4) with iperf as follows: > > for i in `seq 5`; do (iperf -l0 -t 0 -c 2001:db8::1 -u &) ; done > > This needs another machine which runs iperf -u -s (and -V if using IPv6). > > > > I have checked all ethernet drivers in dev/pci which call ifq_restart(). > > A few already avoid calling ifq_restart() unnecessarily. > > The same bug affects bge, bnx, iavf, igc, ix, ixl, ngbe, and pcn. > > > > The bad pattern looks like: > > > > while (cons != prod) { > > descriptor = &ring[cons]; > > > > if (descriptor is not done) > > break; > > > > free descriptor; > > } > > > > if (ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > > > If we break out of the loop without freeing any descriptor, we will > > call ifq_restart() without having made any space on the ring. > > For example, this can happen when drivers invoke both Tx and Rx interrupt > > handlers during the same MSIX interrupt cause, An Rx interrupt can then > > lead to processing of a Tx ring which has made no progress. > > > > Fixed code looks like: > > > > done = 0; > > > > while (cons != prod) { > > desc = &ring[cons]; > > > > if (desc is not done) > > break; > > > > free descriptor; > > done = 1; > > } > > > > if (done && ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > > > > > I am looking for tests and OKs. > > I myself don't have any hardware for these drivers to test with. > > I have tested ix, ixl, igc. OK bluhm@ for these. > > In August server room reconstruction should be finished, then I > will have a setup with bge again. > > jan@ could test iavf, but it needs manual setup and is not that > easy. > > And what should we do with the drivers noone has hardware and wants > to test? Just commit? > > Any volunteers for bnx, iavf, ngbe, pcn? I've tested ix(4), igc(4) and ngbe(4): ix0 at pci5 dev 0 function 0 "Intel X550T" rev 0x01, msix, 6 queues, address a0:36:9f:xx:xx:xx igc0 at pci6 dev 0 function 0 "Intel I225-V" rev 0x03, msix, 4 queues, address 2c:53:4a:xx:xx:xx ngbe0 at pci4 dev 0 function 0 "Beijing WangXun Technology WX1860AL1" rev 0x01, msix, 4 queues, address 6c:b3:11:xx:xx:xx As stsp@ mentioned, when I run iperf -u -l0 -t0 -c 192.168.8.22, the interface indeed gets stuck in OACTIVE until it is reset with ifconfig. stsp@'s diff fixes the problem, thanks. ok kevlo@ > bluhm > > > M sys/dev/pci/if_bge.c | 4+ 3- > > M sys/dev/pci/if_bnx.c | 1+ 1- > > M sys/dev/pci/if_iavf.c | 1+ 1- > > M sys/dev/pci/if_igc.c | 1+ 1- > > M sys/dev/pci/if_ix.c | 3+ 1- > > M sys/dev/pci/if_ixl.c | 1+ 1- > > M sys/dev/pci/if_ngbe.c | 3+ 1- > > M sys/dev/pci/if_pcn.c | 3+ 1- > > > > 8 files changed, 17 insertions(+), 10 deletions(-) > > > > commit - a89c75a2984a37717d6fffa3850bd269e697c3c7 > > commit + b6ed26d3094a0bd6d44e939b604136db06b1c0db > > blob - 5b022a8bf67db05f03aecfd699cc8fc1701191e4 > > blob + c3112a656819b5b109ca032d610ff24f9fe743d6 > > --- sys/dev/pci/if_bge.c > > +++ sys/dev/pci/if_bge.c > > @@ -3739,9 +3739,10 @@ bge_txeof(struct bge_softc *sc) > > > > sc->bge_tx_saved_considx = cons; > > > > - if (ifq_is_oactive(&ifp->if_snd)) > > - ifq_restart(&ifp->if_snd); > > - else if (txcnt == 0) > > + if (ifq_is_oactive(&ifp->if_snd)) { > > + if (freed) > > + ifq_restart(&ifp->if_snd); > > + } else if (txcnt == 0) > > ifp->if_timer = 0; > > } > > > > blob - 6d18e3a22795dede966a95c3f81d74f71fd20b12 > > blob + 79382274aee75c3d51f54d747f6eda4ae84f0d99 > > --- sys/dev/pci/if_bnx.c > > +++ sys/dev/pci/if_bnx.c > > @@ -4579,7 +4579,7 @@ bnx_tx_intr(struct bnx_softc *sc) > > if (used == 0) > > ifp->if_timer = 0; > > > > - if (ifq_is_oactive(&ifp->if_snd)) > > + if (freed && ifq_is_oactive(&ifp->if_snd)) > > ifq_restart(&ifp->if_snd); > > } > > > > blob - bdfeefe85158bea04895190c8e151bd17b9cb6a4 > > blob + 9ae28af22a0fdbc6fb47db73f7df8db4d1fa400a > > --- sys/dev/pci/if_iavf.c > > +++ sys/dev/pci/if_iavf.c > > @@ -2178,7 +2178,7 @@ iavf_txeof(struct iavf_softc *sc, struct ifqueue *ifq) > > > > //ixl_enable(sc, txr->txr_msix); > > > > - if (ifq_is_oactive(ifq)) > > + if (done && ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > > > return (done); > > blob - 6f9aaed06ed82859719b420c8198e9f5042fe823 > > blob + 69800e703841c3635185849dabe61aa430d5f32a > > --- sys/dev/pci/if_igc.c > > +++ sys/dev/pci/if_igc.c > > @@ -1119,7 +1119,7 @@ igc_txeof(struct igc_txring *txr) > > > > txr->next_to_clean = cons; > > > > - if (ifq_is_oactive(ifq)) > > + if (done && ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > > > return (done); > > blob - eece3848ec6deeaf57a0b3da4412119979c5a74f > > blob + e35807fa84586257960ff739743e4c1a05ed65de > > --- sys/dev/pci/if_ix.c > > +++ sys/dev/pci/if_ix.c > > @@ -2641,6 +2641,7 @@ ixgbe_txeof(struct ix_txring *txr) > > unsigned int head, tail, last; > > struct ixgbe_tx_buf *tx_buffer; > > struct ixgbe_legacy_tx_desc *tx_desc; > > + int done = 0; > > > > if (!ISSET(ifp->if_flags, IFF_RUNNING)) > > return FALSE; > > @@ -2673,6 +2674,7 @@ ixgbe_txeof(struct ix_txring *txr) > > tx_buffer->m_head = NULL; > > tx_buffer->eop_index = -1; > > > > + done = 1; > > tail = last + 1; > > if (tail == sc->num_tx_desc) > > tail = 0; > > @@ -2691,7 +2693,7 @@ ixgbe_txeof(struct ix_txring *txr) > > > > txr->next_to_clean = tail; > > > > - if (ifq_is_oactive(ifq)) > > + if (done && ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > > > return TRUE; > > blob - c0c958c59581caac0e1fca5261fd1956e152f193 > > blob + e338ee24ecda59a9155a02fabaafc7f9f79b2173 > > --- sys/dev/pci/if_ixl.c > > +++ sys/dev/pci/if_ixl.c > > @@ -3011,7 +3011,7 @@ ixl_txeof(struct ixl_softc *sc, struct ixl_tx_ring *tx > > > > //ixl_enable(sc, txr->txr_msix); > > > > - if (ifq_is_oactive(ifq)) > > + if (done && ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > > > return (done); > > blob - 7c7a6eb24b69070c80ee4feff9847168188a30fb > > blob + 3e5e939ddba83973bf8df4893206ddd5aad5f1e7 > > --- sys/dev/pci/if_ngbe.c > > +++ sys/dev/pci/if_ngbe.c > > @@ -4461,6 +4461,7 @@ ngbe_txeof(struct tx_ring *txr) > > struct ngbe_tx_buf *tx_buffer; > > union ngbe_tx_desc *tx_desc; > > unsigned int prod, cons, last; > > + int done = 0; > > > > if (!ISSET(ifp->if_flags, IFF_RUNNING)) > > return; > > @@ -4486,6 +4487,7 @@ ngbe_txeof(struct tx_ring *txr) > > 0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE); > > bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map); > > m_freem(tx_buffer->m_head); > > + done = 1; > > > > tx_buffer->m_head = NULL; > > tx_buffer->eop_index = -1; > > @@ -4505,7 +4507,7 @@ ngbe_txeof(struct tx_ring *txr) > > > > txr->next_to_clean = cons; > > > > - if (ifq_is_oactive(ifq)) > > + if (done && ifq_is_oactive(ifq)) > > ifq_restart(ifq); > > } > > > > blob - 3d93a269ba97b398505e353329db00db6cf0e937 > > blob + 97780dba62a5b32a6f7eacccc003f9e0097ab5da > > --- sys/dev/pci/if_pcn.c > > +++ sys/dev/pci/if_pcn.c > > @@ -1143,6 +1143,7 @@ pcn_txintr(struct pcn_softc *sc) > > struct pcn_txsoft *txs; > > uint32_t tmd1, tmd2, tmd; > > int i, j; > > + int done = 0; > > > > /* > > * Go through our Tx list and free mbufs for those > > @@ -1219,6 +1220,7 @@ pcn_txintr(struct pcn_softc *sc) > > bus_dmamap_unload(sc->sc_dmat, txs->txs_dmamap); > > m_freem(txs->txs_mbuf); > > txs->txs_mbuf = NULL; > > + done = 1; > > } > > > > /* Update the dirty transmit buffer pointer. */ > > @@ -1231,7 +1233,7 @@ pcn_txintr(struct pcn_softc *sc) > > if (sc->sc_txsfree == PCN_TXQUEUELEN) > > ifp->if_timer = 0; > > > > - if (ifq_is_oactive(&ifp->if_snd)) > > + if (done && ifq_is_oactive(&ifp->if_snd)) > > ifq_restart(&ifp->if_snd); > > } > > >