From: Stefan Sperling Subject: bge/bnx/iavf/igc/ix/ixl/ngbe/pcn: ifq_restart() fix To: tech@openbsd.org Date: Fri, 20 Jun 2025 12:12:14 +0200 A bug has been fixed by yasuaok@ in vmx(4) where the driver was calling ifq_restart() without actually having made any space on a full Tx ring. Calling ifq_restart() in this case can lead to a condition where the interface gets stuck in OACTIVE until the interface is reset with ifconfig. I could trigger the same bug on ice(4) with iperf as follows: for i in `seq 5`; do (iperf -l0 -t 0 -c 2001:db8::1 -u &) ; done This needs another machine which runs iperf -u -s (and -V if using IPv6). I have checked all ethernet drivers in dev/pci which call ifq_restart(). A few already avoid calling ifq_restart() unnecessarily. The same bug affects bge, bnx, iavf, igc, ix, ixl, ngbe, and pcn. The bad pattern looks like: while (cons != prod) { descriptor = &ring[cons]; if (descriptor is not done) break; free descriptor; } if (ifq_is_oactive(ifq)) ifq_restart(ifq); If we break out of the loop without freeing any descriptor, we will call ifq_restart() without having made any space on the ring. For example, this can happen when drivers invoke both Tx and Rx interrupt handlers during the same MSIX interrupt cause, An Rx interrupt can then lead to processing of a Tx ring which has made no progress. Fixed code looks like: done = 0; while (cons != prod) { desc = &ring[cons]; if (desc is not done) break; free descriptor; done = 1; } if (done && ifq_is_oactive(ifq)) ifq_restart(ifq); I am looking for tests and OKs. I myself don't have any hardware for these drivers to test with. M sys/dev/pci/if_bge.c | 4+ 3- M sys/dev/pci/if_bnx.c | 1+ 1- M sys/dev/pci/if_iavf.c | 1+ 1- M sys/dev/pci/if_igc.c | 1+ 1- M sys/dev/pci/if_ix.c | 3+ 1- M sys/dev/pci/if_ixl.c | 1+ 1- M sys/dev/pci/if_ngbe.c | 3+ 1- M sys/dev/pci/if_pcn.c | 3+ 1- 8 files changed, 17 insertions(+), 10 deletions(-) commit - a89c75a2984a37717d6fffa3850bd269e697c3c7 commit + b6ed26d3094a0bd6d44e939b604136db06b1c0db blob - 5b022a8bf67db05f03aecfd699cc8fc1701191e4 blob + c3112a656819b5b109ca032d610ff24f9fe743d6 --- sys/dev/pci/if_bge.c +++ sys/dev/pci/if_bge.c @@ -3739,9 +3739,10 @@ bge_txeof(struct bge_softc *sc) sc->bge_tx_saved_considx = cons; - if (ifq_is_oactive(&ifp->if_snd)) - ifq_restart(&ifp->if_snd); - else if (txcnt == 0) + if (ifq_is_oactive(&ifp->if_snd)) { + if (freed) + ifq_restart(&ifp->if_snd); + } else if (txcnt == 0) ifp->if_timer = 0; } blob - 6d18e3a22795dede966a95c3f81d74f71fd20b12 blob + 79382274aee75c3d51f54d747f6eda4ae84f0d99 --- sys/dev/pci/if_bnx.c +++ sys/dev/pci/if_bnx.c @@ -4579,7 +4579,7 @@ bnx_tx_intr(struct bnx_softc *sc) if (used == 0) ifp->if_timer = 0; - if (ifq_is_oactive(&ifp->if_snd)) + if (freed && ifq_is_oactive(&ifp->if_snd)) ifq_restart(&ifp->if_snd); } blob - bdfeefe85158bea04895190c8e151bd17b9cb6a4 blob + 9ae28af22a0fdbc6fb47db73f7df8db4d1fa400a --- sys/dev/pci/if_iavf.c +++ sys/dev/pci/if_iavf.c @@ -2178,7 +2178,7 @@ iavf_txeof(struct iavf_softc *sc, struct ifqueue *ifq) //ixl_enable(sc, txr->txr_msix); - if (ifq_is_oactive(ifq)) + if (done && ifq_is_oactive(ifq)) ifq_restart(ifq); return (done); blob - 6f9aaed06ed82859719b420c8198e9f5042fe823 blob + 69800e703841c3635185849dabe61aa430d5f32a --- sys/dev/pci/if_igc.c +++ sys/dev/pci/if_igc.c @@ -1119,7 +1119,7 @@ igc_txeof(struct igc_txring *txr) txr->next_to_clean = cons; - if (ifq_is_oactive(ifq)) + if (done && ifq_is_oactive(ifq)) ifq_restart(ifq); return (done); blob - eece3848ec6deeaf57a0b3da4412119979c5a74f blob + e35807fa84586257960ff739743e4c1a05ed65de --- sys/dev/pci/if_ix.c +++ sys/dev/pci/if_ix.c @@ -2641,6 +2641,7 @@ ixgbe_txeof(struct ix_txring *txr) unsigned int head, tail, last; struct ixgbe_tx_buf *tx_buffer; struct ixgbe_legacy_tx_desc *tx_desc; + int done = 0; if (!ISSET(ifp->if_flags, IFF_RUNNING)) return FALSE; @@ -2673,6 +2674,7 @@ ixgbe_txeof(struct ix_txring *txr) tx_buffer->m_head = NULL; tx_buffer->eop_index = -1; + done = 1; tail = last + 1; if (tail == sc->num_tx_desc) tail = 0; @@ -2691,7 +2693,7 @@ ixgbe_txeof(struct ix_txring *txr) txr->next_to_clean = tail; - if (ifq_is_oactive(ifq)) + if (done && ifq_is_oactive(ifq)) ifq_restart(ifq); return TRUE; blob - c0c958c59581caac0e1fca5261fd1956e152f193 blob + e338ee24ecda59a9155a02fabaafc7f9f79b2173 --- sys/dev/pci/if_ixl.c +++ sys/dev/pci/if_ixl.c @@ -3011,7 +3011,7 @@ ixl_txeof(struct ixl_softc *sc, struct ixl_tx_ring *tx //ixl_enable(sc, txr->txr_msix); - if (ifq_is_oactive(ifq)) + if (done && ifq_is_oactive(ifq)) ifq_restart(ifq); return (done); blob - 7c7a6eb24b69070c80ee4feff9847168188a30fb blob + 3e5e939ddba83973bf8df4893206ddd5aad5f1e7 --- sys/dev/pci/if_ngbe.c +++ sys/dev/pci/if_ngbe.c @@ -4461,6 +4461,7 @@ ngbe_txeof(struct tx_ring *txr) struct ngbe_tx_buf *tx_buffer; union ngbe_tx_desc *tx_desc; unsigned int prod, cons, last; + int done = 0; if (!ISSET(ifp->if_flags, IFF_RUNNING)) return; @@ -4486,6 +4487,7 @@ ngbe_txeof(struct tx_ring *txr) 0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map); m_freem(tx_buffer->m_head); + done = 1; tx_buffer->m_head = NULL; tx_buffer->eop_index = -1; @@ -4505,7 +4507,7 @@ ngbe_txeof(struct tx_ring *txr) txr->next_to_clean = cons; - if (ifq_is_oactive(ifq)) + if (done && ifq_is_oactive(ifq)) ifq_restart(ifq); } blob - 3d93a269ba97b398505e353329db00db6cf0e937 blob + 97780dba62a5b32a6f7eacccc003f9e0097ab5da --- sys/dev/pci/if_pcn.c +++ sys/dev/pci/if_pcn.c @@ -1143,6 +1143,7 @@ pcn_txintr(struct pcn_softc *sc) struct pcn_txsoft *txs; uint32_t tmd1, tmd2, tmd; int i, j; + int done = 0; /* * Go through our Tx list and free mbufs for those @@ -1219,6 +1220,7 @@ pcn_txintr(struct pcn_softc *sc) bus_dmamap_unload(sc->sc_dmat, txs->txs_dmamap); m_freem(txs->txs_mbuf); txs->txs_mbuf = NULL; + done = 1; } /* Update the dirty transmit buffer pointer. */ @@ -1231,7 +1233,7 @@ pcn_txintr(struct pcn_softc *sc) if (sc->sc_txsfree == PCN_TXQUEUELEN) ifp->if_timer = 0; - if (ifq_is_oactive(&ifp->if_snd)) + if (done && ifq_is_oactive(&ifp->if_snd)) ifq_restart(&ifp->if_snd); }