Download raw body.
bge/bnx/iavf/igc/ix/ixl/ngbe/pcn: ifq_restart() fix
Hi Stephan and tech@
I have such a great NIC as AMD Lance FastEthernet (and another one, much
more ancient 10mbit with RJ-45 and AUI interface).
> pcn0 at pci4 dev 6 function 0 "AMD 79c970 PCnet-PCI" rev 0x25, Am79c971,
rev 5: apic 24 int 21, address 00:60:b0:eb:73:c5
> lxtphy0 at pcn0 phy 1: LXT970, rev. 0
> ukphy0 at pcn0 phy 31: Generic IEEE 802.3u media interface, rev. 1: OUI
0x00001a, model 0x0001
I can not manage the interface to get stuck forever. Yes, when I run a
thing such as
> iperf -u -l0 -t0 -c ${serveraddr}
I see a tons of qdrops in kstat:
>pcn0:0:txq:0
> packets: 36982186 packets
> bytes: 1675836505 bytes
> qdrops: 48256453 packets
> errors: 0 packets
> qlen: 0 packets
> maxqlen: 511 packets
> oactive: false
> oactives: 4607637
and oactivee is set to true.
Also I can see ping drops (no buffer space avail) and delays.
But when I stop the iperf client everything is going just fine. oactive in
kstat becomes false again. Pings are ok, browser works fine.
Tested with 7.7 syspatch from 04 may AND with -current kernel + old 7.7
user space + you modifications with done variable in pcn_txintr func.
Behavior is the same in both cases.
If one is interested in more deep investigations, drop me a line privately.
On Fri, Jun 20, 2025 at 1:14 PM Stefan Sperling <stsp@stsp.name> wrote:
> A bug has been fixed by yasuaok@ in vmx(4) where the driver was
> calling ifq_restart() without actually having made any space on
> a full Tx ring. Calling ifq_restart() in this case can lead to
> a condition where the interface gets stuck in OACTIVE until the
> interface is reset with ifconfig.
>
> I could trigger the same bug on ice(4) with iperf as follows:
> for i in `seq 5`; do (iperf -l0 -t 0 -c 2001:db8::1 -u &) ; done
> This needs another machine which runs iperf -u -s (and -V if using IPv6).
>
> I have checked all ethernet drivers in dev/pci which call ifq_restart().
> A few already avoid calling ifq_restart() unnecessarily.
> The same bug affects bge, bnx, iavf, igc, ix, ixl, ngbe, and pcn.
>
> The bad pattern looks like:
>
> while (cons != prod) {
> descriptor = &ring[cons];
>
> if (descriptor is not done)
> break;
>
> free descriptor;
> }
>
> if (ifq_is_oactive(ifq))
> ifq_restart(ifq);
>
> If we break out of the loop without freeing any descriptor, we will
> call ifq_restart() without having made any space on the ring.
> For example, this can happen when drivers invoke both Tx and Rx interrupt
> handlers during the same MSIX interrupt cause, An Rx interrupt can then
> lead to processing of a Tx ring which has made no progress.
>
> Fixed code looks like:
>
> done = 0;
>
> while (cons != prod) {
> desc = &ring[cons];
>
> if (desc is not done)
> break;
>
> free descriptor;
> done = 1;
> }
>
> if (done && ifq_is_oactive(ifq))
> ifq_restart(ifq);
>
>
> I am looking for tests and OKs.
> I myself don't have any hardware for these drivers to test with.
>
>
> M sys/dev/pci/if_bge.c | 4+ 3-
> M sys/dev/pci/if_bnx.c | 1+ 1-
> M sys/dev/pci/if_iavf.c | 1+ 1-
> M sys/dev/pci/if_igc.c | 1+ 1-
> M sys/dev/pci/if_ix.c | 3+ 1-
> M sys/dev/pci/if_ixl.c | 1+ 1-
> M sys/dev/pci/if_ngbe.c | 3+ 1-
> M sys/dev/pci/if_pcn.c | 3+ 1-
>
> 8 files changed, 17 insertions(+), 10 deletions(-)
>
> commit - a89c75a2984a37717d6fffa3850bd269e697c3c7
> commit + b6ed26d3094a0bd6d44e939b604136db06b1c0db
> blob - 5b022a8bf67db05f03aecfd699cc8fc1701191e4
> blob + c3112a656819b5b109ca032d610ff24f9fe743d6
> --- sys/dev/pci/if_bge.c
> +++ sys/dev/pci/if_bge.c
> @@ -3739,9 +3739,10 @@ bge_txeof(struct bge_softc *sc)
>
> sc->bge_tx_saved_considx = cons;
>
> - if (ifq_is_oactive(&ifp->if_snd))
> - ifq_restart(&ifp->if_snd);
> - else if (txcnt == 0)
> + if (ifq_is_oactive(&ifp->if_snd)) {
> + if (freed)
> + ifq_restart(&ifp->if_snd);
> + } else if (txcnt == 0)
> ifp->if_timer = 0;
> }
>
> blob - 6d18e3a22795dede966a95c3f81d74f71fd20b12
> blob + 79382274aee75c3d51f54d747f6eda4ae84f0d99
> --- sys/dev/pci/if_bnx.c
> +++ sys/dev/pci/if_bnx.c
> @@ -4579,7 +4579,7 @@ bnx_tx_intr(struct bnx_softc *sc)
> if (used == 0)
> ifp->if_timer = 0;
>
> - if (ifq_is_oactive(&ifp->if_snd))
> + if (freed && ifq_is_oactive(&ifp->if_snd))
> ifq_restart(&ifp->if_snd);
> }
>
> blob - bdfeefe85158bea04895190c8e151bd17b9cb6a4
> blob + 9ae28af22a0fdbc6fb47db73f7df8db4d1fa400a
> --- sys/dev/pci/if_iavf.c
> +++ sys/dev/pci/if_iavf.c
> @@ -2178,7 +2178,7 @@ iavf_txeof(struct iavf_softc *sc, struct ifqueue
> *ifq)
>
> //ixl_enable(sc, txr->txr_msix);
>
> - if (ifq_is_oactive(ifq))
> + if (done && ifq_is_oactive(ifq))
> ifq_restart(ifq);
>
> return (done);
> blob - 6f9aaed06ed82859719b420c8198e9f5042fe823
> blob + 69800e703841c3635185849dabe61aa430d5f32a
> --- sys/dev/pci/if_igc.c
> +++ sys/dev/pci/if_igc.c
> @@ -1119,7 +1119,7 @@ igc_txeof(struct igc_txring *txr)
>
> txr->next_to_clean = cons;
>
> - if (ifq_is_oactive(ifq))
> + if (done && ifq_is_oactive(ifq))
> ifq_restart(ifq);
>
> return (done);
> blob - eece3848ec6deeaf57a0b3da4412119979c5a74f
> blob + e35807fa84586257960ff739743e4c1a05ed65de
> --- sys/dev/pci/if_ix.c
> +++ sys/dev/pci/if_ix.c
> @@ -2641,6 +2641,7 @@ ixgbe_txeof(struct ix_txring *txr)
> unsigned int head, tail, last;
> struct ixgbe_tx_buf *tx_buffer;
> struct ixgbe_legacy_tx_desc *tx_desc;
> + int done = 0;
>
> if (!ISSET(ifp->if_flags, IFF_RUNNING))
> return FALSE;
> @@ -2673,6 +2674,7 @@ ixgbe_txeof(struct ix_txring *txr)
> tx_buffer->m_head = NULL;
> tx_buffer->eop_index = -1;
>
> + done = 1;
> tail = last + 1;
> if (tail == sc->num_tx_desc)
> tail = 0;
> @@ -2691,7 +2693,7 @@ ixgbe_txeof(struct ix_txring *txr)
>
> txr->next_to_clean = tail;
>
> - if (ifq_is_oactive(ifq))
> + if (done && ifq_is_oactive(ifq))
> ifq_restart(ifq);
>
> return TRUE;
> blob - c0c958c59581caac0e1fca5261fd1956e152f193
> blob + e338ee24ecda59a9155a02fabaafc7f9f79b2173
> --- sys/dev/pci/if_ixl.c
> +++ sys/dev/pci/if_ixl.c
> @@ -3011,7 +3011,7 @@ ixl_txeof(struct ixl_softc *sc, struct ixl_tx_ring
> *tx
>
> //ixl_enable(sc, txr->txr_msix);
>
> - if (ifq_is_oactive(ifq))
> + if (done && ifq_is_oactive(ifq))
> ifq_restart(ifq);
>
> return (done);
> blob - 7c7a6eb24b69070c80ee4feff9847168188a30fb
> blob + 3e5e939ddba83973bf8df4893206ddd5aad5f1e7
> --- sys/dev/pci/if_ngbe.c
> +++ sys/dev/pci/if_ngbe.c
> @@ -4461,6 +4461,7 @@ ngbe_txeof(struct tx_ring *txr)
> struct ngbe_tx_buf *tx_buffer;
> union ngbe_tx_desc *tx_desc;
> unsigned int prod, cons, last;
> + int done = 0;
>
> if (!ISSET(ifp->if_flags, IFF_RUNNING))
> return;
> @@ -4486,6 +4487,7 @@ ngbe_txeof(struct tx_ring *txr)
> 0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE);
> bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map);
> m_freem(tx_buffer->m_head);
> + done = 1;
>
> tx_buffer->m_head = NULL;
> tx_buffer->eop_index = -1;
> @@ -4505,7 +4507,7 @@ ngbe_txeof(struct tx_ring *txr)
>
> txr->next_to_clean = cons;
>
> - if (ifq_is_oactive(ifq))
> + if (done && ifq_is_oactive(ifq))
> ifq_restart(ifq);
> }
>
> blob - 3d93a269ba97b398505e353329db00db6cf0e937
> blob + 97780dba62a5b32a6f7eacccc003f9e0097ab5da
> --- sys/dev/pci/if_pcn.c
> +++ sys/dev/pci/if_pcn.c
> @@ -1143,6 +1143,7 @@ pcn_txintr(struct pcn_softc *sc)
> struct pcn_txsoft *txs;
> uint32_t tmd1, tmd2, tmd;
> int i, j;
> + int done = 0;
>
> /*
> * Go through our Tx list and free mbufs for those
> @@ -1219,6 +1220,7 @@ pcn_txintr(struct pcn_softc *sc)
> bus_dmamap_unload(sc->sc_dmat, txs->txs_dmamap);
> m_freem(txs->txs_mbuf);
> txs->txs_mbuf = NULL;
> + done = 1;
> }
>
> /* Update the dirty transmit buffer pointer. */
> @@ -1231,7 +1233,7 @@ pcn_txintr(struct pcn_softc *sc)
> if (sc->sc_txsfree == PCN_TXQUEUELEN)
> ifp->if_timer = 0;
>
> - if (ifq_is_oactive(&ifp->if_snd))
> + if (done && ifq_is_oactive(&ifp->if_snd))
> ifq_restart(&ifp->if_snd);
> }
>
>
>
bge/bnx/iavf/igc/ix/ixl/ngbe/pcn: ifq_restart() fix