From: Mark Kettenis Subject: Re: make qwx(4) flush Rx rings when the interface goes down To: Stefan Sperling Cc: tech@openbsd.org Date: Fri, 25 Apr 2025 00:23:25 +0200 > Date: Thu, 24 Apr 2025 20:26:13 +0200 > From: Stefan Sperling > > On Thu, Apr 24, 2025 at 11:38:37AM +0200, Stefan Sperling wrote: > > There is a known issue where qwx(4) triggers a pool corruption check: > > > > panic: pool_cache_item_magic_check: mcl2k cpu free list modified > > > > As an attempt to fix this, the patch below makes the driver discard > > pending packets on all Rx rings. > > > > My theory is that the hardware is doing a DMA write to an mbuf which is > > on one of these rings. While the interface goes down we are freeing mbufs > > but we are not updating the ring state from the device's point of view > > before doing so. > > The panic happens later when qwx or another network driver attempts to > > allocate a new mbuf from the 2K cluster pool. > > > > The issue has been observed during suspend/resume, though if my theory > > is correct it might just as well happen during ifconfig down/up. > > > > Also, port over qwx_dp_mon_link_free() from ath11k. I noticed that this > > was missing while searching the Linux driver code for clues. > > Theo pointed out that disabling interrupts won't necessarily stop DMA. > > So here's a new version which flushes the rings after the STOP command > has been sent to (and acknowledged by) firmware. This will hopefully > have the desired effect of stopping DMA. > > I've also renamed the "reap" function to "flush", to avoid an implication > that rings were being completely destroyed/freed. Sadly this doesn't fix the panic I'm seeing on the x13s. What I'm doing to provoke the panic is: 1. Log in to the machine over ssh 2. Run 'ifconfig qwx0 down' 3. Pick up the laptop 4. Run 'ifconfig qwx0 up' There seems to be something really fishy going on with bringing the interface down when there are packets in flight... > M sys/dev/ic/qwx.c | 112+ 21- > > 1 file changed, 112 insertions(+), 21 deletions(-) > > commit - 31338944480d0345b0ff1635ba18a96228bd37ce > commit + 733c622d5b43cf595e1a2ddfe07326238879463a > blob - fdafd249b8801186ec0c12b2d0acb26131dcd963 > blob + fcb17633d81d7f0734e5f38ac655d446d0659a33 > --- sys/dev/ic/qwx.c > +++ sys/dev/ic/qwx.c > @@ -15454,6 +15454,16 @@ config_refill_ring: > } > > void > +qwx_dp_mon_link_free(struct qwx_softc *sc) > +{ > + struct qwx_pdev_dp *dp = &sc->pdev_dp; > + struct qwx_mon_data *pmon = &dp->mon_data; > + > + qwx_dp_link_desc_cleanup(sc, pmon->link_desc_banks, > + HAL_RXDMA_MONITOR_DESC, &dp->rxdma_mon_desc_ring); > +} > + > +void > qwx_dp_pdev_free(struct qwx_softc *sc) > { > int i; > @@ -15462,6 +15472,8 @@ qwx_dp_pdev_free(struct qwx_softc *sc) > > for (i = 0; i < sc->num_radios; i++) > qwx_dp_rx_pdev_free(sc, i); > + > + qwx_dp_mon_link_free(sc); > } > > int > @@ -16058,7 +16070,7 @@ qwx_dp_process_rx_err_buf(struct qwx_softc *sc, uint32 > } > > int > -qwx_dp_process_rx_err(struct qwx_softc *sc) > +qwx_dp_process_rx_err(struct qwx_softc *sc, int purge) > { > struct ieee80211com *ic = &sc->sc_ic; > struct ifnet *ifp = &ic->ic_if; > @@ -16077,7 +16089,7 @@ qwx_dp_process_rx_err(struct qwx_softc *sc) > uint64_t paddr; > uint32_t *desc; > int is_frag; > - uint8_t drop = 0; > + uint8_t drop = purge ? 1 : 0; > > tot_n_bufs_reaped = 0; > > @@ -16149,14 +16161,17 @@ qwx_dp_process_rx_err(struct qwx_softc *sc) > #ifdef notyet > spin_unlock_bh(&srng->lock); > #endif > - for (i = 0; i < sc->num_radios; i++) { > - if (!n_bufs_reaped[i]) > - continue; > + if (!purge) { > + for (i = 0; i < sc->num_radios; i++) { > + if (!n_bufs_reaped[i]) > + continue; > > - rx_ring = &sc->pdev_dp.rx_refill_buf_ring; > + rx_ring = &sc->pdev_dp.rx_refill_buf_ring; > > - qwx_dp_rxbufs_replenish(sc, i, rx_ring, n_bufs_reaped[i], > - sc->hw_params.hal_params->rx_buf_rbm); > + qwx_dp_rxbufs_replenish(sc, i, rx_ring, > + n_bufs_reaped[i], > + sc->hw_params.hal_params->rx_buf_rbm); > + } > } > > ifp->if_ierrors += tot_n_bufs_reaped; > @@ -16306,7 +16321,7 @@ qwx_dp_rx_wbm_err(struct qwx_softc *sc, struct qwx_rx_ > } > > int > -qwx_dp_rx_process_wbm_err(struct qwx_softc *sc) > +qwx_dp_rx_process_wbm_err(struct qwx_softc *sc, int purge) > { > struct ieee80211com *ic = &sc->sc_ic; > struct ifnet *ifp = &ic->ic_if; > @@ -16382,6 +16397,18 @@ qwx_dp_rx_process_wbm_err(struct qwx_softc *sc) > if (!total_num_buffs_reaped) > goto done; > > + if (purge) { > + for (i = 0; i < sc->num_radios; i++) { > + while ((msdu = TAILQ_FIRST(msdu_list))) { > + TAILQ_REMOVE(msdu_list, msdu, entry); > + m_freem(msdu->m); > + msdu->m = NULL; > + } > + } > + > + goto done; > + } > + > for (i = 0; i < sc->num_radios; i++) { > if (!num_buffs_reaped[i]) > continue; > @@ -16894,7 +16921,7 @@ qwx_dp_rx_process_received_packets(struct qwx_softc *s > } > > int > -qwx_dp_process_rx(struct qwx_softc *sc, int ring_id) > +qwx_dp_process_rx(struct qwx_softc *sc, int ring_id, int purge) > { > struct qwx_dp *dp = &sc->dp; > struct qwx_pdev_dp *pdev_dp = &sc->pdev_dp; > @@ -17006,6 +17033,16 @@ try_again: > if (!num_buffs_reaped[i]) > continue; > > + if (purge) { > + while ((msdu = TAILQ_FIRST(&msdu_list[i]))) { > + TAILQ_REMOVE(msdu_list, msdu, entry); > + m_freem(msdu->m); > + msdu->m = NULL; > + } > + > + continue; > + } > + > qwx_dp_rx_process_received_packets(sc, &msdu_list[i], i); > > rx_ring = &sc->pdev_dp.rx_refill_buf_ring; > @@ -17073,7 +17110,7 @@ fail_free_mbuf: > > int > qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, int mac_id, > - struct mbuf_list *ml) > + struct mbuf_list *ml, int purge) > { > const struct ath11k_hw_hal_params *hal_params; > struct qwx_pdev_dp *dp; > @@ -17154,6 +17191,15 @@ qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, i > pmon->buf_state = DP_MON_STATUS_REPLINISH; > } > move_next: > + if (purge) { > + hal_params = sc->hw_params.hal_params; > + qwx_hal_rx_buf_addr_info_set(rx_mon_status_desc, 0, 0, > + hal_params->rx_buf_rbm); > + qwx_hal_srng_src_get_next_entry(sc, srng); > + num_buffs_reaped++; > + continue; > + } > + > m = qwx_dp_rx_alloc_mon_status_buf(sc, rx_ring, &buf_idx); > if (!m) { > hal_params = sc->hw_params.hal_params; > @@ -17209,7 +17255,7 @@ qwx_dp_rx_process_mon_status(struct qwx_softc *sc, int > #endif > struct hal_rx_mon_ppdu_info *ppdu_info = &pmon->mon_ppdu_info; > > - num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml); > + num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml, 0); > if (!num_buffs_reaped) > goto exit; > > @@ -17310,7 +17356,7 @@ qwx_dp_service_mon_ring(void *arg) > } > > int > -qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id) > +qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id, int purge) > { > struct ieee80211com *ic = &sc->sc_ic; > struct ifnet *ifp = &ic->ic_if; > @@ -17382,7 +17428,7 @@ qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac > #ifdef notyet > spin_unlock_bh(&srng->lock); > #endif > - if (num_buf_freed) > + if (num_buf_freed && !purge) > qwx_dp_rxbufs_replenish(sc, mac_id, rx_ring, num_buf_freed, > sc->hw_params.hal_params->rx_buf_rbm); > > @@ -17605,7 +17651,7 @@ qwx_hal_reo_update_rx_reo_queue_status(struct qwx_soft > } > > int > -qwx_dp_process_reo_status(struct qwx_softc *sc) > +qwx_dp_process_reo_status(struct qwx_softc *sc, int purge) > { > struct qwx_dp *dp = &sc->dp; > struct hal_srng *srng; > @@ -17623,8 +17669,11 @@ qwx_dp_process_reo_status(struct qwx_softc *sc) > qwx_hal_srng_access_begin(sc, srng); > > while ((reo_desc = qwx_hal_srng_dst_get_next_entry(sc, srng))) { > - ret = 1; > + ret++; > > + if (purge) > + continue; > + > tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc); > switch (tag) { > case HAL_REO_GET_QUEUE_STATS_STATUS: > @@ -17705,16 +17754,16 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id) > } > > if (sc->hw_params.ring_mask->rx_err[grp_id] && > - qwx_dp_process_rx_err(sc)) > + qwx_dp_process_rx_err(sc, 0)) > ret = 1; > > if (sc->hw_params.ring_mask->rx_wbm_rel[grp_id] && > - qwx_dp_rx_process_wbm_err(sc)) > + qwx_dp_rx_process_wbm_err(sc, 0)) > ret = 1; > > if (sc->hw_params.ring_mask->rx[grp_id]) { > i = fls(sc->hw_params.ring_mask->rx[grp_id]) - 1; > - if (qwx_dp_process_rx(sc, i)) > + if (qwx_dp_process_rx(sc, i, 0)) > ret = 1; > } > > @@ -17732,7 +17781,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id) > } > > if (sc->hw_params.ring_mask->reo_status[grp_id] && > - qwx_dp_process_reo_status(sc)) > + qwx_dp_process_reo_status(sc, 0)) > ret = 1; > > for (i = 0; i < sc->num_radios; i++) { > @@ -17741,7 +17790,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id) > > if (sc->hw_params.ring_mask->rxdma2host[grp_id] & > (1 << (id))) { > - if (qwx_dp_process_rxdma_err(sc, id)) > + if (qwx_dp_process_rxdma_err(sc, id, 0)) > ret = 1; > } > > @@ -19860,11 +19909,53 @@ err_wmi_detach: > } > > void > +qwx_flush_rx_rings(struct qwx_softc *sc) > +{ > + struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > + int i, j, n; > + > + do { > + n = qwx_dp_process_rx_err(sc, 1); > + } while (n > 0); > + > + do { > + n = qwx_dp_rx_process_wbm_err(sc, 1); > + } while (n > 0); > + > + do { > + n = qwx_dp_process_reo_status(sc, 1); > + } while (n > 0); > + > + for (i = 0; i < DP_REO_DST_RING_MAX; i++) { > + do { > + n = qwx_dp_process_rx(sc, i, 1); > + } while (n > 0); > + } > + > + for (i = 0; i < sc->num_radios; i++) { > + for (j = 0; j < sc->hw_params.num_rxmda_per_pdev; j++) { > + int mac_id = i * sc->hw_params.num_rxmda_per_pdev + j; > + > + do { > + n = qwx_dp_process_rxdma_err(sc, mac_id, 1); > + } while (n > 0); > + do { > + n = qwx_dp_rx_reap_mon_status_ring(sc, > + mac_id, &ml, 1); > + ml_purge(&ml); > + } while (n > 0); > + } > + } > +} > + > +void > qwx_core_stop(struct qwx_softc *sc) > { > if (!test_bit(ATH11K_FLAG_CRASH_FLUSH, sc->sc_flags)) > qwx_qmi_firmware_stop(sc); > > + qwx_flush_rx_rings(sc); > + > sc->ops.stop(sc); > qwx_wmi_detach(sc); > qwx_dp_pdev_reo_cleanup(sc); > >