From: Rafael Sadowski Subject: Re: make qwx(4) flush Rx rings when the interface goes down To: tech@openbsd.org Date: Thu, 24 Apr 2025 20:53:07 +0200 On Thu Apr 24, 2025 at 08:26:13PM +0200, Stefan Sperling wrote: > On Thu, Apr 24, 2025 at 11:38:37AM +0200, Stefan Sperling wrote: > > There is a known issue where qwx(4) triggers a pool corruption check: > > > > panic: pool_cache_item_magic_check: mcl2k cpu free list modified > > > > As an attempt to fix this, the patch below makes the driver discard > > pending packets on all Rx rings. > > > > My theory is that the hardware is doing a DMA write to an mbuf which is > > on one of these rings. While the interface goes down we are freeing mbufs > > but we are not updating the ring state from the device's point of view > > before doing so. > > The panic happens later when qwx or another network driver attempts to > > allocate a new mbuf from the 2K cluster pool. > > > > The issue has been observed during suspend/resume, though if my theory > > is correct it might just as well happen during ifconfig down/up. > > > > Also, port over qwx_dp_mon_link_free() from ath11k. I noticed that this > > was missing while searching the Linux driver code for clues. > > Theo pointed out that disabling interrupts won't necessarily stop DMA. > > So here's a new version which flushes the rings after the STOP command > has been sent to (and acknowledged by) firmware. This will hopefully > have the desired effect of stopping DMA. > > I've also renamed the "reap" function to "flush", to avoid an implication > that rings were being completely destroyed/freed. Thanks Stefan. I tested your previous and the diff below with qwx0 at pci2 dev 0 function 0 "Qualcomm QCNFA765" rev 0x01: msi No issues so far. > > M sys/dev/ic/qwx.c | 112+ 21- > > 1 file changed, 112 insertions(+), 21 deletions(-) > > commit - 31338944480d0345b0ff1635ba18a96228bd37ce > commit + 733c622d5b43cf595e1a2ddfe07326238879463a > blob - fdafd249b8801186ec0c12b2d0acb26131dcd963 > blob + fcb17633d81d7f0734e5f38ac655d446d0659a33 > --- sys/dev/ic/qwx.c > +++ sys/dev/ic/qwx.c > @@ -15454,6 +15454,16 @@ config_refill_ring: > } > > void > +qwx_dp_mon_link_free(struct qwx_softc *sc) > +{ > + struct qwx_pdev_dp *dp = &sc->pdev_dp; > + struct qwx_mon_data *pmon = &dp->mon_data; > + > + qwx_dp_link_desc_cleanup(sc, pmon->link_desc_banks, > + HAL_RXDMA_MONITOR_DESC, &dp->rxdma_mon_desc_ring); > +} > + > +void > qwx_dp_pdev_free(struct qwx_softc *sc) > { > int i; > @@ -15462,6 +15472,8 @@ qwx_dp_pdev_free(struct qwx_softc *sc) > > for (i = 0; i < sc->num_radios; i++) > qwx_dp_rx_pdev_free(sc, i); > + > + qwx_dp_mon_link_free(sc); > } > > int > @@ -16058,7 +16070,7 @@ qwx_dp_process_rx_err_buf(struct qwx_softc *sc, uint32 > } > > int > -qwx_dp_process_rx_err(struct qwx_softc *sc) > +qwx_dp_process_rx_err(struct qwx_softc *sc, int purge) > { > struct ieee80211com *ic = &sc->sc_ic; > struct ifnet *ifp = &ic->ic_if; > @@ -16077,7 +16089,7 @@ qwx_dp_process_rx_err(struct qwx_softc *sc) > uint64_t paddr; > uint32_t *desc; > int is_frag; > - uint8_t drop = 0; > + uint8_t drop = purge ? 1 : 0; > > tot_n_bufs_reaped = 0; > > @@ -16149,14 +16161,17 @@ qwx_dp_process_rx_err(struct qwx_softc *sc) > #ifdef notyet > spin_unlock_bh(&srng->lock); > #endif > - for (i = 0; i < sc->num_radios; i++) { > - if (!n_bufs_reaped[i]) > - continue; > + if (!purge) { > + for (i = 0; i < sc->num_radios; i++) { > + if (!n_bufs_reaped[i]) > + continue; > > - rx_ring = &sc->pdev_dp.rx_refill_buf_ring; > + rx_ring = &sc->pdev_dp.rx_refill_buf_ring; > > - qwx_dp_rxbufs_replenish(sc, i, rx_ring, n_bufs_reaped[i], > - sc->hw_params.hal_params->rx_buf_rbm); > + qwx_dp_rxbufs_replenish(sc, i, rx_ring, > + n_bufs_reaped[i], > + sc->hw_params.hal_params->rx_buf_rbm); > + } > } > > ifp->if_ierrors += tot_n_bufs_reaped; > @@ -16306,7 +16321,7 @@ qwx_dp_rx_wbm_err(struct qwx_softc *sc, struct qwx_rx_ > } > > int > -qwx_dp_rx_process_wbm_err(struct qwx_softc *sc) > +qwx_dp_rx_process_wbm_err(struct qwx_softc *sc, int purge) > { > struct ieee80211com *ic = &sc->sc_ic; > struct ifnet *ifp = &ic->ic_if; > @@ -16382,6 +16397,18 @@ qwx_dp_rx_process_wbm_err(struct qwx_softc *sc) > if (!total_num_buffs_reaped) > goto done; > > + if (purge) { > + for (i = 0; i < sc->num_radios; i++) { > + while ((msdu = TAILQ_FIRST(msdu_list))) { > + TAILQ_REMOVE(msdu_list, msdu, entry); > + m_freem(msdu->m); > + msdu->m = NULL; > + } > + } > + > + goto done; > + } > + > for (i = 0; i < sc->num_radios; i++) { > if (!num_buffs_reaped[i]) > continue; > @@ -16894,7 +16921,7 @@ qwx_dp_rx_process_received_packets(struct qwx_softc *s > } > > int > -qwx_dp_process_rx(struct qwx_softc *sc, int ring_id) > +qwx_dp_process_rx(struct qwx_softc *sc, int ring_id, int purge) > { > struct qwx_dp *dp = &sc->dp; > struct qwx_pdev_dp *pdev_dp = &sc->pdev_dp; > @@ -17006,6 +17033,16 @@ try_again: > if (!num_buffs_reaped[i]) > continue; > > + if (purge) { > + while ((msdu = TAILQ_FIRST(&msdu_list[i]))) { > + TAILQ_REMOVE(msdu_list, msdu, entry); > + m_freem(msdu->m); > + msdu->m = NULL; > + } > + > + continue; > + } > + > qwx_dp_rx_process_received_packets(sc, &msdu_list[i], i); > > rx_ring = &sc->pdev_dp.rx_refill_buf_ring; > @@ -17073,7 +17110,7 @@ fail_free_mbuf: > > int > qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, int mac_id, > - struct mbuf_list *ml) > + struct mbuf_list *ml, int purge) > { > const struct ath11k_hw_hal_params *hal_params; > struct qwx_pdev_dp *dp; > @@ -17154,6 +17191,15 @@ qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, i > pmon->buf_state = DP_MON_STATUS_REPLINISH; > } > move_next: > + if (purge) { > + hal_params = sc->hw_params.hal_params; > + qwx_hal_rx_buf_addr_info_set(rx_mon_status_desc, 0, 0, > + hal_params->rx_buf_rbm); > + qwx_hal_srng_src_get_next_entry(sc, srng); > + num_buffs_reaped++; > + continue; > + } > + > m = qwx_dp_rx_alloc_mon_status_buf(sc, rx_ring, &buf_idx); > if (!m) { > hal_params = sc->hw_params.hal_params; > @@ -17209,7 +17255,7 @@ qwx_dp_rx_process_mon_status(struct qwx_softc *sc, int > #endif > struct hal_rx_mon_ppdu_info *ppdu_info = &pmon->mon_ppdu_info; > > - num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml); > + num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml, 0); > if (!num_buffs_reaped) > goto exit; > > @@ -17310,7 +17356,7 @@ qwx_dp_service_mon_ring(void *arg) > } > > int > -qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id) > +qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id, int purge) > { > struct ieee80211com *ic = &sc->sc_ic; > struct ifnet *ifp = &ic->ic_if; > @@ -17382,7 +17428,7 @@ qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac > #ifdef notyet > spin_unlock_bh(&srng->lock); > #endif > - if (num_buf_freed) > + if (num_buf_freed && !purge) > qwx_dp_rxbufs_replenish(sc, mac_id, rx_ring, num_buf_freed, > sc->hw_params.hal_params->rx_buf_rbm); > > @@ -17605,7 +17651,7 @@ qwx_hal_reo_update_rx_reo_queue_status(struct qwx_soft > } > > int > -qwx_dp_process_reo_status(struct qwx_softc *sc) > +qwx_dp_process_reo_status(struct qwx_softc *sc, int purge) > { > struct qwx_dp *dp = &sc->dp; > struct hal_srng *srng; > @@ -17623,8 +17669,11 @@ qwx_dp_process_reo_status(struct qwx_softc *sc) > qwx_hal_srng_access_begin(sc, srng); > > while ((reo_desc = qwx_hal_srng_dst_get_next_entry(sc, srng))) { > - ret = 1; > + ret++; > > + if (purge) > + continue; > + > tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc); > switch (tag) { > case HAL_REO_GET_QUEUE_STATS_STATUS: > @@ -17705,16 +17754,16 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id) > } > > if (sc->hw_params.ring_mask->rx_err[grp_id] && > - qwx_dp_process_rx_err(sc)) > + qwx_dp_process_rx_err(sc, 0)) > ret = 1; > > if (sc->hw_params.ring_mask->rx_wbm_rel[grp_id] && > - qwx_dp_rx_process_wbm_err(sc)) > + qwx_dp_rx_process_wbm_err(sc, 0)) > ret = 1; > > if (sc->hw_params.ring_mask->rx[grp_id]) { > i = fls(sc->hw_params.ring_mask->rx[grp_id]) - 1; > - if (qwx_dp_process_rx(sc, i)) > + if (qwx_dp_process_rx(sc, i, 0)) > ret = 1; > } > > @@ -17732,7 +17781,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id) > } > > if (sc->hw_params.ring_mask->reo_status[grp_id] && > - qwx_dp_process_reo_status(sc)) > + qwx_dp_process_reo_status(sc, 0)) > ret = 1; > > for (i = 0; i < sc->num_radios; i++) { > @@ -17741,7 +17790,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id) > > if (sc->hw_params.ring_mask->rxdma2host[grp_id] & > (1 << (id))) { > - if (qwx_dp_process_rxdma_err(sc, id)) > + if (qwx_dp_process_rxdma_err(sc, id, 0)) > ret = 1; > } > > @@ -19860,11 +19909,53 @@ err_wmi_detach: > } > > void > +qwx_flush_rx_rings(struct qwx_softc *sc) > +{ > + struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > + int i, j, n; > + > + do { > + n = qwx_dp_process_rx_err(sc, 1); > + } while (n > 0); > + > + do { > + n = qwx_dp_rx_process_wbm_err(sc, 1); > + } while (n > 0); > + > + do { > + n = qwx_dp_process_reo_status(sc, 1); > + } while (n > 0); > + > + for (i = 0; i < DP_REO_DST_RING_MAX; i++) { > + do { > + n = qwx_dp_process_rx(sc, i, 1); > + } while (n > 0); > + } > + > + for (i = 0; i < sc->num_radios; i++) { > + for (j = 0; j < sc->hw_params.num_rxmda_per_pdev; j++) { > + int mac_id = i * sc->hw_params.num_rxmda_per_pdev + j; > + > + do { > + n = qwx_dp_process_rxdma_err(sc, mac_id, 1); > + } while (n > 0); > + do { > + n = qwx_dp_rx_reap_mon_status_ring(sc, > + mac_id, &ml, 1); > + ml_purge(&ml); > + } while (n > 0); > + } > + } > +} > + > +void > qwx_core_stop(struct qwx_softc *sc) > { > if (!test_bit(ATH11K_FLAG_CRASH_FLUSH, sc->sc_flags)) > qwx_qmi_firmware_stop(sc); > > + qwx_flush_rx_rings(sc); > + > sc->ops.stop(sc); > qwx_wmi_detach(sc); > qwx_dp_pdev_reo_cleanup(sc); >