Index | Thread | Search

From:
Rafael Sadowski <rafael@sizeofvoid.org>
Subject:
Re: make qwx(4) flush Rx rings when the interface goes down
To:
tech@openbsd.org
Date:
Thu, 24 Apr 2025 20:53:07 +0200

Download raw body.

Thread
On Thu Apr 24, 2025 at 08:26:13PM +0200, Stefan Sperling wrote:
> On Thu, Apr 24, 2025 at 11:38:37AM +0200, Stefan Sperling wrote:
> > There is a known issue where qwx(4) triggers a pool corruption check:
> > 
> >   panic: pool_cache_item_magic_check: mcl2k cpu free list modified
> > 
> > As an attempt to fix this, the patch below makes the driver discard
> > pending packets on all Rx rings.
> > 
> > My theory is that the hardware is doing a DMA write to an mbuf which is
> > on one of these rings. While the interface goes down we are freeing mbufs
> > but we are not updating the ring state from the device's point of view
> > before doing so.
> > The panic happens later when qwx or another network driver attempts to
> > allocate a new mbuf from the 2K cluster pool.
> > 
> > The issue has been observed during suspend/resume, though if my theory
> > is correct it might just as well happen during ifconfig down/up.
> > 
> > Also, port over qwx_dp_mon_link_free() from ath11k. I noticed that this
> > was missing while searching the Linux driver code for clues.
> 
> Theo pointed out that disabling interrupts won't necessarily stop DMA.
> 
> So here's a new version which flushes the rings after the STOP command
> has been sent to (and acknowledged by) firmware. This will hopefully
> have the desired effect of stopping DMA.
> 
> I've also renamed the "reap" function to "flush", to avoid an implication
> that rings were being completely destroyed/freed.

Thanks Stefan. I tested your previous and the diff below with

qwx0 at pci2 dev 0 function 0 "Qualcomm QCNFA765" rev 0x01: msi

No issues so far.

> 
> M  sys/dev/ic/qwx.c  |  112+  21-
> 
> 1 file changed, 112 insertions(+), 21 deletions(-)
> 
> commit - 31338944480d0345b0ff1635ba18a96228bd37ce
> commit + 733c622d5b43cf595e1a2ddfe07326238879463a
> blob - fdafd249b8801186ec0c12b2d0acb26131dcd963
> blob + fcb17633d81d7f0734e5f38ac655d446d0659a33
> --- sys/dev/ic/qwx.c
> +++ sys/dev/ic/qwx.c
> @@ -15454,6 +15454,16 @@ config_refill_ring:
>  }
>  
>  void
> +qwx_dp_mon_link_free(struct qwx_softc *sc)
> +{
> +	struct qwx_pdev_dp *dp = &sc->pdev_dp;
> +	struct qwx_mon_data *pmon = &dp->mon_data;
> +
> +	qwx_dp_link_desc_cleanup(sc, pmon->link_desc_banks,
> +	    HAL_RXDMA_MONITOR_DESC, &dp->rxdma_mon_desc_ring);
> +}
> +
> +void
>  qwx_dp_pdev_free(struct qwx_softc *sc)
>  {
>  	int i;
> @@ -15462,6 +15472,8 @@ qwx_dp_pdev_free(struct qwx_softc *sc)
>  
>  	for (i = 0; i < sc->num_radios; i++)
>  		qwx_dp_rx_pdev_free(sc, i);
> +	
> +	qwx_dp_mon_link_free(sc);
>  }
>  
>  int
> @@ -16058,7 +16070,7 @@ qwx_dp_process_rx_err_buf(struct qwx_softc *sc, uint32
>  }
>  
>  int
> -qwx_dp_process_rx_err(struct qwx_softc *sc)
> +qwx_dp_process_rx_err(struct qwx_softc *sc, int purge)
>  {
>  	struct ieee80211com *ic = &sc->sc_ic;
>  	struct ifnet *ifp = &ic->ic_if;
> @@ -16077,7 +16089,7 @@ qwx_dp_process_rx_err(struct qwx_softc *sc)
>  	uint64_t paddr;
>  	uint32_t *desc;
>  	int is_frag;
> -	uint8_t drop = 0;
> +	uint8_t drop = purge ? 1 : 0;
>  
>  	tot_n_bufs_reaped = 0;
>  
> @@ -16149,14 +16161,17 @@ qwx_dp_process_rx_err(struct qwx_softc *sc)
>  #ifdef notyet
>  	spin_unlock_bh(&srng->lock);
>  #endif
> -	for (i = 0; i < sc->num_radios; i++) {
> -		if (!n_bufs_reaped[i])
> -			continue;
> +	if (!purge) {
> +		for (i = 0; i < sc->num_radios; i++) {
> +			if (!n_bufs_reaped[i])
> +				continue;
>  
> -		rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
> +			rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
>  
> -		qwx_dp_rxbufs_replenish(sc, i, rx_ring, n_bufs_reaped[i],
> -		    sc->hw_params.hal_params->rx_buf_rbm);
> +			qwx_dp_rxbufs_replenish(sc, i, rx_ring,
> +			    n_bufs_reaped[i],
> +			    sc->hw_params.hal_params->rx_buf_rbm);
> +		}
>  	}
>  
>  	ifp->if_ierrors += tot_n_bufs_reaped;
> @@ -16306,7 +16321,7 @@ qwx_dp_rx_wbm_err(struct qwx_softc *sc, struct qwx_rx_
>  }
>  
>  int
> -qwx_dp_rx_process_wbm_err(struct qwx_softc *sc)
> +qwx_dp_rx_process_wbm_err(struct qwx_softc *sc, int purge)
>  {
>  	struct ieee80211com *ic = &sc->sc_ic;
>  	struct ifnet *ifp = &ic->ic_if;
> @@ -16382,6 +16397,18 @@ qwx_dp_rx_process_wbm_err(struct qwx_softc *sc)
>  	if (!total_num_buffs_reaped)
>  		goto done;
>  
> +	if (purge) {
> +		for (i = 0; i < sc->num_radios; i++) {
> +			while ((msdu = TAILQ_FIRST(msdu_list))) {
> +				TAILQ_REMOVE(msdu_list, msdu, entry);
> +				m_freem(msdu->m);
> +				msdu->m = NULL;
> +			}
> +		}
> +
> +		goto done;
> +	}
> +
>  	for (i = 0; i < sc->num_radios; i++) {
>  		if (!num_buffs_reaped[i])
>  			continue;
> @@ -16894,7 +16921,7 @@ qwx_dp_rx_process_received_packets(struct qwx_softc *s
>  }
>  
>  int
> -qwx_dp_process_rx(struct qwx_softc *sc, int ring_id)
> +qwx_dp_process_rx(struct qwx_softc *sc, int ring_id, int purge)
>  {
>  	struct qwx_dp *dp = &sc->dp;
>  	struct qwx_pdev_dp *pdev_dp = &sc->pdev_dp;
> @@ -17006,6 +17033,16 @@ try_again:
>  		if (!num_buffs_reaped[i])
>  			continue;
>  
> +		if (purge) {
> +			while ((msdu = TAILQ_FIRST(&msdu_list[i]))) {
> +				TAILQ_REMOVE(msdu_list, msdu, entry);
> +				m_freem(msdu->m);
> +				msdu->m = NULL;
> +			}
> +
> +			continue;
> +		}
> +
>  		qwx_dp_rx_process_received_packets(sc, &msdu_list[i], i);
>  
>  		rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
> @@ -17073,7 +17110,7 @@ fail_free_mbuf:
>  
>  int
>  qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, int mac_id,
> -    struct mbuf_list *ml)
> +    struct mbuf_list *ml, int purge)
>  {
>  	const struct ath11k_hw_hal_params *hal_params;
>  	struct qwx_pdev_dp *dp;
> @@ -17154,6 +17191,15 @@ qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, i
>  			pmon->buf_state = DP_MON_STATUS_REPLINISH;
>  		}
>  move_next:
> +		if (purge) {
> +			hal_params = sc->hw_params.hal_params;
> +			qwx_hal_rx_buf_addr_info_set(rx_mon_status_desc, 0, 0,
> +			    hal_params->rx_buf_rbm);
> +			qwx_hal_srng_src_get_next_entry(sc, srng);
> +			num_buffs_reaped++;
> +			continue;
> +		}
> +
>  		m = qwx_dp_rx_alloc_mon_status_buf(sc, rx_ring, &buf_idx);
>  		if (!m) {
>  			hal_params = sc->hw_params.hal_params;
> @@ -17209,7 +17255,7 @@ qwx_dp_rx_process_mon_status(struct qwx_softc *sc, int
>  #endif
>  	struct hal_rx_mon_ppdu_info *ppdu_info = &pmon->mon_ppdu_info;
>  
> -	num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml);
> +	num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml, 0);
>  	if (!num_buffs_reaped)
>  		goto exit;
>  
> @@ -17310,7 +17356,7 @@ qwx_dp_service_mon_ring(void *arg)
>  }
>  
>  int
> -qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id)
> +qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id, int purge)
>  {
>  	struct ieee80211com *ic = &sc->sc_ic;
>  	struct ifnet *ifp = &ic->ic_if;
> @@ -17382,7 +17428,7 @@ qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac
>  #ifdef notyet
>  	spin_unlock_bh(&srng->lock);
>  #endif
> -	if (num_buf_freed)
> +	if (num_buf_freed && !purge)
>  		qwx_dp_rxbufs_replenish(sc, mac_id, rx_ring, num_buf_freed,
>  		    sc->hw_params.hal_params->rx_buf_rbm);
>  
> @@ -17605,7 +17651,7 @@ qwx_hal_reo_update_rx_reo_queue_status(struct qwx_soft
>  }
>  
>  int
> -qwx_dp_process_reo_status(struct qwx_softc *sc)
> +qwx_dp_process_reo_status(struct qwx_softc *sc, int purge)
>  {
>  	struct qwx_dp *dp = &sc->dp;
>  	struct hal_srng *srng;
> @@ -17623,8 +17669,11 @@ qwx_dp_process_reo_status(struct qwx_softc *sc)
>  	qwx_hal_srng_access_begin(sc, srng);
>  
>  	while ((reo_desc = qwx_hal_srng_dst_get_next_entry(sc, srng))) {
> -		ret = 1;
> +		ret++;
>  
> +		if (purge)
> +			continue;
> +
>  		tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc);
>  		switch (tag) {
>  		case HAL_REO_GET_QUEUE_STATS_STATUS:
> @@ -17705,16 +17754,16 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
>  	}
>  
>  	if (sc->hw_params.ring_mask->rx_err[grp_id] &&
> -	    qwx_dp_process_rx_err(sc))
> +	    qwx_dp_process_rx_err(sc, 0))
>  		ret = 1;
>  
>  	if (sc->hw_params.ring_mask->rx_wbm_rel[grp_id] &&
> -	    qwx_dp_rx_process_wbm_err(sc))
> +	    qwx_dp_rx_process_wbm_err(sc, 0))
>  		ret = 1;
>  
>  	if (sc->hw_params.ring_mask->rx[grp_id]) {
>  		i = fls(sc->hw_params.ring_mask->rx[grp_id]) - 1;
> -		if (qwx_dp_process_rx(sc, i))
> +		if (qwx_dp_process_rx(sc, i, 0))
>  			ret = 1;
>  	}
>  
> @@ -17732,7 +17781,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
>  	}
>  
>  	if (sc->hw_params.ring_mask->reo_status[grp_id] &&
> -	    qwx_dp_process_reo_status(sc))
> +	    qwx_dp_process_reo_status(sc, 0))
>  		ret = 1;
>  
>  	for (i = 0; i < sc->num_radios; i++) {
> @@ -17741,7 +17790,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
>  
>  			if (sc->hw_params.ring_mask->rxdma2host[grp_id] &
>  			   (1 << (id))) {
> -				if (qwx_dp_process_rxdma_err(sc, id))
> +				if (qwx_dp_process_rxdma_err(sc, id, 0))
>  					ret = 1;
>  			}
>  
> @@ -19860,11 +19909,53 @@ err_wmi_detach:
>  }
>  
>  void
> +qwx_flush_rx_rings(struct qwx_softc *sc)
> +{
> +	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> +	int i, j, n;
> +
> +	do {
> +		n = qwx_dp_process_rx_err(sc, 1);
> +	} while (n > 0);
> +
> +	do {
> +		n = qwx_dp_rx_process_wbm_err(sc, 1);
> +	} while (n > 0);
> +
> +	do {
> +		n = qwx_dp_process_reo_status(sc, 1);
> +	} while (n > 0);
> +
> +	for (i = 0; i < DP_REO_DST_RING_MAX; i++) {
> +		do {
> +			n = qwx_dp_process_rx(sc, i, 1);
> +		} while (n > 0);
> +	}
> +
> +	for (i = 0; i < sc->num_radios; i++) {
> +		for (j = 0; j < sc->hw_params.num_rxmda_per_pdev; j++) {
> +			int mac_id = i * sc->hw_params.num_rxmda_per_pdev + j;
> +
> +			do {
> +				n = qwx_dp_process_rxdma_err(sc, mac_id, 1);
> +			} while (n > 0);
> +			do {
> +				n = qwx_dp_rx_reap_mon_status_ring(sc,
> +				    mac_id, &ml, 1);
> +				ml_purge(&ml);
> +			} while (n > 0);
> +		}
> +	}
> +}
> +
> +void
>  qwx_core_stop(struct qwx_softc *sc)
>  {
>  	if (!test_bit(ATH11K_FLAG_CRASH_FLUSH, sc->sc_flags))
>  		qwx_qmi_firmware_stop(sc);
>  	
> +	qwx_flush_rx_rings(sc);
> +
>  	sc->ops.stop(sc);
>  	qwx_wmi_detach(sc);
>  	qwx_dp_pdev_reo_cleanup(sc);
>