Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: make qwx(4) flush Rx rings when the interface goes down
To:
tech@openbsd.org
Date:
Thu, 24 Apr 2025 20:26:13 +0200

Download raw body.

Thread
On Thu, Apr 24, 2025 at 11:38:37AM +0200, Stefan Sperling wrote:
> There is a known issue where qwx(4) triggers a pool corruption check:
> 
>   panic: pool_cache_item_magic_check: mcl2k cpu free list modified
> 
> As an attempt to fix this, the patch below makes the driver discard
> pending packets on all Rx rings.
> 
> My theory is that the hardware is doing a DMA write to an mbuf which is
> on one of these rings. While the interface goes down we are freeing mbufs
> but we are not updating the ring state from the device's point of view
> before doing so.
> The panic happens later when qwx or another network driver attempts to
> allocate a new mbuf from the 2K cluster pool.
> 
> The issue has been observed during suspend/resume, though if my theory
> is correct it might just as well happen during ifconfig down/up.
> 
> Also, port over qwx_dp_mon_link_free() from ath11k. I noticed that this
> was missing while searching the Linux driver code for clues.

Theo pointed out that disabling interrupts won't necessarily stop DMA.

So here's a new version which flushes the rings after the STOP command
has been sent to (and acknowledged by) firmware. This will hopefully
have the desired effect of stopping DMA.

I've also renamed the "reap" function to "flush", to avoid an implication
that rings were being completely destroyed/freed.

M  sys/dev/ic/qwx.c  |  112+  21-

1 file changed, 112 insertions(+), 21 deletions(-)

commit - 31338944480d0345b0ff1635ba18a96228bd37ce
commit + 733c622d5b43cf595e1a2ddfe07326238879463a
blob - fdafd249b8801186ec0c12b2d0acb26131dcd963
blob + fcb17633d81d7f0734e5f38ac655d446d0659a33
--- sys/dev/ic/qwx.c
+++ sys/dev/ic/qwx.c
@@ -15454,6 +15454,16 @@ config_refill_ring:
 }
 
 void
+qwx_dp_mon_link_free(struct qwx_softc *sc)
+{
+	struct qwx_pdev_dp *dp = &sc->pdev_dp;
+	struct qwx_mon_data *pmon = &dp->mon_data;
+
+	qwx_dp_link_desc_cleanup(sc, pmon->link_desc_banks,
+	    HAL_RXDMA_MONITOR_DESC, &dp->rxdma_mon_desc_ring);
+}
+
+void
 qwx_dp_pdev_free(struct qwx_softc *sc)
 {
 	int i;
@@ -15462,6 +15472,8 @@ qwx_dp_pdev_free(struct qwx_softc *sc)
 
 	for (i = 0; i < sc->num_radios; i++)
 		qwx_dp_rx_pdev_free(sc, i);
+	
+	qwx_dp_mon_link_free(sc);
 }
 
 int
@@ -16058,7 +16070,7 @@ qwx_dp_process_rx_err_buf(struct qwx_softc *sc, uint32
 }
 
 int
-qwx_dp_process_rx_err(struct qwx_softc *sc)
+qwx_dp_process_rx_err(struct qwx_softc *sc, int purge)
 {
 	struct ieee80211com *ic = &sc->sc_ic;
 	struct ifnet *ifp = &ic->ic_if;
@@ -16077,7 +16089,7 @@ qwx_dp_process_rx_err(struct qwx_softc *sc)
 	uint64_t paddr;
 	uint32_t *desc;
 	int is_frag;
-	uint8_t drop = 0;
+	uint8_t drop = purge ? 1 : 0;
 
 	tot_n_bufs_reaped = 0;
 
@@ -16149,14 +16161,17 @@ qwx_dp_process_rx_err(struct qwx_softc *sc)
 #ifdef notyet
 	spin_unlock_bh(&srng->lock);
 #endif
-	for (i = 0; i < sc->num_radios; i++) {
-		if (!n_bufs_reaped[i])
-			continue;
+	if (!purge) {
+		for (i = 0; i < sc->num_radios; i++) {
+			if (!n_bufs_reaped[i])
+				continue;
 
-		rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
+			rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
 
-		qwx_dp_rxbufs_replenish(sc, i, rx_ring, n_bufs_reaped[i],
-		    sc->hw_params.hal_params->rx_buf_rbm);
+			qwx_dp_rxbufs_replenish(sc, i, rx_ring,
+			    n_bufs_reaped[i],
+			    sc->hw_params.hal_params->rx_buf_rbm);
+		}
 	}
 
 	ifp->if_ierrors += tot_n_bufs_reaped;
@@ -16306,7 +16321,7 @@ qwx_dp_rx_wbm_err(struct qwx_softc *sc, struct qwx_rx_
 }
 
 int
-qwx_dp_rx_process_wbm_err(struct qwx_softc *sc)
+qwx_dp_rx_process_wbm_err(struct qwx_softc *sc, int purge)
 {
 	struct ieee80211com *ic = &sc->sc_ic;
 	struct ifnet *ifp = &ic->ic_if;
@@ -16382,6 +16397,18 @@ qwx_dp_rx_process_wbm_err(struct qwx_softc *sc)
 	if (!total_num_buffs_reaped)
 		goto done;
 
+	if (purge) {
+		for (i = 0; i < sc->num_radios; i++) {
+			while ((msdu = TAILQ_FIRST(msdu_list))) {
+				TAILQ_REMOVE(msdu_list, msdu, entry);
+				m_freem(msdu->m);
+				msdu->m = NULL;
+			}
+		}
+
+		goto done;
+	}
+
 	for (i = 0; i < sc->num_radios; i++) {
 		if (!num_buffs_reaped[i])
 			continue;
@@ -16894,7 +16921,7 @@ qwx_dp_rx_process_received_packets(struct qwx_softc *s
 }
 
 int
-qwx_dp_process_rx(struct qwx_softc *sc, int ring_id)
+qwx_dp_process_rx(struct qwx_softc *sc, int ring_id, int purge)
 {
 	struct qwx_dp *dp = &sc->dp;
 	struct qwx_pdev_dp *pdev_dp = &sc->pdev_dp;
@@ -17006,6 +17033,16 @@ try_again:
 		if (!num_buffs_reaped[i])
 			continue;
 
+		if (purge) {
+			while ((msdu = TAILQ_FIRST(&msdu_list[i]))) {
+				TAILQ_REMOVE(msdu_list, msdu, entry);
+				m_freem(msdu->m);
+				msdu->m = NULL;
+			}
+
+			continue;
+		}
+
 		qwx_dp_rx_process_received_packets(sc, &msdu_list[i], i);
 
 		rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
@@ -17073,7 +17110,7 @@ fail_free_mbuf:
 
 int
 qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, int mac_id,
-    struct mbuf_list *ml)
+    struct mbuf_list *ml, int purge)
 {
 	const struct ath11k_hw_hal_params *hal_params;
 	struct qwx_pdev_dp *dp;
@@ -17154,6 +17191,15 @@ qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, i
 			pmon->buf_state = DP_MON_STATUS_REPLINISH;
 		}
 move_next:
+		if (purge) {
+			hal_params = sc->hw_params.hal_params;
+			qwx_hal_rx_buf_addr_info_set(rx_mon_status_desc, 0, 0,
+			    hal_params->rx_buf_rbm);
+			qwx_hal_srng_src_get_next_entry(sc, srng);
+			num_buffs_reaped++;
+			continue;
+		}
+
 		m = qwx_dp_rx_alloc_mon_status_buf(sc, rx_ring, &buf_idx);
 		if (!m) {
 			hal_params = sc->hw_params.hal_params;
@@ -17209,7 +17255,7 @@ qwx_dp_rx_process_mon_status(struct qwx_softc *sc, int
 #endif
 	struct hal_rx_mon_ppdu_info *ppdu_info = &pmon->mon_ppdu_info;
 
-	num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml);
+	num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml, 0);
 	if (!num_buffs_reaped)
 		goto exit;
 
@@ -17310,7 +17356,7 @@ qwx_dp_service_mon_ring(void *arg)
 }
 
 int
-qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id)
+qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id, int purge)
 {
 	struct ieee80211com *ic = &sc->sc_ic;
 	struct ifnet *ifp = &ic->ic_if;
@@ -17382,7 +17428,7 @@ qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac
 #ifdef notyet
 	spin_unlock_bh(&srng->lock);
 #endif
-	if (num_buf_freed)
+	if (num_buf_freed && !purge)
 		qwx_dp_rxbufs_replenish(sc, mac_id, rx_ring, num_buf_freed,
 		    sc->hw_params.hal_params->rx_buf_rbm);
 
@@ -17605,7 +17651,7 @@ qwx_hal_reo_update_rx_reo_queue_status(struct qwx_soft
 }
 
 int
-qwx_dp_process_reo_status(struct qwx_softc *sc)
+qwx_dp_process_reo_status(struct qwx_softc *sc, int purge)
 {
 	struct qwx_dp *dp = &sc->dp;
 	struct hal_srng *srng;
@@ -17623,8 +17669,11 @@ qwx_dp_process_reo_status(struct qwx_softc *sc)
 	qwx_hal_srng_access_begin(sc, srng);
 
 	while ((reo_desc = qwx_hal_srng_dst_get_next_entry(sc, srng))) {
-		ret = 1;
+		ret++;
 
+		if (purge)
+			continue;
+
 		tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc);
 		switch (tag) {
 		case HAL_REO_GET_QUEUE_STATS_STATUS:
@@ -17705,16 +17754,16 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
 	}
 
 	if (sc->hw_params.ring_mask->rx_err[grp_id] &&
-	    qwx_dp_process_rx_err(sc))
+	    qwx_dp_process_rx_err(sc, 0))
 		ret = 1;
 
 	if (sc->hw_params.ring_mask->rx_wbm_rel[grp_id] &&
-	    qwx_dp_rx_process_wbm_err(sc))
+	    qwx_dp_rx_process_wbm_err(sc, 0))
 		ret = 1;
 
 	if (sc->hw_params.ring_mask->rx[grp_id]) {
 		i = fls(sc->hw_params.ring_mask->rx[grp_id]) - 1;
-		if (qwx_dp_process_rx(sc, i))
+		if (qwx_dp_process_rx(sc, i, 0))
 			ret = 1;
 	}
 
@@ -17732,7 +17781,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
 	}
 
 	if (sc->hw_params.ring_mask->reo_status[grp_id] &&
-	    qwx_dp_process_reo_status(sc))
+	    qwx_dp_process_reo_status(sc, 0))
 		ret = 1;
 
 	for (i = 0; i < sc->num_radios; i++) {
@@ -17741,7 +17790,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
 
 			if (sc->hw_params.ring_mask->rxdma2host[grp_id] &
 			   (1 << (id))) {
-				if (qwx_dp_process_rxdma_err(sc, id))
+				if (qwx_dp_process_rxdma_err(sc, id, 0))
 					ret = 1;
 			}
 
@@ -19860,11 +19909,53 @@ err_wmi_detach:
 }
 
 void
+qwx_flush_rx_rings(struct qwx_softc *sc)
+{
+	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+	int i, j, n;
+
+	do {
+		n = qwx_dp_process_rx_err(sc, 1);
+	} while (n > 0);
+
+	do {
+		n = qwx_dp_rx_process_wbm_err(sc, 1);
+	} while (n > 0);
+
+	do {
+		n = qwx_dp_process_reo_status(sc, 1);
+	} while (n > 0);
+
+	for (i = 0; i < DP_REO_DST_RING_MAX; i++) {
+		do {
+			n = qwx_dp_process_rx(sc, i, 1);
+		} while (n > 0);
+	}
+
+	for (i = 0; i < sc->num_radios; i++) {
+		for (j = 0; j < sc->hw_params.num_rxmda_per_pdev; j++) {
+			int mac_id = i * sc->hw_params.num_rxmda_per_pdev + j;
+
+			do {
+				n = qwx_dp_process_rxdma_err(sc, mac_id, 1);
+			} while (n > 0);
+			do {
+				n = qwx_dp_rx_reap_mon_status_ring(sc,
+				    mac_id, &ml, 1);
+				ml_purge(&ml);
+			} while (n > 0);
+		}
+	}
+}
+
+void
 qwx_core_stop(struct qwx_softc *sc)
 {
 	if (!test_bit(ATH11K_FLAG_CRASH_FLUSH, sc->sc_flags))
 		qwx_qmi_firmware_stop(sc);
 	
+	qwx_flush_rx_rings(sc);
+
 	sc->ops.stop(sc);
 	qwx_wmi_detach(sc);
 	qwx_dp_pdev_reo_cleanup(sc);