Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
make qwx(4) flush Rx rings when the interface goes down
To:
tech@openbsd.org
Date:
Thu, 24 Apr 2025 11:38:37 +0200

Download raw body.

Thread
There is a known issue where qwx(4) triggers a pool corruption check:

  panic: pool_cache_item_magic_check: mcl2k cpu free list modified

As an attempt to fix this, the patch below makes the driver discard
pending packets on all Rx rings.

My theory is that the hardware is doing a DMA write to an mbuf which is
on one of these rings. While the interface goes down we are freeing mbufs
but we are not updating the ring state from the device's point of view
before doing so.
The panic happens later when qwx or another network driver attempts to
allocate a new mbuf from the 2K cluster pool.

The issue has been observed during suspend/resume, though if my theory
is correct it might just as well happen during ifconfig down/up.

Also, port over qwx_dp_mon_link_free() from ath11k. I noticed that this
was missing while searching the Linux driver code for clues.

ok?

M  sys/dev/ic/qwx.c  |  113+  21-

1 file changed, 113 insertions(+), 21 deletions(-)

commit - 31338944480d0345b0ff1635ba18a96228bd37ce
commit + 9baa8ca3b3b937f4efba886b1bdb392d310be602
blob - fdafd249b8801186ec0c12b2d0acb26131dcd963
blob + 428cbbc3db1d54c855bf76eebe4bce806a787d68
--- sys/dev/ic/qwx.c
+++ sys/dev/ic/qwx.c
@@ -15454,6 +15454,16 @@ config_refill_ring:
 }
 
 void
+qwx_dp_mon_link_free(struct qwx_softc *sc)
+{
+	struct qwx_pdev_dp *dp = &sc->pdev_dp;
+	struct qwx_mon_data *pmon = &dp->mon_data;
+
+	qwx_dp_link_desc_cleanup(sc, pmon->link_desc_banks,
+	    HAL_RXDMA_MONITOR_DESC, &dp->rxdma_mon_desc_ring);
+}
+
+void
 qwx_dp_pdev_free(struct qwx_softc *sc)
 {
 	int i;
@@ -15462,6 +15472,8 @@ qwx_dp_pdev_free(struct qwx_softc *sc)
 
 	for (i = 0; i < sc->num_radios; i++)
 		qwx_dp_rx_pdev_free(sc, i);
+	
+	qwx_dp_mon_link_free(sc);
 }
 
 int
@@ -16058,7 +16070,7 @@ qwx_dp_process_rx_err_buf(struct qwx_softc *sc, uint32
 }
 
 int
-qwx_dp_process_rx_err(struct qwx_softc *sc)
+qwx_dp_process_rx_err(struct qwx_softc *sc, int purge)
 {
 	struct ieee80211com *ic = &sc->sc_ic;
 	struct ifnet *ifp = &ic->ic_if;
@@ -16077,7 +16089,7 @@ qwx_dp_process_rx_err(struct qwx_softc *sc)
 	uint64_t paddr;
 	uint32_t *desc;
 	int is_frag;
-	uint8_t drop = 0;
+	uint8_t drop = purge ? 1 : 0;
 
 	tot_n_bufs_reaped = 0;
 
@@ -16149,14 +16161,17 @@ qwx_dp_process_rx_err(struct qwx_softc *sc)
 #ifdef notyet
 	spin_unlock_bh(&srng->lock);
 #endif
-	for (i = 0; i < sc->num_radios; i++) {
-		if (!n_bufs_reaped[i])
-			continue;
+	if (!purge) {
+		for (i = 0; i < sc->num_radios; i++) {
+			if (!n_bufs_reaped[i])
+				continue;
 
-		rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
+			rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
 
-		qwx_dp_rxbufs_replenish(sc, i, rx_ring, n_bufs_reaped[i],
-		    sc->hw_params.hal_params->rx_buf_rbm);
+			qwx_dp_rxbufs_replenish(sc, i, rx_ring,
+			    n_bufs_reaped[i],
+			    sc->hw_params.hal_params->rx_buf_rbm);
+		}
 	}
 
 	ifp->if_ierrors += tot_n_bufs_reaped;
@@ -16306,7 +16321,7 @@ qwx_dp_rx_wbm_err(struct qwx_softc *sc, struct qwx_rx_
 }
 
 int
-qwx_dp_rx_process_wbm_err(struct qwx_softc *sc)
+qwx_dp_rx_process_wbm_err(struct qwx_softc *sc, int purge)
 {
 	struct ieee80211com *ic = &sc->sc_ic;
 	struct ifnet *ifp = &ic->ic_if;
@@ -16382,6 +16397,18 @@ qwx_dp_rx_process_wbm_err(struct qwx_softc *sc)
 	if (!total_num_buffs_reaped)
 		goto done;
 
+	if (purge) {
+		for (i = 0; i < sc->num_radios; i++) {
+			while ((msdu = TAILQ_FIRST(msdu_list))) {
+				TAILQ_REMOVE(msdu_list, msdu, entry);
+				m_freem(msdu->m);
+				msdu->m = NULL;
+			}
+		}
+
+		goto done;
+	}
+
 	for (i = 0; i < sc->num_radios; i++) {
 		if (!num_buffs_reaped[i])
 			continue;
@@ -16894,7 +16921,7 @@ qwx_dp_rx_process_received_packets(struct qwx_softc *s
 }
 
 int
-qwx_dp_process_rx(struct qwx_softc *sc, int ring_id)
+qwx_dp_process_rx(struct qwx_softc *sc, int ring_id, int purge)
 {
 	struct qwx_dp *dp = &sc->dp;
 	struct qwx_pdev_dp *pdev_dp = &sc->pdev_dp;
@@ -17006,6 +17033,16 @@ try_again:
 		if (!num_buffs_reaped[i])
 			continue;
 
+		if (purge) {
+			while ((msdu = TAILQ_FIRST(&msdu_list[i]))) {
+				TAILQ_REMOVE(msdu_list, msdu, entry);
+				m_freem(msdu->m);
+				msdu->m = NULL;
+			}
+
+			continue;
+		}
+
 		qwx_dp_rx_process_received_packets(sc, &msdu_list[i], i);
 
 		rx_ring = &sc->pdev_dp.rx_refill_buf_ring;
@@ -17073,7 +17110,7 @@ fail_free_mbuf:
 
 int
 qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, int mac_id,
-    struct mbuf_list *ml)
+    struct mbuf_list *ml, int purge)
 {
 	const struct ath11k_hw_hal_params *hal_params;
 	struct qwx_pdev_dp *dp;
@@ -17111,6 +17148,7 @@ qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, i
 
 		qwx_hal_rx_buf_addr_info_get(rx_mon_status_desc, &paddr,
 		    &cookie, &rbm);
+
 		if (paddr) {
 			buf_idx = FIELD_GET(DP_RXDMA_BUF_COOKIE_BUF_ID, cookie);
 			if (buf_idx >= rx_ring->bufs_max ||
@@ -17154,6 +17192,15 @@ qwx_dp_rx_reap_mon_status_ring(struct qwx_softc *sc, i
 			pmon->buf_state = DP_MON_STATUS_REPLINISH;
 		}
 move_next:
+		if (purge) {
+			hal_params = sc->hw_params.hal_params;
+			qwx_hal_rx_buf_addr_info_set(rx_mon_status_desc, 0, 0,
+			    hal_params->rx_buf_rbm);
+			qwx_hal_srng_src_get_next_entry(sc, srng);
+			num_buffs_reaped++;
+			continue;
+		}
+
 		m = qwx_dp_rx_alloc_mon_status_buf(sc, rx_ring, &buf_idx);
 		if (!m) {
 			hal_params = sc->hw_params.hal_params;
@@ -17209,7 +17256,7 @@ qwx_dp_rx_process_mon_status(struct qwx_softc *sc, int
 #endif
 	struct hal_rx_mon_ppdu_info *ppdu_info = &pmon->mon_ppdu_info;
 
-	num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml);
+	num_buffs_reaped = qwx_dp_rx_reap_mon_status_ring(sc, mac_id, &ml, 0);
 	if (!num_buffs_reaped)
 		goto exit;
 
@@ -17310,7 +17357,7 @@ qwx_dp_service_mon_ring(void *arg)
 }
 
 int
-qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id)
+qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac_id, int purge)
 {
 	struct ieee80211com *ic = &sc->sc_ic;
 	struct ifnet *ifp = &ic->ic_if;
@@ -17382,7 +17429,7 @@ qwx_dp_process_rxdma_err(struct qwx_softc *sc, int mac
 #ifdef notyet
 	spin_unlock_bh(&srng->lock);
 #endif
-	if (num_buf_freed)
+	if (num_buf_freed && !purge)
 		qwx_dp_rxbufs_replenish(sc, mac_id, rx_ring, num_buf_freed,
 		    sc->hw_params.hal_params->rx_buf_rbm);
 
@@ -17605,7 +17652,7 @@ qwx_hal_reo_update_rx_reo_queue_status(struct qwx_soft
 }
 
 int
-qwx_dp_process_reo_status(struct qwx_softc *sc)
+qwx_dp_process_reo_status(struct qwx_softc *sc, int purge)
 {
 	struct qwx_dp *dp = &sc->dp;
 	struct hal_srng *srng;
@@ -17623,8 +17670,11 @@ qwx_dp_process_reo_status(struct qwx_softc *sc)
 	qwx_hal_srng_access_begin(sc, srng);
 
 	while ((reo_desc = qwx_hal_srng_dst_get_next_entry(sc, srng))) {
-		ret = 1;
+		ret++;
 
+		if (purge)
+			continue;
+
 		tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc);
 		switch (tag) {
 		case HAL_REO_GET_QUEUE_STATS_STATUS:
@@ -17705,16 +17755,16 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
 	}
 
 	if (sc->hw_params.ring_mask->rx_err[grp_id] &&
-	    qwx_dp_process_rx_err(sc))
+	    qwx_dp_process_rx_err(sc, 0))
 		ret = 1;
 
 	if (sc->hw_params.ring_mask->rx_wbm_rel[grp_id] &&
-	    qwx_dp_rx_process_wbm_err(sc))
+	    qwx_dp_rx_process_wbm_err(sc, 0))
 		ret = 1;
 
 	if (sc->hw_params.ring_mask->rx[grp_id]) {
 		i = fls(sc->hw_params.ring_mask->rx[grp_id]) - 1;
-		if (qwx_dp_process_rx(sc, i))
+		if (qwx_dp_process_rx(sc, i, 0))
 			ret = 1;
 	}
 
@@ -17732,7 +17782,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
 	}
 
 	if (sc->hw_params.ring_mask->reo_status[grp_id] &&
-	    qwx_dp_process_reo_status(sc))
+	    qwx_dp_process_reo_status(sc, 0))
 		ret = 1;
 
 	for (i = 0; i < sc->num_radios; i++) {
@@ -17741,7 +17791,7 @@ qwx_dp_service_srng(struct qwx_softc *sc, int grp_id)
 
 			if (sc->hw_params.ring_mask->rxdma2host[grp_id] &
 			   (1 << (id))) {
-				if (qwx_dp_process_rxdma_err(sc, id))
+				if (qwx_dp_process_rxdma_err(sc, id, 0))
 					ret = 1;
 			}
 
@@ -19926,6 +19976,46 @@ err_pdev_debug:
 }
 
 void
+qwx_reap_rx_rings(struct qwx_softc *sc)
+{
+	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+	int i, j, n;
+
+	do {
+		n = qwx_dp_process_rx_err(sc, 1);
+	} while (n > 0);
+
+	do {
+		n = qwx_dp_rx_process_wbm_err(sc, 1);
+	} while (n > 0);
+
+	do {
+		n = qwx_dp_process_reo_status(sc, 1);
+	} while (n > 0);
+
+	for (i = 0; i < DP_REO_DST_RING_MAX; i++) {
+		do {
+			n = qwx_dp_process_rx(sc, i, 1);
+		} while (n > 0);
+	}
+
+	for (i = 0; i < sc->num_radios; i++) {
+		for (j = 0; j < sc->hw_params.num_rxmda_per_pdev; j++) {
+			int mac_id = i * sc->hw_params.num_rxmda_per_pdev + j;
+
+			do {
+				n = qwx_dp_process_rxdma_err(sc, mac_id, 1);
+			} while (n > 0);
+			do {
+				n = qwx_dp_rx_reap_mon_status_ring(sc,
+				    mac_id, &ml, 1);
+				ml_purge(&ml);
+			} while (n > 0);
+		}
+	}
+}
+
+void
 qwx_core_deinit(struct qwx_softc *sc)
 {
 	struct ath11k_hal *hal = &sc->hal;
@@ -19936,6 +20026,8 @@ qwx_core_deinit(struct qwx_softc *sc)
 #endif
 	sc->ops.irq_disable(sc);
 
+	qwx_reap_rx_rings(sc);
+
 	qwx_core_stop(sc);
 	qwx_core_pdev_destroy(sc);
 #ifdef notyet