Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: enable rss/multiqueue for newer aq(4) models
To:
Jonathan Matthew <jonathan@d14n.org>
Cc:
tech@openbsd.org, kettenis@openbsd.org
Date:
Tue, 28 Jan 2025 20:22:38 +0100

Download raw body.

Thread
> Date: Sun, 26 Jan 2025 17:50:27 +1000
> From: Jonathan Matthew <jonathan@d14n.org>
> 
> This fills in the missing bits for RSS/multiqueue on 'aq2' hardware
> (AQC113 up to AQC116).  Like earlier models, aq2 is limited to 8 queues.
> 
> ok?

Doesn't seem to break aq(4) on my M2 Pro Mac mini.  However, this made
me realize that aplintc(4) doesn't actually support running interrupts
on other CPUs.  So all the queues end up on the primary CPU.  That in
itself shouldn't be a problem a problem, but I think it means that
intr_barrier(9) is broken on these machines.

Need to dig into this a bit deeper and see if I can fix this.  This
hardware is interesting since it implements a mode where the hardware
picks the most appropriate CPU to run the interrupt on.  Not exactly
sure how it does that, but I believe this helps saving power since it
can direct interrupts to an active CPU to avoid waking up a CPU that
is in a deep sleep state.

Did you test this diff on non-Apple hardware?


> Index: if_aq_pci.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_aq_pci.c,v
> diff -u -p -u -p -r1.28 if_aq_pci.c
> --- if_aq_pci.c	24 May 2024 06:02:53 -0000	1.28
> +++ if_aq_pci.c	26 Jan 2025 07:43:45 -0000
> @@ -365,6 +365,7 @@
>  #define  TPB_TX_BUF_SCP_INS_EN			(1 << 2)
>  #define  TPB_TX_BUF_CLK_GATE_EN			(1 << 5)
>  #define  TPB_TX_BUF_TC_MODE_EN			(1 << 8)
> +#define  TPB_TX_BUF_TC_Q_RAND_MAP_EN		(1 << 9)
>  
>  
>  /* TPB_TXB_BUFSIZE_REG[AQ_TRAFFICCLASS_NUM] 0x7910-7990 */
> @@ -467,7 +468,7 @@
>  
>  #define AQ2_RPF_REDIR2_REG			0x54c8
>  #define  AQ2_RPF_REDIR2_INDEX			(1 << 12)
> -#define  AQ2_RPF_REDIR2_HASHTYPE		0x00000100
> +#define  AQ2_RPF_REDIR2_HASHTYPE		0x000001FF
>  #define  AQ2_RPF_REDIR2_HASHTYPE_NONE		0
>  #define  AQ2_RPF_REDIR2_HASHTYPE_IP		(1 << 0)
>  #define  AQ2_RPF_REDIR2_HASHTYPE_TCP4		(1 << 1)
> @@ -478,7 +479,16 @@
>  #define  AQ2_RPF_REDIR2_HASHTYPE_IP6EX		(1 << 6)
>  #define  AQ2_RPF_REDIR2_HASHTYPE_TCP6EX		(1 << 7)
>  #define  AQ2_RPF_REDIR2_HASHTYPE_UDP6EX		(1 << 8)
> -#define  AQ2_RPF_REDIR2_HASHTYPE_ALL		0x00000100
> +#define  AQ2_RPF_REDIR2_HASHTYPE_ALL		0x000001FF
> +
> +#define AQ2_RX_Q_TC_MAP_REG(i)			(0x5900 + (i) * 4)
> +#define AQ2_TX_Q_TC_MAP_REG(i)			(0x799c + (i) * 4)
> +
> +#define AQ2_RPF_RSS_REDIR_MAX			64
> +#define AQ2_RPF_RSS_REDIR_REG(tc, i)		\
> +	 (0x6200 + (0x100 * ((tc) >> 2)) + (i) * 4)
> +#define AQ2_RPF_RSS_REDIR_TC_MASK(tc)		\
> +	 (0x1f << (5 * ((tc) & 3)))
>  
>  #define AQ2_RPF_REC_TAB_ENABLE_REG		0x6ff0
>  #define  AQ2_RPF_REC_TAB_ENABLE_MASK		0x0000ffff
> @@ -1282,8 +1292,7 @@ aq_attach(struct device *parent, struct 
>  
>  	if (pci_intr_map_msix(pa, 0, &ih) == 0) {
>  		int nmsix = pci_intr_msix_count(pa);
> -		/* don't do rss on aq2 yet */
> -		if (aqp->aq_hwtype == HWTYPE_AQ1 && nmsix > 1) {
> +		if (nmsix > 1) {
>  			nmsix--;
>  			sc->sc_intrmap = intrmap_create(&sc->sc_dev,
>  			    nmsix, AQ_MAXQ, INTRMAP_POWEROF2);
> @@ -2803,6 +2812,26 @@ aq_hw_qos_set(struct aq_softc *sc)
>  		AQ_WRITE_REG_BIT(sc, RPF_RPB_RX_TC_UPT_REG,
>  		    RPF_RPB_RX_TC_UPT_MASK(i_priority), 0);
>  	}
> +
> +	/* ring to TC mapping */
> +	if (HWTYPE_AQ2_P(sc)) {
> +		AQ_WRITE_REG_BIT(sc, TPB_TX_BUF_REG,
> +		    TPB_TX_BUF_TC_Q_RAND_MAP_EN, 1);
> +
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(0), 0x00000000);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(1), 0x00000000);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(2), 0x01010101);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(3), 0x01010101);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(4), 0x02020202);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(5), 0x02020202);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(6), 0x03030303);
> +		AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(7), 0x03030303);
> +
> +		AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(0), 0x00000000);
> +		AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(1), 0x11111111);
> +		AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(2), 0x22222222);
> +		AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(3), 0x33333333);
> +	}
>  }
>  
>  int
> @@ -2816,6 +2845,19 @@ aq_init_rss(struct aq_softc *sc)
>  	
>  	if (sc->sc_nqueues == 1)
>  		return 0;
> +
> +	if (HWTYPE_AQ2_P(sc)) {
> +		AQ_WRITE_REG_BIT(sc, AQ2_RPF_REDIR2_REG, AQ2_RPF_REDIR2_INDEX, 0);
> +		for (i = 0; i < AQ2_RPF_RSS_REDIR_MAX; i++) {
> +			int tc;
> +			int q;
> +			for (tc = 0; tc < 4; tc++) {
> +				q = (tc * 8) + (i % sc->sc_nqueues);
> +				AQ_WRITE_REG_BIT(sc, AQ2_RPF_RSS_REDIR_REG(tc, i),
> +				    AQ2_RPF_RSS_REDIR_TC_MASK(tc), q);
> +			}
> +		}
> +	}
>  
>  	/* rss key is composed of 32 bit registers */
>  	stoeplitz_to_key(rss_key, sizeof(rss_key));
> 
>