From: Jonathan Matthew Subject: Re: enable rss/multiqueue for newer aq(4) models To: Mark Kettenis Cc: tech@openbsd.org, kettenis@openbsd.org Date: Wed, 29 Jan 2025 11:17:31 +1000 On Tue, Jan 28, 2025 at 08:22:38PM +0100, Mark Kettenis wrote: > > Date: Sun, 26 Jan 2025 17:50:27 +1000 > > From: Jonathan Matthew > > > > This fills in the missing bits for RSS/multiqueue on 'aq2' hardware > > (AQC113 up to AQC116). Like earlier models, aq2 is limited to 8 queues. > > > > ok? > > Doesn't seem to break aq(4) on my M2 Pro Mac mini. However, this made > me realize that aplintc(4) doesn't actually support running interrupts > on other CPUs. So all the queues end up on the primary CPU. That in > itself shouldn't be a problem a problem, but I think it means that > intr_barrier(9) is broken on these machines. > > Need to dig into this a bit deeper and see if I can fix this. This > hardware is interesting since it implements a mode where the hardware > picks the most appropriate CPU to run the interrupt on. Not exactly > sure how it does that, but I believe this helps saving power since it > can direct interrupts to an active CPU to avoid waking up a CPU that > is in a deep sleep state. > > Did you test this diff on non-Apple hardware? I worked on this on a rockpro64 with an AQC113 pcie card, and dlg tested it on an amd64 system with an onboard AQC11x. Should I hold off committing this until you figure out what to do with aplintc(4)? It does look like intr_barrier() won't work properly, but I don't think that's a big problem for aq(4) specifically. Not having interrupts distributed across cpus also shouldn't be a problem. > > > > Index: if_aq_pci.c > > =================================================================== > > RCS file: /cvs/src/sys/dev/pci/if_aq_pci.c,v > > diff -u -p -u -p -r1.28 if_aq_pci.c > > --- if_aq_pci.c 24 May 2024 06:02:53 -0000 1.28 > > +++ if_aq_pci.c 26 Jan 2025 07:43:45 -0000 > > @@ -365,6 +365,7 @@ > > #define TPB_TX_BUF_SCP_INS_EN (1 << 2) > > #define TPB_TX_BUF_CLK_GATE_EN (1 << 5) > > #define TPB_TX_BUF_TC_MODE_EN (1 << 8) > > +#define TPB_TX_BUF_TC_Q_RAND_MAP_EN (1 << 9) > > > > > > /* TPB_TXB_BUFSIZE_REG[AQ_TRAFFICCLASS_NUM] 0x7910-7990 */ > > @@ -467,7 +468,7 @@ > > > > #define AQ2_RPF_REDIR2_REG 0x54c8 > > #define AQ2_RPF_REDIR2_INDEX (1 << 12) > > -#define AQ2_RPF_REDIR2_HASHTYPE 0x00000100 > > +#define AQ2_RPF_REDIR2_HASHTYPE 0x000001FF > > #define AQ2_RPF_REDIR2_HASHTYPE_NONE 0 > > #define AQ2_RPF_REDIR2_HASHTYPE_IP (1 << 0) > > #define AQ2_RPF_REDIR2_HASHTYPE_TCP4 (1 << 1) > > @@ -478,7 +479,16 @@ > > #define AQ2_RPF_REDIR2_HASHTYPE_IP6EX (1 << 6) > > #define AQ2_RPF_REDIR2_HASHTYPE_TCP6EX (1 << 7) > > #define AQ2_RPF_REDIR2_HASHTYPE_UDP6EX (1 << 8) > > -#define AQ2_RPF_REDIR2_HASHTYPE_ALL 0x00000100 > > +#define AQ2_RPF_REDIR2_HASHTYPE_ALL 0x000001FF > > + > > +#define AQ2_RX_Q_TC_MAP_REG(i) (0x5900 + (i) * 4) > > +#define AQ2_TX_Q_TC_MAP_REG(i) (0x799c + (i) * 4) > > + > > +#define AQ2_RPF_RSS_REDIR_MAX 64 > > +#define AQ2_RPF_RSS_REDIR_REG(tc, i) \ > > + (0x6200 + (0x100 * ((tc) >> 2)) + (i) * 4) > > +#define AQ2_RPF_RSS_REDIR_TC_MASK(tc) \ > > + (0x1f << (5 * ((tc) & 3))) > > > > #define AQ2_RPF_REC_TAB_ENABLE_REG 0x6ff0 > > #define AQ2_RPF_REC_TAB_ENABLE_MASK 0x0000ffff > > @@ -1282,8 +1292,7 @@ aq_attach(struct device *parent, struct > > > > if (pci_intr_map_msix(pa, 0, &ih) == 0) { > > int nmsix = pci_intr_msix_count(pa); > > - /* don't do rss on aq2 yet */ > > - if (aqp->aq_hwtype == HWTYPE_AQ1 && nmsix > 1) { > > + if (nmsix > 1) { > > nmsix--; > > sc->sc_intrmap = intrmap_create(&sc->sc_dev, > > nmsix, AQ_MAXQ, INTRMAP_POWEROF2); > > @@ -2803,6 +2812,26 @@ aq_hw_qos_set(struct aq_softc *sc) > > AQ_WRITE_REG_BIT(sc, RPF_RPB_RX_TC_UPT_REG, > > RPF_RPB_RX_TC_UPT_MASK(i_priority), 0); > > } > > + > > + /* ring to TC mapping */ > > + if (HWTYPE_AQ2_P(sc)) { > > + AQ_WRITE_REG_BIT(sc, TPB_TX_BUF_REG, > > + TPB_TX_BUF_TC_Q_RAND_MAP_EN, 1); > > + > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(0), 0x00000000); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(1), 0x00000000); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(2), 0x01010101); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(3), 0x01010101); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(4), 0x02020202); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(5), 0x02020202); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(6), 0x03030303); > > + AQ_WRITE_REG(sc, AQ2_TX_Q_TC_MAP_REG(7), 0x03030303); > > + > > + AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(0), 0x00000000); > > + AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(1), 0x11111111); > > + AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(2), 0x22222222); > > + AQ_WRITE_REG(sc, AQ2_RX_Q_TC_MAP_REG(3), 0x33333333); > > + } > > } > > > > int > > @@ -2816,6 +2845,19 @@ aq_init_rss(struct aq_softc *sc) > > > > if (sc->sc_nqueues == 1) > > return 0; > > + > > + if (HWTYPE_AQ2_P(sc)) { > > + AQ_WRITE_REG_BIT(sc, AQ2_RPF_REDIR2_REG, AQ2_RPF_REDIR2_INDEX, 0); > > + for (i = 0; i < AQ2_RPF_RSS_REDIR_MAX; i++) { > > + int tc; > > + int q; > > + for (tc = 0; tc < 4; tc++) { > > + q = (tc * 8) + (i % sc->sc_nqueues); > > + AQ_WRITE_REG_BIT(sc, AQ2_RPF_RSS_REDIR_REG(tc, i), > > + AQ2_RPF_RSS_REDIR_TC_MASK(tc), q); > > + } > > + } > > + } > > > > /* rss key is composed of 32 bit registers */ > > stoeplitz_to_key(rss_key, sizeof(rss_key)); > > > > >