Index | Thread | Search

From:
Kirill A. Korinsky <kirill@korins.ky>
Subject:
Re: octeon: commuliative patch LRO, cnmac queue and softens
To:
Visa Hankala <visa@hankala.org>
Cc:
tech@openbsd.org
Date:
Sun, 05 Apr 2026 15:46:51 +0200

Download raw body.

Thread
On Sun, 05 Apr 2026 13:52:19 +0200,
Visa Hankala <visa@hankala.org> wrote:
> 
> On Sun, Apr 05, 2026 at 11:35:44AM +0200, Kirill A. Korinsky wrote:
> > On Fri, 03 Apr 2026 16:28:34 +0200,
> > Visa Hankala <visa@hankala.org> wrote:
> > > 
> > > > @@ -108,22 +110,30 @@ cn30xxpip_port_config(struct cn30xxpip_s
> > > >  	/* SKIP=0 */
> > > >  
> > > >  	prt_tag = 0;
> > > > +	SET(prt_tag, PIP_PRT_TAGN_INC_VLAN);
> > > >  	SET(prt_tag, PIP_PRT_TAGN_INC_PRT);
> > > 
> > > I wonder if VLAN id and input port number should be left out from
> > > the packet tag. This would make the tag symmetric with regards to IP
> > > addresses and TCP/UDP ports, and let the same CPU core handle both
> > > directions of TCP/UDP flows. This might improve CPU cache locality
> > > and performance when forwarding multiple flows. Of course, the
> > > symmetricity is lost if packets are transformed for example by NAT
> > > or tunneling.
> > >
> > 
> > Not sure that I get the idea of symmetric. Right now it uses SRC and DST
> > addresses and ports, and for an opposite dirrection it should have reversed
> > addresses and ports, isn't it?
> 
> By symmetric hash/tag I mean that
> hash(saddr, sport, daddr, dport) = hash(daddr, dport, saddr, sport) .
>

but... saddr and sport isn't mached daddr and dport.

Let assume that I send traffic from 1.2.3.4:1234 to 1.1.1.1:80, when on
direct traffic I have as tag value hash(1.2.3.4, 1234, 1.1.1.1, 80) and when
remote server sent something back it is hash(1.1.1.1, 80, 1.2.3.4, 1234).

What am I missing here?

> > Also, VLAN id and input port number is here to make differen tag for cases:
> >  - routing traffic between vlans on the same port
> >  - and routing traffic between ports on the same vlan
> 
> In my opinion both directions of a flow should be processed by the same
> CPU core when possible. This should improve scaling in terms of total
> throughput because the state tracking data do not need to go back and
> forth between CPU cores; faster access, less contention.
>

Make sense.

> > Numbers for iperf in single thread when two machines are in different vlan
> > but on the same cnmac:
> 
> I think multi-queue processing should be benchmarked with multiple
> flows. Also, I believe it is more common to forward traffic between
> different ports.
> 
> With a single flow and four cores, you are leaving processing capacity
> unused.
>

Sure, and this was just a naive benchmark to explain why I've used vlan in
tag in the first place.

On my ER4 with vlan and pf, when traffic is routed from one vlan to antoher
vlan on the same cnmac I do have ~923 mbit/s with 4 streams of iperf.

Here a cumulative diff (LRO + multiple queue) which address all your remakrs
and excluded already commited parts by kn@ and me.

Index: sys/arch/octeon/dev/cn30xxpip.c
===================================================================
RCS file: /home/cvs/src/sys/arch/octeon/dev/cn30xxpip.c,v
diff -u -p -r1.11 cn30xxpip.c
--- sys/arch/octeon/dev/cn30xxpip.c	28 Dec 2022 01:39:21 -0000	1.11
+++ sys/arch/octeon/dev/cn30xxpip.c	5 Apr 2026 13:24:51 -0000
@@ -57,6 +57,7 @@ cn30xxpip_init(struct cn30xxpip_attach_a
 	sc->sc_regt = aa->aa_regt;
 	sc->sc_tag_type = aa->aa_tag_type;
 	sc->sc_receive_group = aa->aa_receive_group;
+	sc->sc_receive_group_order = aa->aa_receive_group_order;
 	sc->sc_ip_offset = aa->aa_ip_offset;
 
 	status = bus_space_map(sc->sc_regt, PIP_BASE, PIP_SIZE, 0,
@@ -88,6 +89,7 @@ cn30xxpip_port_config(struct cn30xxpip_s
 	uint64_t prt_cfg;
 	uint64_t prt_tag;
 	uint64_t ip_offset;
+	uint64_t group_mask;
 
 	/*
 	 * Process the headers and place the IP header in the work queue
@@ -108,22 +110,30 @@ cn30xxpip_port_config(struct cn30xxpip_s
 	/* SKIP=0 */
 
 	prt_tag = 0;
-	SET(prt_tag, PIP_PRT_TAGN_INC_PRT);
-	CLR(prt_tag, PIP_PRT_TAGN_IP6_DPRT);
-	CLR(prt_tag, PIP_PRT_TAGN_IP4_DPRT);
-	CLR(prt_tag, PIP_PRT_TAGN_IP6_SPRT);
-	CLR(prt_tag, PIP_PRT_TAGN_IP4_SPRT);
+	CLR(prt_tag, PIP_PRT_TAGN_INC_VLAN);
+	CLR(prt_tag, PIP_PRT_TAGN_INC_PRT);
+	SET(prt_tag, PIP_PRT_TAGN_IP6_DPRT);
+	SET(prt_tag, PIP_PRT_TAGN_IP4_DPRT);
+	SET(prt_tag, PIP_PRT_TAGN_IP6_SPRT);
+	SET(prt_tag, PIP_PRT_TAGN_IP4_SPRT);
 	CLR(prt_tag, PIP_PRT_TAGN_IP6_NXTH);
 	CLR(prt_tag, PIP_PRT_TAGN_IP4_PCTL);
-	CLR(prt_tag, PIP_PRT_TAGN_IP6_DST);
-	CLR(prt_tag, PIP_PRT_TAGN_IP4_SRC);
-	CLR(prt_tag, PIP_PRT_TAGN_IP6_SRC);
-	CLR(prt_tag, PIP_PRT_TAGN_IP4_DST);
+	SET(prt_tag, PIP_PRT_TAGN_IP6_DST);
+	SET(prt_tag, PIP_PRT_TAGN_IP4_SRC);
+	SET(prt_tag, PIP_PRT_TAGN_IP6_SRC);
+	SET(prt_tag, PIP_PRT_TAGN_IP4_DST);
 	SET(prt_tag, PIP_PRT_TAGN_TCP6_TAG_ORDERED);
 	SET(prt_tag, PIP_PRT_TAGN_TCP4_TAG_ORDERED);
 	SET(prt_tag, PIP_PRT_TAGN_IP6_TAG_ORDERED);
 	SET(prt_tag, PIP_PRT_TAGN_IP4_TAG_ORDERED);
 	SET(prt_tag, PIP_PRT_TAGN_NON_TAG_ORDERED);
+	if (sc->sc_receive_group_order > 0) {
+		group_mask = ~((1U << sc->sc_receive_group_order) - 1U);
+		SET(prt_tag, ((uint64_t)sc->sc_receive_group << 36) &
+		    PIP_PRT_TAGN_GRPTAGBASE);
+		SET(prt_tag, (group_mask << 32) & PIP_PRT_TAGN_GRPTAGMASK);
+		SET(prt_tag, PIP_PRT_TAGN_GRPTAG);
+	}
 	SET(prt_tag, sc->sc_receive_group & PIP_PRT_TAGN_GRP);
 
 	ip_offset = 0;
Index: sys/arch/octeon/dev/cn30xxpipvar.h
===================================================================
RCS file: /home/cvs/src/sys/arch/octeon/dev/cn30xxpipvar.h,v
diff -u -p -r1.6 cn30xxpipvar.h
--- sys/arch/octeon/dev/cn30xxpipvar.h	20 May 2024 23:13:33 -0000	1.6
+++ sys/arch/octeon/dev/cn30xxpipvar.h	5 Apr 2026 08:57:01 -0000
@@ -41,6 +41,7 @@ struct cn30xxpip_softc {
 	bus_space_handle_t	sc_regh_stat;
 	int			sc_tag_type;
 	int			sc_receive_group;
+	int			sc_receive_group_order;
 	size_t			sc_ip_offset;
 };
 
@@ -50,6 +51,7 @@ struct cn30xxpip_attach_args {
 	bus_space_tag_t		aa_regt;
 	int			aa_tag_type;
 	int			aa_receive_group;
+	int			aa_receive_group_order;
 	size_t			aa_ip_offset;
 };
 
Index: sys/arch/octeon/dev/if_cnmac.c
===================================================================
RCS file: /home/cvs/src/sys/arch/octeon/dev/if_cnmac.c,v
diff -u -p -r1.86 if_cnmac.c
--- sys/arch/octeon/dev/if_cnmac.c	20 May 2024 23:13:33 -0000	1.86
+++ sys/arch/octeon/dev/if_cnmac.c	5 Apr 2026 08:57:03 -0000
@@ -55,6 +55,11 @@
 #include <net/if_media.h>
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
+#ifndef SMALL_KERNEL
+#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#endif
 
 #if NBPFILTER > 0
 #include <net/bpf.h>
@@ -154,6 +159,11 @@ int	cnmac_send(struct cnmac_softc *, str
 int	cnmac_reset(struct cnmac_softc *);
 int	cnmac_configure(struct cnmac_softc *);
 int	cnmac_configure_common(struct cnmac_softc *);
+unsigned int cnmac_rx_group_count(void);
+unsigned int cnmac_rx_group_order(unsigned int);
+void	cnmac_rx_groups_init(void);
+void	cnmac_rx_groups_config(struct cn30xxpow_softc *);
+void	cnmac_rx_groups_barrier(void);
 
 void	cnmac_free_task(void *);
 void	cnmac_tick_free(void *arg);
@@ -182,6 +192,15 @@ const struct cfattach cnmac_ca = {
 
 struct cfdriver cnmac_cd = { NULL, "cnmac", DV_IFNET };
 
+#define CNMAC_PIP_PORT_MAX	64
+
+struct cnmac_rx_group {
+	unsigned int		crg_group;
+	void			*crg_ih;
+	char			crg_name[IFNAMSIZ];
+	struct mbuf_list	crg_rx_batch[CNMAC_PIP_PORT_MAX];
+};
+
 /* ---- buffer management */
 
 const struct cnmac_pool_param {
@@ -204,7 +223,10 @@ uint64_t cnmac_mac_addr = 0;
 uint32_t cnmac_mac_addr_offset = 0;
 
 int	cnmac_mbufs_to_alloc;
-int	cnmac_npowgroups = 0;
+unsigned int cnmac_nrxgroups = 0;
+unsigned int cnmac_nrxgroups_order = 0;
+struct cnmac_softc *cnmac_port_softc[CNMAC_PIP_PORT_MAX];
+struct cnmac_rx_group cnmac_rx_groups[OCTEON_POW_GROUP_MAX];
 
 void
 cnmac_buf_init(struct cnmac_softc *sc)
@@ -225,6 +247,72 @@ cnmac_buf_init(struct cnmac_softc *sc)
 	}
 }
 
+unsigned int
+cnmac_rx_group_count(void)
+{
+	unsigned int count = 1;
+	unsigned int target = softnet_count();
+
+	while (count < target && count < OCTEON_POW_GROUP_MAX)
+		count <<= 1;
+
+	return count;
+}
+
+unsigned int
+cnmac_rx_group_order(unsigned int count)
+{
+	unsigned int order = 0;
+
+	while ((1U << order) < count)
+		order++;
+
+	return order;
+}
+
+void
+cnmac_rx_groups_init(void)
+{
+	struct cnmac_rx_group *crg;
+	unsigned int i;
+
+	if (cnmac_nrxgroups != 0)
+		return;
+
+	cnmac_nrxgroups = cnmac_rx_group_count();
+	cnmac_nrxgroups_order = cnmac_rx_group_order(cnmac_nrxgroups);
+
+	for (i = 0; i < cnmac_nrxgroups; i++) {
+		crg = &cnmac_rx_groups[i];
+		crg->crg_group = i;
+		snprintf(crg->crg_name, sizeof(crg->crg_name),
+		    "cnmacrx%u", i);
+		crg->crg_ih = octeon_intr_establish(POW_WORKQ_IRQ(i),
+		    IPL_NET | IPL_MPSAFE, cnmac_intr, crg, crg->crg_name);
+		if (crg->crg_ih == NULL)
+			panic("%s: could not set up interrupt",
+			    crg->crg_name);
+	}
+}
+
+void
+cnmac_rx_groups_config(struct cn30xxpow_softc *pow)
+{
+	unsigned int i;
+
+	for (i = 0; i < cnmac_nrxgroups; i++)
+		cn30xxpow_config(pow, i);
+}
+
+void
+cnmac_rx_groups_barrier(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < cnmac_nrxgroups; i++)
+		intr_barrier(cnmac_rx_groups[i].crg_ih);
+}
+
 /* ---- autoconf */
 
 int
@@ -246,11 +334,6 @@ cnmac_attach(struct device *parent, stru
 	struct cn30xxgmx_attach_args *ga = aux;
 	struct ifnet *ifp = &sc->sc_arpcom.ac_if;
 
-	if (cnmac_npowgroups >= OCTEON_POW_GROUP_MAX) {
-		printf(": out of POW groups\n");
-		return;
-	}
-
 	atomic_add_int(&cnmac_mbufs_to_alloc,
 	    cnmac_mbuf_alloc(CNMAC_MBUFS_PER_PORT));
 
@@ -262,7 +345,6 @@ cnmac_attach(struct device *parent, stru
 	sc->sc_gmx_port = ga->ga_gmx_port;
 	sc->sc_smi = ga->ga_smi;
 	sc->sc_phy_addr = ga->ga_phy_addr;
-	sc->sc_powgroup = cnmac_npowgroups++;
 
 	sc->sc_init_flag = 0;
 
@@ -282,6 +364,10 @@ cnmac_attach(struct device *parent, stru
 	task_set(&sc->sc_free_task, cnmac_free_task, sc);
 	timeout_set(&sc->sc_tick_misc_ch, cnmac_tick_misc, sc);
 	timeout_set(&sc->sc_tick_free_ch, cnmac_tick_free, sc);
+	cnmac_rx_groups_init();
+	KASSERT(sc->sc_port < nitems(cnmac_port_softc));
+	KASSERT(cnmac_port_softc[sc->sc_port] == NULL);
+	cnmac_port_softc[sc->sc_port] = sc;
 
 	cn30xxfau_op_init(&sc->sc_fau_done,
 	    OCTEON_CVMSEG_ETHER_OFFSET(sc->sc_dev.dv_unit, csm_ether_fau_done),
@@ -307,6 +393,9 @@ cnmac_attach(struct device *parent, stru
 	ifp->if_softc = sc;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_xflags = IFXF_MPSAFE;
+#ifndef SMALL_KERNEL
+	ifp->if_xflags |= IFXF_LRO;
+#endif
 	ifp->if_ioctl = cnmac_ioctl;
 	ifp->if_qstart = cnmac_start;
 	ifp->if_watchdog = cnmac_watchdog;
@@ -315,22 +404,21 @@ cnmac_attach(struct device *parent, stru
 
 	ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_TCPv4 |
 	    IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
+#ifndef SMALL_KERNEL
+	ifp->if_capabilities |= IFCAP_LRO;
+#endif
 
 	cn30xxgmx_set_filter(sc->sc_gmx_port);
 
 	if_attach(ifp);
 	ether_ifattach(ifp);
+	if_attach_iqueues(ifp, cnmac_nrxgroups);
 
 	cnmac_buf_init(sc);
 
 #if NKSTAT > 0
 	cnmac_kstat_attach(sc);
 #endif
-
-	sc->sc_ih = octeon_intr_establish(POW_WORKQ_IRQ(sc->sc_powgroup),
-	    IPL_NET | IPL_MPSAFE, cnmac_intr, sc, sc->sc_dev.dv_xname);
-	if (sc->sc_ih == NULL)
-		panic("%s: could not set up interrupt", sc->sc_dev.dv_xname);
 }
 
 /* ---- submodules */
@@ -343,7 +431,8 @@ cnmac_pip_init(struct cnmac_softc *sc)
 	pip_aa.aa_port = sc->sc_port;
 	pip_aa.aa_regt = sc->sc_regt;
 	pip_aa.aa_tag_type = POW_TAG_TYPE_ORDERED/* XXX */;
-	pip_aa.aa_receive_group = sc->sc_powgroup;
+	pip_aa.aa_receive_group = 0;
+	pip_aa.aa_receive_group_order = cnmac_nrxgroups_order;
 	pip_aa.aa_ip_offset = sc->sc_ip_offset;
 	cn30xxpip_init(&pip_aa, &sc->sc_pip);
 	cn30xxpip_port_config(sc->sc_pip);
@@ -1026,7 +1115,7 @@ cnmac_stop(struct ifnet *ifp, int disabl
 
 	cn30xxgmx_port_enable(sc->sc_gmx_port, 0);
 
-	intr_barrier(sc->sc_ih);
+	cnmac_rx_groups_barrier();
 	ifq_barrier(&ifp->if_snd);
 
 	ifq_clr_oactive(&ifp->if_snd);
@@ -1058,7 +1147,7 @@ cnmac_configure(struct cnmac_softc *sc)
 
 	cn30xxpko_port_config(sc->sc_pko);
 	cn30xxpko_port_enable(sc->sc_pko, 1);
-	cn30xxpow_config(sc->sc_pow, sc->sc_powgroup);
+	cnmac_rx_groups_config(sc->sc_pow);
 
 	cn30xxgmx_port_enable(sc->sc_gmx_port, 1);
 
@@ -1212,9 +1301,13 @@ cnmac_recv(struct cnmac_softc *sc, uint6
 {
 	struct ifnet *ifp = &sc->sc_arpcom.ac_if;
 	struct mbuf *m;
-	uint64_t word2;
+	uint64_t word1, word2;
 	int nmbuf = 0;
+#ifndef SMALL_KERNEL
+	struct ether_extracted ext;
+#endif
 
+	word1 = work[1];
 	word2 = work[2];
 
 	if (!(ifp->if_flags & IFF_RUNNING))
@@ -1232,6 +1325,8 @@ cnmac_recv(struct cnmac_softc *sc, uint6
 	}
 
 	m->m_pkthdr.csum_flags = 0;
+	m->m_pkthdr.ph_flowid = word1 & PIP_WQE_WORD1_TAG;
+	SET(m->m_pkthdr.csum_flags, M_FLOWID);
 	if (__predict_true(!ISSET(word2, PIP_WQE_WORD2_IP_NI))) {
 		/* Check IP checksum status. */
 		if (!ISSET(word2, PIP_WQE_WORD2_IP_V6) &&
@@ -1246,7 +1341,19 @@ cnmac_recv(struct cnmac_softc *sc, uint6
 			    M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK;
 	}
 
-	ml_enqueue(ml, m);
+#ifndef SMALL_KERNEL
+	if (__predict_true(ISSET(ifp->if_xflags, IFXF_LRO)) &&
+	    __predict_true(!ISSET(word2, PIP_WQE_WORD2_IP_NI)) &&
+	    ISSET(word2, PIP_WQE_WORD2_IP_TU) &&
+	    !ISSET(word2, PIP_WQE_WORD2_IP_FR | PIP_WQE_WORD2_IP_LE)) {
+		ether_extract_headers(m, &ext);
+		if (ext.tcp != NULL)
+			tcp_softlro_glue(ml, m, ifp);
+		else
+			ml_enqueue(ml, m);
+	} else
+#endif
+		ml_enqueue(ml, m);
 
 	return nmbuf;
 
@@ -1258,16 +1365,20 @@ drop:
 int
 cnmac_intr(void *arg)
 {
-	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
-	struct cnmac_softc *sc = arg;
-	struct ifnet *ifp = &sc->sc_arpcom.ac_if;
+	struct cnmac_rx_group *crg = arg;
+	struct cn30xxpow_softc *pow = &cn30xxpow_softc;
+	struct cnmac_softc *sc;
+	struct ifnet *ifp;
+	struct mbuf_list *ml;
 	uint64_t *work;
-	uint64_t wqmask = 1ull << sc->sc_powgroup;
+	uint64_t pending = 0;
+	uint64_t wqmask = 1ull << crg->crg_group;
 	uint32_t coreid = octeon_get_coreid();
-	uint32_t port;
+	unsigned int port;
+	unsigned int i;
 	int nmbuf = 0;
 
-	_POW_WR8(sc->sc_pow, POW_PP_GRP_MSK_OFFSET(coreid), wqmask);
+	_POW_WR8(pow, POW_PP_GRP_MSK_OFFSET(coreid), wqmask);
 
 	cn30xxpow_tag_sw_wait();
 	cn30xxpow_work_request_async(OCTEON_CVMSEG_OFFSET(csm_pow_intr),
@@ -1284,18 +1395,30 @@ cnmac_intr(void *arg)
 		    OCTEON_CVMSEG_OFFSET(csm_pow_intr), POW_NO_WAIT);
 
 		port = (work[1] & PIP_WQE_WORD1_IPRT) >> 42;
-		if (port != sc->sc_port) {
-			printf("%s: unexpected wqe port %u, should be %u\n",
-			    sc->sc_dev.dv_xname, port, sc->sc_port);
+		if (port >= nitems(cnmac_port_softc) ||
+		    (sc = cnmac_port_softc[port]) == NULL) {
+			printf("%s: unexpected wqe port %u\n",
+			    crg->crg_name, port);
 			goto wqe_error;
 		}
 
-		nmbuf += cnmac_recv(sc, work, &ml);
+		if ((pending & (1ULL << port)) == 0) {
+			ml_init(&crg->crg_rx_batch[port]);
+			pending |= 1ULL << port;
+		}
+		nmbuf += cnmac_recv(sc, work, &crg->crg_rx_batch[port]);
 	}
 
-	_POW_WR8(sc->sc_pow, POW_WQ_INT_OFFSET, wqmask);
+	_POW_WR8(pow, POW_WQ_INT_OFFSET, wqmask);
 
-	if_input(ifp, &ml);
+	while (pending) {
+		i = __builtin_ffsll(pending) - 1;
+		sc = cnmac_port_softc[i];
+		ifp = &sc->sc_arpcom.ac_if;
+		ml = &crg->crg_rx_batch[i];
+		ifiq_input(ifp->if_iqs[crg->crg_group], ml);
+		pending &= pending - 1;
+	}
 
 	nmbuf = cnmac_mbuf_alloc(nmbuf);
 	if (nmbuf != 0)
Index: sys/arch/octeon/dev/if_cnmacvar.h
===================================================================
RCS file: /home/cvs/src/sys/arch/octeon/dev/if_cnmacvar.h,v
diff -u -p -r1.20 if_cnmacvar.h
--- sys/arch/octeon/dev/if_cnmacvar.h	28 Dec 2022 01:39:21 -0000	1.20
+++ sys/arch/octeon/dev/if_cnmacvar.h	5 Apr 2026 08:57:01 -0000
@@ -63,7 +63,6 @@ struct cnmac_softc {
 
 	bus_dmamap_t		sc_dmap;
 
-	void			*sc_ih;
 	struct cn30xxpip_softc	*sc_pip;
 	struct cn30xxipd_softc	*sc_ipd;
 	struct cn30xxpko_softc	*sc_pko;
@@ -92,7 +91,6 @@ struct cnmac_softc {
 	uint32_t		sc_port_type;
 	uint32_t		sc_init_flag;
 	int			sc_phy_addr;
-	int			sc_powgroup;
 
 	/*
 	 * Redirection - received (input) packets are redirected (directly sent)


-- 
wbr, Kirill