Index | Thread | Search

From:
Visa Hankala <visa@hankala.org>
Subject:
Re: octeon: commuliative patch LRO, cnmac queue and softens
To:
"Kirill A. Korinsky" <kirill@korins.ky>
Cc:
tech@openbsd.org
Date:
Sun, 26 Apr 2026 16:22:35 +0000

Download raw body.

Thread
On Sun, Apr 05, 2026 at 03:46:51PM +0200, Kirill A. Korinsky wrote:
> Here a cumulative diff (LRO + multiple queue) which address all your remakrs
> and excluded already commited parts by kn@ and me.

Some minor comments below:

> Index: sys/arch/octeon/dev/cn30xxpip.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/cn30xxpip.c,v
> diff -u -p -r1.11 cn30xxpip.c
> --- sys/arch/octeon/dev/cn30xxpip.c	28 Dec 2022 01:39:21 -0000	1.11
> +++ sys/arch/octeon/dev/cn30xxpip.c	5 Apr 2026 13:24:51 -0000
> @@ -57,6 +57,7 @@ cn30xxpip_init(struct cn30xxpip_attach_a
>  	sc->sc_regt = aa->aa_regt;
>  	sc->sc_tag_type = aa->aa_tag_type;
>  	sc->sc_receive_group = aa->aa_receive_group;
> +	sc->sc_receive_group_order = aa->aa_receive_group_order;
>  	sc->sc_ip_offset = aa->aa_ip_offset;
>  
>  	status = bus_space_map(sc->sc_regt, PIP_BASE, PIP_SIZE, 0,
> @@ -88,6 +89,7 @@ cn30xxpip_port_config(struct cn30xxpip_s
>  	uint64_t prt_cfg;
>  	uint64_t prt_tag;
>  	uint64_t ip_offset;
> +	uint64_t group_mask;
>  
>  	/*
>  	 * Process the headers and place the IP header in the work queue
> @@ -108,22 +110,30 @@ cn30xxpip_port_config(struct cn30xxpip_s
>  	/* SKIP=0 */
>  
>  	prt_tag = 0;
> -	SET(prt_tag, PIP_PRT_TAGN_INC_PRT);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP6_DPRT);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP4_DPRT);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP6_SPRT);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP4_SPRT);
> +	CLR(prt_tag, PIP_PRT_TAGN_INC_VLAN);
> +	CLR(prt_tag, PIP_PRT_TAGN_INC_PRT);
> +	SET(prt_tag, PIP_PRT_TAGN_IP6_DPRT);
> +	SET(prt_tag, PIP_PRT_TAGN_IP4_DPRT);
> +	SET(prt_tag, PIP_PRT_TAGN_IP6_SPRT);
> +	SET(prt_tag, PIP_PRT_TAGN_IP4_SPRT);
>  	CLR(prt_tag, PIP_PRT_TAGN_IP6_NXTH);
>  	CLR(prt_tag, PIP_PRT_TAGN_IP4_PCTL);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP6_DST);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP4_SRC);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP6_SRC);
> -	CLR(prt_tag, PIP_PRT_TAGN_IP4_DST);
> +	SET(prt_tag, PIP_PRT_TAGN_IP6_DST);
> +	SET(prt_tag, PIP_PRT_TAGN_IP4_SRC);
> +	SET(prt_tag, PIP_PRT_TAGN_IP6_SRC);
> +	SET(prt_tag, PIP_PRT_TAGN_IP4_DST);
>  	SET(prt_tag, PIP_PRT_TAGN_TCP6_TAG_ORDERED);
>  	SET(prt_tag, PIP_PRT_TAGN_TCP4_TAG_ORDERED);
>  	SET(prt_tag, PIP_PRT_TAGN_IP6_TAG_ORDERED);
>  	SET(prt_tag, PIP_PRT_TAGN_IP4_TAG_ORDERED);
>  	SET(prt_tag, PIP_PRT_TAGN_NON_TAG_ORDERED);
> +	if (sc->sc_receive_group_order > 0) {
> +		group_mask = ~((1U << sc->sc_receive_group_order) - 1U);
> +		SET(prt_tag, ((uint64_t)sc->sc_receive_group << 36) &
> +		    PIP_PRT_TAGN_GRPTAGBASE);
> +		SET(prt_tag, (group_mask << 32) & PIP_PRT_TAGN_GRPTAGMASK);
> +		SET(prt_tag, PIP_PRT_TAGN_GRPTAG);
> +	}
>  	SET(prt_tag, sc->sc_receive_group & PIP_PRT_TAGN_GRP);
>  
>  	ip_offset = 0;
> Index: sys/arch/octeon/dev/cn30xxpipvar.h
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/cn30xxpipvar.h,v
> diff -u -p -r1.6 cn30xxpipvar.h
> --- sys/arch/octeon/dev/cn30xxpipvar.h	20 May 2024 23:13:33 -0000	1.6
> +++ sys/arch/octeon/dev/cn30xxpipvar.h	5 Apr 2026 08:57:01 -0000
> @@ -41,6 +41,7 @@ struct cn30xxpip_softc {
>  	bus_space_handle_t	sc_regh_stat;
>  	int			sc_tag_type;
>  	int			sc_receive_group;
> +	int			sc_receive_group_order;
>  	size_t			sc_ip_offset;
>  };
>  
> @@ -50,6 +51,7 @@ struct cn30xxpip_attach_args {
>  	bus_space_tag_t		aa_regt;
>  	int			aa_tag_type;
>  	int			aa_receive_group;
> +	int			aa_receive_group_order;
>  	size_t			aa_ip_offset;
>  };
>  
> Index: sys/arch/octeon/dev/if_cnmac.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/if_cnmac.c,v
> diff -u -p -r1.86 if_cnmac.c
> --- sys/arch/octeon/dev/if_cnmac.c	20 May 2024 23:13:33 -0000	1.86
> +++ sys/arch/octeon/dev/if_cnmac.c	5 Apr 2026 08:57:03 -0000
> @@ -55,6 +55,11 @@
>  #include <net/if_media.h>
>  #include <netinet/in.h>
>  #include <netinet/if_ether.h>
> +#ifndef SMALL_KERNEL
> +#include <netinet/tcp.h>
> +#include <netinet/tcp_timer.h>
> +#include <netinet/tcp_var.h>
> +#endif
>  
>  #if NBPFILTER > 0
>  #include <net/bpf.h>
> @@ -154,6 +159,11 @@ int	cnmac_send(struct cnmac_softc *, str
>  int	cnmac_reset(struct cnmac_softc *);
>  int	cnmac_configure(struct cnmac_softc *);
>  int	cnmac_configure_common(struct cnmac_softc *);
> +unsigned int cnmac_rx_group_count(void);
> +unsigned int cnmac_rx_group_order(unsigned int);
> +void	cnmac_rx_groups_init(void);
> +void	cnmac_rx_groups_config(struct cn30xxpow_softc *);
> +void	cnmac_rx_groups_barrier(void);
>  
>  void	cnmac_free_task(void *);
>  void	cnmac_tick_free(void *arg);
> @@ -182,6 +192,15 @@ const struct cfattach cnmac_ca = {
>  
>  struct cfdriver cnmac_cd = { NULL, "cnmac", DV_IFNET };
>  
> +#define CNMAC_PIP_PORT_MAX	64
> +
> +struct cnmac_rx_group {
> +	unsigned int		crg_group;
> +	void			*crg_ih;
> +	char			crg_name[IFNAMSIZ];
> +	struct mbuf_list	crg_rx_batch[CNMAC_PIP_PORT_MAX];
> +};

Because each instance of this struct is per-core state, it might be
good to use __aligned(CACHELINESIZE).

> +
>  /* ---- buffer management */
>  
>  const struct cnmac_pool_param {
> @@ -204,7 +223,10 @@ uint64_t cnmac_mac_addr = 0;
>  uint32_t cnmac_mac_addr_offset = 0;
>  
>  int	cnmac_mbufs_to_alloc;
> -int	cnmac_npowgroups = 0;
> +unsigned int cnmac_nrxgroups = 0;
> +unsigned int cnmac_nrxgroups_order = 0;
> +struct cnmac_softc *cnmac_port_softc[CNMAC_PIP_PORT_MAX];
> +struct cnmac_rx_group cnmac_rx_groups[OCTEON_POW_GROUP_MAX];
>  
>  void
>  cnmac_buf_init(struct cnmac_softc *sc)
> @@ -225,6 +247,72 @@ cnmac_buf_init(struct cnmac_softc *sc)
>  	}
>  }
>  
> +unsigned int
> +cnmac_rx_group_count(void)
> +{
> +	unsigned int count = 1;
> +	unsigned int target = softnet_count();
> +
> +	while (count < target && count < OCTEON_POW_GROUP_MAX)
> +		count <<= 1;

This finds the smallest power of two that is at least 'target'.
For example, if you have three active cores (for some weird reason),
you get four groups. Is this overshoot intentional?

> +
> +	return count;
> +}
> +
> +unsigned int
> +cnmac_rx_group_order(unsigned int count)
> +{
> +	unsigned int order = 0;
> +
> +	while ((1U << order) < count)
> +		order++;
> +
> +	return order;
> +}
> +
> +void
> +cnmac_rx_groups_init(void)
> +{
> +	struct cnmac_rx_group *crg;
> +	unsigned int i;
> +
> +	if (cnmac_nrxgroups != 0)
> +		return;
> +
> +	cnmac_nrxgroups = cnmac_rx_group_count();
> +	cnmac_nrxgroups_order = cnmac_rx_group_order(cnmac_nrxgroups);

I think this cnmac_nrxgroups calculation could be shortened, and kept
inline in cnmac_rx_groups_init():

	target = softnet_count();
	target = min(target, OCTEON_POW_GROUP_MAX);
	cnmac_nrxgroups_order = fls(target) - 1;
	cnmac_nrxgroups = 1U << cnmac_nrxgroups_order;

The above caps the number of groups with 'target'.

> +
> +	for (i = 0; i < cnmac_nrxgroups; i++) {
> +		crg = &cnmac_rx_groups[i];
> +		crg->crg_group = i;
> +		snprintf(crg->crg_name, sizeof(crg->crg_name),
> +		    "cnmacrx%u", i);
> +		crg->crg_ih = octeon_intr_establish(POW_WORKQ_IRQ(i),
> +		    IPL_NET | IPL_MPSAFE, cnmac_intr, crg, crg->crg_name);
> +		if (crg->crg_ih == NULL)
> +			panic("%s: could not set up interrupt",
> +			    crg->crg_name);
> +	}
> +}
> +
> +void
> +cnmac_rx_groups_config(struct cn30xxpow_softc *pow)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < cnmac_nrxgroups; i++)
> +		cn30xxpow_config(pow, i);
> +}
> +
> +void
> +cnmac_rx_groups_barrier(void)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < cnmac_nrxgroups; i++)
> +		intr_barrier(cnmac_rx_groups[i].crg_ih);
> +}
> +
>  /* ---- autoconf */
>  
>  int
> @@ -246,11 +334,6 @@ cnmac_attach(struct device *parent, stru
>  	struct cn30xxgmx_attach_args *ga = aux;
>  	struct ifnet *ifp = &sc->sc_arpcom.ac_if;
>  
> -	if (cnmac_npowgroups >= OCTEON_POW_GROUP_MAX) {
> -		printf(": out of POW groups\n");
> -		return;
> -	}
> -
>  	atomic_add_int(&cnmac_mbufs_to_alloc,
>  	    cnmac_mbuf_alloc(CNMAC_MBUFS_PER_PORT));
>  
> @@ -262,7 +345,6 @@ cnmac_attach(struct device *parent, stru
>  	sc->sc_gmx_port = ga->ga_gmx_port;
>  	sc->sc_smi = ga->ga_smi;
>  	sc->sc_phy_addr = ga->ga_phy_addr;
> -	sc->sc_powgroup = cnmac_npowgroups++;
>  
>  	sc->sc_init_flag = 0;
>  
> @@ -282,6 +364,10 @@ cnmac_attach(struct device *parent, stru
>  	task_set(&sc->sc_free_task, cnmac_free_task, sc);
>  	timeout_set(&sc->sc_tick_misc_ch, cnmac_tick_misc, sc);
>  	timeout_set(&sc->sc_tick_free_ch, cnmac_tick_free, sc);
> +	cnmac_rx_groups_init();
> +	KASSERT(sc->sc_port < nitems(cnmac_port_softc));
> +	KASSERT(cnmac_port_softc[sc->sc_port] == NULL);
> +	cnmac_port_softc[sc->sc_port] = sc;
>  
>  	cn30xxfau_op_init(&sc->sc_fau_done,
>  	    OCTEON_CVMSEG_ETHER_OFFSET(sc->sc_dev.dv_unit, csm_ether_fau_done),
> @@ -307,6 +393,9 @@ cnmac_attach(struct device *parent, stru
>  	ifp->if_softc = sc;
>  	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
>  	ifp->if_xflags = IFXF_MPSAFE;
> +#ifndef SMALL_KERNEL
> +	ifp->if_xflags |= IFXF_LRO;
> +#endif
>  	ifp->if_ioctl = cnmac_ioctl;
>  	ifp->if_qstart = cnmac_start;
>  	ifp->if_watchdog = cnmac_watchdog;
> @@ -315,22 +404,21 @@ cnmac_attach(struct device *parent, stru
>  
>  	ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_TCPv4 |
>  	    IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> +#ifndef SMALL_KERNEL
> +	ifp->if_capabilities |= IFCAP_LRO;
> +#endif
>  
>  	cn30xxgmx_set_filter(sc->sc_gmx_port);
>  
>  	if_attach(ifp);
>  	ether_ifattach(ifp);
> +	if_attach_iqueues(ifp, cnmac_nrxgroups);
>  
>  	cnmac_buf_init(sc);
>  
>  #if NKSTAT > 0
>  	cnmac_kstat_attach(sc);
>  #endif
> -
> -	sc->sc_ih = octeon_intr_establish(POW_WORKQ_IRQ(sc->sc_powgroup),
> -	    IPL_NET | IPL_MPSAFE, cnmac_intr, sc, sc->sc_dev.dv_xname);
> -	if (sc->sc_ih == NULL)
> -		panic("%s: could not set up interrupt", sc->sc_dev.dv_xname);
>  }
>  
>  /* ---- submodules */
> @@ -343,7 +431,8 @@ cnmac_pip_init(struct cnmac_softc *sc)
>  	pip_aa.aa_port = sc->sc_port;
>  	pip_aa.aa_regt = sc->sc_regt;
>  	pip_aa.aa_tag_type = POW_TAG_TYPE_ORDERED/* XXX */;
> -	pip_aa.aa_receive_group = sc->sc_powgroup;
> +	pip_aa.aa_receive_group = 0;
> +	pip_aa.aa_receive_group_order = cnmac_nrxgroups_order;
>  	pip_aa.aa_ip_offset = sc->sc_ip_offset;
>  	cn30xxpip_init(&pip_aa, &sc->sc_pip);
>  	cn30xxpip_port_config(sc->sc_pip);
> @@ -1026,7 +1115,7 @@ cnmac_stop(struct ifnet *ifp, int disabl
>  
>  	cn30xxgmx_port_enable(sc->sc_gmx_port, 0);
>  
> -	intr_barrier(sc->sc_ih);
> +	cnmac_rx_groups_barrier();
>  	ifq_barrier(&ifp->if_snd);
>  
>  	ifq_clr_oactive(&ifp->if_snd);
> @@ -1058,7 +1147,7 @@ cnmac_configure(struct cnmac_softc *sc)
>  
>  	cn30xxpko_port_config(sc->sc_pko);
>  	cn30xxpko_port_enable(sc->sc_pko, 1);
> -	cn30xxpow_config(sc->sc_pow, sc->sc_powgroup);
> +	cnmac_rx_groups_config(sc->sc_pow);
>  
>  	cn30xxgmx_port_enable(sc->sc_gmx_port, 1);
>  
> @@ -1212,9 +1301,13 @@ cnmac_recv(struct cnmac_softc *sc, uint6
>  {
>  	struct ifnet *ifp = &sc->sc_arpcom.ac_if;
>  	struct mbuf *m;
> -	uint64_t word2;
> +	uint64_t word1, word2;
>  	int nmbuf = 0;
> +#ifndef SMALL_KERNEL
> +	struct ether_extracted ext;
> +#endif
>  
> +	word1 = work[1];
>  	word2 = work[2];
>  
>  	if (!(ifp->if_flags & IFF_RUNNING))
> @@ -1232,6 +1325,8 @@ cnmac_recv(struct cnmac_softc *sc, uint6
>  	}
>  
>  	m->m_pkthdr.csum_flags = 0;
> +	m->m_pkthdr.ph_flowid = word1 & PIP_WQE_WORD1_TAG;
> +	SET(m->m_pkthdr.csum_flags, M_FLOWID);
>  	if (__predict_true(!ISSET(word2, PIP_WQE_WORD2_IP_NI))) {
>  		/* Check IP checksum status. */
>  		if (!ISSET(word2, PIP_WQE_WORD2_IP_V6) &&
> @@ -1246,7 +1341,19 @@ cnmac_recv(struct cnmac_softc *sc, uint6
>  			    M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK;
>  	}
>  
> -	ml_enqueue(ml, m);
> +#ifndef SMALL_KERNEL
> +	if (__predict_true(ISSET(ifp->if_xflags, IFXF_LRO)) &&
> +	    __predict_true(!ISSET(word2, PIP_WQE_WORD2_IP_NI)) &&
> +	    ISSET(word2, PIP_WQE_WORD2_IP_TU) &&
> +	    !ISSET(word2, PIP_WQE_WORD2_IP_FR | PIP_WQE_WORD2_IP_LE)) {
> +		ether_extract_headers(m, &ext);
> +		if (ext.tcp != NULL)
> +			tcp_softlro_glue(ml, m, ifp);
> +		else
> +			ml_enqueue(ml, m);
> +	} else
> +#endif
> +		ml_enqueue(ml, m);
>  
>  	return nmbuf;
>  
> @@ -1258,16 +1365,20 @@ drop:
>  int
>  cnmac_intr(void *arg)
>  {
> -	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> -	struct cnmac_softc *sc = arg;
> -	struct ifnet *ifp = &sc->sc_arpcom.ac_if;
> +	struct cnmac_rx_group *crg = arg;
> +	struct cn30xxpow_softc *pow = &cn30xxpow_softc;
> +	struct cnmac_softc *sc;
> +	struct ifnet *ifp;
> +	struct mbuf_list *ml;
>  	uint64_t *work;
> -	uint64_t wqmask = 1ull << sc->sc_powgroup;
> +	uint64_t pending = 0;
> +	uint64_t wqmask = 1ull << crg->crg_group;
>  	uint32_t coreid = octeon_get_coreid();
> -	uint32_t port;
> +	unsigned int port;
> +	unsigned int i;
>  	int nmbuf = 0;
>  
> -	_POW_WR8(sc->sc_pow, POW_PP_GRP_MSK_OFFSET(coreid), wqmask);
> +	_POW_WR8(pow, POW_PP_GRP_MSK_OFFSET(coreid), wqmask);
>  
>  	cn30xxpow_tag_sw_wait();
>  	cn30xxpow_work_request_async(OCTEON_CVMSEG_OFFSET(csm_pow_intr),
> @@ -1284,18 +1395,30 @@ cnmac_intr(void *arg)
>  		    OCTEON_CVMSEG_OFFSET(csm_pow_intr), POW_NO_WAIT);
>  
>  		port = (work[1] & PIP_WQE_WORD1_IPRT) >> 42;
> -		if (port != sc->sc_port) {
> -			printf("%s: unexpected wqe port %u, should be %u\n",
> -			    sc->sc_dev.dv_xname, port, sc->sc_port);
> +		if (port >= nitems(cnmac_port_softc) ||
> +		    (sc = cnmac_port_softc[port]) == NULL) {
> +			printf("%s: unexpected wqe port %u\n",
> +			    crg->crg_name, port);
>  			goto wqe_error;
>  		}
>  
> -		nmbuf += cnmac_recv(sc, work, &ml);
> +		if ((pending & (1ULL << port)) == 0) {
> +			ml_init(&crg->crg_rx_batch[port]);
> +			pending |= 1ULL << port;
> +		}
> +		nmbuf += cnmac_recv(sc, work, &crg->crg_rx_batch[port]);
>  	}
>  
> -	_POW_WR8(sc->sc_pow, POW_WQ_INT_OFFSET, wqmask);
> +	_POW_WR8(pow, POW_WQ_INT_OFFSET, wqmask);
>  
> -	if_input(ifp, &ml);
> +	while (pending) {
> +		i = __builtin_ffsll(pending) - 1;
> +		sc = cnmac_port_softc[i];
> +		ifp = &sc->sc_arpcom.ac_if;
> +		ml = &crg->crg_rx_batch[i];
> +		ifiq_input(ifp->if_iqs[crg->crg_group], ml);
> +		pending &= pending - 1;

Right, this is a nifty way to clear the least significant 1-bit.

> +	}
>  
>  	nmbuf = cnmac_mbuf_alloc(nmbuf);
>  	if (nmbuf != 0)
> Index: sys/arch/octeon/dev/if_cnmacvar.h
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/if_cnmacvar.h,v
> diff -u -p -r1.20 if_cnmacvar.h
> --- sys/arch/octeon/dev/if_cnmacvar.h	28 Dec 2022 01:39:21 -0000	1.20
> +++ sys/arch/octeon/dev/if_cnmacvar.h	5 Apr 2026 08:57:01 -0000
> @@ -63,7 +63,6 @@ struct cnmac_softc {
>  
>  	bus_dmamap_t		sc_dmap;
>  
> -	void			*sc_ih;
>  	struct cn30xxpip_softc	*sc_pip;
>  	struct cn30xxipd_softc	*sc_ipd;
>  	struct cn30xxpko_softc	*sc_pko;
> @@ -92,7 +91,6 @@ struct cnmac_softc {
>  	uint32_t		sc_port_type;
>  	uint32_t		sc_init_flag;
>  	int			sc_phy_addr;
> -	int			sc_powgroup;
>  
>  	/*
>  	 * Redirection - received (input) packets are redirected (directly sent)
> 
> 
> -- 
> wbr, Kirill
>