Download raw body.
octeon: commuliative patch LRO, cnmac queue and softens
On Sun, Apr 05, 2026 at 03:46:51PM +0200, Kirill A. Korinsky wrote:
> Here a cumulative diff (LRO + multiple queue) which address all your remakrs
> and excluded already commited parts by kn@ and me.
Some minor comments below:
> Index: sys/arch/octeon/dev/cn30xxpip.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/cn30xxpip.c,v
> diff -u -p -r1.11 cn30xxpip.c
> --- sys/arch/octeon/dev/cn30xxpip.c 28 Dec 2022 01:39:21 -0000 1.11
> +++ sys/arch/octeon/dev/cn30xxpip.c 5 Apr 2026 13:24:51 -0000
> @@ -57,6 +57,7 @@ cn30xxpip_init(struct cn30xxpip_attach_a
> sc->sc_regt = aa->aa_regt;
> sc->sc_tag_type = aa->aa_tag_type;
> sc->sc_receive_group = aa->aa_receive_group;
> + sc->sc_receive_group_order = aa->aa_receive_group_order;
> sc->sc_ip_offset = aa->aa_ip_offset;
>
> status = bus_space_map(sc->sc_regt, PIP_BASE, PIP_SIZE, 0,
> @@ -88,6 +89,7 @@ cn30xxpip_port_config(struct cn30xxpip_s
> uint64_t prt_cfg;
> uint64_t prt_tag;
> uint64_t ip_offset;
> + uint64_t group_mask;
>
> /*
> * Process the headers and place the IP header in the work queue
> @@ -108,22 +110,30 @@ cn30xxpip_port_config(struct cn30xxpip_s
> /* SKIP=0 */
>
> prt_tag = 0;
> - SET(prt_tag, PIP_PRT_TAGN_INC_PRT);
> - CLR(prt_tag, PIP_PRT_TAGN_IP6_DPRT);
> - CLR(prt_tag, PIP_PRT_TAGN_IP4_DPRT);
> - CLR(prt_tag, PIP_PRT_TAGN_IP6_SPRT);
> - CLR(prt_tag, PIP_PRT_TAGN_IP4_SPRT);
> + CLR(prt_tag, PIP_PRT_TAGN_INC_VLAN);
> + CLR(prt_tag, PIP_PRT_TAGN_INC_PRT);
> + SET(prt_tag, PIP_PRT_TAGN_IP6_DPRT);
> + SET(prt_tag, PIP_PRT_TAGN_IP4_DPRT);
> + SET(prt_tag, PIP_PRT_TAGN_IP6_SPRT);
> + SET(prt_tag, PIP_PRT_TAGN_IP4_SPRT);
> CLR(prt_tag, PIP_PRT_TAGN_IP6_NXTH);
> CLR(prt_tag, PIP_PRT_TAGN_IP4_PCTL);
> - CLR(prt_tag, PIP_PRT_TAGN_IP6_DST);
> - CLR(prt_tag, PIP_PRT_TAGN_IP4_SRC);
> - CLR(prt_tag, PIP_PRT_TAGN_IP6_SRC);
> - CLR(prt_tag, PIP_PRT_TAGN_IP4_DST);
> + SET(prt_tag, PIP_PRT_TAGN_IP6_DST);
> + SET(prt_tag, PIP_PRT_TAGN_IP4_SRC);
> + SET(prt_tag, PIP_PRT_TAGN_IP6_SRC);
> + SET(prt_tag, PIP_PRT_TAGN_IP4_DST);
> SET(prt_tag, PIP_PRT_TAGN_TCP6_TAG_ORDERED);
> SET(prt_tag, PIP_PRT_TAGN_TCP4_TAG_ORDERED);
> SET(prt_tag, PIP_PRT_TAGN_IP6_TAG_ORDERED);
> SET(prt_tag, PIP_PRT_TAGN_IP4_TAG_ORDERED);
> SET(prt_tag, PIP_PRT_TAGN_NON_TAG_ORDERED);
> + if (sc->sc_receive_group_order > 0) {
> + group_mask = ~((1U << sc->sc_receive_group_order) - 1U);
> + SET(prt_tag, ((uint64_t)sc->sc_receive_group << 36) &
> + PIP_PRT_TAGN_GRPTAGBASE);
> + SET(prt_tag, (group_mask << 32) & PIP_PRT_TAGN_GRPTAGMASK);
> + SET(prt_tag, PIP_PRT_TAGN_GRPTAG);
> + }
> SET(prt_tag, sc->sc_receive_group & PIP_PRT_TAGN_GRP);
>
> ip_offset = 0;
> Index: sys/arch/octeon/dev/cn30xxpipvar.h
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/cn30xxpipvar.h,v
> diff -u -p -r1.6 cn30xxpipvar.h
> --- sys/arch/octeon/dev/cn30xxpipvar.h 20 May 2024 23:13:33 -0000 1.6
> +++ sys/arch/octeon/dev/cn30xxpipvar.h 5 Apr 2026 08:57:01 -0000
> @@ -41,6 +41,7 @@ struct cn30xxpip_softc {
> bus_space_handle_t sc_regh_stat;
> int sc_tag_type;
> int sc_receive_group;
> + int sc_receive_group_order;
> size_t sc_ip_offset;
> };
>
> @@ -50,6 +51,7 @@ struct cn30xxpip_attach_args {
> bus_space_tag_t aa_regt;
> int aa_tag_type;
> int aa_receive_group;
> + int aa_receive_group_order;
> size_t aa_ip_offset;
> };
>
> Index: sys/arch/octeon/dev/if_cnmac.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/if_cnmac.c,v
> diff -u -p -r1.86 if_cnmac.c
> --- sys/arch/octeon/dev/if_cnmac.c 20 May 2024 23:13:33 -0000 1.86
> +++ sys/arch/octeon/dev/if_cnmac.c 5 Apr 2026 08:57:03 -0000
> @@ -55,6 +55,11 @@
> #include <net/if_media.h>
> #include <netinet/in.h>
> #include <netinet/if_ether.h>
> +#ifndef SMALL_KERNEL
> +#include <netinet/tcp.h>
> +#include <netinet/tcp_timer.h>
> +#include <netinet/tcp_var.h>
> +#endif
>
> #if NBPFILTER > 0
> #include <net/bpf.h>
> @@ -154,6 +159,11 @@ int cnmac_send(struct cnmac_softc *, str
> int cnmac_reset(struct cnmac_softc *);
> int cnmac_configure(struct cnmac_softc *);
> int cnmac_configure_common(struct cnmac_softc *);
> +unsigned int cnmac_rx_group_count(void);
> +unsigned int cnmac_rx_group_order(unsigned int);
> +void cnmac_rx_groups_init(void);
> +void cnmac_rx_groups_config(struct cn30xxpow_softc *);
> +void cnmac_rx_groups_barrier(void);
>
> void cnmac_free_task(void *);
> void cnmac_tick_free(void *arg);
> @@ -182,6 +192,15 @@ const struct cfattach cnmac_ca = {
>
> struct cfdriver cnmac_cd = { NULL, "cnmac", DV_IFNET };
>
> +#define CNMAC_PIP_PORT_MAX 64
> +
> +struct cnmac_rx_group {
> + unsigned int crg_group;
> + void *crg_ih;
> + char crg_name[IFNAMSIZ];
> + struct mbuf_list crg_rx_batch[CNMAC_PIP_PORT_MAX];
> +};
Because each instance of this struct is per-core state, it might be
good to use __aligned(CACHELINESIZE).
> +
> /* ---- buffer management */
>
> const struct cnmac_pool_param {
> @@ -204,7 +223,10 @@ uint64_t cnmac_mac_addr = 0;
> uint32_t cnmac_mac_addr_offset = 0;
>
> int cnmac_mbufs_to_alloc;
> -int cnmac_npowgroups = 0;
> +unsigned int cnmac_nrxgroups = 0;
> +unsigned int cnmac_nrxgroups_order = 0;
> +struct cnmac_softc *cnmac_port_softc[CNMAC_PIP_PORT_MAX];
> +struct cnmac_rx_group cnmac_rx_groups[OCTEON_POW_GROUP_MAX];
>
> void
> cnmac_buf_init(struct cnmac_softc *sc)
> @@ -225,6 +247,72 @@ cnmac_buf_init(struct cnmac_softc *sc)
> }
> }
>
> +unsigned int
> +cnmac_rx_group_count(void)
> +{
> + unsigned int count = 1;
> + unsigned int target = softnet_count();
> +
> + while (count < target && count < OCTEON_POW_GROUP_MAX)
> + count <<= 1;
This finds the smallest power of two that is at least 'target'.
For example, if you have three active cores (for some weird reason),
you get four groups. Is this overshoot intentional?
> +
> + return count;
> +}
> +
> +unsigned int
> +cnmac_rx_group_order(unsigned int count)
> +{
> + unsigned int order = 0;
> +
> + while ((1U << order) < count)
> + order++;
> +
> + return order;
> +}
> +
> +void
> +cnmac_rx_groups_init(void)
> +{
> + struct cnmac_rx_group *crg;
> + unsigned int i;
> +
> + if (cnmac_nrxgroups != 0)
> + return;
> +
> + cnmac_nrxgroups = cnmac_rx_group_count();
> + cnmac_nrxgroups_order = cnmac_rx_group_order(cnmac_nrxgroups);
I think this cnmac_nrxgroups calculation could be shortened, and kept
inline in cnmac_rx_groups_init():
target = softnet_count();
target = min(target, OCTEON_POW_GROUP_MAX);
cnmac_nrxgroups_order = fls(target) - 1;
cnmac_nrxgroups = 1U << cnmac_nrxgroups_order;
The above caps the number of groups with 'target'.
> +
> + for (i = 0; i < cnmac_nrxgroups; i++) {
> + crg = &cnmac_rx_groups[i];
> + crg->crg_group = i;
> + snprintf(crg->crg_name, sizeof(crg->crg_name),
> + "cnmacrx%u", i);
> + crg->crg_ih = octeon_intr_establish(POW_WORKQ_IRQ(i),
> + IPL_NET | IPL_MPSAFE, cnmac_intr, crg, crg->crg_name);
> + if (crg->crg_ih == NULL)
> + panic("%s: could not set up interrupt",
> + crg->crg_name);
> + }
> +}
> +
> +void
> +cnmac_rx_groups_config(struct cn30xxpow_softc *pow)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < cnmac_nrxgroups; i++)
> + cn30xxpow_config(pow, i);
> +}
> +
> +void
> +cnmac_rx_groups_barrier(void)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < cnmac_nrxgroups; i++)
> + intr_barrier(cnmac_rx_groups[i].crg_ih);
> +}
> +
> /* ---- autoconf */
>
> int
> @@ -246,11 +334,6 @@ cnmac_attach(struct device *parent, stru
> struct cn30xxgmx_attach_args *ga = aux;
> struct ifnet *ifp = &sc->sc_arpcom.ac_if;
>
> - if (cnmac_npowgroups >= OCTEON_POW_GROUP_MAX) {
> - printf(": out of POW groups\n");
> - return;
> - }
> -
> atomic_add_int(&cnmac_mbufs_to_alloc,
> cnmac_mbuf_alloc(CNMAC_MBUFS_PER_PORT));
>
> @@ -262,7 +345,6 @@ cnmac_attach(struct device *parent, stru
> sc->sc_gmx_port = ga->ga_gmx_port;
> sc->sc_smi = ga->ga_smi;
> sc->sc_phy_addr = ga->ga_phy_addr;
> - sc->sc_powgroup = cnmac_npowgroups++;
>
> sc->sc_init_flag = 0;
>
> @@ -282,6 +364,10 @@ cnmac_attach(struct device *parent, stru
> task_set(&sc->sc_free_task, cnmac_free_task, sc);
> timeout_set(&sc->sc_tick_misc_ch, cnmac_tick_misc, sc);
> timeout_set(&sc->sc_tick_free_ch, cnmac_tick_free, sc);
> + cnmac_rx_groups_init();
> + KASSERT(sc->sc_port < nitems(cnmac_port_softc));
> + KASSERT(cnmac_port_softc[sc->sc_port] == NULL);
> + cnmac_port_softc[sc->sc_port] = sc;
>
> cn30xxfau_op_init(&sc->sc_fau_done,
> OCTEON_CVMSEG_ETHER_OFFSET(sc->sc_dev.dv_unit, csm_ether_fau_done),
> @@ -307,6 +393,9 @@ cnmac_attach(struct device *parent, stru
> ifp->if_softc = sc;
> ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
> ifp->if_xflags = IFXF_MPSAFE;
> +#ifndef SMALL_KERNEL
> + ifp->if_xflags |= IFXF_LRO;
> +#endif
> ifp->if_ioctl = cnmac_ioctl;
> ifp->if_qstart = cnmac_start;
> ifp->if_watchdog = cnmac_watchdog;
> @@ -315,22 +404,21 @@ cnmac_attach(struct device *parent, stru
>
> ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_TCPv4 |
> IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> +#ifndef SMALL_KERNEL
> + ifp->if_capabilities |= IFCAP_LRO;
> +#endif
>
> cn30xxgmx_set_filter(sc->sc_gmx_port);
>
> if_attach(ifp);
> ether_ifattach(ifp);
> + if_attach_iqueues(ifp, cnmac_nrxgroups);
>
> cnmac_buf_init(sc);
>
> #if NKSTAT > 0
> cnmac_kstat_attach(sc);
> #endif
> -
> - sc->sc_ih = octeon_intr_establish(POW_WORKQ_IRQ(sc->sc_powgroup),
> - IPL_NET | IPL_MPSAFE, cnmac_intr, sc, sc->sc_dev.dv_xname);
> - if (sc->sc_ih == NULL)
> - panic("%s: could not set up interrupt", sc->sc_dev.dv_xname);
> }
>
> /* ---- submodules */
> @@ -343,7 +431,8 @@ cnmac_pip_init(struct cnmac_softc *sc)
> pip_aa.aa_port = sc->sc_port;
> pip_aa.aa_regt = sc->sc_regt;
> pip_aa.aa_tag_type = POW_TAG_TYPE_ORDERED/* XXX */;
> - pip_aa.aa_receive_group = sc->sc_powgroup;
> + pip_aa.aa_receive_group = 0;
> + pip_aa.aa_receive_group_order = cnmac_nrxgroups_order;
> pip_aa.aa_ip_offset = sc->sc_ip_offset;
> cn30xxpip_init(&pip_aa, &sc->sc_pip);
> cn30xxpip_port_config(sc->sc_pip);
> @@ -1026,7 +1115,7 @@ cnmac_stop(struct ifnet *ifp, int disabl
>
> cn30xxgmx_port_enable(sc->sc_gmx_port, 0);
>
> - intr_barrier(sc->sc_ih);
> + cnmac_rx_groups_barrier();
> ifq_barrier(&ifp->if_snd);
>
> ifq_clr_oactive(&ifp->if_snd);
> @@ -1058,7 +1147,7 @@ cnmac_configure(struct cnmac_softc *sc)
>
> cn30xxpko_port_config(sc->sc_pko);
> cn30xxpko_port_enable(sc->sc_pko, 1);
> - cn30xxpow_config(sc->sc_pow, sc->sc_powgroup);
> + cnmac_rx_groups_config(sc->sc_pow);
>
> cn30xxgmx_port_enable(sc->sc_gmx_port, 1);
>
> @@ -1212,9 +1301,13 @@ cnmac_recv(struct cnmac_softc *sc, uint6
> {
> struct ifnet *ifp = &sc->sc_arpcom.ac_if;
> struct mbuf *m;
> - uint64_t word2;
> + uint64_t word1, word2;
> int nmbuf = 0;
> +#ifndef SMALL_KERNEL
> + struct ether_extracted ext;
> +#endif
>
> + word1 = work[1];
> word2 = work[2];
>
> if (!(ifp->if_flags & IFF_RUNNING))
> @@ -1232,6 +1325,8 @@ cnmac_recv(struct cnmac_softc *sc, uint6
> }
>
> m->m_pkthdr.csum_flags = 0;
> + m->m_pkthdr.ph_flowid = word1 & PIP_WQE_WORD1_TAG;
> + SET(m->m_pkthdr.csum_flags, M_FLOWID);
> if (__predict_true(!ISSET(word2, PIP_WQE_WORD2_IP_NI))) {
> /* Check IP checksum status. */
> if (!ISSET(word2, PIP_WQE_WORD2_IP_V6) &&
> @@ -1246,7 +1341,19 @@ cnmac_recv(struct cnmac_softc *sc, uint6
> M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK;
> }
>
> - ml_enqueue(ml, m);
> +#ifndef SMALL_KERNEL
> + if (__predict_true(ISSET(ifp->if_xflags, IFXF_LRO)) &&
> + __predict_true(!ISSET(word2, PIP_WQE_WORD2_IP_NI)) &&
> + ISSET(word2, PIP_WQE_WORD2_IP_TU) &&
> + !ISSET(word2, PIP_WQE_WORD2_IP_FR | PIP_WQE_WORD2_IP_LE)) {
> + ether_extract_headers(m, &ext);
> + if (ext.tcp != NULL)
> + tcp_softlro_glue(ml, m, ifp);
> + else
> + ml_enqueue(ml, m);
> + } else
> +#endif
> + ml_enqueue(ml, m);
>
> return nmbuf;
>
> @@ -1258,16 +1365,20 @@ drop:
> int
> cnmac_intr(void *arg)
> {
> - struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> - struct cnmac_softc *sc = arg;
> - struct ifnet *ifp = &sc->sc_arpcom.ac_if;
> + struct cnmac_rx_group *crg = arg;
> + struct cn30xxpow_softc *pow = &cn30xxpow_softc;
> + struct cnmac_softc *sc;
> + struct ifnet *ifp;
> + struct mbuf_list *ml;
> uint64_t *work;
> - uint64_t wqmask = 1ull << sc->sc_powgroup;
> + uint64_t pending = 0;
> + uint64_t wqmask = 1ull << crg->crg_group;
> uint32_t coreid = octeon_get_coreid();
> - uint32_t port;
> + unsigned int port;
> + unsigned int i;
> int nmbuf = 0;
>
> - _POW_WR8(sc->sc_pow, POW_PP_GRP_MSK_OFFSET(coreid), wqmask);
> + _POW_WR8(pow, POW_PP_GRP_MSK_OFFSET(coreid), wqmask);
>
> cn30xxpow_tag_sw_wait();
> cn30xxpow_work_request_async(OCTEON_CVMSEG_OFFSET(csm_pow_intr),
> @@ -1284,18 +1395,30 @@ cnmac_intr(void *arg)
> OCTEON_CVMSEG_OFFSET(csm_pow_intr), POW_NO_WAIT);
>
> port = (work[1] & PIP_WQE_WORD1_IPRT) >> 42;
> - if (port != sc->sc_port) {
> - printf("%s: unexpected wqe port %u, should be %u\n",
> - sc->sc_dev.dv_xname, port, sc->sc_port);
> + if (port >= nitems(cnmac_port_softc) ||
> + (sc = cnmac_port_softc[port]) == NULL) {
> + printf("%s: unexpected wqe port %u\n",
> + crg->crg_name, port);
> goto wqe_error;
> }
>
> - nmbuf += cnmac_recv(sc, work, &ml);
> + if ((pending & (1ULL << port)) == 0) {
> + ml_init(&crg->crg_rx_batch[port]);
> + pending |= 1ULL << port;
> + }
> + nmbuf += cnmac_recv(sc, work, &crg->crg_rx_batch[port]);
> }
>
> - _POW_WR8(sc->sc_pow, POW_WQ_INT_OFFSET, wqmask);
> + _POW_WR8(pow, POW_WQ_INT_OFFSET, wqmask);
>
> - if_input(ifp, &ml);
> + while (pending) {
> + i = __builtin_ffsll(pending) - 1;
> + sc = cnmac_port_softc[i];
> + ifp = &sc->sc_arpcom.ac_if;
> + ml = &crg->crg_rx_batch[i];
> + ifiq_input(ifp->if_iqs[crg->crg_group], ml);
> + pending &= pending - 1;
Right, this is a nifty way to clear the least significant 1-bit.
> + }
>
> nmbuf = cnmac_mbuf_alloc(nmbuf);
> if (nmbuf != 0)
> Index: sys/arch/octeon/dev/if_cnmacvar.h
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/octeon/dev/if_cnmacvar.h,v
> diff -u -p -r1.20 if_cnmacvar.h
> --- sys/arch/octeon/dev/if_cnmacvar.h 28 Dec 2022 01:39:21 -0000 1.20
> +++ sys/arch/octeon/dev/if_cnmacvar.h 5 Apr 2026 08:57:01 -0000
> @@ -63,7 +63,6 @@ struct cnmac_softc {
>
> bus_dmamap_t sc_dmap;
>
> - void *sc_ih;
> struct cn30xxpip_softc *sc_pip;
> struct cn30xxipd_softc *sc_ipd;
> struct cn30xxpko_softc *sc_pko;
> @@ -92,7 +91,6 @@ struct cnmac_softc {
> uint32_t sc_port_type;
> uint32_t sc_init_flag;
> int sc_phy_addr;
> - int sc_powgroup;
>
> /*
> * Redirection - received (input) packets are redirected (directly sent)
>
>
> --
> wbr, Kirill
>
octeon: commuliative patch LRO, cnmac queue and softens