Download raw body.
hardware LRO support for bnxt(4)
On Tue, Oct 28, 2025 at 01:58:54PM +0100, Stefan Sperling wrote:
> This patch adds support for hardware LRO to bnxt(4).
>
> With this patch, all currently supported devices start using hardware LRO.
> The softLRO path is now unused. But it is kept intact because we will still
> need it for newer devices ("P5" and up) which jmatthew@ is planning to add.
>
> Buffers on TPA-enabled rings can be completed out-of-order. Rather than
> freeing consumed slots on aggregation rings sequentially we must maintain
> a bitmap which keeps track of free slots. This avoids freeing slots the
> device is still using.
>
> Tested on:
>
> bnxt0 at pci14 dev 0 function 0 "Broadcom BCM57414" rev 0x01: fw ver 234.0.150, msix, 8 queues, address xx:xx:xx:xx:xx:xx
>
>
> 357020 1238298984 9906.392 100.00%
> Conn: 1 Mbps: 9906.392 Peak Mbps: 9916.791 Avg Mbps: 9906.392
>
> 0 input LRO generated packets glued in software
> 143069252 input LRO generated packets from device
> 16803037 input LRO small packets received from network
> 0 bad LRO packets dropped
>
> The patch requires my very recently committed "bnxt pci id fix" patch
> to compile.
>
> ok?
I haven't actually tried this yet, I've just been going through how it works
in my head. It generally looks good, but I think there are some edge cases
that need better handling, and some things I'm not sure about.
>
>
> M sys/dev/pci/if_bnxt.c | 352+ 80-
>
> 1 file changed, 352 insertions(+), 80 deletions(-)
>
> commit - 79c7ebb7c5ed81e0595f0844e09ea98df93f37e1
> commit + 1337fda361508a90fb10fd17a1e4af6e8c2dbad8
> blob - cfa5c59544fcf775cb7b29dddb2090b439ba4104
> blob + 36b0f798db8dc834291959fc98ad1e40c62a410e
> --- sys/dev/pci/if_bnxt.c
> +++ sys/dev/pci/if_bnxt.c
> @@ -97,6 +97,8 @@
> #define BNXT_MAX_TX_SEGS 31
> #define BNXT_TX_SLOTS(bs) (bs->bs_map->dm_nsegs + 1)
>
> +#define BNXT_MAX_RX_SEGS 32
Using this as max_agg_segs in the vnic tpa config means it'll combine up to
this many received TCP segments off the wire into an aggregation; does this
mean it'll also use that many buffers off the ag ring, or does it pack
multiple segments into each ag buffer? I guess it packs them, because you can
let it aggregate a lot more segments than buffers it can use in a single
aggregation.
> +
> #define BNXT_HWRM_SHORT_REQ_LEN sizeof(struct hwrm_short_input)
>
> #define BNXT_HWRM_LOCK_INIT(_sc, _name) \
> @@ -111,6 +113,7 @@
> #define BNXT_FLAG_WOL_CAP 0x0004
> #define BNXT_FLAG_SHORT_CMD 0x0008
> #define BNXT_FLAG_MSIX 0x0010
> +#define BNXT_FLAG_TPA 0x0020 /* hardware LRO */
>
> /* NVRam stuff has a five minute timeout */
> #define BNXT_NVM_TIMEO (5 * 60 * 1000)
> @@ -185,6 +188,11 @@ struct bnxt_dmamem {
> #define BNXT_DMA_DVA(_bdm) ((u_int64_t)(_bdm)->bdm_map->dm_segs[0].ds_addr)
> #define BNXT_DMA_KVA(_bdm) ((void *)(_bdm)->bdm_kva)
>
> +struct bnxt_full_tpa_start {
> + struct rx_tpa_start_cmpl low;
> + struct rx_tpa_start_cmpl_hi high;
> +};
> +
> struct bnxt_rx_queue {
> struct bnxt_softc *rx_softc;
> struct ifiqueue *rx_ifiq;
> @@ -197,8 +205,9 @@ struct bnxt_rx_queue {
> int rx_prod;
> int rx_cons;
> int rx_ag_prod;
> - int rx_ag_cons;
> + uint8_t *rx_ag_freemap;
> struct timeout rx_refill;
> + struct bnxt_full_tpa_start *rx_tpas;
> };
>
> struct bnxt_tx_queue {
> @@ -224,6 +233,13 @@ struct bnxt_queue {
> struct bnxt_grp_info q_rg;
> };
>
> +struct bnxt_hw_lro {
> + uint16_t is_mode_gro;
> + uint16_t max_agg_segs;
> + uint16_t max_aggs;
> + uint32_t min_agg_len;
> +};
> +
> struct bnxt_softc {
> struct device sc_dev;
> struct arpcom sc_ac;
> @@ -263,6 +279,8 @@ struct bnxt_softc {
> int sc_nqueues;
> struct intrmap *sc_intrmap;
> struct bnxt_queue sc_queues[BNXT_MAX_QUEUES];
> +
> + struct bnxt_hw_lro sc_hw_lro;
I realise this is copied from the FreeBSD driver, but there it seems to be a
separate structure to work with their sysctl implementation. We're not exposing
the LRO settings as sysctls so it doesn't really do anything for us.
> };
> #define DEVNAME(_sc) ((_sc)->sc_dev.dv_xname)
>
> @@ -317,12 +335,18 @@ void bnxt_write_tx_doorbell(struct bnxt_softc *, stru
>
> int bnxt_rx_fill(struct bnxt_queue *);
> int bnxt_rx_fill_ag(struct bnxt_queue *);
> -u_int bnxt_rx_fill_slots(struct bnxt_softc *, struct bnxt_ring *, void *,
> - struct bnxt_slot *, uint *, int, uint16_t, u_int);
> +u_int bnxt_rx_fill_slots(struct bnxt_softc *, struct bnxt_ring *,
> + void *, struct bnxt_slot *, uint *, uint8_t *, uint32_t,
> + int, uint16_t, u_int);
> void bnxt_refill(void *);
> int bnxt_rx(struct bnxt_softc *, struct bnxt_rx_queue *,
> struct bnxt_cp_ring *, struct mbuf_list *, struct mbuf_list *,
> int *, int *, struct cmpl_base *);
> +void bnxt_rx_tpa_start(struct bnxt_softc *, struct bnxt_rx_queue *,
> + struct bnxt_cp_ring *, struct cmpl_base *);
> +int bnxt_rx_tpa(struct bnxt_softc *, struct bnxt_rx_queue *,
> + struct bnxt_cp_ring *, struct mbuf_list *, int *, int *,
> + struct cmpl_base *);
>
> void bnxt_txeof(struct bnxt_softc *, struct bnxt_tx_queue *, int *,
> struct cmpl_base *);
> @@ -372,6 +396,7 @@ int bnxt_hwrm_free_filter(struct bnxt_softc *,
> int bnxt_hwrm_vnic_rss_cfg(struct bnxt_softc *,
> struct bnxt_vnic_info *, uint32_t, daddr_t, daddr_t);
> int bnxt_cfg_async_cr(struct bnxt_softc *, struct bnxt_cp_ring *);
> +int bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *);
> int bnxt_hwrm_nvm_get_dev_info(struct bnxt_softc *, uint16_t *,
> uint16_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *);
> int bnxt_hwrm_port_phy_qcfg(struct bnxt_softc *,
> @@ -385,9 +410,6 @@ int bnxt_hwrm_func_drv_unrgtr(struct bnxt_softc *softc
>
> int bnxt_hwrm_port_qstats(struct bnxt_softc *softc);
>
> -
> -int bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *softc);
> -void bnxt_validate_hw_lro_settings(struct bnxt_softc *softc);
> int bnxt_hwrm_fw_reset(struct bnxt_softc *softc, uint8_t processor,
> uint8_t *selfreset);
> int bnxt_hwrm_fw_qstatus(struct bnxt_softc *softc, uint8_t type,
> @@ -652,6 +674,39 @@ bnxt_attach(struct device *parent, struct device *self
> #endif
> ifp->if_capabilities |= IFCAP_LRO;
>
> + switch (PCI_PRODUCT(pa->pa_id)) {
> + case PCI_PRODUCT_BROADCOM_BCM57301:
> + case PCI_PRODUCT_BROADCOM_BCM57302:
> + case PCI_PRODUCT_BROADCOM_BCM57304:
> + case PCI_PRODUCT_BROADCOM_BCM57311:
> + case PCI_PRODUCT_BROADCOM_BCM57312:
> + case PCI_PRODUCT_BROADCOM_BCM57314:
> + case PCI_PRODUCT_BROADCOM_BCM57402:
> + case PCI_PRODUCT_BROADCOM_BCM57404:
> + case PCI_PRODUCT_BROADCOM_BCM57406:
> + case PCI_PRODUCT_BROADCOM_BCM57407:
> + case PCI_PRODUCT_BROADCOM_BCM57412:
> + case PCI_PRODUCT_BROADCOM_BCM57414:
> + case PCI_PRODUCT_BROADCOM_BCM57416:
> + case PCI_PRODUCT_BROADCOM_BCM57416_SFP:
> + case PCI_PRODUCT_BROADCOM_BCM57417:
> + case PCI_PRODUCT_BROADCOM_BCM57417_SFP:
> + sc->sc_flags |= BNXT_FLAG_TPA;
> + break;
> + default:
> + break;
> + }
> +
> + /*
> + * TPA aggregation thresholds should be kept in sync with
> + * if_rxr_init() such that the ag ring's low watermark provides
> + * sufficient space for all aggregations allowed here.
> + */
> + sc->sc_hw_lro.is_mode_gro = 0;
> + sc->sc_hw_lro.max_agg_segs = 5; /* 2^5 = 32 segs = BNXT_MAX_RX_SEGS */
> + sc->sc_hw_lro.max_aggs = HWRM_VNIC_TPA_CFG_INPUT_MAX_AGGS_4;
> + sc->sc_hw_lro.min_agg_len = 512;
> +
> ifq_init_maxlen(&ifp->if_snd, 1024); /* ? */
>
> ifmedia_init(&sc->sc_media, IFM_IMASK, bnxt_media_change,
> @@ -791,6 +846,7 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
> struct bnxt_tx_queue *tx = &bq->q_tx;
> struct bnxt_grp_info *rg = &bq->q_rg;
> struct bnxt_slot *bs;
> + size_t num_tpa_agg_ids = 0;
> int i;
>
> tx->tx_ring_mem = bnxt_dmamem_alloc(sc, PAGE_SIZE);
> @@ -920,23 +976,42 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
> goto destroy_rx_slots;
> }
>
> + rx->rx_ag_freemap = malloc(rx->rx_ag_ring.ring_size / 8,
> + M_DEVBUF, M_WAITOK | M_ZERO);
> + if (rx->rx_ag_freemap == NULL) {
> + printf("%s: failed to allocate rx ag freemap\n", DEVNAME(sc));
> + goto destroy_rx_ag_slots;
> + }
> + memset(rx->rx_ag_freemap, 0xff, rx->rx_ag_ring.ring_size / 8);
> +
> for (i = 0; i < rx->rx_ag_ring.ring_size; i++) {
> bs = &rx->rx_ag_slots[i];
> - if (bus_dmamap_create(sc->sc_dmat, BNXT_AG_BUFFER_SIZE, 1,
> - BNXT_AG_BUFFER_SIZE, 0,
> + if (bus_dmamap_create(sc->sc_dmat, BNXT_AG_BUFFER_SIZE,
> + BNXT_MAX_RX_SEGS, BNXT_AG_BUFFER_SIZE, 0,
I don't think this needs to be changed, as we're not changing how the
aggregation buffers are allocated, and we still have a dma map per buffer.
> BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
> &bs->bs_map) != 0) {
> printf("%s: failed to allocate rx ag dma maps\n",
> DEVNAME(sc));
> - goto destroy_rx_ag_slots;
> + goto destroy_rx_ag_freemap;
> }
> }
>
> + if ((sc->sc_flags & BNXT_FLAG_TPA) && (ifp->if_xflags & IFXF_LRO)) {
> + num_tpa_agg_ids = (RX_TPA_START_CMPL_AGG_ID_MASK >>
> + RX_TPA_START_CMPL_AGG_ID_SFT),
> + rx->rx_tpas = mallocarray(sizeof(rx->rx_tpas[0]),
> + num_tpa_agg_ids, M_DEVBUF, M_WAITOK | M_ZERO);
> + if (rx->rx_tpas == NULL) {
> + printf("%s: failed to allocate TPA\n", DEVNAME(sc));
> + goto destroy_rx_ag_freemap;
> + }
> + }
> +
> tx->tx_slots = mallocarray(sizeof(*bs), tx->tx_ring.ring_size,
> M_DEVBUF, M_WAITOK | M_ZERO);
> if (tx->tx_slots == NULL) {
> printf("%s: failed to allocate tx slots\n", DEVNAME(sc));
> - goto destroy_rx_ag_slots;
> + goto destroy_rx_tpas;
> }
>
> for (i = 0; i < tx->tx_ring.ring_size; i++) {
> @@ -958,12 +1033,12 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
> * once the whole ring has been used once, we should be able to back off
> * to 2 or so slots, but we currently don't have a way of doing that.
> */
> - if_rxr_init(&rx->rxr[0], 32, rx->rx_ring.ring_size - 1);
> - if_rxr_init(&rx->rxr[1], 32, rx->rx_ag_ring.ring_size - 1);
> + if_rxr_init(&rx->rxr[0], BNXT_MAX_RX_SEGS, rx->rx_ring.ring_size - 1);
> + if_rxr_init(&rx->rxr[1], BNXT_MAX_RX_SEGS * 4,
> + rx->rx_ag_ring.ring_size - 1);
Not sure I follow this part. The requirement to fill this much of the rx ring
(at least at first) isn't related to LRO/TPA at all, as far as I can tell.
It was there before we even used the aggregation ring. Was it necessary to
increase this to 128 to get TPA working?
> rx->rx_prod = 0;
> rx->rx_cons = 0;
> rx->rx_ag_prod = 0;
> - rx->rx_ag_cons = 0;
> bnxt_rx_fill(bq);
> bnxt_rx_fill_ag(bq);
>
> @@ -980,6 +1055,12 @@ destroy_tx_slots:
> tx->tx_slots = NULL;
>
> i = rx->rx_ag_ring.ring_size;
> +destroy_rx_tpas:
> + free(rx->rx_tpas, M_DEVBUF, num_tpa_agg_ids * sizeof(rx->rx_tpas[0]));
> + rx->rx_tpas = NULL;
> +destroy_rx_ag_freemap:
> + free(rx->rx_ag_freemap, M_DEVBUF, rx->rx_ag_ring.ring_size / 8);
> + rx->rx_ag_freemap = NULL;
> destroy_rx_ag_slots:
> bnxt_free_slots(sc, rx->rx_ag_slots, i, rx->rx_ag_ring.ring_size);
> rx->rx_ag_slots = NULL;
> @@ -1031,6 +1112,9 @@ bnxt_queue_down(struct bnxt_softc *sc, struct bnxt_que
> tx->tx_ring.ring_size);
> tx->tx_slots = NULL;
>
> + free(rx->rx_ag_freemap, M_DEVBUF, rx->rx_ag_ring.ring_size / 8);
> + rx->rx_ag_freemap = NULL;
> +
> bnxt_free_slots(sc, rx->rx_ag_slots, rx->rx_ag_ring.ring_size,
> rx->rx_ag_ring.ring_size);
> rx->rx_ag_slots = NULL;
> @@ -1161,6 +1245,13 @@ bnxt_up(struct bnxt_softc *sc)
> }
> }
>
> + if (bnxt_hwrm_vnic_tpa_cfg(sc) != 0) {
> + printf("%s: failed to set hardware LRO config\n",
> + DEVNAME(sc));
> + ret = EIO;
> + goto dealloc_vnic;
> + }
> +
> bnxt_iff(sc);
> SET(ifp->if_flags, IFF_RUNNING);
>
> @@ -1301,10 +1392,12 @@ bnxt_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data
> }
>
> if (error == ENETRESET) {
> - if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) ==
> - (IFF_UP | IFF_RUNNING))
> - bnxt_iff(sc);
> error = 0;
> + if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) ==
> + (IFF_UP | IFF_RUNNING)) {
> + bnxt_down(sc);
> + error = bnxt_up(sc);
> + }
> }
>
> splx(s);
> @@ -1650,6 +1743,15 @@ bnxt_intr(void *xq)
> if (ISSET(ifp->if_flags, IFF_RUNNING))
> bnxt_txeof(sc, tx, &txfree, cmpl);
> break;
> + case CMPL_BASE_TYPE_RX_TPA_START:
> + if (ISSET(ifp->if_flags, IFF_RUNNING))
> + bnxt_rx_tpa_start(sc, rx, cpr, cmpl);
> + break;
> + case CMPL_BASE_TYPE_RX_TPA_END:
> + if (ISSET(ifp->if_flags, IFF_RUNNING))
> + rollback = bnxt_rx_tpa(sc, rx, cpr, &ml,
> + &rxfree, &agfree, cmpl);
> + break;
> default:
> printf("%s: unexpected completion type %u\n",
> DEVNAME(sc), type);
> @@ -1673,15 +1775,12 @@ bnxt_intr(void *xq)
>
> if (rxfree != 0) {
> int livelocked = 0;
> + int ag_needs_filling = 0;
>
> rx->rx_cons += rxfree;
> if (rx->rx_cons >= rx->rx_ring.ring_size)
> rx->rx_cons -= rx->rx_ring.ring_size;
>
> - rx->rx_ag_cons += agfree;
> - if (rx->rx_ag_cons >= rx->rx_ag_ring.ring_size)
> - rx->rx_ag_cons -= rx->rx_ag_ring.ring_size;
> -
> if_rxr_put(&rx->rxr[0], rxfree);
> if_rxr_put(&rx->rxr[1], agfree);
>
> @@ -1696,9 +1795,13 @@ bnxt_intr(void *xq)
> }
>
> bnxt_rx_fill(q);
> - bnxt_rx_fill_ag(q);
> - if ((rx->rx_cons == rx->rx_prod) ||
> - (rx->rx_ag_cons == rx->rx_ag_prod))
> + if (if_rxr_needrefill(&rx->rxr[1]))
> + ag_needs_filling = 1;
> +
> + if (ag_needs_filling && bnxt_rx_fill_ag(q) == 0)
> + ag_needs_filling = 0;
> +
> + if (rx->rx_cons == rx->rx_prod || ag_needs_filling)
> timeout_add(&rx->rx_refill, 0);
> }
> if (txfree != 0) {
> @@ -2211,10 +2314,30 @@ bnxt_write_tx_doorbell(struct bnxt_softc *sc, struct b
> htole32(val));
> }
>
> +int
> +bnxt_next_free_ag_slot(uint8_t *freemap, uint32_t mapsize, uint p)
> +{
> + int idx;
> + uint32_t n = 0;
> + uint i = p / 8;
> +
> + for (n = 0; n < mapsize; n++) {
> + idx = ffs(freemap[i]);
> + if (idx > 0)
> + return ((idx - 1) + (i * 8));
> + i++;
> + if (i >= mapsize)
> + i = 0;
> + }
> +
> + return -1;
> +}
> +
> u_int
> -bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_ring *ring, void *ring_mem,
> - struct bnxt_slot *slots, uint *prod, int bufsize, uint16_t bdtype,
> - u_int nslots)
> +bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_ring *ring,
> + void *ring_mem, struct bnxt_slot *slots, uint *prod,
> + uint8_t *freemap, uint32_t mapsize,
> + int bufsize, uint16_t bdtype, u_int nslots)
> {
> struct rx_prod_pkt_bd *rxring;
> struct bnxt_slot *bs;
> @@ -2224,6 +2347,12 @@ bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_
> rxring = (struct rx_prod_pkt_bd *)ring_mem;
> p = *prod;
> for (fills = 0; fills < nslots; fills++) {
> + if (freemap != NULL && isclr(freemap, p)) {
> + p = bnxt_next_free_ag_slot(freemap, mapsize, p);
> + if (p == -1)
> + return (nslots - fills);
> + }
> +
> bs = &slots[p];
> m = MCLGETL(NULL, M_DONTWAIT, bufsize);
> if (m == NULL)
> @@ -2242,6 +2371,9 @@ bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_
> rxring[p].opaque = p;
> rxring[p].addr = htole64(bs->bs_map->dm_segs[0].ds_addr);
>
> + if (freemap != NULL)
> + clrbit(freemap, p);
> +
> if (++p >= ring->ring_size)
> p = 0;
> }
> @@ -2264,8 +2396,8 @@ bnxt_rx_fill(struct bnxt_queue *q)
> slots = if_rxr_get(&rx->rxr[0], rx->rx_ring.ring_size);
> if (slots > 0) {
> slots = bnxt_rx_fill_slots(sc, &rx->rx_ring,
> - BNXT_DMA_KVA(rx->rx_ring_mem), rx->rx_slots,
> - &rx->rx_prod, MCLBYTES,
> + BNXT_DMA_KVA(rx->rx_ring_mem), rx->rx_slots, &rx->rx_prod,
> + NULL, 0, MCLBYTES,
> RX_PROD_PKT_BD_TYPE_RX_PROD_PKT, slots);
> if_rxr_put(&rx->rxr[0], slots);
> } else
> @@ -2279,19 +2411,23 @@ bnxt_rx_fill_ag(struct bnxt_queue *q)
> {
> struct bnxt_rx_queue *rx = &q->q_rx;
> struct bnxt_softc *sc = q->q_sc;
> - u_int slots;
> + u_int slots, unfilled_slots;
> int rv = 0;
>
> slots = if_rxr_get(&rx->rxr[1], rx->rx_ag_ring.ring_size);
> if (slots > 0) {
> - slots = bnxt_rx_fill_slots(sc, &rx->rx_ag_ring,
> + unfilled_slots = bnxt_rx_fill_slots(sc, &rx->rx_ag_ring,
> BNXT_DMA_KVA(rx->rx_ring_mem) + PAGE_SIZE,
> rx->rx_ag_slots, &rx->rx_ag_prod,
> + rx->rx_ag_freemap, rx->rx_ag_ring.ring_size / 8,
> BNXT_AG_BUFFER_SIZE,
> RX_PROD_AGG_BD_TYPE_RX_PROD_AGG, slots);
> - if_rxr_put(&rx->rxr[1], slots);
> - } else
> + if_rxr_put(&rx->rxr[1], unfilled_slots);
> + if (unfilled_slots == slots)
> + rv = 1;
> + } else {
> rv = 1;
> + }
>
> return (rv);
> }
> @@ -2301,15 +2437,18 @@ bnxt_refill(void *xq)
> {
> struct bnxt_queue *q = xq;
> struct bnxt_rx_queue *rx = &q->q_rx;
> + int ag_needs_filling = 0;
>
> if (rx->rx_cons == rx->rx_prod)
> bnxt_rx_fill(q);
>
> - if (rx->rx_ag_cons == rx->rx_ag_prod)
> - bnxt_rx_fill_ag(q);
> + if (if_rxr_needrefill(&rx->rxr[1]))
> + ag_needs_filling = 1;
>
> - if ((rx->rx_cons == rx->rx_prod) ||
> - (rx->rx_ag_cons == rx->rx_ag_prod))
> + if (ag_needs_filling && bnxt_rx_fill_ag(q) == 0)
> + ag_needs_filling = 0;
> +
> + if (rx->rx_cons == rx->rx_prod || ag_needs_filling)
> timeout_add(&rx->rx_refill, 1);
> }
>
> @@ -2318,29 +2457,25 @@ bnxt_rx(struct bnxt_softc *sc, struct bnxt_rx_queue *r
> struct bnxt_cp_ring *cpr, struct mbuf_list *ml, struct mbuf_list *mltcp,
> int *slots, int *agslots, struct cmpl_base *cmpl)
> {
> - struct mbuf *m, *am;
> + struct mbuf *m, *am, *aprev = NULL;
> struct bnxt_slot *bs;
> struct rx_pkt_cmpl *rxlo = (struct rx_pkt_cmpl *)cmpl;
> struct rx_pkt_cmpl_hi *rxhi;
> - struct rx_abuf_cmpl *ag;
> + struct rx_abuf_cmpl *ag = NULL;
> uint32_t flags;
> uint16_t errors;
> + uint8_t num_agg;
> + int i;
>
> + num_agg = (rxlo->agg_bufs_v1 & RX_TPA_END_CMPL_AGG_BUFS_MASK) >>
> + RX_TPA_END_CMPL_AGG_BUFS_SFT;
> +
> /* second part of the rx completion */
> rxhi = (struct rx_pkt_cmpl_hi *)bnxt_cpr_next_cmpl(sc, cpr);
> if (rxhi == NULL) {
> return (1);
> }
>
> - /* packets over 2k in size use an aggregation buffer completion too */
> - ag = NULL;
> - if ((rxlo->agg_bufs_v1 >> RX_PKT_CMPL_AGG_BUFS_SFT) != 0) {
> - ag = (struct rx_abuf_cmpl *)bnxt_cpr_next_cmpl(sc, cpr);
> - if (ag == NULL) {
> - return (1);
> - }
> - }
> -
This changes handling of the case where we get an rx interrupt before all three
entries for the packet are available on the completion queue, which I did see
when I was adding use of the aggregation ring to receive large packets.
Returning 0 in this case means we'll consume the rx entries (rx_pkt_cmpl,
rx_pkt_cmpl_hi) and pass a truncated packet to ifiq_input(), then drop the
rx_abuf_cmpl entry next time an rx interrupt arrives. From what I remember,
no one likes it when a network driver does this.
> bs = &rx->rx_slots[rxlo->opaque];
> bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0, bs->bs_map->dm_mapsize,
> BUS_DMASYNC_POSTREAD);
> @@ -2376,22 +2511,34 @@ bnxt_rx(struct bnxt_softc *sc, struct bnxt_rx_queue *r
> m->m_pkthdr.csum_flags |= M_FLOWID;
> }
>
> - if (ag != NULL) {
> + for (i = 0; i < num_agg; i++) {
> + ag = (struct rx_abuf_cmpl *)bnxt_cpr_next_cmpl(sc, cpr);
> + if (ag == NULL)
> + break;
> + if (isset(rx->rx_ag_freemap, ag->opaque))
> + continue;
> bs = &rx->rx_ag_slots[ag->opaque];
> bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0,
> bs->bs_map->dm_mapsize, BUS_DMASYNC_POSTREAD);
> bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
>
> + setbit(rx->rx_ag_freemap, ag->opaque);
> +
> am = bs->bs_m;
> bs->bs_m = NULL;
> am->m_len = letoh16(ag->len);
> - m->m_next = am;
> + if (aprev == NULL)
> + m->m_next = am;
> + else
> + aprev->m_next = am;
> + aprev = am;
> m->m_pkthdr.len += am->m_len;
> (*agslots)++;
> }
>
> #ifndef SMALL_KERNEL
> - if (ISSET(sc->sc_ac.ac_if.if_xflags, IFXF_LRO) &&
> + if ((sc->sc_flags & BNXT_FLAG_TPA) == 0 &&
> + ISSET(sc->sc_ac.ac_if.if_xflags, IFXF_LRO) &&
> ((lemtoh16(&rxlo->flags_type) & RX_PKT_CMPL_FLAGS_ITYPE_TCP) ==
> RX_PKT_CMPL_FLAGS_ITYPE_TCP))
> tcp_softlro_glue(mltcp, m, &sc->sc_ac.ac_if);
> @@ -2403,6 +2550,147 @@ bnxt_rx(struct bnxt_softc *sc, struct bnxt_rx_queue *r
> }
>
> void
> +bnxt_rx_tpa_start(struct bnxt_softc *sc, struct bnxt_rx_queue *rx,
> + struct bnxt_cp_ring *cpr, struct cmpl_base *cmpl)
> +{
> + struct rx_tpa_start_cmpl *rtpa = (struct rx_tpa_start_cmpl *)cmpl;
> + struct rx_tpa_start_cmpl_hi *rtpa_hi;
> + uint8_t agg_id;
> +
> + /* second part of the rx tpa start completion */
> + rtpa_hi = (struct rx_tpa_start_cmpl_hi *)bnxt_cpr_next_cmpl(sc, cpr);
> + if (rtpa_hi == NULL)
> + return;
I think this should return 1 like bnxt_rx() and bnxt_rx_tpa() so it doesn't
consume the first rx completion entry if the second isn't available.
> +
> + agg_id = (rtpa->agg_id & RX_TPA_START_CMPL_AGG_ID_MASK) >>
> + RX_TPA_START_CMPL_AGG_ID_SFT;
> +
> + memcpy(&rx->rx_tpas[agg_id].low, rtpa, sizeof(*rtpa));
> + memcpy(&rx->rx_tpas[agg_id].high, rtpa_hi, sizeof(*rtpa_hi));
> +}
> +
> +int
> +bnxt_rx_tpa(struct bnxt_softc *sc, struct bnxt_rx_queue *rx,
> + struct bnxt_cp_ring *cpr, struct mbuf_list *ml, int *slots, int *agslots,
> + struct cmpl_base *cmpl)
> +{
> + struct mbuf *m, *am, *aprev = NULL;
> + struct bnxt_slot *bs;
> + struct rx_tpa_end_cmpl *rxt = (struct rx_tpa_end_cmpl *)cmpl;
> + struct rx_tpa_end_cmpl_hi *rxthi;
> + struct bnxt_full_tpa_start *tpas;
> + struct rx_abuf_cmpl *ag;
> + uint32_t flags;
> + uint8_t i, num_agg;
> + uint8_t agg_id;
> +
> + /* second part of the rx completion */
> + rxthi = (struct rx_tpa_end_cmpl_hi *)bnxt_cpr_next_cmpl(sc, cpr);
> + if (rxthi == NULL)
> + return (1);
> +
> + agg_id = (rxt->agg_id & RX_TPA_END_CMPL_AGG_ID_MASK) >>
> + RX_TPA_END_CMPL_AGG_ID_SFT;
> + tpas = &rx->rx_tpas[agg_id];
> +
> + num_agg = (rxt->agg_bufs_v1 & RX_TPA_END_CMPL_AGG_BUFS_MASK) >>
> + RX_TPA_END_CMPL_AGG_BUFS_SFT;
> +
> + bs = &rx->rx_slots[tpas->low.opaque];
> + bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0, bs->bs_map->dm_mapsize,
> + BUS_DMASYNC_POSTREAD);
> + bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
> +
> + m = bs->bs_m;
> + bs->bs_m = NULL;
> + m->m_pkthdr.len = m->m_len = letoh16(tpas->low.len);
> + (*slots)++;
> +
> + /* checksum flags */
> + flags = lemtoh32(&tpas->high.flags2);
> + if (flags & RX_TPA_START_CMPL_FLAGS2_IP_CS_CALC)
> + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK;
> +
> + if (flags & RX_TPA_START_CMPL_FLAGS2_L4_CS_CALC) {
> + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
> + M_UDP_CSUM_IN_OK;
> + }
> +#if NVLAN > 0
> + if ((flags & RX_TPA_START_CMPL_FLAGS2_META_FORMAT_MASK) ==
> + RX_TPA_START_CMPL_FLAGS2_META_FORMAT_VLAN) {
> + m->m_pkthdr.ether_vtag = lemtoh16(&tpas->high.metadata);
> + m->m_flags |= M_VLANTAG;
> + }
> +#endif
> +
> + if (lemtoh16(&tpas->low.flags_type) &
> + RX_TPA_START_CMPL_FLAGS_RSS_VALID) {
> + m->m_pkthdr.ph_flowid = lemtoh32(&tpas->low.rss_hash);
> + m->m_pkthdr.csum_flags |= M_FLOWID;
> + }
> +
> + for (i = 0; i < num_agg; i++) {
> + ag = (struct rx_abuf_cmpl *)bnxt_cpr_next_cmpl(sc, cpr);
> + if (ag == NULL)
> + break;
I think we'll need to find a way to return 1 and avoid consuming the completion
queue entries if the full aggregation isn't available yet, which means not
touching the free map or chaining the mbufs together until we've got all the
completion entries. We could put an array in bnxt_queue for holding pointers
to rx_abuf_cmpls that we're not sure we can consume yet, perhaps.
> + if (isset(rx->rx_ag_freemap, ag->opaque))
> + continue;
> +
> + bs = &rx->rx_ag_slots[ag->opaque];
> + bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0,
> + bs->bs_map->dm_mapsize, BUS_DMASYNC_POSTREAD);
> + bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
> +
> + setbit(rx->rx_ag_freemap, ag->opaque);
> +
> + am = bs->bs_m;
> + bs->bs_m = NULL;
> + am->m_len = letoh16(ag->len);
> + if (aprev == NULL)
> + m->m_next = am;
> + else
> + aprev->m_next = am;
> + aprev = am;
> + m->m_pkthdr.len += am->m_len;
> + (*agslots)++;
> + }
> +
> + bs = &rx->rx_slots[rxt->opaque];
> + bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0, bs->bs_map->dm_mapsize,
> + BUS_DMASYNC_POSTREAD);
> + bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
> +
> + am = bs->bs_m;
> + bs->bs_m = NULL;
> + am->m_len = letoh16(rxt->len);
> + if (aprev == NULL)
> + m->m_next = am;
> + else
> + aprev->m_next = am;
> + m->m_pkthdr.len += am->m_len;
> + (*slots)++;
> +
> + flags = lemtoh16(&tpas->low.flags_type);
> + if ((flags & RX_TPA_START_CMPL_FLAGS_ITYPE_TCP) ==
> + RX_TPA_START_CMPL_FLAGS_ITYPE_TCP) {
> + uint16_t payload_offset = rxt->payload_offset;
> +
> + if (payload_offset == 0)
> + payload_offset = 256;
> +
> + if (m->m_pkthdr.len > payload_offset)
> + m->m_pkthdr.ph_mss = m->m_pkthdr.len - payload_offset;
> +
> + tcpstat_add(tcps_inhwlro, i + 1);
> + tcpstat_inc(tcps_inpktlro);
> +
> + }
> +
> + ml_enqueue(ml, m);
> + return (0);
> +}
> +
> +void
> bnxt_txeof(struct bnxt_softc *sc, struct bnxt_tx_queue *tx, int *txfree,
> struct cmpl_base *cmpl)
> {
> @@ -3346,64 +3634,48 @@ bnxt_cfg_async_cr(struct bnxt_softc *softc, struct bnx
> return rc;
> }
>
> -#if 0
> -
> -void
> -bnxt_validate_hw_lro_settings(struct bnxt_softc *softc)
> -{
> - softc->hw_lro.enable = min(softc->hw_lro.enable, 1);
> -
> - softc->hw_lro.is_mode_gro = min(softc->hw_lro.is_mode_gro, 1);
> -
> - softc->hw_lro.max_agg_segs = min(softc->hw_lro.max_agg_segs,
> - HWRM_VNIC_TPA_CFG_INPUT_MAX_AGG_SEGS_MAX);
> -
> - softc->hw_lro.max_aggs = min(softc->hw_lro.max_aggs,
> - HWRM_VNIC_TPA_CFG_INPUT_MAX_AGGS_MAX);
> -
> - softc->hw_lro.min_agg_len = min(softc->hw_lro.min_agg_len, BNXT_MAX_MTU);
> -}
> -
> int
> bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *softc)
> {
> + struct ifnet *ifp = &softc->sc_ac.ac_if;
> struct hwrm_vnic_tpa_cfg_input req = {0};
> uint32_t flags;
>
> - if (softc->vnic_info.id == (uint16_t) HWRM_NA_SIGNATURE) {
> + if (softc->sc_vnic.id == (uint16_t) HWRM_NA_SIGNATURE ||
> + (softc->sc_flags & BNXT_FLAG_TPA) == 0)
> return 0;
> - }
>
> bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_TPA_CFG);
>
> - if (softc->hw_lro.enable) {
> + if (ifp->if_xflags & IFXF_LRO) {
> flags = HWRM_VNIC_TPA_CFG_INPUT_FLAGS_TPA |
> HWRM_VNIC_TPA_CFG_INPUT_FLAGS_ENCAP_TPA |
> HWRM_VNIC_TPA_CFG_INPUT_FLAGS_AGG_WITH_ECN |
> HWRM_VNIC_TPA_CFG_INPUT_FLAGS_AGG_WITH_SAME_GRE_SEQ;
>
> - if (softc->hw_lro.is_mode_gro)
> + if (softc->sc_hw_lro.is_mode_gro)
> flags |= HWRM_VNIC_TPA_CFG_INPUT_FLAGS_GRO;
> else
> flags |= HWRM_VNIC_TPA_CFG_INPUT_FLAGS_RSC_WND_UPDATE;
Since we're not providing a mechanism to switch between these modes, we should
just pick one.
>
> req.flags = htole32(flags);
>
> - req.enables = htole32(HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGG_SEGS |
> - HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGGS |
> - HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MIN_AGG_LEN);
> + req.enables = htole32(
> + HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGG_SEGS |
> + HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGGS |
> + HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MIN_AGG_LEN);
>
> - req.max_agg_segs = htole16(softc->hw_lro.max_agg_segs);
> - req.max_aggs = htole16(softc->hw_lro.max_aggs);
> - req.min_agg_len = htole32(softc->hw_lro.min_agg_len);
> + req.max_agg_segs = htole16(softc->sc_hw_lro.max_agg_segs);
> + req.max_aggs = htole16(softc->sc_hw_lro.max_aggs);
> + req.min_agg_len = htole32(softc->sc_hw_lro.min_agg_len);
> }
>
> - req.vnic_id = htole16(softc->vnic_info.id);
> + req.vnic_id = htole16(softc->sc_vnic.id);
>
> return hwrm_send_message(softc, &req, sizeof(req));
> }
>
> -
> +#if 0
> int
> bnxt_hwrm_fw_reset(struct bnxt_softc *softc, uint8_t processor,
> uint8_t *selfreset)
>
hardware LRO support for bnxt(4)