Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: hardware LRO support for bnxt(4)
To:
Jonathan Matthew <jonathan@d14n.org>
Cc:
tech@openbsd.org
Date:
Wed, 5 Nov 2025 18:51:38 +0100

Download raw body.

Thread
On Sun, Nov 02, 2025 at 12:05:11AM +1000, Jonathan Matthew wrote:
> On Fri, Oct 31, 2025 at 11:09:55AM +0100, Stefan Sperling wrote:
> > On Fri, Oct 31, 2025 at 01:11:08PM +1000, Jonathan Matthew wrote:
> > Putting more buffers on the ring is needed for performance. This is what
> > makes tcpbench go from 500 Mbps to 9999 Mbps if TPA is enabled.
> 
> I guess the rxring mechanism isn't able to increase the number of active slots
> enough to get up to speed?  It works properly in ix(4) for example, which starts
> with just enough slots to cover two full size packets.  I'll see if I can
> figure out what's going wrong here.

Investigating some more, it seems the situation is even worse.

By now I suspect rxr is fundementally incompatible with how TPA works
on these devices. At least as far as the aggregation ring is concerned.

I have already prevented driver-side processing of "incomplete" events
on the completion ring. Including cross-checks between TPA start and
end events (each of which use two slots on the completion ring).

Even so, sometimes a TPA start opaque value in the softc reads back as zero
during the TPA end event, even though it was not zero when it was saved to
the softc during the TPA start event. I don't yet understand why that happens
but this is a strong indication of a bug remaining in my code somewhere.

Even with just a single queue via #define BNXT_MAX_QUEUES 1 I can reproduce
use after frees on mcl8k clusters by running several TCP streams through the
driver (several instances of tcpbench).
I do not have certainty yet but this could mean the device is doing DMA
to aggregation ring buffers even after we take them off the ring?

As far as I can tell from reading FreeBSD bnxt_en and iflib code,
the vendor driver always fills an entire Rx ring as soon as packets
start being received in the Rx interrupt handler. I don't see them
using rxr style ring scaling like we are using.

And indeed, crashes become harder to trigger with higher rxr watermark
thresholds. Best results at ring size - 1, though it will still crash.
Setting the threshold to the entire ring size results in 0 Mbit/s.
I suspect the device sees a completely full ring as if it was empty.

I haven't quite worked out yet how the apparent "out of order" completion
of aggregation buffers relates to the slot value we send to the device
doorbell after refilling the ring. A single-slot doorbell mechanism like
this only makes sense for linear ring buffers. The devices tells us which
TPA slots were freed and I guess we need to immediately refill them,
regardless of where they fall inside or outside our rxr cons/prod window.

My current diff can be found below, for what it is worth...

There is also this worrying check in FreeBSD which I still have to
cross-check with the ring allocations on by our driver. Were you
already aware of these ring size ratio requirements?

nrdx[0] is the completion ring
nrdx[1] is the regular Rx ring
nrdx[2] is the Rx aggregation ring

	if (scctx->isc_nrxd[0] <
	    ((scctx->isc_nrxd[1] * 4) + scctx->isc_nrxd[2]))
		device_printf(softc->dev,
		    "WARNING: nrxd0 (%d) should be at least 4 * nrxd1 (%d) + nrxd2 (%d).  Driver may be unstable\n",
		    scctx->isc_nrxd[0], scctx->isc_nrxd[1], scctx->isc_nrxd[2]);
	if (scctx->isc_ntxd[0] < scctx->isc_ntxd[1] * 2)
		device_printf(softc->dev,
		    "WARNING: ntxd0 (%d) should be at least 2 * ntxd1 (%d).  Driver may be unstable\n",
		    scctx->isc_ntxd[0], scctx->isc_ntxd[1]);



M  sys/dev/pci/if_bnxt.c  |  443+  81-

1 file changed, 443 insertions(+), 81 deletions(-)

commit - 67f539dd03ce615ef3cff41d2e9d99890edc19af
commit + 1c446c402e461a60140bdc4651f1e38e2d47d738
blob - 50cc0010beca4a0d691645e6e7be12d6d97d0c10
blob + 6074507d2ade465a55ce0b652cd1add31f1bb124
--- sys/dev/pci/if_bnxt.c
+++ sys/dev/pci/if_bnxt.c
@@ -82,7 +82,7 @@
 #define BNXT_HWRM_BAR		0x10
 #define BNXT_DOORBELL_BAR	0x18
 
-#define BNXT_MAX_QUEUES		8
+#define BNXT_MAX_QUEUES		1
 
 #define BNXT_CP_RING_ID_BASE	0
 #define BNXT_RX_RING_ID_BASE	(BNXT_MAX_QUEUES + 1)
@@ -91,12 +91,15 @@
 
 #define BNXT_MAX_MTU		9500
 #define BNXT_AG_BUFFER_SIZE	8192
-
+#define BNXT_MAX_AG_BUFFERS	(RX_TPA_END_CMPL_AGG_BUFS_MASK >> \
+				    RX_TPA_END_CMPL_AGG_BUFS_SFT)
 #define BNXT_CP_PAGES		4
 
 #define BNXT_MAX_TX_SEGS	31
 #define BNXT_TX_SLOTS(bs)	(bs->bs_map->dm_nsegs + 1)
 
+#define BNXT_MAX_RX_SEGS	32
+
 #define BNXT_HWRM_SHORT_REQ_LEN	sizeof(struct hwrm_short_input)
 
 #define BNXT_HWRM_LOCK_INIT(_sc, _name)	\
@@ -111,6 +114,7 @@
 #define BNXT_FLAG_WOL_CAP       0x0004
 #define BNXT_FLAG_SHORT_CMD     0x0008
 #define BNXT_FLAG_MSIX          0x0010
+#define BNXT_FLAG_TPA           0x0020 /* hardware LRO */
 
 /* NVRam stuff has a five minute timeout */
 #define BNXT_NVM_TIMEO	(5 * 60 * 1000)
@@ -185,6 +189,12 @@ struct bnxt_dmamem {
 #define BNXT_DMA_DVA(_bdm)	((u_int64_t)(_bdm)->bdm_map->dm_segs[0].ds_addr)
 #define BNXT_DMA_KVA(_bdm)	((void *)(_bdm)->bdm_kva)
 
+struct bnxt_full_tpa_start {
+	uint32_t completed;
+	struct rx_tpa_start_cmpl low;
+	struct rx_tpa_start_cmpl_hi high;
+};
+
 struct bnxt_rx_queue {
 	struct bnxt_softc	*rx_softc;
 	struct ifiqueue		*rx_ifiq;
@@ -198,7 +208,9 @@ struct bnxt_rx_queue {
 	int			rx_cons;
 	int			rx_ag_prod;
 	int			rx_ag_cons;
+	uint8_t			*rx_ag_freemap;
 	struct timeout		rx_refill;
+	struct bnxt_full_tpa_start *rx_tpas;
 };
 
 struct bnxt_tx_queue {
@@ -263,6 +275,10 @@ struct bnxt_softc {
 	int			sc_nqueues;
 	struct intrmap		*sc_intrmap;
 	struct bnxt_queue	sc_queues[BNXT_MAX_QUEUES];
+
+	uint16_t		sc_max_agg_segs;
+	uint16_t		sc_max_aggs;
+	uint32_t		sc_min_agg_len;
 };
 #define DEVNAME(_sc)	((_sc)->sc_dev.dv_xname)
 
@@ -317,12 +333,18 @@ void		bnxt_write_tx_doorbell(struct bnxt_softc *, stru
 
 int		bnxt_rx_fill(struct bnxt_queue *);
 int		bnxt_rx_fill_ag(struct bnxt_queue *);
-u_int		bnxt_rx_fill_slots(struct bnxt_softc *, struct bnxt_ring *, void *,
-		    struct bnxt_slot *, uint *, int, uint16_t, u_int);
+u_int		bnxt_rx_fill_slots(struct bnxt_softc *, struct bnxt_ring *,
+		    void *, struct bnxt_slot *, uint, uint *, uint8_t *,
+		    uint32_t, int, uint16_t, u_int);
 void		bnxt_refill(void *);
 int		bnxt_rx(struct bnxt_softc *, struct bnxt_rx_queue *,
 		    struct bnxt_cp_ring *, struct mbuf_list *, struct mbuf_list *,
 		    int *, int *, struct cmpl_base *);
+int		bnxt_rx_tpa_start(struct bnxt_softc *, struct bnxt_rx_queue *,
+		    struct bnxt_cp_ring *, struct cmpl_base *);
+int		bnxt_rx_tpa(struct bnxt_softc *, struct bnxt_rx_queue *,
+		    struct bnxt_cp_ring *, struct mbuf_list *, int *, int *,
+		    struct cmpl_base *);
 
 void		bnxt_txeof(struct bnxt_softc *, struct bnxt_tx_queue *, int *,
 		    struct cmpl_base *);
@@ -372,6 +394,7 @@ int		bnxt_hwrm_free_filter(struct bnxt_softc *,
 int		bnxt_hwrm_vnic_rss_cfg(struct bnxt_softc *,
 		    struct bnxt_vnic_info *, uint32_t, daddr_t, daddr_t);
 int		bnxt_cfg_async_cr(struct bnxt_softc *, struct bnxt_cp_ring *);
+int		bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *);
 int		bnxt_hwrm_nvm_get_dev_info(struct bnxt_softc *, uint16_t *,
 		    uint16_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *);
 int		bnxt_hwrm_port_phy_qcfg(struct bnxt_softc *,
@@ -385,9 +408,6 @@ int bnxt_hwrm_func_drv_unrgtr(struct bnxt_softc *softc
 
 int bnxt_hwrm_port_qstats(struct bnxt_softc *softc);
 
-
-int bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *softc);
-void bnxt_validate_hw_lro_settings(struct bnxt_softc *softc);
 int bnxt_hwrm_fw_reset(struct bnxt_softc *softc, uint8_t processor,
     uint8_t *selfreset);
 int bnxt_hwrm_fw_qstatus(struct bnxt_softc *softc, uint8_t type,
@@ -653,6 +673,33 @@ bnxt_attach(struct device *parent, struct device *self
 	ifp->if_capabilities |= IFCAP_LRO;
 	ifp->if_xflags |= IFXF_LRO;
 
+	switch (PCI_PRODUCT(pa->pa_id)) {
+	case PCI_PRODUCT_BROADCOM_BCM57301:
+	case PCI_PRODUCT_BROADCOM_BCM57302:
+	case PCI_PRODUCT_BROADCOM_BCM57304:
+	case PCI_PRODUCT_BROADCOM_BCM57311:
+	case PCI_PRODUCT_BROADCOM_BCM57312:
+	case PCI_PRODUCT_BROADCOM_BCM57314:
+	case PCI_PRODUCT_BROADCOM_BCM57402:
+	case PCI_PRODUCT_BROADCOM_BCM57404:
+	case PCI_PRODUCT_BROADCOM_BCM57406:
+	case PCI_PRODUCT_BROADCOM_BCM57407:
+	case PCI_PRODUCT_BROADCOM_BCM57412:
+	case PCI_PRODUCT_BROADCOM_BCM57414:
+	case PCI_PRODUCT_BROADCOM_BCM57416:
+	case PCI_PRODUCT_BROADCOM_BCM57416_SFP:
+	case PCI_PRODUCT_BROADCOM_BCM57417:
+	case PCI_PRODUCT_BROADCOM_BCM57417_SFP:
+		sc->sc_flags |= BNXT_FLAG_TPA;
+		break;
+	default:
+		break;
+	}
+
+	sc->sc_max_agg_segs = 5; /* 2^5 = 32 segs = BNXT_MAX_RX_SEGS */
+	sc->sc_max_aggs = HWRM_VNIC_TPA_CFG_INPUT_MAX_AGGS_1;
+	sc->sc_min_agg_len = 512;
+
 	ifq_init_maxlen(&ifp->if_snd, 1024);	/* ? */
 
 	ifmedia_init(&sc->sc_media, IFM_IMASK, bnxt_media_change,
@@ -687,6 +734,7 @@ bnxt_attach(struct device *parent, struct device *self
 			cp->stats_ctx_id = HWRM_NA_SIGNATURE;
 			cp->ring.phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 			cp->ring.id = i + 1;	/* first cp ring is async only */
+			printf("%s: cp ring id %u\n", __func__, i + 1);
 			cp->softc = sc;
 			cp->ring.doorbell = bq->q_cp.ring.id * 0x80;
 			cp->ring.ring_size = (PAGE_SIZE * BNXT_CP_PAGES) /
@@ -792,6 +840,7 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
 	struct bnxt_tx_queue *tx = &bq->q_tx;
 	struct bnxt_grp_info *rg = &bq->q_rg;
 	struct bnxt_slot *bs;
+	size_t num_tpa_agg_ids = 0;
 	int i;
 
 	tx->tx_ring_mem = bnxt_dmamem_alloc(sc, PAGE_SIZE);
@@ -859,6 +908,7 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
 
 	rx->rx_ring.phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 	rx->rx_ring.id = BNXT_RX_RING_ID_BASE + bq->q_index;
+	printf("%s: rx ring id %u\n", __func__, rx->rx_ring.id);
 	rx->rx_ring.doorbell = rx->rx_ring.id * 0x80;
 	rx->rx_ring.ring_size = PAGE_SIZE / sizeof(struct rx_prod_pkt_bd);
 	rx->rx_ring.vaddr = BNXT_DMA_KVA(rx->rx_ring_mem);
@@ -873,6 +923,7 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
 
 	rx->rx_ag_ring.phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 	rx->rx_ag_ring.id = BNXT_AG_RING_ID_BASE + bq->q_index;
+	printf("%s: rx ag ring id %u\n", __func__, rx->rx_ag_ring.id);
 	rx->rx_ag_ring.doorbell = rx->rx_ag_ring.id * 0x80;
 	rx->rx_ag_ring.ring_size = PAGE_SIZE / sizeof(struct rx_prod_pkt_bd);
 	rx->rx_ag_ring.vaddr = BNXT_DMA_KVA(rx->rx_ring_mem) + PAGE_SIZE;
@@ -921,6 +972,14 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
 		goto destroy_rx_slots;
 	}
 
+	rx->rx_ag_freemap = malloc(rx->rx_ag_ring.ring_size / 8,
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+	if (rx->rx_ag_freemap == NULL) {
+		printf("%s: failed to allocate rx ag freemap\n", DEVNAME(sc));
+		goto destroy_rx_ag_slots;
+	}
+	memset(rx->rx_ag_freemap, 0xff, rx->rx_ag_ring.ring_size / 8);
+
 	for (i = 0; i < rx->rx_ag_ring.ring_size; i++) {
 		bs = &rx->rx_ag_slots[i];
 		if (bus_dmamap_create(sc->sc_dmat, BNXT_AG_BUFFER_SIZE, 1,
@@ -929,15 +988,26 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
 		    &bs->bs_map) != 0) {
 			printf("%s: failed to allocate rx ag dma maps\n",
 			    DEVNAME(sc));
-			goto destroy_rx_ag_slots;
+			goto destroy_rx_ag_freemap;
 		}
 	}
 
+	if ((sc->sc_flags & BNXT_FLAG_TPA) && (ifp->if_xflags & IFXF_LRO)) {
+		num_tpa_agg_ids = (RX_TPA_START_CMPL_AGG_ID_MASK >>
+		    RX_TPA_START_CMPL_AGG_ID_SFT),
+		rx->rx_tpas = mallocarray(sizeof(rx->rx_tpas[0]),
+		    num_tpa_agg_ids, M_DEVBUF, M_WAITOK | M_ZERO);
+		if (rx->rx_tpas == NULL) {
+			printf("%s: failed to allocate TPA\n", DEVNAME(sc));
+			goto destroy_rx_ag_freemap;
+		}
+	}
+
 	tx->tx_slots = mallocarray(sizeof(*bs), tx->tx_ring.ring_size,
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 	if (tx->tx_slots == NULL) {
 		printf("%s: failed to allocate tx slots\n", DEVNAME(sc));
-		goto destroy_rx_ag_slots;
+		goto destroy_rx_tpas;
 	}
 
 	for (i = 0; i < tx->tx_ring.ring_size; i++) {
@@ -960,7 +1030,8 @@ bnxt_queue_up(struct bnxt_softc *sc, struct bnxt_queue
 	 * to 2 or so slots, but we currently don't have a way of doing that.
 	 */
 	if_rxr_init(&rx->rxr[0], 32, rx->rx_ring.ring_size - 1);
-	if_rxr_init(&rx->rxr[1], 32, rx->rx_ag_ring.ring_size - 1);
+	if_rxr_init(&rx->rxr[1], rx->rx_ag_ring.ring_size - 1,
+	    rx->rx_ag_ring.ring_size - 1);
 	rx->rx_prod = 0;
 	rx->rx_cons = 0;
 	rx->rx_ag_prod = 0;
@@ -981,6 +1052,12 @@ destroy_tx_slots:
 	tx->tx_slots = NULL;
 
 	i = rx->rx_ag_ring.ring_size;
+destroy_rx_tpas:
+	free(rx->rx_tpas, M_DEVBUF, num_tpa_agg_ids * sizeof(rx->rx_tpas[0]));
+	rx->rx_tpas = NULL;
+destroy_rx_ag_freemap:
+	free(rx->rx_ag_freemap, M_DEVBUF, rx->rx_ag_ring.ring_size / 8);
+	rx->rx_ag_freemap = NULL;
 destroy_rx_ag_slots:
 	bnxt_free_slots(sc, rx->rx_ag_slots, i, rx->rx_ag_ring.ring_size);
 	rx->rx_ag_slots = NULL;
@@ -1032,6 +1109,9 @@ bnxt_queue_down(struct bnxt_softc *sc, struct bnxt_que
 	    tx->tx_ring.ring_size);
 	tx->tx_slots = NULL;
 
+	free(rx->rx_ag_freemap, M_DEVBUF, rx->rx_ag_ring.ring_size / 8);
+	rx->rx_ag_freemap = NULL;
+
 	bnxt_free_slots(sc, rx->rx_ag_slots, rx->rx_ag_ring.ring_size,
 	    rx->rx_ag_ring.ring_size);
 	rx->rx_ag_slots = NULL;
@@ -1162,6 +1242,13 @@ bnxt_up(struct bnxt_softc *sc)
 		}
 	}
 
+	if (bnxt_hwrm_vnic_tpa_cfg(sc) != 0) {
+		printf("%s: failed to set hardware LRO config\n",
+		    DEVNAME(sc));
+		ret = EIO;
+		goto dealloc_vnic;
+	}
+
 	bnxt_iff(sc);
 	SET(ifp->if_flags, IFF_RUNNING);
 
@@ -1302,10 +1389,12 @@ bnxt_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data
 	}
 
 	if (error == ENETRESET) {
-		if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) ==
-		    (IFF_UP | IFF_RUNNING))
-			bnxt_iff(sc);
 		error = 0;
+		if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) ==
+		    (IFF_UP | IFF_RUNNING)) {
+			bnxt_down(sc);
+			error = bnxt_up(sc);
+		}
 	}
 
 	splx(s);
@@ -1651,6 +1740,15 @@ bnxt_intr(void *xq)
 			if (ISSET(ifp->if_flags, IFF_RUNNING))
 				bnxt_txeof(sc, tx, &txfree, cmpl);
 			break;
+		case CMPL_BASE_TYPE_RX_TPA_START:
+			if (ISSET(ifp->if_flags, IFF_RUNNING))
+				rollback = bnxt_rx_tpa_start(sc, rx, cpr, cmpl);
+			break;
+		case CMPL_BASE_TYPE_RX_TPA_END:
+			if (ISSET(ifp->if_flags, IFF_RUNNING))
+				rollback = bnxt_rx_tpa(sc, rx, cpr, &ml,
+				    &rxfree, &agfree, cmpl);
+			break;
 		default:
 			printf("%s: unexpected completion type %u\n",
 			    DEVNAME(sc), type);
@@ -1674,6 +1772,7 @@ bnxt_intr(void *xq)
 
 	if (rxfree != 0) {
 		int livelocked = 0;
+		int ag_needs_filling = 0;
 
 		rx->rx_cons += rxfree;
 		if (rx->rx_cons >= rx->rx_ring.ring_size)
@@ -1697,9 +1796,13 @@ bnxt_intr(void *xq)
 		}
 
 		bnxt_rx_fill(q);
-		bnxt_rx_fill_ag(q);
-		if ((rx->rx_cons == rx->rx_prod) ||
-		    (rx->rx_ag_cons == rx->rx_ag_prod))
+		if (if_rxr_needrefill(&rx->rxr[1]))
+			ag_needs_filling = 1;
+
+		if (ag_needs_filling && bnxt_rx_fill_ag(q) == 0)
+			ag_needs_filling = 0;
+
+		if (rx->rx_cons == rx->rx_prod || ag_needs_filling)
 			timeout_add(&rx->rx_refill, 0);
 	}
 	if (txfree != 0) {
@@ -2212,10 +2315,30 @@ bnxt_write_tx_doorbell(struct bnxt_softc *sc, struct b
 	    htole32(val));
 }
 
+int
+bnxt_next_free_ag_slot(uint8_t *freemap, uint32_t mapsize, uint p)
+{
+	int idx;
+	uint32_t n = 0;
+	uint i = p / 8;
+
+	for (n = 0; n < mapsize; n++) {
+		idx = ffs(freemap[i]);
+		if (idx > 0)
+			return ((idx - 1) + (i * 8));
+		i++;
+		if (i >= mapsize)
+			i = 0;
+	}
+
+	return -1;
+}
+
 u_int
-bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_ring *ring, void *ring_mem,
-    struct bnxt_slot *slots, uint *prod, int bufsize, uint16_t bdtype,
-    u_int nslots)
+bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_ring *ring,
+    void *ring_mem, struct bnxt_slot *slots, uint cons, uint *prod,
+    uint8_t *freemap, uint32_t mapsize,
+    int bufsize, uint16_t bdtype, u_int nslots)
 {
 	struct rx_prod_pkt_bd *rxring;
 	struct bnxt_slot *bs;
@@ -2225,6 +2348,17 @@ bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_
 	rxring = (struct rx_prod_pkt_bd *)ring_mem;
 	p = *prod;
 	for (fills = 0; fills < nslots; fills++) {
+		if (freemap != NULL) {
+			/*
+			 * Aggregation ring buffers can complete out-of-order.
+			 * Make sure to refill free slots located between our
+			 * previous doorbell value and future doorbell value.
+			 */
+			p = bnxt_next_free_ag_slot(freemap, mapsize, cons);
+			if (p == -1)
+				return (nslots - fills);
+		}
+
 		bs = &slots[p];
 		m = MCLGETL(NULL, M_DONTWAIT, bufsize);
 		if (m == NULL)
@@ -2240,9 +2374,13 @@ bnxt_rx_fill_slots(struct bnxt_softc *sc, struct bnxt_
 
 		rxring[p].flags_type = htole16(bdtype);
 		rxring[p].len = htole16(bufsize);
-		rxring[p].opaque = p;
+		rxring[p].opaque = (((uint32_t)p) & PAGE_MASK) |
+		    (((uint32_t)ring->id) << PAGE_SHIFT);
 		rxring[p].addr = htole64(bs->bs_map->dm_segs[0].ds_addr);
 
+		if (freemap != NULL)
+			clrbit(freemap, p);
+
 		if (++p >= ring->ring_size)
 			p = 0;
 	}
@@ -2266,7 +2404,8 @@ bnxt_rx_fill(struct bnxt_queue *q)
 	if (slots > 0) {
 		slots = bnxt_rx_fill_slots(sc, &rx->rx_ring,
 		    BNXT_DMA_KVA(rx->rx_ring_mem), rx->rx_slots,
-		    &rx->rx_prod, MCLBYTES,
+		    rx->rx_cons, &rx->rx_prod,
+		    NULL, 0, MCLBYTES,
 		    RX_PROD_PKT_BD_TYPE_RX_PROD_PKT, slots);
 		if_rxr_put(&rx->rxr[0], slots);
 	} else
@@ -2280,19 +2419,23 @@ bnxt_rx_fill_ag(struct bnxt_queue *q)
 {
 	struct bnxt_rx_queue *rx = &q->q_rx;
 	struct bnxt_softc *sc = q->q_sc;
-	u_int slots;
+	u_int slots, unfilled_slots;
 	int rv = 0;
 
-	slots = if_rxr_get(&rx->rxr[1],  rx->rx_ag_ring.ring_size);
+	slots = if_rxr_get(&rx->rxr[1], rx->rx_ring.ring_size);
 	if (slots > 0) {
-		slots = bnxt_rx_fill_slots(sc, &rx->rx_ag_ring,
+		unfilled_slots = bnxt_rx_fill_slots(sc, &rx->rx_ag_ring,
 		    BNXT_DMA_KVA(rx->rx_ring_mem) + PAGE_SIZE,
-		    rx->rx_ag_slots, &rx->rx_ag_prod,
+		    rx->rx_ag_slots, rx->rx_ag_cons, &rx->rx_ag_prod,
+		    rx->rx_ag_freemap, rx->rx_ag_ring.ring_size / 8,
 		    BNXT_AG_BUFFER_SIZE,
 		    RX_PROD_AGG_BD_TYPE_RX_PROD_AGG, slots);
-		if_rxr_put(&rx->rxr[1], slots);
-	} else
+		if_rxr_put(&rx->rxr[1], unfilled_slots);
+		if (unfilled_slots > 0)
+			rv = 1;
+	} else {
 		rv = 1;
+	}
 
 	return (rv);
 }
@@ -2302,15 +2445,18 @@ bnxt_refill(void *xq)
 {
 	struct bnxt_queue *q = xq;
 	struct bnxt_rx_queue *rx = &q->q_rx;
+	int ag_needs_filling = 0;
 
 	if (rx->rx_cons == rx->rx_prod)
 		bnxt_rx_fill(q);
 
-	if (rx->rx_ag_cons == rx->rx_ag_prod)
-		bnxt_rx_fill_ag(q);
+	if (if_rxr_needrefill(&rx->rxr[1]))
+		ag_needs_filling = 1;
 
-	if ((rx->rx_cons == rx->rx_prod) ||
-	    (rx->rx_ag_cons == rx->rx_ag_prod))
+	if (ag_needs_filling && bnxt_rx_fill_ag(q) == 0)
+		ag_needs_filling = 0;
+
+	if (rx->rx_cons == rx->rx_prod || ag_needs_filling)
 		timeout_add(&rx->rx_refill, 1);
 }
 
@@ -2319,30 +2465,40 @@ bnxt_rx(struct bnxt_softc *sc, struct bnxt_rx_queue *r
     struct bnxt_cp_ring *cpr, struct mbuf_list *ml, struct mbuf_list *mltcp,
     int *slots, int *agslots, struct cmpl_base *cmpl)
 {
-	struct mbuf *m, *am;
+	struct mbuf *m, *am, *aprev = NULL;
 	struct bnxt_slot *bs;
 	struct rx_pkt_cmpl *rxlo = (struct rx_pkt_cmpl *)cmpl;
 	struct rx_pkt_cmpl_hi *rxhi;
-	struct rx_abuf_cmpl *ag;
+	struct rx_abuf_cmpl *ag[BNXT_MAX_AG_BUFFERS];
 	uint32_t flags;
 	uint16_t errors;
+	uint8_t num_agg;
+	int i;
+	uint slot, ring_id;
 
+	memset(ag, 0, sizeof(ag));
+	num_agg = (rxlo->agg_bufs_v1 & RX_TPA_END_CMPL_AGG_BUFS_MASK) >>
+	    RX_TPA_END_CMPL_AGG_BUFS_SFT;
+
 	/* second part of the rx completion */
 	rxhi = (struct rx_pkt_cmpl_hi *)bnxt_cpr_next_cmpl(sc, cpr);
 	if (rxhi == NULL) {
 		return (1);
 	}
 
-	/* packets over 2k in size use an aggregation buffer completion too */
-	ag = NULL;
-	if ((rxlo->agg_bufs_v1 >> RX_PKT_CMPL_AGG_BUFS_SFT) != 0) {
-		ag = (struct rx_abuf_cmpl *)bnxt_cpr_next_cmpl(sc, cpr);
-		if (ag == NULL) {
-			return (1);
-		}
+	for (i = 0; i < num_agg; i++) {
+		ag[i] = (struct rx_abuf_cmpl *)bnxt_cpr_next_cmpl(sc, cpr);
+		if (ag[i] == NULL)
+			break;
 	}
+	if (i < num_agg)
+		return 1;
 
-	bs = &rx->rx_slots[rxlo->opaque];
+	slot = rxlo->opaque & PAGE_MASK;
+	ring_id = rxlo->opaque >> PAGE_SHIFT;
+	KASSERT(ring_id == rx->rx_ring.id);
+
+	bs = &rx->rx_slots[slot];
 	bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0, bs->bs_map->dm_mapsize,
 	    BUS_DMASYNC_POSTREAD);
 	bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
@@ -2377,22 +2533,41 @@ bnxt_rx(struct bnxt_softc *sc, struct bnxt_rx_queue *r
 		m->m_pkthdr.csum_flags |= M_FLOWID;
 	}
 
-	if (ag != NULL) {
-		bs = &rx->rx_ag_slots[ag->opaque];
+	for (i = 0; i < num_agg; i++) {
+		slot = ag[i]->opaque & PAGE_MASK;
+		ring_id = ag[i]->opaque >> PAGE_SHIFT;
+		KASSERT(ring_id == rx->rx_ag_ring.id);
+
+		if (isset(rx->rx_ag_freemap, slot)) {
+			printf("%s: ag buf %d of %d on ring %d slot %d "
+			    "already free\n", __func__, i, num_agg,
+			    ring_id, slot);
+			KASSERT(isclr(rx->rx_ag_freemap, slot));
+			continue; /* should not happen */
+		}
+
+		bs = &rx->rx_ag_slots[slot];
 		bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0,
 		    bs->bs_map->dm_mapsize, BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
 
+		setbit(rx->rx_ag_freemap, slot);
+
 		am = bs->bs_m;
 		bs->bs_m = NULL;
-		am->m_len = letoh16(ag->len);
-		m->m_next = am;
+		am->m_len = letoh16(ag[i]->len);
+		if (aprev == NULL)
+			m->m_next = am;
+		else
+			aprev->m_next = am;
+		aprev = am;
 		m->m_pkthdr.len += am->m_len;
 		(*agslots)++;
 	}
 
 #ifndef SMALL_KERNEL
-	if (ISSET(sc->sc_ac.ac_if.if_xflags, IFXF_LRO) &&
+	if ((sc->sc_flags & BNXT_FLAG_TPA) == 0 &&
+	    ISSET(sc->sc_ac.ac_if.if_xflags, IFXF_LRO) &&
 	    ((lemtoh16(&rxlo->flags_type) & RX_PKT_CMPL_FLAGS_ITYPE_TCP) ==
 	    RX_PKT_CMPL_FLAGS_ITYPE_TCP))
 		tcp_softlro_glue(mltcp, m, &sc->sc_ac.ac_if);
@@ -2403,6 +2578,213 @@ bnxt_rx(struct bnxt_softc *sc, struct bnxt_rx_queue *r
 	return (0);
 }
 
+int
+bnxt_rx_tpa_start(struct bnxt_softc *sc, struct bnxt_rx_queue *rx,
+    struct bnxt_cp_ring *cpr, struct cmpl_base *cmpl)
+{
+	struct rx_tpa_start_cmpl *rtpa = (struct rx_tpa_start_cmpl *)cmpl;
+	struct rx_tpa_start_cmpl_hi *rtpa_hi;
+	uint8_t agg_id;
+	uint slot, ring_id;
+
+	/* second part of the rx tpa start completion */
+	rtpa_hi = (struct rx_tpa_start_cmpl_hi *)bnxt_cpr_next_cmpl(sc, cpr);
+	if (rtpa_hi == NULL)
+		return 1;
+
+	agg_id = (rtpa->agg_id & RX_TPA_START_CMPL_AGG_ID_MASK) >>
+	    RX_TPA_START_CMPL_AGG_ID_SFT;
+
+	slot = rtpa->opaque & PAGE_MASK;
+	ring_id = rtpa->opaque >> PAGE_SHIFT;
+
+	if (rtpa->opaque == 0) {
+		printf("%s: TPA start for agg %u ring %u slot %u opaque 0x%x\n", __func__, agg_id, ring_id, slot, rtpa->opaque);
+	}
+	KASSERT(rtpa->opaque);
+
+	memcpy(&rx->rx_tpas[agg_id].low, rtpa, sizeof(*rtpa));
+	memcpy(&rx->rx_tpas[agg_id].high, rtpa_hi, sizeof(*rtpa_hi));
+	rx->rx_tpas[agg_id].completed = 1;
+
+	return 0;
+}
+
+int
+bnxt_rx_tpa(struct bnxt_softc *sc, struct bnxt_rx_queue *rx,
+    struct bnxt_cp_ring *cpr, struct mbuf_list *ml, int *slots, int *agslots,
+    struct cmpl_base *cmpl)
+{
+	struct mbuf *m, *am, *aprev = NULL;
+	struct bnxt_slot *bs;
+	struct rx_tpa_end_cmpl *rxt = (struct rx_tpa_end_cmpl *)cmpl;
+	struct rx_tpa_end_cmpl_hi *rxthi;
+	struct bnxt_full_tpa_start *tpas;
+	struct rx_abuf_cmpl *ag[BNXT_MAX_AG_BUFFERS];
+	uint32_t flags;
+	uint8_t	i, num_agg;
+	uint8_t agg_id;
+	u_int slot, ring_id;
+
+	/* TPA start completion */
+	agg_id = (rxt->agg_id & RX_TPA_END_CMPL_AGG_ID_MASK) >>
+	    RX_TPA_END_CMPL_AGG_ID_SFT;
+	tpas = &rx->rx_tpas[agg_id];
+	if (tpas->completed == 0)
+		return (1);
+
+	/* second part of the rx completion */
+	rxthi = (struct rx_tpa_end_cmpl_hi *)bnxt_cpr_next_cmpl(sc, cpr);
+	if (rxthi == NULL)
+		return (1);
+
+	//printf("%s: TPA end for agg %u\n", __func__, agg_id);
+
+	memset(ag, 0, sizeof(ag));
+	num_agg = (rxt->agg_bufs_v1 & RX_TPA_END_CMPL_AGG_BUFS_MASK) >>
+	    RX_TPA_END_CMPL_AGG_BUFS_SFT;
+
+	/* Avoid processing packets which have not yet been fully completed. */
+	for (i = 0; i < num_agg; i++) {
+		ag[i] = (struct rx_abuf_cmpl *)bnxt_cpr_next_cmpl(sc, cpr);
+		if (ag[i] == NULL)
+			break;
+	}
+	if (i < num_agg)
+		return 1;
+#if 0
+	if (tpas->low.opaque != rxt->opaque) {
+		printf("%s: TPA start opaque 0x%x != TPA end opaque 0x%x\n",
+		    __func__, tpas->low.opaque, rxt->opaque);
+	}
+#endif
+
+	slot = tpas->low.opaque & PAGE_MASK;
+	ring_id = tpas->low.opaque >> PAGE_SHIFT;
+	#if 0
+	if (ring_id != rx->rx_ring.id) {
+		printf("%s: tpa start on rx ring %u, expected %u, opaque=0x%d\n", __func__, ring_id, rx->rx_ring.id, tpas->low.opaque);
+	}
+	KASSERT(ring_id == rx->rx_ring.id);
+	#endif
+
+	bs = &rx->rx_slots[slot];
+	bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0, bs->bs_map->dm_mapsize,
+	    BUS_DMASYNC_POSTREAD);
+	bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
+
+	if (bs->bs_m == NULL || tpas == NULL) {
+		printf("%s: invalid rx slot %u via TPA start opaque 0x%x\n",
+		    __func__, slot, tpas ? tpas->low.opaque : 0xdeadbeef);
+	}
+	KASSERT(bs->bs_m);
+	KASSERT(tpas);
+	m = bs->bs_m;
+	bs->bs_m = NULL;
+	m->m_pkthdr.len = m->m_len = letoh16(tpas->low.len);
+	(*slots)++;
+
+	/* checksum flags */
+	flags = lemtoh32(&tpas->high.flags2);
+	if (flags & RX_TPA_START_CMPL_FLAGS2_IP_CS_CALC)
+		m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK;
+
+	if (flags & RX_TPA_START_CMPL_FLAGS2_L4_CS_CALC) {
+		m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
+		    M_UDP_CSUM_IN_OK;
+	}
+#if NVLAN > 0
+	if ((flags & RX_TPA_START_CMPL_FLAGS2_META_FORMAT_MASK) ==
+	    RX_TPA_START_CMPL_FLAGS2_META_FORMAT_VLAN) {
+		m->m_pkthdr.ether_vtag = lemtoh16(&tpas->high.metadata);
+		m->m_flags |= M_VLANTAG;
+	}
+#endif
+
+	if (lemtoh16(&tpas->low.flags_type) &
+	    RX_TPA_START_CMPL_FLAGS_RSS_VALID) {
+		m->m_pkthdr.ph_flowid = lemtoh32(&tpas->low.rss_hash);
+		m->m_pkthdr.csum_flags |= M_FLOWID;
+	}
+
+	for (i = 0; i < num_agg; i++) {
+		slot = ag[i]->opaque & PAGE_MASK;
+		ring_id = ag[i]->opaque >> PAGE_SHIFT;
+		KASSERT(ring_id == rx->rx_ag_ring.id);
+
+		if (isset(rx->rx_ag_freemap, slot)) {
+			printf("%s: ag buf %d of %d on ring %d slot %d "
+			    "already free\n", __func__, i, num_agg,
+			    ring_id, slot);
+			KASSERT(isclr(rx->rx_ag_freemap, slot));
+			continue; /* should not happen */
+		}
+
+		bs = &rx->rx_ag_slots[slot];
+		bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0,
+		    bs->bs_map->dm_mapsize, BUS_DMASYNC_POSTREAD);
+		bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
+
+		setbit(rx->rx_ag_freemap, slot);
+
+		am = bs->bs_m;
+		bs->bs_m = NULL;
+		if (letoh16(ag[i]->len) > 0) {
+			am->m_len = letoh16(ag[i]->len);
+			if (aprev == NULL)
+				m->m_next = am;
+			else
+				aprev->m_next = am;
+			aprev = am;
+			m->m_pkthdr.len += am->m_len;
+		} else
+			m_freem(am);
+		(*agslots)++;
+	}
+
+	slot = rxt->opaque & PAGE_MASK;
+	ring_id = rxt->opaque >> PAGE_SHIFT;
+	KASSERT(ring_id == rx->rx_ring.id);
+
+	bs = &rx->rx_slots[slot];
+	bus_dmamap_sync(sc->sc_dmat, bs->bs_map, 0, bs->bs_map->dm_mapsize,
+	    BUS_DMASYNC_POSTREAD);
+	bus_dmamap_unload(sc->sc_dmat, bs->bs_map);
+
+	am = bs->bs_m;
+	bs->bs_m = NULL;
+	if (letoh16(rxt->len) > 0) {
+		am->m_len = letoh16(rxt->len);
+		if (aprev == NULL)
+			m->m_next = am;
+		else
+			aprev->m_next = am;
+		m->m_pkthdr.len += am->m_len;
+	} else
+		m_freem(am);
+	(*slots)++;
+
+	flags = lemtoh16(&tpas->low.flags_type);
+	if ((flags & RX_TPA_START_CMPL_FLAGS_ITYPE_TCP) ==
+	    RX_TPA_START_CMPL_FLAGS_ITYPE_TCP) {
+		uint16_t payload_offset = rxt->payload_offset;
+
+		if (payload_offset == 0)
+			payload_offset = 256;
+
+		if (m->m_pkthdr.len > payload_offset)
+			m->m_pkthdr.ph_mss = m->m_pkthdr.len - payload_offset;
+
+		tcpstat_add(tcps_inhwlro, i + 1);
+		tcpstat_inc(tcps_inpktlro);
+
+	}
+
+	tpas->completed = 0;
+	ml_enqueue(ml, m);
+	return (0);
+}
+
 void
 bnxt_txeof(struct bnxt_softc *sc, struct bnxt_tx_queue *tx, int *txfree,
     struct cmpl_base *cmpl)
@@ -3347,64 +3729,44 @@ bnxt_cfg_async_cr(struct bnxt_softc *softc, struct bnx
 	return rc;
 }
 
-#if 0
-
-void
-bnxt_validate_hw_lro_settings(struct bnxt_softc *softc)
-{
-	softc->hw_lro.enable = min(softc->hw_lro.enable, 1);
-
-        softc->hw_lro.is_mode_gro = min(softc->hw_lro.is_mode_gro, 1);
-
-	softc->hw_lro.max_agg_segs = min(softc->hw_lro.max_agg_segs,
-		HWRM_VNIC_TPA_CFG_INPUT_MAX_AGG_SEGS_MAX);
-
-	softc->hw_lro.max_aggs = min(softc->hw_lro.max_aggs,
-		HWRM_VNIC_TPA_CFG_INPUT_MAX_AGGS_MAX);
-
-	softc->hw_lro.min_agg_len = min(softc->hw_lro.min_agg_len, BNXT_MAX_MTU);
-}
-
 int
 bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *softc)
 {
+	struct ifnet *ifp = &softc->sc_ac.ac_if;
 	struct hwrm_vnic_tpa_cfg_input req = {0};
 	uint32_t flags;
 
-	if (softc->vnic_info.id == (uint16_t) HWRM_NA_SIGNATURE) {
+	if (softc->sc_vnic.id == (uint16_t) HWRM_NA_SIGNATURE ||
+	    (softc->sc_flags & BNXT_FLAG_TPA) == 0)
 		return 0;
-	}
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_TPA_CFG);
 
-	if (softc->hw_lro.enable) {
+	if (ifp->if_xflags & IFXF_LRO) {
 		flags = HWRM_VNIC_TPA_CFG_INPUT_FLAGS_TPA |
 			HWRM_VNIC_TPA_CFG_INPUT_FLAGS_ENCAP_TPA |
 			HWRM_VNIC_TPA_CFG_INPUT_FLAGS_AGG_WITH_ECN |
-			HWRM_VNIC_TPA_CFG_INPUT_FLAGS_AGG_WITH_SAME_GRE_SEQ;
-		
-        	if (softc->hw_lro.is_mode_gro)
-			flags |= HWRM_VNIC_TPA_CFG_INPUT_FLAGS_GRO;
-		else
-			flags |= HWRM_VNIC_TPA_CFG_INPUT_FLAGS_RSC_WND_UPDATE;
+			HWRM_VNIC_TPA_CFG_INPUT_FLAGS_AGG_WITH_SAME_GRE_SEQ |
+			HWRM_VNIC_TPA_CFG_INPUT_FLAGS_GRO;
 			
 		req.flags = htole32(flags);
 
-		req.enables = htole32(HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGG_SEGS |
-				HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGGS |
-				HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MIN_AGG_LEN);
+		req.enables = htole32(
+		    HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGG_SEGS |
+		    HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGGS |
+		    HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MIN_AGG_LEN);
 
-		req.max_agg_segs = htole16(softc->hw_lro.max_agg_segs);
-		req.max_aggs = htole16(softc->hw_lro.max_aggs);
-		req.min_agg_len = htole32(softc->hw_lro.min_agg_len);
+		req.max_agg_segs = htole16(softc->sc_max_agg_segs);
+		req.max_aggs = htole16(softc->sc_max_aggs);
+		req.min_agg_len = htole32(softc->sc_min_agg_len);
 	}
 
-	req.vnic_id = htole16(softc->vnic_info.id);
+	req.vnic_id = htole16(softc->sc_vnic.id);
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
-
+#if 0
 int
 bnxt_hwrm_fw_reset(struct bnxt_softc *softc, uint8_t processor,
     uint8_t *selfreset)