From: Yuichiro NAITO Subject: Re: iavf patch [4/4]: Set flowid To: tech@openbsd.org Date: Thu, 27 Feb 2025 18:21:27 +0900 My previous idea that setting if_txmit=1 was not so good. David told me a technique that setting flowid in each mbuf to share the same softnet task queue between packet forwarding task and sending task. I see that ph_flowid is used to select the softnet. If the same softnet is selected for a single packet stream (that ipgen generates), output packets are queued in the ifq and will be sent after all packets in rx queue is forwarded. For the multiple stream, the receiving packets are hased by the nic hardware and chosen which CPU core to interrupt. Each CPU core has one softnet. This means a softnet is chosen by the rss hash. So, good hash is important for this mechanism, I also added rss key and lut for iavf(4). This patch improves packet forwarding performance and the performance gets stable. The ipgen results are shown as follows. ``` rfc2544 tolerable error rate: 0.0000% rfc2544 trial duration: 10 sec rfc2544 pps resolution: 0.0000% rfc2544 interval: 0 sec rfc2544 warming duration: 1 sec framesize|0G 1G 2G 3G 4G 5G 6G 7G 8G 9G 10Gbps ---------+----+----+----+----+----+----+----+----+----+----+ 64 |### 535.56Mbps, 1046012/14880952pps, 7.03% 128 |###### 1057.79Mbps, 1032998/ 8445945pps, 12.23% 512 |###################### 4285.72Mbps, 1046319/ 2349624pps, 44.53% 1024 |########################################### 8514.45Mbps, 1039361/ 1197318pps, 86.81% 1280 |################################################# 9647.95Mbps, 942183/ 961538pps, 97.99% 1408 |################################################## 9820.52Mbps, 871850/ 875350pps, 99.60% 1518 |################################################## 9860.31Mbps, 811949/ 812743pps, 99.90% framesize|0 |1m |2m |3m |4m |5m |6m |7m |8m |9m |10m |11m |12m |13m |14m |15m pps ---------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ 64 |##### 1046012/14880952pps, 7.03% 128 |##### 1032998/ 8445945pps, 12.23% 512 |##### 1046319/ 2349624pps, 44.53% 1024 |##### 1039361/ 1197318pps, 86.81% 1280 |#### 942183/ 961538pps, 97.99% 1408 |#### 871850/ 875350pps, 99.60% 1518 |#### 811949/ 812743pps, 99.90% ``` Please apply this patch after the following 3 patches. https://marc.info/?l=openbsd-tech&m=173891264325358&w=2 https://marc.info/?l=openbsd-tech&m=173891291725560&w=2 https://marc.info/?l=openbsd-tech&m=173891294425572&w=2 diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c index 204dbfc2637..cd4f53c30b2 100644 --- a/sys/dev/pci/if_iavf.c +++ b/sys/dev/pci/if_iavf.c @@ -75,6 +75,7 @@ #include #endif +#include #include #include #include @@ -158,6 +159,8 @@ struct iavf_aq_desc { #define IAVF_VC_OP_CONFIG_PROMISC 14 #define IAVF_VC_OP_GET_STATS 15 #define IAVF_VC_OP_EVENT 17 +#define IAVF_VC_OP_CONFIG_RSS_KEY 23 +#define IAVF_VC_OP_CONFIG_RSS_LUT 24 #define IAVF_VC_OP_GET_RSS_HENA_CAPS 25 #define IAVF_VC_OP_SET_RSS_HENA 26 @@ -325,6 +328,22 @@ struct iavf_vc_pf_event { uint32_t severity; } __packed; +struct iavf_vc_rss_key { + uint16_t vsi_id; + uint16_t key_len; + uint8_t key[1]; + uint8_t pad[1]; +} __packed; + +struct iavf_vc_rss_lut { + uint16_t vsi_id; + uint16_t lut_entries; + uint8_t lut[1]; + uint8_t pad[1]; +}__packed; + +#define IAVF_RSS_VSI_LUT_ENTRY_MASK 0x3F + /* aq response codes */ #define IAVF_AQ_RC_OK 0 /* success */ #define IAVF_AQ_RC_EPERM 1 /* Operation not permitted */ @@ -420,7 +439,9 @@ struct iavf_rx_rd_desc_32 { } __packed __aligned(16); struct iavf_rx_wb_desc_16 { - uint64_t qword0; + uint16_t _reserved1; + uint16_t l2tag1; + uint32_t filter_status; #define IAVF_RX_DESC_L2TAG1_SHIFT 16 #define IAVF_RX_DESC_L2TAG1_MASK (0xffff << IAVF_RX_DESC_L2TAG1_SHIFT) uint64_t qword1; @@ -461,6 +482,8 @@ struct iavf_rx_wb_desc_16 { #define IAVF_RX_DESC_PLEN_MASK (0x3fffULL << IAVF_RX_DESC_PLEN_SHIFT) #define IAVF_RX_DESC_HLEN_SHIFT 42 #define IAVF_RX_DESC_HLEN_MASK (0x7ffULL << IAVF_RX_DESC_HLEN_SHIFT) + uint64_t qword2; + uint64_t qword3; } __packed __aligned(16); struct iavf_rx_wb_desc_32 { @@ -623,6 +646,8 @@ struct iavf_softc { uint16_t sc_vsi_id; uint16_t sc_qset_handle; unsigned int sc_base_queue; + uint32_t sc_rss_key_size; + uint32_t sc_rss_lut_size; struct cond sc_admin_cond; int sc_admin_result; @@ -681,6 +706,8 @@ static void iavf_init_admin_queue(struct iavf_softc *); static enum i40e_mac_type iavf_mactype(pci_product_id_t); static int iavf_get_version(struct iavf_softc *); static int iavf_get_vf_resources(struct iavf_softc *); +static int iavf_config_rss_key(struct iavf_softc *); +static int iavf_config_rss_lut(struct iavf_softc *); static int iavf_config_irq_map(struct iavf_softc *); static int iavf_add_del_addr(struct iavf_softc *, uint8_t *, int); @@ -1247,16 +1274,16 @@ iavf_config_vsi_queues(struct iavf_softc *sc) txq = &config->qpair[i].txq; txq->vsi_id = htole16(sc->sc_vsi_id); - txq->queue_id = htole16(i); - txq->ring_len = sc->sc_tx_ring_ndescs; + txq->queue_id = htole16(txr->txr_qid); + txq->ring_len = htole16(sc->sc_tx_ring_ndescs); txq->headwb_ena = 0; htolem64(&txq->dma_ring_addr, IAVF_DMA_DVA(&txr->txr_mem)); txq->dma_headwb_addr = 0; rxq = &config->qpair[i].rxq; rxq->vsi_id = htole16(sc->sc_vsi_id); - rxq->queue_id = htole16(i); - rxq->ring_len = sc->sc_rx_ring_ndescs; + rxq->queue_id = htole16(rxr->rxr_qid); + rxq->ring_len = htole16(sc->sc_rx_ring_ndescs); rxq->splithdr_ena = 0; rxq->databuf_size = htole32(MCLBYTES); rxq->max_pkt_size = htole32(IAVF_HARDMTU); @@ -1303,7 +1330,72 @@ iavf_config_hena(struct iavf_softc *sc) return (1); } - caps = IAVF_DMA_KVA(&sc->sc_scratch); + return (0); +} + +static int +iavf_config_rss_key(struct iavf_softc *sc) +{ + struct iavf_aq_desc iaq; + struct iavf_vc_rss_key *rss_key; + uint32_t key_len = sc->sc_rss_key_size; + int rv; + + memset(&iaq, 0, sizeof(iaq)); + iaq.iaq_flags = htole16(IAVF_AQ_BUF | IAVF_AQ_RD); + iaq.iaq_opcode = htole16(IAVF_AQ_OP_SEND_TO_PF); + iaq.iaq_vc_opcode = htole32(IAVF_VC_OP_CONFIG_RSS_KEY); + iaq.iaq_datalen = htole16(sizeof(*rss_key) - sizeof(rss_key->pad) + + (sizeof(rss_key->key[0]) * key_len)); + iavf_aq_dva(&iaq, IAVF_DMA_DVA(&sc->sc_scratch)); + + rss_key = IAVF_DMA_KVA(&sc->sc_scratch); + rss_key->vsi_id = htole16(sc->sc_vsi_id); + stoeplitz_to_key(&rss_key->key, key_len); + rss_key->key_len = htole16(key_len); + rss_key->key[key_len] = 0; + + iavf_atq_post(sc, &iaq); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); + if (rv != IAVF_VC_RC_SUCCESS) { + printf("%s: CONFIG_RSS_KEY failed: %d\n", DEVNAME(sc), rv); + return (1); + } + return (0); +} + +static int +iavf_config_rss_lut(struct iavf_softc *sc) +{ + struct iavf_aq_desc iaq; + struct iavf_vc_rss_lut *rss_lut; + uint8_t *lut; + uint32_t lut_size = sc->sc_rss_lut_size; + int i, rv; + + memset(&iaq, 0, sizeof(iaq)); + iaq.iaq_flags = htole16(IAVF_AQ_BUF | IAVF_AQ_RD); + iaq.iaq_opcode = htole16(IAVF_AQ_OP_SEND_TO_PF); + iaq.iaq_vc_opcode = htole32(IAVF_VC_OP_CONFIG_RSS_LUT); + iaq.iaq_datalen = htole16(sizeof(*rss_lut) - sizeof(rss_lut->pad) + + (sizeof(rss_lut->lut[0]) * lut_size)); + iavf_aq_dva(&iaq, IAVF_DMA_DVA(&sc->sc_scratch)); + + rss_lut = IAVF_DMA_KVA(&sc->sc_scratch); + rss_lut->vsi_id = htole16(sc->sc_vsi_id); + rss_lut->lut_entries = htole16(lut_size); + + lut = rss_lut->lut; + for (i = 0; i < lut_size; i++) + lut[i] = (i % iavf_nqueues(sc)) & IAVF_RSS_VSI_LUT_ENTRY_MASK; + rss_lut->lut[i] = 0; + + iavf_atq_post(sc, &iaq); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); + if (rv != IAVF_VC_RC_SUCCESS) { + printf("%s: CONFIG_RSS_LUT failed: %d\n", DEVNAME(sc), rv); + return (1); + } return (0); } @@ -1382,7 +1474,9 @@ iavf_up(struct iavf_softc *sc) if (iavf_config_vsi_queues(sc) != 0) goto down; - if (iavf_config_hena(sc) != 0) + if (iavf_config_hena(sc) != 0 || + iavf_config_rss_key(sc) != 0 || + iavf_config_rss_lut(sc) != 0) goto down; if (iavf_queue_select(sc, IAVF_VC_OP_ENABLE_QUEUES) != 0) @@ -2233,14 +2327,13 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) { struct iavf_rx_ring *rxr = ifiq->ifiq_softc; struct ifnet *ifp = &sc->sc_ac.ac_if; - struct iavf_rx_wb_desc_32 *ring, *rxd; + struct iavf_rx_wb_desc_16 *ring, *rxd; struct iavf_rx_map *rxm; bus_dmamap_t map; unsigned int cons, prod; struct mbuf_list ml = MBUF_LIST_INITIALIZER(); struct mbuf *m; uint64_t word; - uint16_t vlan; unsigned int len; unsigned int mask; int done = 0; @@ -2276,7 +2369,7 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->sc_dmat, map); - + m = rxm->rxm_m; rxm->rxm_m = NULL; @@ -2292,17 +2385,22 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) m->m_pkthdr.len += len; if (ISSET(word, IAVF_RX_DESC_EOP)) { + if (!ISSET(word, + IAVF_RX_DESC_RXE | IAVF_RX_DESC_OVERSIZE)) { + if ((word & IAVF_RX_DESC_FLTSTAT_MASK) == + IAVF_RX_DESC_FLTSTAT_RSS) { + m->m_pkthdr.ph_flowid = + lemtoh32(&rxd->filter_status); + m->m_pkthdr.csum_flags |= M_FLOWID; + } + #if NVLAN > 0 - if (ISSET(word, IAVF_RX_DESC_L2TAG1P)) { - vlan = (lemtoh64(&rxd->qword0) & - IAVF_RX_DESC_L2TAG1_MASK) - >> IAVF_RX_DESC_L2TAG1_SHIFT; - m->m_pkthdr.ether_vtag = vlan; - m->m_flags |= M_VLANTAG; - } + if (ISSET(word, IAVF_RX_DESC_L2TAG1P)) { + m->m_pkthdr.ether_vtag = + lemtoh16(&rxd->l2tag1); + m->m_flags |= M_VLANTAG; + } #endif - if (!ISSET(word, - IAVF_RX_DESC_RXE | IAVF_RX_DESC_OVERSIZE)) { iavf_rx_checksum(m, word); ml_enqueue(&ml, m); } else { @@ -2488,6 +2586,9 @@ iavf_process_vf_resources(struct iavf_softc *sc, struct iavf_aq_desc *desc, return; } + sc->sc_rss_key_size = vf_res->rss_key_size; + sc->sc_rss_lut_size = vf_res->rss_lut_size; + mtu = letoh16(vf_res->max_mtu); if (mtu != 0) ifp->if_hardmtu = MIN(IAVF_HARDMTU, mtu); @@ -2653,6 +2754,8 @@ iavf_process_arq(struct iavf_softc *sc, int fill) case IAVF_VC_OP_ADD_ETH_ADDR: case IAVF_VC_OP_DEL_ETH_ADDR: case IAVF_VC_OP_CONFIG_PROMISC: + case IAVF_VC_OP_CONFIG_RSS_KEY: + case IAVF_VC_OP_CONFIG_RSS_LUT: sc->sc_admin_result = letoh32(iaq->iaq_vc_retval); cond_signal(&sc->sc_admin_cond); break; -- Yuichiro NAITO (naito.yuichiro@gmail.com)