From: Jan Klemkow Subject: Re: iavf(4): multi-queue support To: Yuichiro NAITO Cc: tech@openbsd.org Date: Thu, 21 Nov 2024 10:44:40 +0100 On Thu, Nov 21, 2024 at 10:31:02AM GMT, Yuichiro NAITO wrote: > From: Yuichiro NAITO > Subject: Re: iavf(4): multi-queue support > Date: Wed, 04 Sep 2024 17:22:21 +0900 (JST) > > > Hi. Suppose you are interested in iavf(4) multi-queue. Try the following > > complete patch which enables multi-queue, checksum offloads, and TSO. > > I confirmed it works on my ESXi 8.0 and Linux qemu/kvm. Iperf3 results in > > 9.41 Gbps transmit speed and 6.87 Gbps receive speed of my OpenBSD guest > > with MTU size 1500 on ESXi 8.0. > > Hi, I had some reports that my patch doesn't work on ESXi while attaching > an iavf device. The reporter said the following error messages are shown > in the dmesg. > > ``` > iavf0: SET_RSS_HENA failed: -1 > iavf0: queue op 9 failed: -1 > ``` > > Both errors had an error code '-1', meaning the response from the PF driver > timed out. The `SET_RSS_HENA` request sends a packet classifier value for > the RSS hash filter which currently sends 0. Some PF driver version of ESXi > ignores the 0 value. So, I added the default value referring to the NetBSD > driver. The value definition is the same as the ixl(4). I split the > definitions to the 'if_iavfvars.h' file to share the code. > > The `queue op 9 failed` message happened in the 'iavf_queue_select' function. > This seems really timed out. I extended the time-out value to 3000 ms. This > value is also taken from NetBSD. > > I merged my code that handles a PCI bus error case in my previous mail. > > https://marc.info/?l=openbsd-tech&m=172723210819245&w=2 > > I also merged Jan's code that has VLAN #ifdef. The checksum offload code is > the same as Jan's. If you see the diff from Jan's code, you will see my code > only. > > https://marc.info/?l=openbsd-tech&m=173040636900369&w=2 > > OK? I tested your diff on my KVM setup. Works for me there. I had no time for ESXi tests yet. Could you split your diff in checksum offload, TSO and Multi-Queue. Thus, its easier to review and to see where the problems are. Thanks, Jan > diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c > index d573d6725f4..ea047064b0d 100644 > --- a/sys/dev/pci/if_iavf.c > +++ b/sys/dev/pci/if_iavf.c > @@ -49,6 +49,7 @@ > */ > > #include "bpfilter.h" > +#include "vlan.h" > > #include > #include > @@ -62,6 +63,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -75,11 +77,19 @@ > > #include > #include > +#include > > #include > #include > #include > > +#ifndef CACHE_LINE_SIZE > +#define CACHE_LINE_SIZE 64 > +#endif > + > +#define IAVF_MAX_VECTORS 4 > +#define IAVF_MAX_DMA_SEG_SIZE ((16 * 1024) - 1) > + > #define I40E_MASK(mask, shift) ((mask) << (shift)) > #define I40E_AQ_LARGE_BUF 512 > > @@ -89,7 +99,10 @@ > #define IAVF_VFR_COMPLETED 1 > #define IAVF_VFR_VFACTIVE 2 > > +#define IAVF_EXEC_TIMEOUT 3000 > + > #include > +#include > > struct iavf_aq_desc { > uint16_t iaq_flags; > @@ -386,6 +399,10 @@ struct iavf_tx_desc { > #define IAVF_TX_DESC_BSIZE_MASK \ > (IAVF_TX_DESC_BSIZE_MAX << IAVF_TX_DESC_BSIZE_SHIFT) > > +#define IAVF_TX_CTX_DESC_CMD_TSO 0x10 > +#define IAVF_TX_CTX_DESC_TLEN_SHIFT 30 > +#define IAVF_TX_CTX_DESC_MSS_SHIFT 50 > + > #define IAVF_TX_DESC_L2TAG1_SHIFT 48 > #define IAVF_TX_DESC_L2TAG1_MASK (0xffff << IAVF_TX_DESC_L2TAG1_SHIFT) > } __packed __aligned(16); > @@ -460,6 +477,7 @@ struct iavf_rx_wb_desc_32 { > #define IAVF_TX_PKT_DESCS 8 > #define IAVF_TX_QUEUE_ALIGN 128 > #define IAVF_RX_QUEUE_ALIGN 128 > +#define IAVF_TX_PKT_MAXSIZE (MCLBYTES * IAVF_TX_PKT_DESCS) > > #define IAVF_HARDMTU 9712 /* 9726 - ETHER_HDR_LEN */ > > @@ -526,6 +544,7 @@ struct iavf_tx_map { > struct iavf_tx_ring { > unsigned int txr_prod; > unsigned int txr_cons; > + struct ifqueue *txr_ifq; > > struct iavf_tx_map *txr_maps; > struct iavf_dmamem txr_mem; > @@ -541,6 +560,7 @@ struct iavf_rx_map { > > struct iavf_rx_ring { > struct iavf_softc *rxr_sc; > + struct ifiqueue *rxr_ifiq; > > struct if_rxring rxr_acct; > struct timeout rxr_refill; > @@ -558,17 +578,36 @@ struct iavf_rx_ring { > unsigned int rxr_qid; > }; > > +struct iavf_vector { > + struct iavf_softc *iv_sc; > + struct iavf_rx_ring *iv_rxr; > + struct iavf_tx_ring *iv_txr; > + int iv_qid; > + void *iv_ihc; > + char iv_name[16]; > +} __aligned(CACHE_LINE_SIZE); > + > +enum i40e_mac_type { > + I40E_MAC_XL710, > + I40E_MAC_X722, > + I40E_MAC_X722_VF, > + I40E_MAC_VF, > + I40E_MAC_GENERIC > +}; > + > struct iavf_softc { > struct device sc_dev; > struct arpcom sc_ac; > struct ifmedia sc_media; > uint64_t sc_media_status; > uint64_t sc_media_active; > + enum i40e_mac_type sc_mac_type; > > pci_chipset_tag_t sc_pc; > pci_intr_handle_t sc_ih; > void *sc_ihc; > pcitag_t sc_tag; > + struct intrmap *sc_intrmap; > > bus_dma_tag_t sc_dmat; > bus_space_tag_t sc_memt; > @@ -578,6 +617,7 @@ struct iavf_softc { > uint32_t sc_major_ver; > uint32_t sc_minor_ver; > > + int sc_if_attached; > int sc_got_vf_resources; > int sc_got_irq_map; > uint32_t sc_vf_id; > @@ -612,6 +652,9 @@ struct iavf_softc { > unsigned int sc_tx_ring_ndescs; > unsigned int sc_rx_ring_ndescs; > unsigned int sc_nqueues; /* 1 << sc_nqueues */ > + unsigned int sc_nintrs; > + > + struct iavf_vector *sc_vectors; > > struct rwlock sc_cfg_lock; > unsigned int sc_dead; > @@ -636,6 +679,7 @@ static void iavf_atq_done(struct iavf_softc *); > > static void iavf_init_admin_queue(struct iavf_softc *); > > +static enum i40e_mac_type iavf_mactype(pci_product_id_t); > static int iavf_get_version(struct iavf_softc *); > static int iavf_get_vf_resources(struct iavf_softc *); > static int iavf_config_irq_map(struct iavf_softc *); > @@ -644,6 +688,7 @@ static int iavf_add_del_addr(struct iavf_softc *, uint8_t *, int); > static int iavf_process_arq(struct iavf_softc *, int); > > static int iavf_match(struct device *, void *, void *); > +static int iavf_setup_interrupts(struct iavf_softc *, struct pci_attach_args *); > static void iavf_attach(struct device *, struct device *, void *); > > static int iavf_media_change(struct ifnet *); > @@ -652,6 +697,7 @@ static void iavf_watchdog(struct ifnet *); > static int iavf_ioctl(struct ifnet *, u_long, caddr_t); > static void iavf_start(struct ifqueue *); > static int iavf_intr(void *); > +static int iavf_intr_vector(void *); > static int iavf_up(struct iavf_softc *); > static int iavf_down(struct iavf_softc *); > static int iavf_iff(struct iavf_softc *); > @@ -715,9 +761,17 @@ static const struct iavf_aq_regs iavf_aq_regs = { > I40E_VFINT_DYN_CTL0_CLEARPBA_MASK | \ > (IAVF_NOITR << I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT)); \ > iavf_wr((_s), I40E_VFINT_ICR0_ENA1, I40E_VFINT_ICR0_ENA1_ADMINQ_MASK) > +#define iavf_queue_intr_enable(_s, _q) \ > + iavf_wr((_s), I40E_VFINT_DYN_CTLN1((_q)), \ > + I40E_VFINT_DYN_CTLN1_INTENA_MASK | \ > + I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK | \ > + (IAVF_NOITR << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT)) > +#define iavf_queue_intr_disable(_s, _q) \ > + iavf_wr((_s), I40E_VFINT_DYN_CTLN1((_q)), \ > + (IAVF_NOITR << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT)) > > #define iavf_nqueues(_sc) (1 << (_sc)->sc_nqueues) > -#define iavf_allqueues(_sc) ((1 << ((_sc)->sc_nqueues+1)) - 1) > +#define iavf_allqueues(_sc) ((1 << (iavf_nqueues(_sc))) - 1) > > #ifdef __LP64__ > #define iavf_dmamem_hi(_ixm) (uint32_t)(IAVF_DMA_DVA(_ixm) >> 32) > @@ -757,6 +811,107 @@ iavf_match(struct device *parent, void *match, void *aux) > return (pci_matchbyid(aux, iavf_devices, nitems(iavf_devices))); > } > > +static enum i40e_mac_type > +iavf_mactype(pci_product_id_t id) > +{ > + > + switch (id) { > + case PCI_PRODUCT_INTEL_XL710_VF: > + case PCI_PRODUCT_INTEL_XL710_VF_HV: > + return I40E_MAC_VF; > + case PCI_PRODUCT_INTEL_X722_VF: > + return I40E_MAC_X722_VF; > + } > + > + return I40E_MAC_GENERIC; > +} > + > +static int > +iavf_intr_vector(void *v) > +{ > + struct iavf_vector *iv = v; > + struct iavf_softc *sc = iv->iv_sc; > + > + struct ifnet *ifp = &sc->sc_ac.ac_if; > + int rv = 0; > + > + if (ISSET(ifp->if_flags, IFF_RUNNING)) { > + rv |= iavf_rxeof(sc, iv->iv_rxr->rxr_ifiq); > + rv |= iavf_txeof(sc, iv->iv_txr->txr_ifq); > + } > + > + iavf_queue_intr_enable(sc, iv->iv_qid); > + > + return rv; > +} > + > +static int > +iavf_setup_interrupts(struct iavf_softc *sc, struct pci_attach_args *pa) > +{ > + unsigned int i, v, nqueues = iavf_nqueues(sc); > + struct iavf_vector *iv; > + pci_intr_handle_t ih; > + > + sc->sc_ihc = pci_intr_establish(sc->sc_pc, sc->sc_ih, > + IPL_NET | IPL_MPSAFE, iavf_intr, sc, DEVNAME(sc)); > + if (sc->sc_ihc == NULL) { > + printf("%s: unable to establish interrupt handler\n", > + DEVNAME(sc)); > + return -1; > + } > + > + sc->sc_vectors = mallocarray(sizeof(*sc->sc_vectors), nqueues, > + M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); > + if (sc->sc_vectors == NULL) { > + printf("%s: unable to allocate vectors\n", DEVNAME(sc)); > + return -1; > + } > + > + for (i = 0; i < nqueues; i++) { > + iv = &sc->sc_vectors[i]; > + iv->iv_sc = sc; > + iv->iv_qid = i; > + snprintf(iv->iv_name, sizeof(iv->iv_name), "%s:%u", > + DEVNAME(sc), i); > + } > + > + if (sc->sc_intrmap) { > + for (i = 0; i < nqueues; i++) { > + iv = &sc->sc_vectors[i]; > + v = i + 1; /* 0 is used for adminq */ > + > + if (pci_intr_map_msix(pa, v, &ih)) { > + printf("%s: unable to map msi-x vector %d\n", > + DEVNAME(sc), v); > + goto free_vectors; > + } > + > + iv->iv_ihc = pci_intr_establish_cpu(sc->sc_pc, ih, > + IPL_NET | IPL_MPSAFE, > + intrmap_cpu(sc->sc_intrmap, i), > + iavf_intr_vector, iv, iv->iv_name); > + if (iv->iv_ihc == NULL) { > + printf("%s: unable to establish interrupt %d\n", > + DEVNAME(sc), v); > + goto free_vectors; > + } > + } > + } > + > + sc->sc_nintrs = nqueues + 1; > + return 0; > +free_vectors: > + if (sc->sc_intrmap != NULL) { > + for (i = 0; i < nqueues; i++) { > + struct iavf_vector *iv = &sc->sc_vectors[i]; > + if (iv->iv_ihc != NULL) > + pci_intr_disestablish(sc->sc_pc, iv->iv_ihc); > + } > + } > + free(sc->sc_vectors, M_DEVBUF, nqueues * sizeof(*sc->sc_vectors)); > + return -1; > +} > + > void > iavf_attach(struct device *parent, struct device *self, void *aux) > { > @@ -764,7 +919,13 @@ iavf_attach(struct device *parent, struct device *self, void *aux) > struct ifnet *ifp = &sc->sc_ac.ac_if; > struct pci_attach_args *pa = aux; > pcireg_t memtype; > - int tries; > + int nmsix, tries; > + unsigned int nqueues; > + > + if ((pa->pa_flags & PCI_FLAGS_MSI_ENABLED) == 0) { > + printf(" msix disabled!\n"); > + return; > + } > > rw_init(&sc->sc_cfg_lock, "iavfcfg"); > > @@ -773,6 +934,8 @@ iavf_attach(struct device *parent, struct device *self, void *aux) > sc->sc_dmat = pa->pa_dmat; > sc->sc_aq_regs = &iavf_aq_regs; > > + sc->sc_mac_type = iavf_mactype(PCI_PRODUCT(pa->pa_id)); > + > sc->sc_nqueues = 0; /* 1 << 0 is 1 queue */ > sc->sc_tx_ring_ndescs = 1024; > sc->sc_rx_ring_ndescs = 1024; > @@ -852,13 +1015,20 @@ iavf_attach(struct device *parent, struct device *self, void *aux) > goto free_scratch; > } > > - if (iavf_config_irq_map(sc) != 0) { > - printf(", timeout waiting for IRQ map response"); > - goto free_scratch; > - } > - > /* msix only? */ > - if (pci_intr_map_msix(pa, 0, &sc->sc_ih) != 0) { > + if (pci_intr_map_msix(pa, 0, &sc->sc_ih) == 0) { > + nmsix = pci_intr_msix_count(pa); > + if (nmsix > 1) { /* we used 1 (the 0th) for the adminq */ > + nmsix--; > + > + sc->sc_intrmap = intrmap_create(&sc->sc_dev, > + nmsix, IAVF_MAX_VECTORS, INTRMAP_POWEROF2); > + nqueues = intrmap_count(sc->sc_intrmap); > + KASSERT(nqueues > 0); > + KASSERT(powerof2(nqueues)); > + sc->sc_nqueues = fls(nqueues) - 1; > + } > + } else { > printf(", unable to map interrupt\n"); > goto free_scratch; > } > @@ -868,17 +1038,23 @@ iavf_attach(struct device *parent, struct device *self, void *aux) > if (memcmp(sc->sc_ac.ac_enaddr, etheranyaddr, ETHER_ADDR_LEN) == 0) > ether_fakeaddr(ifp); > > - printf(", %s, address %s\n", pci_intr_string(sc->sc_pc, sc->sc_ih), > - ether_sprintf(sc->sc_ac.ac_enaddr)); > + nqueues = iavf_nqueues(sc); > + printf(", %s, %d queue%s, address %s\n", > + pci_intr_string(sc->sc_pc, sc->sc_ih), > + nqueues, (nqueues > 1 ? "s" : ""), > + ether_sprintf(sc->sc_ac.ac_enaddr)); > > - sc->sc_ihc = pci_intr_establish(sc->sc_pc, sc->sc_ih, > - IPL_NET | IPL_MPSAFE, iavf_intr, sc, DEVNAME(sc)); > - if (sc->sc_ihc == NULL) { > + if (iavf_setup_interrupts(sc, pa) != 0) { > printf("%s: unable to establish interrupt handler\n", > DEVNAME(sc)); > goto free_scratch; > } > > + if (iavf_config_irq_map(sc) != 0) { > + printf(", timeout waiting for IRQ map response"); > + goto free_scratch; > + } > + > ifp->if_softc = sc; > ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; > ifp->if_xflags = IFXF_MPSAFE; > @@ -890,11 +1066,14 @@ iavf_attach(struct device *parent, struct device *self, void *aux) > strlcpy(ifp->if_xname, DEVNAME(sc), IFNAMSIZ); > ifq_init_maxlen(&ifp->if_snd, sc->sc_tx_ring_ndescs); > > - ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; > -#if 0 > - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | > - IFCAP_CSUM_UDPv4; > + ifp->if_capabilities = IFCAP_VLAN_MTU; > +#if NVLAN > 0 > + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; > #endif > + ifp->if_capabilities |= IFCAP_CSUM_IPv4 | > + IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | > + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; > + ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; > > ifmedia_init(&sc->sc_media, 0, iavf_media_change, iavf_media_status); > > @@ -906,6 +1085,7 @@ iavf_attach(struct device *parent, struct device *self, void *aux) > > if_attach_queues(ifp, iavf_nqueues(sc)); > if_attach_iqueues(ifp, iavf_nqueues(sc)); > + sc->sc_if_attached++; > > iavf_intr_enable(sc); > > @@ -1095,7 +1275,7 @@ iavf_config_vsi_queues(struct iavf_softc *sc) > BUS_DMASYNC_PREREAD); > > iavf_atq_post(sc, &iaq); > - rv = iavf_arq_wait(sc, 250); > + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); > if (rv != IAVF_VC_RC_SUCCESS) { > printf("%s: CONFIG_VSI_QUEUES failed: %d\n", DEVNAME(sc), rv); > return (1); > @@ -1119,10 +1299,11 @@ iavf_config_hena(struct iavf_softc *sc) > iavf_aq_dva(&iaq, IAVF_DMA_DVA(&sc->sc_scratch)); > > caps = IAVF_DMA_KVA(&sc->sc_scratch); > - *caps = 0; > + *caps = (sc->sc_mac_type == I40E_MAC_X722_VF) ? IXL_RSS_HENA_BASE_722 : > + IXL_RSS_HENA_BASE_710; > > iavf_atq_post(sc, &iaq); > - rv = iavf_arq_wait(sc, 250); > + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); > if (rv != IAVF_VC_RC_SUCCESS) { > printf("%s: SET_RSS_HENA failed: %d\n", DEVNAME(sc), rv); > return (1); > @@ -1157,7 +1338,7 @@ iavf_queue_select(struct iavf_softc *sc, int opcode) > BUS_DMASYNC_PREREAD); > > iavf_atq_post(sc, &iaq); > - rv = iavf_arq_wait(sc, 250); > + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); > if (rv != IAVF_VC_RC_SUCCESS) { > printf("%s: queue op %d failed: %d\n", DEVNAME(sc), opcode, rv); > return (1); > @@ -1170,13 +1351,13 @@ static int > iavf_up(struct iavf_softc *sc) > { > struct ifnet *ifp = &sc->sc_ac.ac_if; > + struct iavf_vector *iv; > struct iavf_rx_ring *rxr; > struct iavf_tx_ring *txr; > unsigned int nqueues, i; > int rv = ENOMEM; > > nqueues = iavf_nqueues(sc); > - KASSERT(nqueues == 1); /* XXX */ > > rw_enter_write(&sc->sc_cfg_lock); > if (sc->sc_dead) { > @@ -1195,8 +1376,11 @@ iavf_up(struct iavf_softc *sc) > goto free; > } > > - ifp->if_iqs[i]->ifiq_softc = rxr; > - ifp->if_ifqs[i]->ifq_softc = txr; > + iv = &sc->sc_vectors[i]; > + iv->iv_rxr = ifp->if_iqs[i]->ifiq_softc = rxr; > + iv->iv_txr = ifp->if_ifqs[i]->ifq_softc = txr; > + rxr->rxr_ifiq = ifp->if_iqs[i]; > + txr->txr_ifq = ifp->if_ifqs[i]; > > iavf_rxfill(sc, rxr); > } > @@ -1210,6 +1394,9 @@ iavf_up(struct iavf_softc *sc) > if (iavf_queue_select(sc, IAVF_VC_OP_ENABLE_QUEUES) != 0) > goto down; > > + for (i = 0; i < nqueues; i++) > + iavf_queue_intr_enable(sc, i); > + > SET(ifp->if_flags, IFF_RUNNING); > > iavf_wr(sc, I40E_VFINT_ITR01(0), 0x7a); > @@ -1235,6 +1422,9 @@ free: > > iavf_txr_free(sc, txr); > iavf_rxr_free(sc, rxr); > + iv = &sc->sc_vectors[i]; > + iv->iv_rxr = ifp->if_iqs[i]->ifiq_softc = NULL; > + iv->iv_txr = ifp->if_ifqs[i]->ifq_softc = NULL; > } > rw_exit_write(&sc->sc_cfg_lock); > return (rv); > @@ -1273,7 +1463,7 @@ iavf_config_promisc_mode(struct iavf_softc *sc, int unicast, int multicast) > BUS_DMASYNC_PREREAD); > > iavf_atq_post(sc, &iaq); > - rv = iavf_arq_wait(sc, 250); > + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); > if (rv != IAVF_VC_RC_SUCCESS) { > printf("%s: CONFIG_PROMISC_MODE failed: %d\n", DEVNAME(sc), rv); > return (1); > @@ -1312,7 +1502,7 @@ iavf_add_del_addr(struct iavf_softc *sc, uint8_t *addr, int add) > BUS_DMASYNC_PREREAD); > > iavf_atq_post(sc, &iaq); > - rv = iavf_arq_wait(sc, 250); > + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); > if (rv != IAVF_VC_RC_SUCCESS) { > printf("%s: ADD/DEL_ETH_ADDR failed: %d\n", DEVNAME(sc), rv); > return (1); > @@ -1357,6 +1547,7 @@ static int > iavf_down(struct iavf_softc *sc) > { > struct ifnet *ifp = &sc->sc_ac.ac_if; > + struct iavf_vector *iv; > struct iavf_rx_ring *rxr; > struct iavf_tx_ring *txr; > unsigned int nqueues, i; > @@ -1386,6 +1577,8 @@ iavf_down(struct iavf_softc *sc) > /* make sure no hw generated work is still in flight */ > intr_barrier(sc->sc_ihc); > for (i = 0; i < nqueues; i++) { > + iavf_queue_intr_disable(sc, i); > + > rxr = ifp->if_iqs[i]->ifiq_softc; > txr = ifp->if_ifqs[i]->ifq_softc; > > @@ -1395,8 +1588,9 @@ iavf_down(struct iavf_softc *sc) > } > > for (i = 0; i < nqueues; i++) { > - rxr = ifp->if_iqs[i]->ifiq_softc; > - txr = ifp->if_ifqs[i]->ifq_softc; > + iv = &sc->sc_vectors[i]; > + txr = iv->iv_txr; > + rxr = iv->iv_rxr; > > iavf_txr_clean(sc, txr); > iavf_rxr_clean(sc, rxr); > @@ -1404,8 +1598,8 @@ iavf_down(struct iavf_softc *sc) > iavf_txr_free(sc, txr); > iavf_rxr_free(sc, rxr); > > - ifp->if_iqs[i]->ifiq_softc = NULL; > - ifp->if_ifqs[i]->ifq_softc = NULL; > + iv->iv_rxr = ifp->if_iqs[i]->ifiq_softc = NULL; > + iv->iv_txr = ifp->if_ifqs[i]->ifq_softc = NULL; > } > > /* unmask */ > @@ -1437,7 +1631,8 @@ iavf_reset(void *xsc) > link_state = ifp->if_link_state; > if (ifp->if_link_state != LINK_STATE_DOWN) { > ifp->if_link_state = LINK_STATE_DOWN; > - if_link_state_change(ifp); > + if (sc->sc_if_attached) > + if_link_state_change(ifp); > } > > up = 0; > @@ -1561,7 +1756,7 @@ iavf_txr_alloc(struct iavf_softc *sc, unsigned int qid) > txm = &maps[i]; > > if (bus_dmamap_create(sc->sc_dmat, > - IAVF_HARDMTU, IAVF_TX_PKT_DESCS, IAVF_HARDMTU, 0, > + MAXMCLBYTES, IAVF_TX_PKT_DESCS, IAVF_MAX_DMA_SEG_SIZE, 0, > BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, > &txm->txm_map) != 0) > goto uncreate; > @@ -1656,6 +1851,83 @@ iavf_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m) > BUS_DMA_STREAMING | BUS_DMA_NOWAIT)); > } > > +static uint64_t > +iavf_tx_offload(struct mbuf *m, struct iavf_tx_ring *txr, unsigned int prod) > +{ > + struct ether_extracted ext; > + uint64_t hlen; > + uint64_t offload = 0; > + > +#if NVLAN > 0 > + if (ISSET(m->m_flags, M_VLANTAG)) { > + uint64_t vtag = m->m_pkthdr.ether_vtag; > + offload |= IAVF_TX_DESC_CMD_IL2TAG1; > + offload |= vtag << IAVF_TX_DESC_L2TAG1_SHIFT; > + } > +#endif > + > + if (!ISSET(m->m_pkthdr.csum_flags, > + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO)) > + return (offload); > + > + ether_extract_headers(m, &ext); > + > + if (ext.ip4) { > + offload |= ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ? > + IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM : > + IAVF_TX_DESC_CMD_IIPT_IPV4; > +#ifdef INET6 > + } else if (ext.ip6) { > + offload |= IAVF_TX_DESC_CMD_IIPT_IPV6; > +#endif > + } else { > + panic("CSUM_OUT set for non-IP packet"); > + /* NOTREACHED */ > + } > + hlen = ext.iphlen; > + > + offload |= (ETHER_HDR_LEN >> 1) << IAVF_TX_DESC_MACLEN_SHIFT; > + offload |= (hlen >> 2) << IAVF_TX_DESC_IPLEN_SHIFT; > + > + if (ext.tcp && ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) { > + offload |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP; > + offload |= (uint64_t)(ext.tcphlen >> 2) > + << IAVF_TX_DESC_L4LEN_SHIFT; > + } else if (ext.udp && ISSET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) { > + offload |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP; > + offload |= (uint64_t)(sizeof(*ext.udp) >> 2) > + << IAVF_TX_DESC_L4LEN_SHIFT; > + } > + > + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { > + if (ext.tcp && m->m_pkthdr.ph_mss > 0) { > + struct iavf_tx_desc *ring, *txd; > + uint64_t cmd = 0, paylen, outlen; > + > + hlen += ext.tcphlen; > + > + /* > + * The MSS should not be set to a lower value than 64 > + * or larger than 9668 bytes. > + */ > + outlen = MIN(9668, MAX(64, m->m_pkthdr.ph_mss)); > + paylen = m->m_pkthdr.len - ETHER_HDR_LEN - hlen; > + ring = IAVF_DMA_KVA(&txr->txr_mem); > + txd = &ring[prod]; > + > + cmd |= IAVF_TX_DESC_DTYPE_CONTEXT; > + cmd |= IAVF_TX_CTX_DESC_CMD_TSO; > + cmd |= paylen << IAVF_TX_CTX_DESC_TLEN_SHIFT; > + cmd |= outlen << IAVF_TX_CTX_DESC_MSS_SHIFT; > + > + htolem64(&txd->addr, 0); > + htolem64(&txd->cmd, cmd); > + } > + } > + > + return offload; > +} > + > static void > iavf_start(struct ifqueue *ifq) > { > @@ -1668,6 +1940,7 @@ iavf_start(struct ifqueue *ifq) > struct mbuf *m; > uint64_t cmd; > uint64_t vlan_cmd; > + uint64_t offload; > unsigned int prod, free, last, i; > unsigned int mask; > int post = 0; > @@ -1693,7 +1966,8 @@ iavf_start(struct ifqueue *ifq) > mask = sc->sc_tx_ring_ndescs - 1; > > for (;;) { > - if (free <= IAVF_TX_PKT_DESCS) { > + /* We need one extra descriptor for TSO packets. */ > + if (free <= (IAVF_TX_PKT_DESCS + 1)) { > ifq_set_oactive(ifq); > break; > } > @@ -1702,9 +1976,17 @@ iavf_start(struct ifqueue *ifq) > if (m == NULL) > break; > > + offload = iavf_tx_offload(m, txr, prod); > + > txm = &txr->txr_maps[prod]; > map = txm->txm_map; > > + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { > + prod++; > + prod &= mask; > + free--; > + } > + > if (iavf_load_mbuf(sc->sc_dmat, map, m) != 0) { > ifq->ifq_errors++; > m_freem(m); > @@ -1727,7 +2009,7 @@ iavf_start(struct ifqueue *ifq) > cmd = (uint64_t)map->dm_segs[i].ds_len << > IAVF_TX_DESC_BSIZE_SHIFT; > cmd |= IAVF_TX_DESC_DTYPE_DATA | IAVF_TX_DESC_CMD_ICRC | > - vlan_cmd; > + vlan_cmd | offload; > > htolem64(&txd->addr, map->dm_segs[i].ds_addr); > htolem64(&txd->cmd, cmd); > @@ -1938,6 +2220,24 @@ iavf_rxr_free(struct iavf_softc *sc, struct iavf_rx_ring *rxr) > free(rxr, M_DEVBUF, sizeof(*rxr)); > } > > +static void > +iavf_rx_checksum(struct mbuf *m, uint64_t word) > +{ > + if (!ISSET(word, IAVF_RX_DESC_L3L4P)) > + return; > + > + if (ISSET(word, IAVF_RX_DESC_IPE)) > + return; > + > + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK; > + > + if (ISSET(word, IAVF_RX_DESC_L4E)) > + return; > + > + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK; > +} > + > + > static int > iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) > { > @@ -2002,6 +2302,7 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) > m->m_pkthdr.len += len; > > if (ISSET(word, IAVF_RX_DESC_EOP)) { > +#if NVLAN > 0 > if (ISSET(word, IAVF_RX_DESC_L2TAG1P)) { > vlan = (lemtoh64(&rxd->qword0) & > IAVF_RX_DESC_L2TAG1_MASK) > @@ -2009,8 +2310,10 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) > m->m_pkthdr.ether_vtag = vlan; > m->m_flags |= M_VLANTAG; > } > +#endif > if (!ISSET(word, > IAVF_RX_DESC_RXE | IAVF_RX_DESC_OVERSIZE)) { > + iavf_rx_checksum(m, word); > ml_enqueue(&ml, m); > } else { > ifp->if_ierrors++; /* XXX */ > @@ -2555,25 +2858,45 @@ iavf_config_irq_map(struct iavf_softc *sc) > struct iavf_aq_desc iaq; > struct iavf_vc_vector_map *vec; > struct iavf_vc_irq_map_info *map; > + struct iavf_vector *iv; > + unsigned int num_vec = 0; > int tries; > > memset(&iaq, 0, sizeof(iaq)); > iaq.iaq_flags = htole16(IAVF_AQ_BUF | IAVF_AQ_RD); > iaq.iaq_opcode = htole16(IAVF_AQ_OP_SEND_TO_PF); > iaq.iaq_vc_opcode = htole32(IAVF_VC_OP_CONFIG_IRQ_MAP); > - iaq.iaq_datalen = htole16(sizeof(*map) + sizeof(*vec)); > + iaq.iaq_datalen = htole16(sizeof(*map) + sizeof(*vec) * sc->sc_nintrs); > iavf_aq_dva(&iaq, IAVF_DMA_DVA(&sc->sc_scratch)); > > map = IAVF_DMA_KVA(&sc->sc_scratch); > - map->num_vectors = htole16(1); > > vec = map->vecmap; > - vec[0].vsi_id = htole16(sc->sc_vsi_id); > - vec[0].vector_id = 0; > - vec[0].rxq_map = htole16(iavf_allqueues(sc)); > - vec[0].txq_map = htole16(iavf_allqueues(sc)); > - vec[0].rxitr_idx = htole16(IAVF_NOITR); > - vec[0].txitr_idx = htole16(IAVF_NOITR); > + if (sc->sc_nintrs == 1) { > + vec[num_vec].vsi_id = htole16(sc->sc_vsi_id); > + vec[num_vec].vector_id = htole16(num_vec); > + vec[num_vec].rxq_map = htole16(iavf_allqueues(sc)); > + vec[num_vec].txq_map = htole16(iavf_allqueues(sc)); > + vec[num_vec].rxitr_idx = htole16(IAVF_NOITR); > + vec[num_vec].txitr_idx = htole16(IAVF_NOITR); > + num_vec++; > + } else if (sc->sc_nintrs > 1) { > + for (; num_vec < sc->sc_nintrs - 1; num_vec++) { > + iv = &sc->sc_vectors[num_vec]; > + vec[num_vec].vsi_id = htole16(sc->sc_vsi_id); > + vec[num_vec].vector_id = htole16(num_vec + 1); > + vec[num_vec].rxq_map = htole16(1 << iv->iv_qid); > + vec[num_vec].txq_map = htole16(1 << iv->iv_qid); > + vec[num_vec].rxitr_idx = htole16(IAVF_ITR0); > + vec[num_vec].txitr_idx = htole16(IAVF_ITR1); > + } > + vec[num_vec].vsi_id = htole16(sc->sc_vsi_id); > + vec[num_vec].vector_id = htole16(0); > + vec[num_vec].rxq_map = htole16(0); > + vec[num_vec].txq_map = htole16(0); > + num_vec++; > + } > + map->num_vectors = htole16(num_vec); > > bus_dmamap_sync(sc->sc_dmat, IAVF_DMA_MAP(&sc->sc_scratch), 0, IAVF_DMA_LEN(&sc->sc_scratch), > BUS_DMASYNC_PREREAD); > diff --git a/sys/dev/pci/if_ixl.c b/sys/dev/pci/if_ixl.c > index 12c84ba2c79..66bbf2415ed 100644 > --- a/sys/dev/pci/if_ixl.c > +++ b/sys/dev/pci/if_ixl.c > @@ -923,53 +923,7 @@ CTASSERT(MAXMCLBYTES < IXL_TSO_SIZE); > #define IXL_AQ_ALIGN 64 /* lol */ > #define IXL_AQ_BUFLEN 4096 > > -/* Packet Classifier Types for filters */ > -/* bits 0-28 are reserved for future use */ > -#define IXL_PCT_NONF_IPV4_UDP_UCAST (1ULL << 29) /* 722 */ > -#define IXL_PCT_NONF_IPV4_UDP_MCAST (1ULL << 30) /* 722 */ > -#define IXL_PCT_NONF_IPV4_UDP (1ULL << 31) > -#define IXL_PCT_NONF_IPV4_TCP_SYN_NOACK (1ULL << 32) /* 722 */ > -#define IXL_PCT_NONF_IPV4_TCP (1ULL << 33) > -#define IXL_PCT_NONF_IPV4_SCTP (1ULL << 34) > -#define IXL_PCT_NONF_IPV4_OTHER (1ULL << 35) > -#define IXL_PCT_FRAG_IPV4 (1ULL << 36) > -/* bits 37-38 are reserved for future use */ > -#define IXL_PCT_NONF_IPV6_UDP_UCAST (1ULL << 39) /* 722 */ > -#define IXL_PCT_NONF_IPV6_UDP_MCAST (1ULL << 40) /* 722 */ > -#define IXL_PCT_NONF_IPV6_UDP (1ULL << 41) > -#define IXL_PCT_NONF_IPV6_TCP_SYN_NOACK (1ULL << 42) /* 722 */ > -#define IXL_PCT_NONF_IPV6_TCP (1ULL << 43) > -#define IXL_PCT_NONF_IPV6_SCTP (1ULL << 44) > -#define IXL_PCT_NONF_IPV6_OTHER (1ULL << 45) > -#define IXL_PCT_FRAG_IPV6 (1ULL << 46) > -/* bit 47 is reserved for future use */ > -#define IXL_PCT_FCOE_OX (1ULL << 48) > -#define IXL_PCT_FCOE_RX (1ULL << 49) > -#define IXL_PCT_FCOE_OTHER (1ULL << 50) > -/* bits 51-62 are reserved for future use */ > -#define IXL_PCT_L2_PAYLOAD (1ULL << 63) > - > -#define IXL_RSS_HENA_BASE_DEFAULT \ > - IXL_PCT_NONF_IPV4_UDP | \ > - IXL_PCT_NONF_IPV4_TCP | \ > - IXL_PCT_NONF_IPV4_SCTP | \ > - IXL_PCT_NONF_IPV4_OTHER | \ > - IXL_PCT_FRAG_IPV4 | \ > - IXL_PCT_NONF_IPV6_UDP | \ > - IXL_PCT_NONF_IPV6_TCP | \ > - IXL_PCT_NONF_IPV6_SCTP | \ > - IXL_PCT_NONF_IPV6_OTHER | \ > - IXL_PCT_FRAG_IPV6 | \ > - IXL_PCT_L2_PAYLOAD > - > -#define IXL_RSS_HENA_BASE_710 IXL_RSS_HENA_BASE_DEFAULT > -#define IXL_RSS_HENA_BASE_722 IXL_RSS_HENA_BASE_DEFAULT | \ > - IXL_PCT_NONF_IPV4_UDP_UCAST | \ > - IXL_PCT_NONF_IPV4_UDP_MCAST | \ > - IXL_PCT_NONF_IPV6_UDP_UCAST | \ > - IXL_PCT_NONF_IPV6_UDP_MCAST | \ > - IXL_PCT_NONF_IPV4_TCP_SYN_NOACK | \ > - IXL_PCT_NONF_IPV6_TCP_SYN_NOACK > +#include > > #define IXL_HMC_ROUNDUP 512 > #define IXL_HMC_PGSIZE 4096 > diff --git a/sys/dev/pci/if_ixlvar.h b/sys/dev/pci/if_ixlvar.h > new file mode 100644 > index 00000000000..7361be66bd4 > --- /dev/null > +++ b/sys/dev/pci/if_ixlvar.h > @@ -0,0 +1,102 @@ > +/* $Id$ */ > + > +/* > + * Copyright (c) 2013-2015, Intel Corporation > + * All rights reserved. > + > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions are met: > + * > + * 1. Redistributions of source code must retain the above copyright notice, > + * this list of conditions and the following disclaimer. > + * > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * 3. Neither the name of the Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived from > + * this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" > + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE > + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR > + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF > + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS > + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN > + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) > + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE > + * POSSIBILITY OF SUCH DAMAGE. > + */ > + > +/* > + * Copyright (c) 2016,2017 David Gwynne > + * > + * Permission to use, copy, modify, and distribute this software for any > + * purpose with or without fee is hereby granted, provided that the above > + * copyright notice and this permission notice appear in all copies. > + * > + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > + */ > + > +#ifndef _IXL_VAR_H_ > +#define _IXL_VAR_H_ > + > +/* Packet Classifier Types for filters */ > +/* bits 0-28 are reserved for future use */ > +#define IXL_PCT_NONF_IPV4_UDP_UCAST (1ULL << 29) /* 722 */ > +#define IXL_PCT_NONF_IPV4_UDP_MCAST (1ULL << 30) /* 722 */ > +#define IXL_PCT_NONF_IPV4_UDP (1ULL << 31) > +#define IXL_PCT_NONF_IPV4_TCP_SYN_NOACK (1ULL << 32) /* 722 */ > +#define IXL_PCT_NONF_IPV4_TCP (1ULL << 33) > +#define IXL_PCT_NONF_IPV4_SCTP (1ULL << 34) > +#define IXL_PCT_NONF_IPV4_OTHER (1ULL << 35) > +#define IXL_PCT_FRAG_IPV4 (1ULL << 36) > +/* bits 37-38 are reserved for future use */ > +#define IXL_PCT_NONF_IPV6_UDP_UCAST (1ULL << 39) /* 722 */ > +#define IXL_PCT_NONF_IPV6_UDP_MCAST (1ULL << 40) /* 722 */ > +#define IXL_PCT_NONF_IPV6_UDP (1ULL << 41) > +#define IXL_PCT_NONF_IPV6_TCP_SYN_NOACK (1ULL << 42) /* 722 */ > +#define IXL_PCT_NONF_IPV6_TCP (1ULL << 43) > +#define IXL_PCT_NONF_IPV6_SCTP (1ULL << 44) > +#define IXL_PCT_NONF_IPV6_OTHER (1ULL << 45) > +#define IXL_PCT_FRAG_IPV6 (1ULL << 46) > +/* bit 47 is reserved for future use */ > +#define IXL_PCT_FCOE_OX (1ULL << 48) > +#define IXL_PCT_FCOE_RX (1ULL << 49) > +#define IXL_PCT_FCOE_OTHER (1ULL << 50) > +/* bits 51-62 are reserved for future use */ > +#define IXL_PCT_L2_PAYLOAD (1ULL << 63) > + > +#define IXL_RSS_HENA_BASE_DEFAULT \ > + IXL_PCT_NONF_IPV4_UDP | \ > + IXL_PCT_NONF_IPV4_TCP | \ > + IXL_PCT_NONF_IPV4_SCTP | \ > + IXL_PCT_NONF_IPV4_OTHER | \ > + IXL_PCT_FRAG_IPV4 | \ > + IXL_PCT_NONF_IPV6_UDP | \ > + IXL_PCT_NONF_IPV6_TCP | \ > + IXL_PCT_NONF_IPV6_SCTP | \ > + IXL_PCT_NONF_IPV6_OTHER | \ > + IXL_PCT_FRAG_IPV6 | \ > + IXL_PCT_L2_PAYLOAD > + > +#define IXL_RSS_HENA_BASE_710 IXL_RSS_HENA_BASE_DEFAULT > +#define IXL_RSS_HENA_BASE_722 IXL_RSS_HENA_BASE_DEFAULT | \ > + IXL_PCT_NONF_IPV4_UDP_UCAST | \ > + IXL_PCT_NONF_IPV4_UDP_MCAST | \ > + IXL_PCT_NONF_IPV6_UDP_UCAST | \ > + IXL_PCT_NONF_IPV6_UDP_MCAST | \ > + IXL_PCT_NONF_IPV4_TCP_SYN_NOACK | \ > + IXL_PCT_NONF_IPV6_TCP_SYN_NOACK > + > +#endif /* _IXL_VAR_H_ */ > + > > -- > Yuichiro NAITO (naito.yuichiro@gmail.com) > >