From: Yuichiro NAITO Subject: Re: iavf(4): multi-queue support To: j.klemkow@wemelug.de Cc: tech@openbsd.org Date: Fri, 22 Nov 2024 16:57:32 +0900 From: Jan Klemkow Subject: Re: iavf(4): multi-queue support Date: Thu, 21 Nov 2024 10:44:40 +0100 > On Thu, Nov 21, 2024 at 10:31:02AM GMT, Yuichiro NAITO wrote: >> From: Yuichiro NAITO >> Subject: Re: iavf(4): multi-queue support >> Date: Wed, 04 Sep 2024 17:22:21 +0900 (JST) >> >> > Hi. Suppose you are interested in iavf(4) multi-queue. Try the following >> > complete patch which enables multi-queue, checksum offloads, and TSO. >> > I confirmed it works on my ESXi 8.0 and Linux qemu/kvm. Iperf3 results in >> > 9.41 Gbps transmit speed and 6.87 Gbps receive speed of my OpenBSD guest >> > with MTU size 1500 on ESXi 8.0. >> >> Hi, I had some reports that my patch doesn't work on ESXi while attaching >> an iavf device. The reporter said the following error messages are shown >> in the dmesg. >> >> ``` >> iavf0: SET_RSS_HENA failed: -1 >> iavf0: queue op 9 failed: -1 >> ``` >> >> Both errors had an error code '-1', meaning the response from the PF driver >> timed out. The `SET_RSS_HENA` request sends a packet classifier value for >> the RSS hash filter which currently sends 0. Some PF driver version of ESXi >> ignores the 0 value. So, I added the default value referring to the NetBSD >> driver. The value definition is the same as the ixl(4). I split the >> definitions to the 'if_iavfvars.h' file to share the code. >> >> The `queue op 9 failed` message happened in the 'iavf_queue_select' function. >> This seems really timed out. I extended the time-out value to 3000 ms. This >> value is also taken from NetBSD. >> >> I merged my code that handles a PCI bus error case in my previous mail. >> >> https://marc.info/?l=openbsd-tech&m=172723210819245&w=2 >> >> I also merged Jan's code that has VLAN #ifdef. The checksum offload code is >> the same as Jan's. If you see the diff from Jan's code, you will see my code >> only. >> >> https://marc.info/?l=openbsd-tech&m=173040636900369&w=2 >> >> OK? > > I tested your diff on my KVM setup. Works for me there. I had no time > for ESXi tests yet. > > Could you split your diff in checksum offload, TSO and Multi-Queue. > Thus, its easier to review and to see where the problems are. Sure. I split my patch into the following 4 patches. 1. check-sum offloading 2. TSO support 3. Multi-queue support 4. PCI bus error handling Please apply by this order. Here is the check-sum offloading patch, originally you wrote it. I changed the 'ixl_rx_checksum' function name to 'iavf_rx_checksum'. It looks like a simple mistake. No functional change is intended. diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c index d573d6725f4..aac22b8f378 100644 --- a/sys/dev/pci/if_iavf.c +++ b/sys/dev/pci/if_iavf.c @@ -49,6 +49,7 @@ */ #include "bpfilter.h" +#include "vlan.h" #include #include @@ -75,6 +76,7 @@ #include #include +#include #include #include @@ -890,11 +892,13 @@ iavf_attach(struct device *parent, struct device *self, void *aux) strlcpy(ifp->if_xname, DEVNAME(sc), IFNAMSIZ); ifq_init_maxlen(&ifp->if_snd, sc->sc_tx_ring_ndescs); - ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; -#if 0 - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | - IFCAP_CSUM_UDPv4; + ifp->if_capabilities = IFCAP_VLAN_MTU; +#if NVLAN > 0 + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; #endif + ifp->if_capabilities |= IFCAP_CSUM_IPv4 | + IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; ifmedia_init(&sc->sc_media, 0, iavf_media_change, iavf_media_status); @@ -1656,6 +1660,57 @@ iavf_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m) BUS_DMA_STREAMING | BUS_DMA_NOWAIT)); } +static uint64_t +iavf_tx_offload(struct mbuf *m) +{ + struct ether_extracted ext; + uint64_t hlen; + uint64_t offload = 0; + +#if NVLAN > 0 + if (ISSET(m->m_flags, M_VLANTAG)) { + uint64_t vtag = m->m_pkthdr.ether_vtag; + offload |= IAVF_TX_DESC_CMD_IL2TAG1; + offload |= vtag << IAVF_TX_DESC_L2TAG1_SHIFT; + } +#endif + + if (!ISSET(m->m_pkthdr.csum_flags, + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) + return (offload); + + ether_extract_headers(m, &ext); + + if (ext.ip4) { + offload |= ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ? + IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM : + IAVF_TX_DESC_CMD_IIPT_IPV4; +#ifdef INET6 + } else if (ext.ip6) { + offload |= IAVF_TX_DESC_CMD_IIPT_IPV6; +#endif + } else { + panic("CSUM_OUT set for non-IP packet"); + /* NOTREACHED */ + } + hlen = ext.iphlen; + + offload |= (ETHER_HDR_LEN >> 1) << IAVF_TX_DESC_MACLEN_SHIFT; + offload |= (hlen >> 2) << IAVF_TX_DESC_IPLEN_SHIFT; + + if (ext.tcp && ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) { + offload |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP; + offload |= (uint64_t)(ext.tcphlen >> 2) + << IAVF_TX_DESC_L4LEN_SHIFT; + } else if (ext.udp && ISSET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) { + offload |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP; + offload |= (uint64_t)(sizeof(*ext.udp) >> 2) + << IAVF_TX_DESC_L4LEN_SHIFT; + } + + return offload; +} + static void iavf_start(struct ifqueue *ifq) { @@ -1667,7 +1722,7 @@ iavf_start(struct ifqueue *ifq) bus_dmamap_t map; struct mbuf *m; uint64_t cmd; - uint64_t vlan_cmd; + uint64_t offload; unsigned int prod, free, last, i; unsigned int mask; int post = 0; @@ -1702,6 +1757,8 @@ iavf_start(struct ifqueue *ifq) if (m == NULL) break; + offload = iavf_tx_offload(m); + txm = &txr->txr_maps[prod]; map = txm->txm_map; @@ -1714,20 +1771,13 @@ iavf_start(struct ifqueue *ifq) bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_PREWRITE); - vlan_cmd = 0; - if (m->m_flags & M_VLANTAG) { - vlan_cmd = IAVF_TX_DESC_CMD_IL2TAG1 | - (((uint64_t)m->m_pkthdr.ether_vtag) << - IAVF_TX_DESC_L2TAG1_SHIFT); - } - for (i = 0; i < map->dm_nsegs; i++) { txd = &ring[prod]; cmd = (uint64_t)map->dm_segs[i].ds_len << IAVF_TX_DESC_BSIZE_SHIFT; - cmd |= IAVF_TX_DESC_DTYPE_DATA | IAVF_TX_DESC_CMD_ICRC | - vlan_cmd; + cmd |= IAVF_TX_DESC_DTYPE_DATA | IAVF_TX_DESC_CMD_ICRC; + cmd |= offload; htolem64(&txd->addr, map->dm_segs[i].ds_addr); htolem64(&txd->cmd, cmd); @@ -1938,6 +1988,24 @@ iavf_rxr_free(struct iavf_softc *sc, struct iavf_rx_ring *rxr) free(rxr, M_DEVBUF, sizeof(*rxr)); } +static void +iavf_rx_checksum(struct mbuf *m, uint64_t word) +{ + if (!ISSET(word, IAVF_RX_DESC_L3L4P)) + return; + + if (ISSET(word, IAVF_RX_DESC_IPE)) + return; + + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK; + + if (ISSET(word, IAVF_RX_DESC_L4E)) + return; + + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK; +} + + static int iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) { @@ -2002,6 +2070,7 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) m->m_pkthdr.len += len; if (ISSET(word, IAVF_RX_DESC_EOP)) { +#if NVLAN > 0 if (ISSET(word, IAVF_RX_DESC_L2TAG1P)) { vlan = (lemtoh64(&rxd->qword0) & IAVF_RX_DESC_L2TAG1_MASK) @@ -2009,8 +2078,10 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq) m->m_pkthdr.ether_vtag = vlan; m->m_flags |= M_VLANTAG; } +#endif if (!ISSET(word, IAVF_RX_DESC_RXE | IAVF_RX_DESC_OVERSIZE)) { + iavf_rx_checksum(m, word); ml_enqueue(&ml, m); } else { ifp->if_ierrors++; /* XXX */ Here is the TSO support patch. This is almost the same as ixl(4). diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c index aac22b8f378..5f39b36dc20 100644 --- a/sys/dev/pci/if_iavf.c +++ b/sys/dev/pci/if_iavf.c @@ -82,6 +82,8 @@ #include #include +#define IAVF_MAX_DMA_SEG_SIZE ((16 * 1024) - 1) + #define I40E_MASK(mask, shift) ((mask) << (shift)) #define I40E_AQ_LARGE_BUF 512 @@ -388,6 +390,10 @@ struct iavf_tx_desc { #define IAVF_TX_DESC_BSIZE_MASK \ (IAVF_TX_DESC_BSIZE_MAX << IAVF_TX_DESC_BSIZE_SHIFT) +#define IAVF_TX_CTX_DESC_CMD_TSO 0x10 +#define IAVF_TX_CTX_DESC_TLEN_SHIFT 30 +#define IAVF_TX_CTX_DESC_MSS_SHIFT 50 + #define IAVF_TX_DESC_L2TAG1_SHIFT 48 #define IAVF_TX_DESC_L2TAG1_MASK (0xffff << IAVF_TX_DESC_L2TAG1_SHIFT) } __packed __aligned(16); @@ -899,6 +905,7 @@ iavf_attach(struct device *parent, struct device *self, void *aux) ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; + ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; ifmedia_init(&sc->sc_media, 0, iavf_media_change, iavf_media_status); @@ -1565,7 +1572,7 @@ iavf_txr_alloc(struct iavf_softc *sc, unsigned int qid) txm = &maps[i]; if (bus_dmamap_create(sc->sc_dmat, - IAVF_HARDMTU, IAVF_TX_PKT_DESCS, IAVF_HARDMTU, 0, + MAXMCLBYTES, IAVF_TX_PKT_DESCS, IAVF_MAX_DMA_SEG_SIZE, 0, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, &txm->txm_map) != 0) goto uncreate; @@ -1661,7 +1668,7 @@ iavf_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m) } static uint64_t -iavf_tx_offload(struct mbuf *m) +iavf_tx_offload(struct mbuf *m, struct iavf_tx_ring *txr, unsigned int prod) { struct ether_extracted ext; uint64_t hlen; @@ -1676,7 +1683,7 @@ iavf_tx_offload(struct mbuf *m) #endif if (!ISSET(m->m_pkthdr.csum_flags, - M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO)) return (offload); ether_extract_headers(m, &ext); @@ -1708,6 +1715,32 @@ iavf_tx_offload(struct mbuf *m) << IAVF_TX_DESC_L4LEN_SHIFT; } + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { + if (ext.tcp && m->m_pkthdr.ph_mss > 0) { + struct iavf_tx_desc *ring, *txd; + uint64_t cmd = 0, paylen, outlen; + + hlen += ext.tcphlen; + + /* + * The MSS should not be set to a lower value than 64 + * or larger than 9668 bytes. + */ + outlen = MIN(9668, MAX(64, m->m_pkthdr.ph_mss)); + paylen = m->m_pkthdr.len - ETHER_HDR_LEN - hlen; + ring = IAVF_DMA_KVA(&txr->txr_mem); + txd = &ring[prod]; + + cmd |= IAVF_TX_DESC_DTYPE_CONTEXT; + cmd |= IAVF_TX_CTX_DESC_CMD_TSO; + cmd |= paylen << IAVF_TX_CTX_DESC_TLEN_SHIFT; + cmd |= outlen << IAVF_TX_CTX_DESC_MSS_SHIFT; + + htolem64(&txd->addr, 0); + htolem64(&txd->cmd, cmd); + } + } + return offload; } @@ -1748,7 +1781,8 @@ iavf_start(struct ifqueue *ifq) mask = sc->sc_tx_ring_ndescs - 1; for (;;) { - if (free <= IAVF_TX_PKT_DESCS) { + /* We need one extra descriptor for TSO packets. */ + if (free <= (IAVF_TX_PKT_DESCS + 1)) { ifq_set_oactive(ifq); break; } @@ -1757,11 +1791,17 @@ iavf_start(struct ifqueue *ifq) if (m == NULL) break; - offload = iavf_tx_offload(m); + offload = iavf_tx_offload(m, txr, prod); txm = &txr->txr_maps[prod]; map = txm->txm_map; + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { + prod++; + prod &= mask; + free--; + } + if (iavf_load_mbuf(sc->sc_dmat, map, m) != 0) { ifq->ifq_errors++; m_freem(m); Here is the Multi-queue support. Since this patch, iavf(4) works on ESXi. Including the fix of 'SET_RSS_HENA' and the PF response timeout problem. diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c index 5f39b36dc20..1226b953821 100644 --- a/sys/dev/pci/if_iavf.c +++ b/sys/dev/pci/if_iavf.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,11 @@ #include #include +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 64 +#endif + +#define IAVF_MAX_VECTORS 4 #define IAVF_MAX_DMA_SEG_SIZE ((16 * 1024) - 1) #define I40E_MASK(mask, shift) ((mask) << (shift)) @@ -93,7 +99,10 @@ #define IAVF_VFR_COMPLETED 1 #define IAVF_VFR_VFACTIVE 2 +#define IAVF_EXEC_TIMEOUT 3000 + #include +#include struct iavf_aq_desc { uint16_t iaq_flags; @@ -534,6 +543,7 @@ struct iavf_tx_map { struct iavf_tx_ring { unsigned int txr_prod; unsigned int txr_cons; + struct ifqueue *txr_ifq; struct iavf_tx_map *txr_maps; struct iavf_dmamem txr_mem; @@ -549,6 +559,7 @@ struct iavf_rx_map { struct iavf_rx_ring { struct iavf_softc *rxr_sc; + struct ifiqueue *rxr_ifiq; struct if_rxring rxr_acct; struct timeout rxr_refill; @@ -566,17 +577,36 @@ struct iavf_rx_ring { unsigned int rxr_qid; }; +struct iavf_vector { + struct iavf_softc *iv_sc; + struct iavf_rx_ring *iv_rxr; + struct iavf_tx_ring *iv_txr; + int iv_qid; + void *iv_ihc; + char iv_name[16]; +} __aligned(CACHE_LINE_SIZE); + +enum i40e_mac_type { + I40E_MAC_XL710, + I40E_MAC_X722, + I40E_MAC_X722_VF, + I40E_MAC_VF, + I40E_MAC_GENERIC +}; + struct iavf_softc { struct device sc_dev; struct arpcom sc_ac; struct ifmedia sc_media; uint64_t sc_media_status; uint64_t sc_media_active; + enum i40e_mac_type sc_mac_type; pci_chipset_tag_t sc_pc; pci_intr_handle_t sc_ih; void *sc_ihc; pcitag_t sc_tag; + struct intrmap *sc_intrmap; bus_dma_tag_t sc_dmat; bus_space_tag_t sc_memt; @@ -620,6 +650,9 @@ struct iavf_softc { unsigned int sc_tx_ring_ndescs; unsigned int sc_rx_ring_ndescs; unsigned int sc_nqueues; /* 1 << sc_nqueues */ + unsigned int sc_nintrs; + + struct iavf_vector *sc_vectors; struct rwlock sc_cfg_lock; unsigned int sc_dead; @@ -644,6 +677,7 @@ static void iavf_atq_done(struct iavf_softc *); static void iavf_init_admin_queue(struct iavf_softc *); +static enum i40e_mac_type iavf_mactype(pci_product_id_t); static int iavf_get_version(struct iavf_softc *); static int iavf_get_vf_resources(struct iavf_softc *); static int iavf_config_irq_map(struct iavf_softc *); @@ -652,6 +686,7 @@ static int iavf_add_del_addr(struct iavf_softc *, uint8_t *, int); static int iavf_process_arq(struct iavf_softc *, int); static int iavf_match(struct device *, void *, void *); +static int iavf_setup_interrupts(struct iavf_softc *, struct pci_attach_args *); static void iavf_attach(struct device *, struct device *, void *); static int iavf_media_change(struct ifnet *); @@ -660,6 +695,7 @@ static void iavf_watchdog(struct ifnet *); static int iavf_ioctl(struct ifnet *, u_long, caddr_t); static void iavf_start(struct ifqueue *); static int iavf_intr(void *); +static int iavf_intr_vector(void *); static int iavf_up(struct iavf_softc *); static int iavf_down(struct iavf_softc *); static int iavf_iff(struct iavf_softc *); @@ -723,9 +759,17 @@ static const struct iavf_aq_regs iavf_aq_regs = { I40E_VFINT_DYN_CTL0_CLEARPBA_MASK | \ (IAVF_NOITR << I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT)); \ iavf_wr((_s), I40E_VFINT_ICR0_ENA1, I40E_VFINT_ICR0_ENA1_ADMINQ_MASK) +#define iavf_queue_intr_enable(_s, _q) \ + iavf_wr((_s), I40E_VFINT_DYN_CTLN1((_q)), \ + I40E_VFINT_DYN_CTLN1_INTENA_MASK | \ + I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK | \ + (IAVF_NOITR << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT)) +#define iavf_queue_intr_disable(_s, _q) \ + iavf_wr((_s), I40E_VFINT_DYN_CTLN1((_q)), \ + (IAVF_NOITR << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT)) #define iavf_nqueues(_sc) (1 << (_sc)->sc_nqueues) -#define iavf_allqueues(_sc) ((1 << ((_sc)->sc_nqueues+1)) - 1) +#define iavf_allqueues(_sc) ((1 << (iavf_nqueues(_sc))) - 1) #ifdef __LP64__ #define iavf_dmamem_hi(_ixm) (uint32_t)(IAVF_DMA_DVA(_ixm) >> 32) @@ -765,6 +809,107 @@ iavf_match(struct device *parent, void *match, void *aux) return (pci_matchbyid(aux, iavf_devices, nitems(iavf_devices))); } +static enum i40e_mac_type +iavf_mactype(pci_product_id_t id) +{ + + switch (id) { + case PCI_PRODUCT_INTEL_XL710_VF: + case PCI_PRODUCT_INTEL_XL710_VF_HV: + return I40E_MAC_VF; + case PCI_PRODUCT_INTEL_X722_VF: + return I40E_MAC_X722_VF; + } + + return I40E_MAC_GENERIC; +} + +static int +iavf_intr_vector(void *v) +{ + struct iavf_vector *iv = v; + struct iavf_softc *sc = iv->iv_sc; + + struct ifnet *ifp = &sc->sc_ac.ac_if; + int rv = 0; + + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + rv |= iavf_rxeof(sc, iv->iv_rxr->rxr_ifiq); + rv |= iavf_txeof(sc, iv->iv_txr->txr_ifq); + } + + iavf_queue_intr_enable(sc, iv->iv_qid); + + return rv; +} + +static int +iavf_setup_interrupts(struct iavf_softc *sc, struct pci_attach_args *pa) +{ + unsigned int i, v, nqueues = iavf_nqueues(sc); + struct iavf_vector *iv; + pci_intr_handle_t ih; + + sc->sc_ihc = pci_intr_establish(sc->sc_pc, sc->sc_ih, + IPL_NET | IPL_MPSAFE, iavf_intr, sc, DEVNAME(sc)); + if (sc->sc_ihc == NULL) { + printf("%s: unable to establish interrupt handler\n", + DEVNAME(sc)); + return -1; + } + + sc->sc_vectors = mallocarray(sizeof(*sc->sc_vectors), nqueues, + M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc->sc_vectors == NULL) { + printf("%s: unable to allocate vectors\n", DEVNAME(sc)); + return -1; + } + + for (i = 0; i < nqueues; i++) { + iv = &sc->sc_vectors[i]; + iv->iv_sc = sc; + iv->iv_qid = i; + snprintf(iv->iv_name, sizeof(iv->iv_name), "%s:%u", + DEVNAME(sc), i); + } + + if (sc->sc_intrmap) { + for (i = 0; i < nqueues; i++) { + iv = &sc->sc_vectors[i]; + v = i + 1; /* 0 is used for adminq */ + + if (pci_intr_map_msix(pa, v, &ih)) { + printf("%s: unable to map msi-x vector %d\n", + DEVNAME(sc), v); + goto free_vectors; + } + + iv->iv_ihc = pci_intr_establish_cpu(sc->sc_pc, ih, + IPL_NET | IPL_MPSAFE, + intrmap_cpu(sc->sc_intrmap, i), + iavf_intr_vector, iv, iv->iv_name); + if (iv->iv_ihc == NULL) { + printf("%s: unable to establish interrupt %d\n", + DEVNAME(sc), v); + goto free_vectors; + } + } + } + + sc->sc_nintrs = nqueues + 1; + return 0; +free_vectors: + if (sc->sc_intrmap != NULL) { + for (i = 0; i < nqueues; i++) { + struct iavf_vector *iv = &sc->sc_vectors[i]; + if (iv->iv_ihc != NULL) + pci_intr_disestablish(sc->sc_pc, iv->iv_ihc); + } + } + free(sc->sc_vectors, M_DEVBUF, nqueues * sizeof(*sc->sc_vectors)); + return -1; +} + void iavf_attach(struct device *parent, struct device *self, void *aux) { @@ -772,7 +917,8 @@ iavf_attach(struct device *parent, struct device *self, void *aux) struct ifnet *ifp = &sc->sc_ac.ac_if; struct pci_attach_args *pa = aux; pcireg_t memtype; - int tries; + int nmsix, tries; + unsigned int nqueues; rw_init(&sc->sc_cfg_lock, "iavfcfg"); @@ -781,6 +927,8 @@ iavf_attach(struct device *parent, struct device *self, void *aux) sc->sc_dmat = pa->pa_dmat; sc->sc_aq_regs = &iavf_aq_regs; + sc->sc_mac_type = iavf_mactype(PCI_PRODUCT(pa->pa_id)); + sc->sc_nqueues = 0; /* 1 << 0 is 1 queue */ sc->sc_tx_ring_ndescs = 1024; sc->sc_rx_ring_ndescs = 1024; @@ -860,13 +1008,20 @@ iavf_attach(struct device *parent, struct device *self, void *aux) goto free_scratch; } - if (iavf_config_irq_map(sc) != 0) { - printf(", timeout waiting for IRQ map response"); - goto free_scratch; - } - /* msix only? */ - if (pci_intr_map_msix(pa, 0, &sc->sc_ih) != 0) { + if (pci_intr_map_msix(pa, 0, &sc->sc_ih) == 0) { + nmsix = pci_intr_msix_count(pa); + if (nmsix > 1) { /* we used 1 (the 0th) for the adminq */ + nmsix--; + + sc->sc_intrmap = intrmap_create(&sc->sc_dev, + nmsix, IAVF_MAX_VECTORS, INTRMAP_POWEROF2); + nqueues = intrmap_count(sc->sc_intrmap); + KASSERT(nqueues > 0); + KASSERT(powerof2(nqueues)); + sc->sc_nqueues = fls(nqueues) - 1; + } + } else { printf(", unable to map interrupt\n"); goto free_scratch; } @@ -876,17 +1031,23 @@ iavf_attach(struct device *parent, struct device *self, void *aux) if (memcmp(sc->sc_ac.ac_enaddr, etheranyaddr, ETHER_ADDR_LEN) == 0) ether_fakeaddr(ifp); - printf(", %s, address %s\n", pci_intr_string(sc->sc_pc, sc->sc_ih), - ether_sprintf(sc->sc_ac.ac_enaddr)); + nqueues = iavf_nqueues(sc); + printf(", %s, %d queue%s, address %s\n", + pci_intr_string(sc->sc_pc, sc->sc_ih), + nqueues, (nqueues > 1 ? "s" : ""), + ether_sprintf(sc->sc_ac.ac_enaddr)); - sc->sc_ihc = pci_intr_establish(sc->sc_pc, sc->sc_ih, - IPL_NET | IPL_MPSAFE, iavf_intr, sc, DEVNAME(sc)); - if (sc->sc_ihc == NULL) { + if (iavf_setup_interrupts(sc, pa) != 0) { printf("%s: unable to establish interrupt handler\n", DEVNAME(sc)); goto free_scratch; } + if (iavf_config_irq_map(sc) != 0) { + printf(", timeout waiting for IRQ map response"); + goto free_scratch; + } + ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_xflags = IFXF_MPSAFE; @@ -1106,7 +1267,7 @@ iavf_config_vsi_queues(struct iavf_softc *sc) BUS_DMASYNC_PREREAD); iavf_atq_post(sc, &iaq); - rv = iavf_arq_wait(sc, 250); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); if (rv != IAVF_VC_RC_SUCCESS) { printf("%s: CONFIG_VSI_QUEUES failed: %d\n", DEVNAME(sc), rv); return (1); @@ -1130,10 +1291,11 @@ iavf_config_hena(struct iavf_softc *sc) iavf_aq_dva(&iaq, IAVF_DMA_DVA(&sc->sc_scratch)); caps = IAVF_DMA_KVA(&sc->sc_scratch); - *caps = 0; + *caps = (sc->sc_mac_type == I40E_MAC_X722_VF) ? IXL_RSS_HENA_BASE_722 : + IXL_RSS_HENA_BASE_710; iavf_atq_post(sc, &iaq); - rv = iavf_arq_wait(sc, 250); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); if (rv != IAVF_VC_RC_SUCCESS) { printf("%s: SET_RSS_HENA failed: %d\n", DEVNAME(sc), rv); return (1); @@ -1168,7 +1330,7 @@ iavf_queue_select(struct iavf_softc *sc, int opcode) BUS_DMASYNC_PREREAD); iavf_atq_post(sc, &iaq); - rv = iavf_arq_wait(sc, 250); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); if (rv != IAVF_VC_RC_SUCCESS) { printf("%s: queue op %d failed: %d\n", DEVNAME(sc), opcode, rv); return (1); @@ -1181,13 +1343,13 @@ static int iavf_up(struct iavf_softc *sc) { struct ifnet *ifp = &sc->sc_ac.ac_if; + struct iavf_vector *iv; struct iavf_rx_ring *rxr; struct iavf_tx_ring *txr; unsigned int nqueues, i; int rv = ENOMEM; nqueues = iavf_nqueues(sc); - KASSERT(nqueues == 1); /* XXX */ rw_enter_write(&sc->sc_cfg_lock); if (sc->sc_dead) { @@ -1206,8 +1368,11 @@ iavf_up(struct iavf_softc *sc) goto free; } - ifp->if_iqs[i]->ifiq_softc = rxr; - ifp->if_ifqs[i]->ifq_softc = txr; + iv = &sc->sc_vectors[i]; + iv->iv_rxr = ifp->if_iqs[i]->ifiq_softc = rxr; + iv->iv_txr = ifp->if_ifqs[i]->ifq_softc = txr; + rxr->rxr_ifiq = ifp->if_iqs[i]; + txr->txr_ifq = ifp->if_ifqs[i]; iavf_rxfill(sc, rxr); } @@ -1221,6 +1386,9 @@ iavf_up(struct iavf_softc *sc) if (iavf_queue_select(sc, IAVF_VC_OP_ENABLE_QUEUES) != 0) goto down; + for (i = 0; i < nqueues; i++) + iavf_queue_intr_enable(sc, i); + SET(ifp->if_flags, IFF_RUNNING); iavf_wr(sc, I40E_VFINT_ITR01(0), 0x7a); @@ -1246,6 +1414,9 @@ free: iavf_txr_free(sc, txr); iavf_rxr_free(sc, rxr); + iv = &sc->sc_vectors[i]; + iv->iv_rxr = ifp->if_iqs[i]->ifiq_softc = NULL; + iv->iv_txr = ifp->if_ifqs[i]->ifq_softc = NULL; } rw_exit_write(&sc->sc_cfg_lock); return (rv); @@ -1284,7 +1455,7 @@ iavf_config_promisc_mode(struct iavf_softc *sc, int unicast, int multicast) BUS_DMASYNC_PREREAD); iavf_atq_post(sc, &iaq); - rv = iavf_arq_wait(sc, 250); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); if (rv != IAVF_VC_RC_SUCCESS) { printf("%s: CONFIG_PROMISC_MODE failed: %d\n", DEVNAME(sc), rv); return (1); @@ -1323,7 +1494,7 @@ iavf_add_del_addr(struct iavf_softc *sc, uint8_t *addr, int add) BUS_DMASYNC_PREREAD); iavf_atq_post(sc, &iaq); - rv = iavf_arq_wait(sc, 250); + rv = iavf_arq_wait(sc, IAVF_EXEC_TIMEOUT); if (rv != IAVF_VC_RC_SUCCESS) { printf("%s: ADD/DEL_ETH_ADDR failed: %d\n", DEVNAME(sc), rv); return (1); @@ -1368,6 +1539,7 @@ static int iavf_down(struct iavf_softc *sc) { struct ifnet *ifp = &sc->sc_ac.ac_if; + struct iavf_vector *iv; struct iavf_rx_ring *rxr; struct iavf_tx_ring *txr; unsigned int nqueues, i; @@ -1397,6 +1569,8 @@ iavf_down(struct iavf_softc *sc) /* make sure no hw generated work is still in flight */ intr_barrier(sc->sc_ihc); for (i = 0; i < nqueues; i++) { + iavf_queue_intr_disable(sc, i); + rxr = ifp->if_iqs[i]->ifiq_softc; txr = ifp->if_ifqs[i]->ifq_softc; @@ -1406,8 +1580,9 @@ iavf_down(struct iavf_softc *sc) } for (i = 0; i < nqueues; i++) { - rxr = ifp->if_iqs[i]->ifiq_softc; - txr = ifp->if_ifqs[i]->ifq_softc; + iv = &sc->sc_vectors[i]; + txr = iv->iv_txr; + rxr = iv->iv_rxr; iavf_txr_clean(sc, txr); iavf_rxr_clean(sc, rxr); @@ -1415,8 +1590,8 @@ iavf_down(struct iavf_softc *sc) iavf_txr_free(sc, txr); iavf_rxr_free(sc, rxr); - ifp->if_iqs[i]->ifiq_softc = NULL; - ifp->if_ifqs[i]->ifq_softc = NULL; + iv->iv_rxr = ifp->if_iqs[i]->ifiq_softc = NULL; + iv->iv_txr = ifp->if_ifqs[i]->ifq_softc = NULL; } /* unmask */ @@ -2666,25 +2841,45 @@ iavf_config_irq_map(struct iavf_softc *sc) struct iavf_aq_desc iaq; struct iavf_vc_vector_map *vec; struct iavf_vc_irq_map_info *map; + struct iavf_vector *iv; + unsigned int num_vec = 0; int tries; memset(&iaq, 0, sizeof(iaq)); iaq.iaq_flags = htole16(IAVF_AQ_BUF | IAVF_AQ_RD); iaq.iaq_opcode = htole16(IAVF_AQ_OP_SEND_TO_PF); iaq.iaq_vc_opcode = htole32(IAVF_VC_OP_CONFIG_IRQ_MAP); - iaq.iaq_datalen = htole16(sizeof(*map) + sizeof(*vec)); + iaq.iaq_datalen = htole16(sizeof(*map) + sizeof(*vec) * sc->sc_nintrs); iavf_aq_dva(&iaq, IAVF_DMA_DVA(&sc->sc_scratch)); map = IAVF_DMA_KVA(&sc->sc_scratch); - map->num_vectors = htole16(1); vec = map->vecmap; - vec[0].vsi_id = htole16(sc->sc_vsi_id); - vec[0].vector_id = 0; - vec[0].rxq_map = htole16(iavf_allqueues(sc)); - vec[0].txq_map = htole16(iavf_allqueues(sc)); - vec[0].rxitr_idx = htole16(IAVF_NOITR); - vec[0].txitr_idx = htole16(IAVF_NOITR); + if (sc->sc_nintrs == 1) { + vec[num_vec].vsi_id = htole16(sc->sc_vsi_id); + vec[num_vec].vector_id = htole16(num_vec); + vec[num_vec].rxq_map = htole16(iavf_allqueues(sc)); + vec[num_vec].txq_map = htole16(iavf_allqueues(sc)); + vec[num_vec].rxitr_idx = htole16(IAVF_NOITR); + vec[num_vec].txitr_idx = htole16(IAVF_NOITR); + num_vec++; + } else if (sc->sc_nintrs > 1) { + for (; num_vec < sc->sc_nintrs - 1; num_vec++) { + iv = &sc->sc_vectors[num_vec]; + vec[num_vec].vsi_id = htole16(sc->sc_vsi_id); + vec[num_vec].vector_id = htole16(num_vec + 1); + vec[num_vec].rxq_map = htole16(1 << iv->iv_qid); + vec[num_vec].txq_map = htole16(1 << iv->iv_qid); + vec[num_vec].rxitr_idx = htole16(IAVF_ITR0); + vec[num_vec].txitr_idx = htole16(IAVF_ITR1); + } + vec[num_vec].vsi_id = htole16(sc->sc_vsi_id); + vec[num_vec].vector_id = htole16(0); + vec[num_vec].rxq_map = htole16(0); + vec[num_vec].txq_map = htole16(0); + num_vec++; + } + map->num_vectors = htole16(num_vec); bus_dmamap_sync(sc->sc_dmat, IAVF_DMA_MAP(&sc->sc_scratch), 0, IAVF_DMA_LEN(&sc->sc_scratch), BUS_DMASYNC_PREREAD); diff --git a/sys/dev/pci/if_ixl.c b/sys/dev/pci/if_ixl.c index 12c84ba2c79..66bbf2415ed 100644 --- a/sys/dev/pci/if_ixl.c +++ b/sys/dev/pci/if_ixl.c @@ -923,53 +923,7 @@ CTASSERT(MAXMCLBYTES < IXL_TSO_SIZE); #define IXL_AQ_ALIGN 64 /* lol */ #define IXL_AQ_BUFLEN 4096 -/* Packet Classifier Types for filters */ -/* bits 0-28 are reserved for future use */ -#define IXL_PCT_NONF_IPV4_UDP_UCAST (1ULL << 29) /* 722 */ -#define IXL_PCT_NONF_IPV4_UDP_MCAST (1ULL << 30) /* 722 */ -#define IXL_PCT_NONF_IPV4_UDP (1ULL << 31) -#define IXL_PCT_NONF_IPV4_TCP_SYN_NOACK (1ULL << 32) /* 722 */ -#define IXL_PCT_NONF_IPV4_TCP (1ULL << 33) -#define IXL_PCT_NONF_IPV4_SCTP (1ULL << 34) -#define IXL_PCT_NONF_IPV4_OTHER (1ULL << 35) -#define IXL_PCT_FRAG_IPV4 (1ULL << 36) -/* bits 37-38 are reserved for future use */ -#define IXL_PCT_NONF_IPV6_UDP_UCAST (1ULL << 39) /* 722 */ -#define IXL_PCT_NONF_IPV6_UDP_MCAST (1ULL << 40) /* 722 */ -#define IXL_PCT_NONF_IPV6_UDP (1ULL << 41) -#define IXL_PCT_NONF_IPV6_TCP_SYN_NOACK (1ULL << 42) /* 722 */ -#define IXL_PCT_NONF_IPV6_TCP (1ULL << 43) -#define IXL_PCT_NONF_IPV6_SCTP (1ULL << 44) -#define IXL_PCT_NONF_IPV6_OTHER (1ULL << 45) -#define IXL_PCT_FRAG_IPV6 (1ULL << 46) -/* bit 47 is reserved for future use */ -#define IXL_PCT_FCOE_OX (1ULL << 48) -#define IXL_PCT_FCOE_RX (1ULL << 49) -#define IXL_PCT_FCOE_OTHER (1ULL << 50) -/* bits 51-62 are reserved for future use */ -#define IXL_PCT_L2_PAYLOAD (1ULL << 63) - -#define IXL_RSS_HENA_BASE_DEFAULT \ - IXL_PCT_NONF_IPV4_UDP | \ - IXL_PCT_NONF_IPV4_TCP | \ - IXL_PCT_NONF_IPV4_SCTP | \ - IXL_PCT_NONF_IPV4_OTHER | \ - IXL_PCT_FRAG_IPV4 | \ - IXL_PCT_NONF_IPV6_UDP | \ - IXL_PCT_NONF_IPV6_TCP | \ - IXL_PCT_NONF_IPV6_SCTP | \ - IXL_PCT_NONF_IPV6_OTHER | \ - IXL_PCT_FRAG_IPV6 | \ - IXL_PCT_L2_PAYLOAD - -#define IXL_RSS_HENA_BASE_710 IXL_RSS_HENA_BASE_DEFAULT -#define IXL_RSS_HENA_BASE_722 IXL_RSS_HENA_BASE_DEFAULT | \ - IXL_PCT_NONF_IPV4_UDP_UCAST | \ - IXL_PCT_NONF_IPV4_UDP_MCAST | \ - IXL_PCT_NONF_IPV6_UDP_UCAST | \ - IXL_PCT_NONF_IPV6_UDP_MCAST | \ - IXL_PCT_NONF_IPV4_TCP_SYN_NOACK | \ - IXL_PCT_NONF_IPV6_TCP_SYN_NOACK +#include #define IXL_HMC_ROUNDUP 512 #define IXL_HMC_PGSIZE 4096 diff --git a/sys/dev/pci/if_ixlvar.h b/sys/dev/pci/if_ixlvar.h new file mode 100644 index 00000000000..7361be66bd4 --- /dev/null +++ b/sys/dev/pci/if_ixlvar.h @@ -0,0 +1,102 @@ +/* $Id$ */ + +/* + * Copyright (c) 2013-2015, Intel Corporation + * All rights reserved. + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016,2017 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _IXL_VAR_H_ +#define _IXL_VAR_H_ + +/* Packet Classifier Types for filters */ +/* bits 0-28 are reserved for future use */ +#define IXL_PCT_NONF_IPV4_UDP_UCAST (1ULL << 29) /* 722 */ +#define IXL_PCT_NONF_IPV4_UDP_MCAST (1ULL << 30) /* 722 */ +#define IXL_PCT_NONF_IPV4_UDP (1ULL << 31) +#define IXL_PCT_NONF_IPV4_TCP_SYN_NOACK (1ULL << 32) /* 722 */ +#define IXL_PCT_NONF_IPV4_TCP (1ULL << 33) +#define IXL_PCT_NONF_IPV4_SCTP (1ULL << 34) +#define IXL_PCT_NONF_IPV4_OTHER (1ULL << 35) +#define IXL_PCT_FRAG_IPV4 (1ULL << 36) +/* bits 37-38 are reserved for future use */ +#define IXL_PCT_NONF_IPV6_UDP_UCAST (1ULL << 39) /* 722 */ +#define IXL_PCT_NONF_IPV6_UDP_MCAST (1ULL << 40) /* 722 */ +#define IXL_PCT_NONF_IPV6_UDP (1ULL << 41) +#define IXL_PCT_NONF_IPV6_TCP_SYN_NOACK (1ULL << 42) /* 722 */ +#define IXL_PCT_NONF_IPV6_TCP (1ULL << 43) +#define IXL_PCT_NONF_IPV6_SCTP (1ULL << 44) +#define IXL_PCT_NONF_IPV6_OTHER (1ULL << 45) +#define IXL_PCT_FRAG_IPV6 (1ULL << 46) +/* bit 47 is reserved for future use */ +#define IXL_PCT_FCOE_OX (1ULL << 48) +#define IXL_PCT_FCOE_RX (1ULL << 49) +#define IXL_PCT_FCOE_OTHER (1ULL << 50) +/* bits 51-62 are reserved for future use */ +#define IXL_PCT_L2_PAYLOAD (1ULL << 63) + +#define IXL_RSS_HENA_BASE_DEFAULT \ + IXL_PCT_NONF_IPV4_UDP | \ + IXL_PCT_NONF_IPV4_TCP | \ + IXL_PCT_NONF_IPV4_SCTP | \ + IXL_PCT_NONF_IPV4_OTHER | \ + IXL_PCT_FRAG_IPV4 | \ + IXL_PCT_NONF_IPV6_UDP | \ + IXL_PCT_NONF_IPV6_TCP | \ + IXL_PCT_NONF_IPV6_SCTP | \ + IXL_PCT_NONF_IPV6_OTHER | \ + IXL_PCT_FRAG_IPV6 | \ + IXL_PCT_L2_PAYLOAD + +#define IXL_RSS_HENA_BASE_710 IXL_RSS_HENA_BASE_DEFAULT +#define IXL_RSS_HENA_BASE_722 IXL_RSS_HENA_BASE_DEFAULT | \ + IXL_PCT_NONF_IPV4_UDP_UCAST | \ + IXL_PCT_NONF_IPV4_UDP_MCAST | \ + IXL_PCT_NONF_IPV6_UDP_UCAST | \ + IXL_PCT_NONF_IPV6_UDP_MCAST | \ + IXL_PCT_NONF_IPV4_TCP_SYN_NOACK | \ + IXL_PCT_NONF_IPV6_TCP_SYN_NOACK + +#endif /* _IXL_VAR_H_ */ + Here is the PCI bus error handling patch that I proposed in the following mail. https://marc.info/?l=openbsd-tech&m=172723210819245&w=2 diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c index 1226b953821..f5077ff0b45 100644 --- a/sys/dev/pci/if_iavf.c +++ b/sys/dev/pci/if_iavf.c @@ -616,6 +616,7 @@ struct iavf_softc { uint32_t sc_major_ver; uint32_t sc_minor_ver; + int sc_if_attached; int sc_got_vf_resources; int sc_got_irq_map; uint32_t sc_vf_id; @@ -1078,6 +1079,7 @@ iavf_attach(struct device *parent, struct device *self, void *aux) if_attach_queues(ifp, iavf_nqueues(sc)); if_attach_iqueues(ifp, iavf_nqueues(sc)); + sc->sc_if_attached++; iavf_intr_enable(sc); @@ -1623,7 +1625,8 @@ iavf_reset(void *xsc) link_state = ifp->if_link_state; if (ifp->if_link_state != LINK_STATE_DOWN) { ifp->if_link_state = LINK_STATE_DOWN; - if_link_state_change(ifp); + if (sc->sc_if_attached) + if_link_state_change(ifp); } up = 0; -- Yuichiro NAITO (naito.yuichiro@gmail.com)