Download raw body.
iavf(4): multi-queue support
Hello,
On Fri, 22 Nov 2024 16:48:40 +0100
Jan Klemkow <jan@openbsd.org> wrote:
> On Fri, Nov 22, 2024 at 04:57:32PM GMT, Yuichiro NAITO wrote:
>> From: Jan Klemkow <j.klemkow@wemelug.de>
>> Subject: Re: iavf(4): multi-queue support
>> Date: Thu, 21 Nov 2024 10:44:40 +0100
>>
>> > On Thu, Nov 21, 2024 at 10:31:02AM GMT, Yuichiro NAITO wrote:
>> >> From: Yuichiro NAITO <naito.yuichiro@gmail.com>
>> >> Subject: Re: iavf(4): multi-queue support
>> >> Date: Wed, 04 Sep 2024 17:22:21 +0900 (JST)
>> >>
>> >> > Hi. Suppose you are interested in iavf(4) multi-queue. Try the following
>> >> > complete patch which enables multi-queue, checksum offloads, and TSO.
>> >> > I confirmed it works on my ESXi 8.0 and Linux qemu/kvm. Iperf3 results in
>> >> > 9.41 Gbps transmit speed and 6.87 Gbps receive speed of my OpenBSD guest
>> >> > with MTU size 1500 on ESXi 8.0.
>> >>
>> >> Hi, I had some reports that my patch doesn't work on ESXi while attaching
>> >> an iavf device. The reporter said the following error messages are shown
>> >> in the dmesg.
>> >>
>> >> ```
>> >> iavf0: SET_RSS_HENA failed: -1
>> >> iavf0: queue op 9 failed: -1
>> >> ```
>> >>
>> >> Both errors had an error code '-1', meaning the response from the PF driver
>> >> timed out. The `SET_RSS_HENA` request sends a packet classifier value for
>> >> the RSS hash filter which currently sends 0. Some PF driver version of ESXi
>> >> ignores the 0 value. So, I added the default value referring to the NetBSD
>> >> driver. The value definition is the same as the ixl(4). I split the
>> >> definitions to the 'if_iavfvars.h' file to share the code.
>> >>
>> >> The `queue op 9 failed` message happened in the 'iavf_queue_select' function.
>> >> This seems really timed out. I extended the time-out value to 3000 ms. This
>> >> value is also taken from NetBSD.
>> >>
>> >> I merged my code that handles a PCI bus error case in my previous mail.
>> >>
>> >> https://marc.info/?l=openbsd-tech&m=172723210819245&w=2
>> >>
>> >> I also merged Jan's code that has VLAN #ifdef. The checksum offload code is
>> >> the same as Jan's. If you see the diff from Jan's code, you will see my code
>> >> only.
>> >>
>> >> https://marc.info/?l=openbsd-tech&m=173040636900369&w=2
>> >>
>> >> OK?
>> >
>> > I tested your diff on my KVM setup. Works for me there. I had no time
>> > for ESXi tests yet.
>> >
>> > Could you split your diff in checksum offload, TSO and Multi-Queue.
>> > Thus, its easier to review and to see where the problems are.
>>
>> Sure. I split my patch into the following 4 patches.
>>
>> 1. check-sum offloading
>> 2. TSO support
>> 3. Multi-queue support
>> 4. PCI bus error handling
>>
>> Please apply by this order.
>>
>> Here is the check-sum offloading patch, originally you wrote it.
>> I changed the 'ixl_rx_checksum' function name to 'iavf_rx_checksum'.
>> It looks like a simple mistake. No functional change is intended.
>
> I reviewed and tested the checksum your checksum diff below.
>
> ok jan@
Naito doesn't have an account on cvs. The first diff seems correct
for me too. I'll commit it later.
Also my company IIJ tested the diff which includes the other diffs.
>> diff --git a/sys/dev/pci/if_iavf.c b/sys/dev/pci/if_iavf.c
>> index d573d6725f4..aac22b8f378 100644
>> --- a/sys/dev/pci/if_iavf.c
>> +++ b/sys/dev/pci/if_iavf.c
>> @@ -49,6 +49,7 @@
>> */
>>
>> #include "bpfilter.h"
>> +#include "vlan.h"
>>
>> #include <sys/param.h>
>> #include <sys/systm.h>
>> @@ -75,6 +76,7 @@
>>
>> #include <netinet/in.h>
>> #include <netinet/if_ether.h>
>> +#include <netinet/udp.h>
>>
>> #include <dev/pci/pcireg.h>
>> #include <dev/pci/pcivar.h>
>> @@ -890,11 +892,13 @@ iavf_attach(struct device *parent, struct device *self, void *aux)
>> strlcpy(ifp->if_xname, DEVNAME(sc), IFNAMSIZ);
>> ifq_init_maxlen(&ifp->if_snd, sc->sc_tx_ring_ndescs);
>>
>> - ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
>> -#if 0
>> - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 |
>> - IFCAP_CSUM_UDPv4;
>> + ifp->if_capabilities = IFCAP_VLAN_MTU;
>> +#if NVLAN > 0
>> + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
>> #endif
>> + ifp->if_capabilities |= IFCAP_CSUM_IPv4 |
>> + IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
>> + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
>>
>> ifmedia_init(&sc->sc_media, 0, iavf_media_change, iavf_media_status);
>>
>> @@ -1656,6 +1660,57 @@ iavf_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m)
>> BUS_DMA_STREAMING | BUS_DMA_NOWAIT));
>> }
>>
>> +static uint64_t
>> +iavf_tx_offload(struct mbuf *m)
>> +{
>> + struct ether_extracted ext;
>> + uint64_t hlen;
>> + uint64_t offload = 0;
>> +
>> +#if NVLAN > 0
>> + if (ISSET(m->m_flags, M_VLANTAG)) {
>> + uint64_t vtag = m->m_pkthdr.ether_vtag;
>> + offload |= IAVF_TX_DESC_CMD_IL2TAG1;
>> + offload |= vtag << IAVF_TX_DESC_L2TAG1_SHIFT;
>> + }
>> +#endif
>> +
>> + if (!ISSET(m->m_pkthdr.csum_flags,
>> + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
>> + return (offload);
>> +
>> + ether_extract_headers(m, &ext);
>> +
>> + if (ext.ip4) {
>> + offload |= ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ?
>> + IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM :
>> + IAVF_TX_DESC_CMD_IIPT_IPV4;
>> +#ifdef INET6
>> + } else if (ext.ip6) {
>> + offload |= IAVF_TX_DESC_CMD_IIPT_IPV6;
>> +#endif
>> + } else {
>> + panic("CSUM_OUT set for non-IP packet");
>> + /* NOTREACHED */
>> + }
>> + hlen = ext.iphlen;
>> +
>> + offload |= (ETHER_HDR_LEN >> 1) << IAVF_TX_DESC_MACLEN_SHIFT;
>> + offload |= (hlen >> 2) << IAVF_TX_DESC_IPLEN_SHIFT;
>> +
>> + if (ext.tcp && ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
>> + offload |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
>> + offload |= (uint64_t)(ext.tcphlen >> 2)
>> + << IAVF_TX_DESC_L4LEN_SHIFT;
>> + } else if (ext.udp && ISSET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
>> + offload |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
>> + offload |= (uint64_t)(sizeof(*ext.udp) >> 2)
>> + << IAVF_TX_DESC_L4LEN_SHIFT;
>> + }
>> +
>> + return offload;
>> +}
>> +
>> static void
>> iavf_start(struct ifqueue *ifq)
>> {
>> @@ -1667,7 +1722,7 @@ iavf_start(struct ifqueue *ifq)
>> bus_dmamap_t map;
>> struct mbuf *m;
>> uint64_t cmd;
>> - uint64_t vlan_cmd;
>> + uint64_t offload;
>> unsigned int prod, free, last, i;
>> unsigned int mask;
>> int post = 0;
>> @@ -1702,6 +1757,8 @@ iavf_start(struct ifqueue *ifq)
>> if (m == NULL)
>> break;
>>
>> + offload = iavf_tx_offload(m);
>> +
>> txm = &txr->txr_maps[prod];
>> map = txm->txm_map;
>>
>> @@ -1714,20 +1771,13 @@ iavf_start(struct ifqueue *ifq)
>> bus_dmamap_sync(sc->sc_dmat, map, 0,
>> map->dm_mapsize, BUS_DMASYNC_PREWRITE);
>>
>> - vlan_cmd = 0;
>> - if (m->m_flags & M_VLANTAG) {
>> - vlan_cmd = IAVF_TX_DESC_CMD_IL2TAG1 |
>> - (((uint64_t)m->m_pkthdr.ether_vtag) <<
>> - IAVF_TX_DESC_L2TAG1_SHIFT);
>> - }
>> -
>> for (i = 0; i < map->dm_nsegs; i++) {
>> txd = &ring[prod];
>>
>> cmd = (uint64_t)map->dm_segs[i].ds_len <<
>> IAVF_TX_DESC_BSIZE_SHIFT;
>> - cmd |= IAVF_TX_DESC_DTYPE_DATA | IAVF_TX_DESC_CMD_ICRC |
>> - vlan_cmd;
>> + cmd |= IAVF_TX_DESC_DTYPE_DATA | IAVF_TX_DESC_CMD_ICRC;
>> + cmd |= offload;
>>
>> htolem64(&txd->addr, map->dm_segs[i].ds_addr);
>> htolem64(&txd->cmd, cmd);
>> @@ -1938,6 +1988,24 @@ iavf_rxr_free(struct iavf_softc *sc, struct iavf_rx_ring *rxr)
>> free(rxr, M_DEVBUF, sizeof(*rxr));
>> }
>>
>> +static void
>> +iavf_rx_checksum(struct mbuf *m, uint64_t word)
>> +{
>> + if (!ISSET(word, IAVF_RX_DESC_L3L4P))
>> + return;
>> +
>> + if (ISSET(word, IAVF_RX_DESC_IPE))
>> + return;
>> +
>> + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK;
>> +
>> + if (ISSET(word, IAVF_RX_DESC_L4E))
>> + return;
>> +
>> + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK;
>> +}
>> +
>> +
>> static int
>> iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq)
>> {
>> @@ -2002,6 +2070,7 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq)
>> m->m_pkthdr.len += len;
>>
>> if (ISSET(word, IAVF_RX_DESC_EOP)) {
>> +#if NVLAN > 0
>> if (ISSET(word, IAVF_RX_DESC_L2TAG1P)) {
>> vlan = (lemtoh64(&rxd->qword0) &
>> IAVF_RX_DESC_L2TAG1_MASK)
>> @@ -2009,8 +2078,10 @@ iavf_rxeof(struct iavf_softc *sc, struct ifiqueue *ifiq)
>> m->m_pkthdr.ether_vtag = vlan;
>> m->m_flags |= M_VLANTAG;
>> }
>> +#endif
>> if (!ISSET(word,
>> IAVF_RX_DESC_RXE | IAVF_RX_DESC_OVERSIZE)) {
>> + iavf_rx_checksum(m, word);
>> ml_enqueue(&ml, m);
>> } else {
>> ifp->if_ierrors++; /* XXX */
>
>
iavf(4): multi-queue support