Index | Thread | Search

From:
Stefan Fritsch <sf@openbsd.org>
Subject:
Re: vio(4) multi-queue V7
To:
tech@openbsd.org
Date:
Mon, 23 Sep 2024 22:14:34 +0200

Download raw body.

Thread
  • Stefan Fritsch:

    vio(4) multi-queue V6

    • Stefan Fritsch:

      vio(4) multi-queue V7

  • Hi,
    
    below is V7 of the multi-queue diff, rebased to again apply to -current and with some
    minor tweaks, like removing an ancient qemu bug workaround.
    
    Cheers,
    Stefan
    
    
    diff --git a/share/man/man4/vio.4 b/share/man/man4/vio.4
    index e3c713941a0..ce4ba88241d 100644
    --- a/share/man/man4/vio.4
    +++ b/share/man/man4/vio.4
    @@ -34,11 +34,6 @@ Setting the bit 0x2 in the flags disables the RingEventIndex feature.
     This can be tried as a workaround for possible bugs in host implementations of
     .Nm
     at the cost of slightly reduced performance.
    -.Pp
    -Setting the bit 0x100 in the flags forces the interface to be always in
    -promiscuous mode.
    -This can be used as a workaround for a bug in QEMU before version 1.7.2 that
    -prevents packets with a VLAN tag from being sent to the guest.
     .Sh SEE ALSO
     .Xr intro 4 ,
     .Xr virtio 4
    diff --git a/sys/dev/fdt/virtio_mmio.c b/sys/dev/fdt/virtio_mmio.c
    index 604ffcab570..5a22c7f4823 100644
    --- a/sys/dev/fdt/virtio_mmio.c
    +++ b/sys/dev/fdt/virtio_mmio.c
    @@ -103,6 +103,9 @@ void		virtio_mmio_set_status(struct virtio_softc *, int);
     int		virtio_mmio_negotiate_features(struct virtio_softc *,
         const struct virtio_feature_name *);
     int		virtio_mmio_intr(void *);
    +int             virtio_mmio_intr_establish(struct virtio_softc *, struct virtio_attach_args *,
    +    int, struct cpu_info *, int (*)(void *), void *);
    +
     
     struct virtio_mmio_softc {
     	struct virtio_softc	sc_sc;
    @@ -151,6 +154,7 @@ const struct virtio_ops virtio_mmio_ops = {
     	virtio_mmio_set_status,
     	virtio_mmio_negotiate_features,
     	virtio_mmio_intr,
    +	virtio_mmio_intr_establish,
     };
     
     uint16_t
    @@ -522,3 +526,11 @@ virtio_mmio_kick(struct virtio_softc *vsc, uint16_t idx)
     	bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_NOTIFY,
     	    idx);
     }
    +
    +int
    +virtio_mmio_intr_establish(struct virtio_softc *vsc,
    +    struct virtio_attach_args *va, int vec, struct cpu_info *ci,
    +    int (*func)(void *), void *arg)
    +{
    +	return ENXIO;
    +}
    diff --git a/sys/dev/pci/virtio_pci.c b/sys/dev/pci/virtio_pci.c
    index f9c8801ceb7..7c0d6a8451e 100644
    --- a/sys/dev/pci/virtio_pci.c
    +++ b/sys/dev/pci/virtio_pci.c
    @@ -50,7 +50,7 @@
      * XXX: PCI-endian while the device specific registers are native endian.
      */
     
    -#define MAX_MSIX_VECS	8
    +#define MAX_MSIX_VECS	16
     
     struct virtio_pci_softc;
     struct virtio_pci_attach_args;
    @@ -62,7 +62,7 @@ int		virtio_pci_attach_10(struct virtio_pci_softc *sc, struct pci_attach_args *p
     int		virtio_pci_detach(struct device *, int);
     
     void		virtio_pci_kick(struct virtio_softc *, uint16_t);
    -int		virtio_pci_adjust_config_region(struct virtio_pci_softc *);
    +int		virtio_pci_adjust_config_region(struct virtio_pci_softc *, int offset);
     uint8_t		virtio_pci_read_device_config_1(struct virtio_softc *, int);
     uint16_t	virtio_pci_read_device_config_2(struct virtio_softc *, int);
     uint32_t	virtio_pci_read_device_config_4(struct virtio_softc *, int);
    @@ -80,8 +80,9 @@ int		virtio_pci_negotiate_features(struct virtio_softc *, const struct virtio_fe
     int		virtio_pci_negotiate_features_10(struct virtio_softc *, const struct virtio_feature_name *);
     void		virtio_pci_set_msix_queue_vector(struct virtio_pci_softc *, uint32_t, uint16_t);
     void		virtio_pci_set_msix_config_vector(struct virtio_pci_softc *, uint16_t);
    -int		virtio_pci_msix_establish(struct virtio_pci_softc *, struct virtio_pci_attach_args *, int, int (*)(void *), void *);
    +int		virtio_pci_msix_establish(struct virtio_pci_softc *, struct virtio_pci_attach_args *, int, struct cpu_info *, int (*)(void *), void *);
     int		virtio_pci_setup_msix(struct virtio_pci_softc *, struct virtio_pci_attach_args *, int);
    +int		virtio_pci_intr_establish(struct virtio_softc *, struct virtio_attach_args *, int, struct cpu_info *, int (*)(void *), void *);
     void		virtio_pci_free_irqs(struct virtio_pci_softc *);
     int		virtio_pci_poll_intr(void *);
     int		virtio_pci_legacy_intr(void *);
    @@ -98,6 +99,7 @@ enum irq_type {
     	IRQ_NO_MSIX,
     	IRQ_MSIX_SHARED, /* vec 0: config irq, vec 1 shared by all vqs */
     	IRQ_MSIX_PER_VQ, /* vec 0: config irq, vec n: irq of vq[n-1] */
    +	IRQ_MSIX_CHILD,  /* assigned by child driver */
     };
     
     struct virtio_pci_intr {
    @@ -175,6 +177,7 @@ const struct virtio_ops virtio_pci_ops = {
     	virtio_pci_set_status,
     	virtio_pci_negotiate_features,
     	virtio_pci_poll_intr,
    +	virtio_pci_intr_establish,
     };
     
     static inline uint64_t
    @@ -355,8 +358,7 @@ virtio_pci_match(struct device *parent, void *match, void *aux)
     		return 1;
     	/* virtio 1.0 */
     	if (PCI_PRODUCT(pa->pa_id) >= 0x1040 &&
    -	    PCI_PRODUCT(pa->pa_id) <= 0x107f &&
    -	    PCI_REVISION(pa->pa_class) == 1)
    +	    PCI_PRODUCT(pa->pa_id) <= 0x107f)
     		return 1;
     	return 0;
     }
    @@ -588,23 +590,22 @@ virtio_pci_attach(struct device *parent, struct device *self, void *aux)
     	struct pci_attach_args *pa = (struct pci_attach_args *)aux;
     	pci_chipset_tag_t pc = pa->pa_pc;
     	pcitag_t tag = pa->pa_tag;
    -	int revision, ret = ENODEV;
    +	int revision, product, ret = ENODEV;
     	pcireg_t id;
     	char const *intrstr;
     	pci_intr_handle_t ih;
     	struct virtio_pci_attach_args vpa = { { 0 }, pa };
     
     	revision = PCI_REVISION(pa->pa_class);
    -	switch (revision) {
    -	case 0:
    +	product = PCI_PRODUCT(pa->pa_id);
    +	if (revision == 0) {
     		/* subsystem ID shows what I am */
     		id = PCI_PRODUCT(pci_conf_read(pc, tag, PCI_SUBSYS_ID_REG));
    -		break;
    -	case 1:
    -		id = PCI_PRODUCT(pa->pa_id) - 0x1040;
    -		break;
    -	default:
    -		printf("unknown revision 0x%02x; giving up\n", revision);
    +	} else if (product >= 0x1040 && product <= 0x107f) {
    +		id = product - 0x1040;
    +	} else {
    +		printf("unknown device prod 0x%04x rev 0x%02x; giving up\n",
    +		    product, revision);
     		return;
     	}
     
    @@ -633,7 +634,7 @@ virtio_pci_attach(struct device *parent, struct device *self, void *aux)
     
     	vsc->sc_ops = &virtio_pci_ops;
     	if ((vsc->sc_dev.dv_cfdata->cf_flags & VIRTIO_CF_NO_VERSION_1) == 0 &&
    -	    (revision == 1 ||
    +	    (revision >= 1 ||
     	     (vsc->sc_dev.dv_cfdata->cf_flags & VIRTIO_CF_PREFER_VERSION_1))) {
     		ret = virtio_pci_attach_10(sc, pa);
     	}
    @@ -646,9 +647,8 @@ virtio_pci_attach(struct device *parent, struct device *self, void *aux)
     		goto fail_0;
     	}
     
    -	sc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI;
     	sc->sc_irq_type = IRQ_NO_MSIX;
    -	if (virtio_pci_adjust_config_region(sc) != 0)
    +	if (virtio_pci_adjust_config_region(sc, VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI) != 0)
     		goto fail_0;
     
     	virtio_device_reset(vsc);
    @@ -670,7 +670,9 @@ virtio_pci_attach(struct device *parent, struct device *self, void *aux)
     		goto fail_1;
     	}
     
    -	if (virtio_pci_setup_msix(sc, &vpa, 0) == 0) {
    +	if (sc->sc_irq_type == IRQ_MSIX_CHILD) {
    +		intrstr = "msix";
    +	} else if (virtio_pci_setup_msix(sc, &vpa, 0) == 0) {
     		sc->sc_irq_type = IRQ_MSIX_PER_VQ;
     		intrstr = "msix per-VQ";
     	} else if (virtio_pci_setup_msix(sc, &vpa, 1) == 0) {
    @@ -738,11 +740,14 @@ virtio_pci_detach(struct device *self, int flags)
     }
     
     int
    -virtio_pci_adjust_config_region(struct virtio_pci_softc *sc)
    +virtio_pci_adjust_config_region(struct virtio_pci_softc *sc, int offset)
     {
     	if (sc->sc_sc.sc_version_1)
     		return 0;
    -	sc->sc_devcfg_iosize = sc->sc_iosize - sc->sc_devcfg_offset;
    +	if (sc->sc_devcfg_offset == offset)
    +		return 0;
    +	sc->sc_devcfg_offset = offset;
    +	sc->sc_devcfg_iosize = sc->sc_iosize - offset;
     	sc->sc_devcfg_iot = sc->sc_iot;
     	if (bus_space_subregion(sc->sc_iot, sc->sc_ioh, sc->sc_devcfg_offset,
     	    sc->sc_devcfg_iosize, &sc->sc_devcfg_ioh) != 0) {
    @@ -821,6 +826,8 @@ virtio_pci_negotiate_features_10(struct virtio_softc *vsc,
     	uint64_t host, negotiated;
     
     	vsc->sc_driver_features |= VIRTIO_F_VERSION_1;
    +	/* we always use the standard pci busdma_tag */
    +	vsc->sc_driver_features |= VIRTIO_F_ACCESS_PLATFORM;
     	/* notify on empty is 0.9 only */
     	vsc->sc_driver_features &= ~VIRTIO_F_NOTIFY_ON_EMPTY;
     	CWRITE(sc, device_feature_select, 0);
    @@ -937,30 +944,33 @@ virtio_pci_write_device_config_8(struct virtio_softc *vsc,
     
     int
     virtio_pci_msix_establish(struct virtio_pci_softc *sc,
    -    struct virtio_pci_attach_args *vpa, int idx,
    +    struct virtio_pci_attach_args *vpa, int idx, struct cpu_info *ci,
         int (*handler)(void *), void *ih_arg)
     {
     	struct virtio_softc *vsc = &sc->sc_sc;
     	pci_intr_handle_t ih;
    +	int r;
     
     	KASSERT(idx < sc->sc_nintr);
     
    -	if (pci_intr_map_msix(vpa->vpa_pa, idx, &ih) != 0) {
    +	r = pci_intr_map_msix(vpa->vpa_pa, idx, &ih);
    +	if (r != 0) {
     #if VIRTIO_DEBUG
     		printf("%s[%d]: pci_intr_map_msix failed\n",
     		    vsc->sc_dev.dv_xname, idx);
     #endif
    -		return 1;
    +		return r;
     	}
     	snprintf(sc->sc_intr[idx].name, sizeof(sc->sc_intr[idx].name), "%s:%d",
     	    vsc->sc_child->dv_xname, idx);
    -	sc->sc_intr[idx].ih = pci_intr_establish(sc->sc_pc, ih, vsc->sc_ipl,
    -	    handler, ih_arg, sc->sc_intr[idx].name);
    +	sc->sc_intr[idx].ih = pci_intr_establish_cpu(sc->sc_pc, ih, vsc->sc_ipl,
    +	    ci, handler, ih_arg, sc->sc_intr[idx].name);
     	if (sc->sc_intr[idx].ih == NULL) {
     		printf("%s[%d]: couldn't establish msix interrupt\n",
    -		    vsc->sc_dev.dv_xname, idx);
    -		return 1;
    +		    vsc->sc_child->dv_xname, idx);
    +		return ENOMEM;
     	}
    +	virtio_pci_adjust_config_region(sc, VIRTIO_CONFIG_DEVICE_CONFIG_MSI);
     	return 0;
     }
     
    @@ -1010,8 +1020,8 @@ virtio_pci_free_irqs(struct virtio_pci_softc *sc)
     		}
     	}
     
    -	sc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI;
    -	virtio_pci_adjust_config_region(sc);
    +	/* XXX msix_delroute does not unset PCI_MSIX_MC_MSIXE -> leave alone? */
    +	virtio_pci_adjust_config_region(sc, VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI);
     }
     
     int
    @@ -1019,34 +1029,33 @@ virtio_pci_setup_msix(struct virtio_pci_softc *sc,
         struct virtio_pci_attach_args *vpa, int shared)
     {
     	struct virtio_softc *vsc = &sc->sc_sc;
    -	int i;
    +	int i, r = 0;
     
     	/* Shared needs config + queue */
     	if (shared && vpa->vpa_va.va_nintr < 1 + 1)
    -		return 1;
    +		return ERANGE;
     	/* Per VQ needs config + N * queue */
     	if (!shared && vpa->vpa_va.va_nintr < 1 + vsc->sc_nvqs)
    -		return 1;
    +		return ERANGE;
     
    -	if (virtio_pci_msix_establish(sc, vpa, 0, virtio_pci_config_intr, vsc))
    -		return 1;
    -	sc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSI;
    -	virtio_pci_adjust_config_region(sc);
    +	r = virtio_pci_msix_establish(sc, vpa, 0, NULL, virtio_pci_config_intr, vsc);
    +	if (r != 0)
    +		return r;
     
     	if (shared) {
    -		if (virtio_pci_msix_establish(sc, vpa, 1,
    -		    virtio_pci_shared_queue_intr, vsc)) {
    +		r = virtio_pci_msix_establish(sc, vpa, 1, NULL,
    +		    virtio_pci_shared_queue_intr, vsc);
    +		if (r != 0)
     			goto fail;
    -		}
     
     		for (i = 0; i < vsc->sc_nvqs; i++)
     			vsc->sc_vqs[i].vq_intr_vec = 1;
     	} else {
     		for (i = 0; i < vsc->sc_nvqs; i++) {
    -			if (virtio_pci_msix_establish(sc, vpa, i + 1,
    -			    virtio_pci_queue_intr, &vsc->sc_vqs[i])) {
    +			r = virtio_pci_msix_establish(sc, vpa, i + 1, NULL,
    +			    virtio_pci_queue_intr, &vsc->sc_vqs[i]);
    +			if (r != 0)
     				goto fail;
    -			}
     			vsc->sc_vqs[i].vq_intr_vec = i + 1;
     		}
     	}
    @@ -1054,7 +1063,28 @@ virtio_pci_setup_msix(struct virtio_pci_softc *sc,
     	return 0;
     fail:
     	virtio_pci_free_irqs(sc);
    -	return 1;
    +	return r;
    +}
    +
    +int
    +virtio_pci_intr_establish(struct virtio_softc *vsc,
    +    struct virtio_attach_args *va, int vec, struct cpu_info *ci,
    +    int (*func)(void *), void *arg)
    +{
    +	struct virtio_pci_attach_args *vpa;
    +	struct virtio_pci_softc *sc;
    +
    +	if (vsc->sc_ops != &virtio_pci_ops)
    +		return ENXIO;
    +
    +	vpa = (struct virtio_pci_attach_args *)va;
    +	sc = (struct virtio_pci_softc *)vsc;
    +
    +	if (vec >= sc->sc_nintr || sc->sc_nintr <= 1)
    +		return ERANGE;
    +
    +	sc->sc_irq_type = IRQ_MSIX_CHILD;
    +	return virtio_pci_msix_establish(sc, vpa, vec, ci, func, arg);
     }
     
     /*
    diff --git a/sys/dev/pv/if_vio.c b/sys/dev/pv/if_vio.c
    index 9b6a5452d1b..918fb2288f4 100644
    --- a/sys/dev/pv/if_vio.c
    +++ b/sys/dev/pv/if_vio.c
    @@ -32,7 +32,9 @@
     #include <sys/param.h>
     #include <sys/systm.h>
     #include <sys/device.h>
    +#include <sys/intrmap.h>
     #include <sys/mbuf.h>
    +#include <sys/mutex.h>
     #include <sys/sockio.h>
     #include <sys/timeout.h>
     
    @@ -63,8 +65,15 @@
      * if_vioreg.h:
      */
     /* Configuration registers */
    -#define VIRTIO_NET_CONFIG_MAC		0 /* 8bit x 6byte */
    -#define VIRTIO_NET_CONFIG_STATUS	6 /* 16bit */
    +#define VIRTIO_NET_CONFIG_MAC		 0 /*  8 bit x 6 byte */
    +#define VIRTIO_NET_CONFIG_STATUS	 6 /* 16 bit */
    +#define VIRTIO_NET_CONFIG_MAX_QUEUES	 8 /* 16 bit */
    +#define VIRTIO_NET_CONFIG_MTU		10 /* 16 bit */
    +#define VIRTIO_NET_CONFIG_SPEED		12 /* 32 bit */
    +#define VIRTIO_NET_CONFIG_DUPLEX	16 /*  8 bit */
    +#define VIRTIO_NET_CONFIG_RSS_SIZE	17 /*  8 bit */
    +#define VIRTIO_NET_CONFIG_RSS_LEN	18 /* 16 bit */
    +#define VIRTIO_NET_CONFIG_HASH_TYPES	20 /* 16 bit */
     
     /* Feature bits */
     #define VIRTIO_NET_F_CSUM			(1ULL<<0)
    @@ -97,12 +106,6 @@
     #define VIRTIO_NET_F_RSC_EXT			(1ULL<<61)
     #define VIRTIO_NET_F_STANDBY			(1ULL<<62)
     #define VIRTIO_NET_F_SPEED_DUPLEX		(1ULL<<63)
    -/*
    - * Config(8) flags. The lowest byte is reserved for generic virtio stuff.
    - */
    -
    -/* Workaround for vlan related bug in qemu < version 2.0 */
    -#define CONFFLAG_QEMU_VLAN_BUG		(1<<8)
     
     static const struct virtio_feature_name virtio_net_feature_names[] = {
     #if VIRTIO_DEBUG
    @@ -182,6 +185,11 @@ struct virtio_net_ctrl_cmd {
     # define VIRTIO_NET_CTRL_VLAN_ADD	0
     # define VIRTIO_NET_CTRL_VLAN_DEL	1
     
    +#define VIRTIO_NET_CTRL_MQ		4
    +# define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET	0
    +# define VIRTIO_NET_CTRL_MQ_RSS_CONFIG		1
    +# define VIRTIO_NET_CTRL_MQ_HASH_CONFIG		2
    +
     #define VIRTIO_NET_CTRL_GUEST_OFFLOADS	5
     # define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET	0
     
    @@ -195,6 +203,12 @@ struct virtio_net_ctrl_rx {
     	uint8_t	onoff;
     } __packed;
     
    +struct virtio_net_ctrl_mq_pairs_set {
    +	uint16_t virtqueue_pairs;
    +};
    +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN	1
    +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX	0x8000
    +
     struct virtio_net_ctrl_guest_offloads {
     	uint64_t offloads;
     } __packed;
    @@ -224,9 +238,13 @@ struct vio_queue {
     	struct mbuf		**viq_rxmbufs;
     	struct mbuf		**viq_txmbufs;
     	struct if_rxring	  viq_rxring;
    +	struct ifiqueue		 *viq_ifiq;
    +	struct ifqueue		 *viq_ifq;
     	struct virtqueue	 *viq_rxvq;
     	struct virtqueue	 *viq_txvq;
    -};
    +	struct mutex		  viq_txmtx, viq_rxmtx;
    +	int			  viq_txfree_slots;
    +} __aligned(64);
     
     struct vio_softc {
     	struct device		sc_dev;
    @@ -246,16 +264,20 @@ struct vio_softc {
     	caddr_t			sc_dma_kva;
     
     	int			sc_hdr_size;
    -	struct virtio_net_ctrl_cmd *sc_ctrl_cmd;
    -	struct virtio_net_ctrl_status *sc_ctrl_status;
    -	struct virtio_net_ctrl_rx *sc_ctrl_rx;
    -	struct virtio_net_ctrl_guest_offloads *sc_ctrl_guest_offloads;
    -	struct virtio_net_ctrl_mac_tbl *sc_ctrl_mac_tbl_uc;
    +	struct virtio_net_ctrl_cmd		*sc_ctrl_cmd;
    +	struct virtio_net_ctrl_status		*sc_ctrl_status;
    +	struct virtio_net_ctrl_rx		*sc_ctrl_rx;
    +	struct virtio_net_ctrl_mq_pairs_set	*sc_ctrl_mq_pairs;
    +	struct virtio_net_ctrl_guest_offloads	*sc_ctrl_guest_offloads;
    +	struct virtio_net_ctrl_mac_tbl		*sc_ctrl_mac_tbl_uc;
     #define sc_ctrl_mac_info sc_ctrl_mac_tbl_uc
    -	struct virtio_net_ctrl_mac_tbl *sc_ctrl_mac_tbl_mc;
    +	struct virtio_net_ctrl_mac_tbl		*sc_ctrl_mac_tbl_mc;
     
    +	struct intrmap		*sc_intrmap;
     	struct vio_queue	*sc_q;
     	uint16_t		sc_nqueues;
    +	int			sc_tx_slots_per_req;
    +	int			sc_rx_mbuf_size;
     
     	enum vio_ctrl_state	sc_ctrl_inuse;
     
    @@ -285,7 +307,7 @@ void	vio_attach(struct device *, struct device *, void *);
     /* ifnet interface functions */
     int	vio_init(struct ifnet *);
     void	vio_stop(struct ifnet *, int);
    -void	vio_start(struct ifnet *);
    +void	vio_start(struct ifqueue *);
     int	vio_ioctl(struct ifnet *, u_long, caddr_t);
     void	vio_get_lladdr(struct arpcom *ac, struct virtio_softc *vsc);
     void	vio_put_lladdr(struct arpcom *ac, struct virtio_softc *vsc);
    @@ -301,15 +323,21 @@ void	vio_rxtick(void *);
     
     /* tx */
     int	vio_tx_intr(struct virtqueue *);
    +int	vio_tx_dequeue(struct virtqueue *);
     int	vio_txeof(struct virtqueue *);
     void	vio_tx_drain(struct vio_softc *);
     int	vio_encap(struct vio_queue *, int, struct mbuf *);
     void	vio_txtick(void *);
     
    +int	vio_queue_intr(void *);
    +int	vio_config_intr(void *);
    +int	vio_ctrl_intr(void *);
    +
     /* other control */
     void	vio_link_state(struct ifnet *);
     int	vio_config_change(struct virtio_softc *);
     int	vio_ctrl_rx(struct vio_softc *, int, int);
    +int	vio_ctrl_mq(struct vio_softc *);
     int	vio_ctrl_guest_offloads(struct vio_softc *, uint64_t);
     int	vio_set_rx_filter(struct vio_softc *);
     void	vio_iff(struct vio_softc *);
    @@ -397,6 +425,8 @@ vio_free_dmamem(struct vio_softc *sc)
      *   sc_ctrl_status:	 return value for a command via ctrl vq (READ)
      *   sc_ctrl_rx:	 parameter for a VIRTIO_NET_CTRL_RX class command
      *			 (WRITE)
    + *   sc_ctrl_mq_pairs_set: set number of rx/tx queue pais (WRITE)
    + *   sc_ctrl_guest_offloads: configure offload features (WRITE)
      *   sc_ctrl_mac_tbl_uc: unicast MAC address filter for a VIRTIO_NET_CTRL_MAC
      *			 class command (WRITE)
      *   sc_ctrl_mac_tbl_mc: multicast MAC address filter for a VIRTIO_NET_CTRL_MAC
    @@ -438,6 +468,7 @@ vio_alloc_mem(struct vio_softc *sc, int tx_max_segments)
     		allocsize += sizeof(struct virtio_net_ctrl_cmd) * 1;
     		allocsize += sizeof(struct virtio_net_ctrl_status) * 1;
     		allocsize += sizeof(struct virtio_net_ctrl_rx) * 1;
    +		allocsize += sizeof(struct virtio_net_ctrl_mq_pairs_set) * 1;
     		allocsize += sizeof(struct virtio_net_ctrl_guest_offloads) * 1;
     		allocsize += VIO_CTRL_MAC_INFO_SIZE;
     	}
    @@ -463,6 +494,8 @@ vio_alloc_mem(struct vio_softc *sc, int tx_max_segments)
     		offset += sizeof(*sc->sc_ctrl_status);
     		sc->sc_ctrl_rx = (void *)(kva + offset);
     		offset += sizeof(*sc->sc_ctrl_rx);
    +		sc->sc_ctrl_mq_pairs = (void *)(kva + offset);
    +		offset += sizeof(*sc->sc_ctrl_mq_pairs);
     		sc->sc_ctrl_guest_offloads = (void *)(kva + offset);
     		offset += sizeof(*sc->sc_ctrl_guest_offloads);
     		sc->sc_ctrl_mac_tbl_uc = (void *)(kva + offset);
    @@ -492,8 +525,9 @@ vio_alloc_mem(struct vio_softc *sc, int tx_max_segments)
     		vioq->viq_txmbufs = vioq->viq_rxmbufs + rxqsize;
     
     		for (i = 0; i < rxqsize; i++) {
    -			r = bus_dmamap_create(vsc->sc_dmat, MAXMCLBYTES,
    -			    MAXMCLBYTES/PAGE_SIZE + 1, MCLBYTES, 0,
    +			r = bus_dmamap_create(vsc->sc_dmat,
    +			    sc->sc_rx_mbuf_size + sc->sc_hdr_size, 2,
    +			    sc->sc_rx_mbuf_size, 0,
     			    BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW,
     			    &vioq->viq_rxdmamaps[i]);
     			if (r != 0)
    @@ -585,7 +619,8 @@ vio_attach(struct device *parent, struct device *self, void *aux)
     {
     	struct vio_softc *sc = (struct vio_softc *)self;
     	struct virtio_softc *vsc = (struct virtio_softc *)parent;
    -	int i, tx_max_segments;
    +	struct virtio_attach_args *va = aux;
    +	int i, r, tx_max_segments;
     	struct ifnet *ifp = &sc->sc_ac.ac_if;
     
     	if (vsc->sc_child != NULL) {
    @@ -597,13 +632,16 @@ vio_attach(struct device *parent, struct device *self, void *aux)
     	sc->sc_virtio = vsc;
     
     	vsc->sc_child = self;
    -	vsc->sc_ipl = IPL_NET;
    +	vsc->sc_ipl = IPL_NET | IPL_MPSAFE;
     	vsc->sc_config_change = NULL;
     	vsc->sc_driver_features = VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS |
     	    VIRTIO_NET_F_CTRL_VQ | VIRTIO_NET_F_CTRL_RX |
     	    VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_CSUM |
     	    VIRTIO_F_RING_EVENT_IDX | VIRTIO_NET_F_GUEST_CSUM;
     
    +	if (va->va_nintr > 3)
    +		vsc->sc_driver_features |= VIRTIO_NET_F_MQ;
    +
     	vsc->sc_driver_features |= VIRTIO_NET_F_HOST_TSO4;
     	vsc->sc_driver_features |= VIRTIO_NET_F_HOST_TSO6;
     
    @@ -613,10 +651,23 @@ vio_attach(struct device *parent, struct device *self, void *aux)
     
     	virtio_negotiate_features(vsc, virtio_net_feature_names);
     
    -	sc->sc_nqueues = 1;
    -	vsc->sc_nvqs = 2 * sc->sc_nqueues;
    -	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ))
    -		vsc->sc_nvqs++;
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_MQ)) {
    +		i = virtio_read_device_config_2(vsc,
    +		    VIRTIO_NET_CONFIG_MAX_QUEUES);
    +		vsc->sc_nvqs = 2 * i + 1;
    +		i = MIN(i, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX);
    +		sc->sc_intrmap = intrmap_create(&sc->sc_dev, i,
    +		    va->va_nintr - 2, 0);
    +		sc->sc_nqueues = intrmap_count(sc->sc_intrmap);
    +		printf(": %u queue%s", sc->sc_nqueues,
    +		    sc->sc_nqueues > 1 ? "s"  : "");
    +	} else {
    +		sc->sc_nqueues = 1;
    +		printf(": 1 queue");
    +		vsc->sc_nvqs = 2;
    +		if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ))
    +			vsc->sc_nvqs++;
    +	}
     
     	vsc->sc_vqs = mallocarray(vsc->sc_nvqs, sizeof(*vsc->sc_vqs), M_DEVBUF,
     	    M_WAITOK|M_ZERO);
    @@ -644,10 +695,36 @@ vio_attach(struct device *parent, struct device *self, void *aux)
     	} else {
     		sc->sc_hdr_size = offsetof(struct virtio_net_hdr, num_buffers);
     	}
    +
    +	ifp->if_capabilities = 0;
    +	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
    +	ifp->if_xflags = IFXF_MPSAFE;
    +#if NVLAN > 0
    +	ifp->if_capabilities |= IFCAP_VLAN_MTU;
    +	ifp->if_capabilities |= IFCAP_VLAN_HWOFFLOAD;
    +#endif
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_CSUM))
    +		ifp->if_capabilities |= IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4|
    +		    IFCAP_CSUM_TCPv6|IFCAP_CSUM_UDPv6;
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_HOST_TSO4))
    +		ifp->if_capabilities |= IFCAP_TSOv4;
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_HOST_TSO6))
    +		ifp->if_capabilities |= IFCAP_TSOv6;
    +
    +	sc->sc_rx_mbuf_size = MCLBYTES;
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
    +	    (virtio_has_feature(vsc, VIRTIO_NET_F_GUEST_TSO4) ||
    +	     virtio_has_feature(vsc, VIRTIO_NET_F_GUEST_TSO6))) {
    +		ifp->if_xflags |= IFXF_LRO;
    +		ifp->if_capabilities |= IFCAP_LRO;
    +		sc->sc_rx_mbuf_size = 4 * 1024;
    +	}
    +
     	if (virtio_has_feature(vsc, VIRTIO_NET_F_MRG_RXBUF))
     		ifp->if_hardmtu = MAXMCLBYTES;
     	else
    -		ifp->if_hardmtu = MCLBYTES - sc->sc_hdr_size - ETHER_HDR_LEN;
    +		ifp->if_hardmtu = sc->sc_rx_mbuf_size - sc->sc_hdr_size -
    +		    ETHER_HDR_LEN;
     
     	/* defrag for longer mbuf chains */
     	tx_max_segments = 16;
    @@ -660,11 +737,18 @@ vio_attach(struct device *parent, struct device *self, void *aux)
     		tx_max_segments = 32;
     	}
     
    +	if (virtio_has_feature(vsc, VIRTIO_F_RING_INDIRECT_DESC))
    +		sc->sc_tx_slots_per_req = 1;
    +	else
    +		sc->sc_tx_slots_per_req = tx_max_segments + 1;
    +
     	for (i = 0; i < sc->sc_nqueues; i++) {
     		int vqidx = 2 * i;
     		struct vio_queue *vioq = &sc->sc_q[i];
     
     		vioq->viq_rxvq = &vsc->sc_vqs[vqidx];
    +		mtx_init(&vioq->viq_txmtx, IPL_NET);
    +		mtx_init(&vioq->viq_rxmtx, IPL_NET);
     		vioq->viq_sc = sc;
     		if (virtio_alloc_vq(vsc, vioq->viq_rxvq, vqidx, 2, "rx") != 0)
     			goto err;
    @@ -682,57 +766,102 @@ vio_attach(struct device *parent, struct device *self, void *aux)
     			virtio_postpone_intr_far(vioq->viq_txvq);
     		else
     			virtio_stop_vq_intr(vsc, vioq->viq_txvq);
    +		vioq->viq_txfree_slots = vioq->viq_txvq->vq_num - 1;
    +		KASSERT(vioq->viq_txfree_slots > sc->sc_tx_slots_per_req);
    +		if (vioq->viq_txvq->vq_num != sc->sc_q[0].viq_txvq->vq_num) {
    +			printf("inequal tx queue size %d: %d != %d\n", i,
    +			    vioq->viq_txvq->vq_num,
    +			    sc->sc_q[0].viq_txvq->vq_num);
    +			goto err;
    +		}
    +		DPRINTF("%d: q %p rx %p tx %p\n", i, vioq, vioq->viq_rxvq,
    +		    vioq->viq_txvq);
    +
    +		if (sc->sc_intrmap != NULL) {
    +			vioq->viq_rxvq->vq_intr_vec = i + 2;
    +			vioq->viq_txvq->vq_intr_vec = i + 2;
    +		}
     	}
     
     	/* control queue */
     	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ)) {
    -		sc->sc_ctl_vq = &vsc->sc_vqs[2];
    -		if (virtio_alloc_vq(vsc, sc->sc_ctl_vq, 2, 1,
    -		    "control") != 0)
    +		i = 2;
    +		if (virtio_has_feature(vsc, VIRTIO_NET_F_MQ)) {
    +			i = 2 * virtio_read_device_config_2(vsc,
    +			    VIRTIO_NET_CONFIG_MAX_QUEUES);
    +		}
    +		sc->sc_ctl_vq =  &vsc->sc_vqs[i];
    +		if (virtio_alloc_vq(vsc, sc->sc_ctl_vq, i, 1, "control") != 0)
     			goto err;
     		sc->sc_ctl_vq->vq_done = vio_ctrleof;
    +		if (sc->sc_intrmap != NULL)
    +			sc->sc_ctl_vq->vq_intr_vec = 1;
     		virtio_start_vq_intr(vsc, sc->sc_ctl_vq);
     	}
     
    +	if (sc->sc_intrmap) {
    +		r = virtio_intr_establish(vsc, va, 0, NULL, vio_config_intr,
    +		    vsc);
    +		if (r != 0) {
    +			printf("%s: cannot alloc config intr: %d\n",
    +			    sc->sc_dev.dv_xname, r);
    +			goto err;
    +		}
    +		r = virtio_intr_establish(vsc, va, 1, NULL, vio_ctrl_intr,
    +		    sc->sc_ctl_vq);
    +		if (r != 0) {
    +			printf("%s: cannot alloc ctrl intr: %d\n",
    +			    sc->sc_dev.dv_xname, r);
    +			goto err;
    +		}
    +		for (i = 0; i < sc->sc_nqueues; i++) {
    +			struct cpu_info *ci = NULL;
    +			ci = intrmap_cpu(sc->sc_intrmap, i);
    +			r = virtio_intr_establish(vsc, va, i + 2, ci,
    +			    vio_queue_intr, &sc->sc_q[i]);
    +			if (r != 0) {
    +				printf("%s: cannot alloc q%d intr: %d\n",
    +				    sc->sc_dev.dv_xname, i, r);
    +				goto err;
    +			}
    +		}
    +	}
    +
     	if (vio_alloc_mem(sc, tx_max_segments) < 0)
     		goto err;
     
     	strlcpy(ifp->if_xname, self->dv_xname, IFNAMSIZ);
     	ifp->if_softc = sc;
    -	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
    -	ifp->if_start = vio_start;
    +	ifp->if_qstart = vio_start;
     	ifp->if_ioctl = vio_ioctl;
    -	ifp->if_capabilities = 0;
    -#if NVLAN > 0
    -	ifp->if_capabilities |= IFCAP_VLAN_MTU;
    -	ifp->if_capabilities |= IFCAP_VLAN_HWOFFLOAD;
    -#endif
    -	if (virtio_has_feature(vsc, VIRTIO_NET_F_CSUM))
    -		ifp->if_capabilities |= IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4|
    -		    IFCAP_CSUM_TCPv6|IFCAP_CSUM_UDPv6;
    -	if (virtio_has_feature(vsc, VIRTIO_NET_F_HOST_TSO4))
    -		ifp->if_capabilities |= IFCAP_TSOv4;
    -	if (virtio_has_feature(vsc, VIRTIO_NET_F_HOST_TSO6))
    -		ifp->if_capabilities |= IFCAP_TSOv6;
    -
    -	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
    -	    (virtio_has_feature(vsc, VIRTIO_NET_F_GUEST_TSO4) ||
    -	     virtio_has_feature(vsc, VIRTIO_NET_F_GUEST_TSO6))) {
    -		ifp->if_xflags |= IFXF_LRO;
    -		ifp->if_capabilities |= IFCAP_LRO;
    -	}
     
     	ifq_init_maxlen(&ifp->if_snd, vsc->sc_vqs[1].vq_num - 1);
     	ifmedia_init(&sc->sc_media, 0, vio_media_change, vio_media_status);
     	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
     	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
     	vsc->sc_config_change = vio_config_change;
    -	timeout_set(&sc->sc_txtick, vio_txtick, sc->sc_q[0].viq_txvq);
    -	timeout_set(&sc->sc_rxtick, vio_rxtick, sc->sc_q[0].viq_rxvq);
    +	timeout_set(&sc->sc_txtick, vio_txtick, sc);
    +	timeout_set(&sc->sc_rxtick, vio_rxtick, sc);
     
     	virtio_set_status(vsc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK);
    +
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_MQ)) {
    +		/* ctrl queue works only after DRIVER_OK */
    +		vio_ctrl_mq(sc);
    +	}
    +
     	if_attach(ifp);
     	ether_ifattach(ifp);
    +	vio_link_state(ifp);
    +
    +	if_attach_queues(ifp, sc->sc_nqueues);
    +	if_attach_iqueues(ifp, sc->sc_nqueues);
    +
    +	for (i = 0; i < sc->sc_nqueues; i++) {
    +		ifp->if_ifqs[i]->ifq_softc = &sc->sc_q[i];
    +		sc->sc_q[i].viq_ifq = ifp->if_ifqs[i];
    +		sc->sc_q[i].viq_ifiq = ifp->if_iqs[i];
    +	}
     
     	return;
     
    @@ -766,12 +895,41 @@ vio_link_state(struct ifnet *ifp)
     	}
     }
     
    +/* interrupt handlers for multi-queue */
    +int
    +vio_queue_intr(void *arg)
    +{
    +	struct vio_queue *vioq = arg;
    +	struct virtio_softc *vsc = vioq->viq_sc->sc_virtio;
    +	int r;
    +	r = virtio_check_vq(vsc, vioq->viq_txvq);
    +	r |= virtio_check_vq(vsc, vioq->viq_rxvq);
    +	return r;
    +}
    +
    +int
    +vio_config_intr(void *arg)
    +{
    +	struct virtio_softc *vsc = arg;
    +	return vio_config_change(vsc);
    +}
    +
    +int
    +vio_ctrl_intr(void *arg)
    +{
    +	struct virtqueue *vq = arg;
    +	return virtio_check_vq(vq->vq_owner, vq);
    +}
    +
    +
     int
     vio_config_change(struct virtio_softc *vsc)
     {
     	struct vio_softc *sc = (struct vio_softc *)vsc->sc_child;
    +	KERNEL_LOCK();
     	vio_link_state(&sc->sc_ac.ac_if);
     	vio_needs_reset(sc);
    +	KERNEL_UNLOCK();
     	return 1;
     }
     
    @@ -807,12 +965,14 @@ vio_init(struct ifnet *ifp)
     	for (qidx = 0; qidx < sc->sc_nqueues; qidx++) {
     		struct vio_queue *vioq = &sc->sc_q[qidx];
     
    +		mtx_enter(&vioq->viq_rxmtx);
     		if_rxr_init(&vioq->viq_rxring,
    -		    2 * ((ifp->if_hardmtu / MCLBYTES) + 1),
    +		    2 * ((ifp->if_hardmtu / sc->sc_rx_mbuf_size) + 1),
     		    vioq->viq_rxvq->vq_num);
     		vio_populate_rx_mbufs(sc, vioq);
    +		ifq_clr_oactive(vioq->viq_ifq);
    +		mtx_leave(&vioq->viq_rxmtx);
     	}
    -	ifq_clr_oactive(&ifp->if_snd);
     	vio_iff(sc);
     	vio_link_state(ifp);
     
    @@ -847,11 +1007,13 @@ vio_stop(struct ifnet *ifp, int disable)
     	CLR(ifp->if_flags, IFF_RUNNING);
     	timeout_del(&sc->sc_txtick);
     	timeout_del(&sc->sc_rxtick);
    -	ifq_clr_oactive(&ifp->if_snd);
     	/* only way to stop I/O and DMA is resetting... */
     	virtio_reset(vsc);
    -	for (i = 0; i < sc->sc_nqueues; i++)
    +	for (i = 0; i < sc->sc_nqueues; i++) {
    +		mtx_enter(&sc->sc_q[i].viq_rxmtx);
     		vio_rxeof(&sc->sc_q[i]);
    +		mtx_leave(&sc->sc_q[i].viq_rxmtx);
    +	}
     
     	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ))
     		vio_ctrl_wakeup(sc, RESET);
    @@ -867,6 +1029,8 @@ vio_stop(struct ifnet *ifp, int disable)
     	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ))
     		virtio_start_vq_intr(vsc, sc->sc_ctl_vq);
     	virtio_reinit_end(vsc);
    +	if (virtio_has_feature(vsc, VIRTIO_NET_F_MQ))
    +		vio_ctrl_mq(sc);
     	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ))
     		vio_ctrl_wakeup(sc, FREE);
     }
    @@ -953,35 +1117,42 @@ vio_tx_offload(struct virtio_net_hdr *hdr, struct mbuf *m)
     }
     
     void
    -vio_start(struct ifnet *ifp)
    +vio_start(struct ifqueue *viq_ifq)
     {
    +	struct ifnet *ifp = viq_ifq->ifq_if;
    +	struct vio_queue *vioq = viq_ifq->ifq_softc;
     	struct vio_softc *sc = ifp->if_softc;
     	struct virtio_softc *vsc = sc->sc_virtio;
    -	struct vio_queue *vioq = &sc->sc_q[0];
     	struct virtqueue *vq = vioq->viq_txvq;
     	struct mbuf *m;
    -	int queued = 0;
    +	int queued = 0, free_slots, used_slots;
     
    -	vio_txeof(vq);
    +	mtx_enter(&vioq->viq_txmtx);
    +	vio_tx_dequeue(vq);
     
    -	if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(&ifp->if_snd))
    -		return;
    -	if (ifq_empty(&ifp->if_snd))
    -		return;
     
     again:
    +	free_slots = vioq->viq_txfree_slots;
    +	KASSERT(free_slots >= 0);
    +	used_slots = 0;
     	for (;;) {
     		int slot, r;
     		struct virtio_net_hdr *hdr;
     
    -		m = ifq_deq_begin(&ifp->if_snd);
    +		if (free_slots - used_slots < sc->sc_tx_slots_per_req) {
    +			ifq_set_oactive(viq_ifq);
    +			break;
    +		}
    +
    +		m = ifq_dequeue(viq_ifq);
     		if (m == NULL)
     			break;
     
     		r = virtio_enqueue_prep(vq, &slot);
     		if (r == EAGAIN) {
    -			ifq_deq_rollback(&ifp->if_snd, m);
    -			ifq_set_oactive(&ifp->if_snd);
    +			printf("%s: virtio_enqueue_prep failed?\n", __func__);
    +			m_freem(m);
    +			viq_ifq->ifq_errors++;
     			break;
     		}
     		if (r != 0)
    @@ -995,22 +1166,27 @@ again:
     		r = vio_encap(vioq, slot, m);
     		if (r != 0) {
     			virtio_enqueue_abort(vq, slot);
    -			ifq_deq_commit(&ifp->if_snd, m);
     			m_freem(m);
    -			ifp->if_oerrors++;
    +			viq_ifq->ifq_errors++;
     			continue;
     		}
     		r = virtio_enqueue_reserve(vq, slot,
     		    vioq->viq_txdmamaps[slot]->dm_nsegs + 1);
     		if (r != 0) {
    +			printf("%s: virtio_enqueue_reserve failed?\n",
    +			    __func__);
    +			m_freem(m);
    +			viq_ifq->ifq_errors++;
     			bus_dmamap_unload(vsc->sc_dmat,
     			    vioq->viq_txdmamaps[slot]);
    -			ifq_deq_rollback(&ifp->if_snd, m);
     			vioq->viq_txmbufs[slot] = NULL;
    -			ifq_set_oactive(&ifp->if_snd);
     			break;
     		}
    -		ifq_deq_commit(&ifp->if_snd, m);
    +		if (sc->sc_tx_slots_per_req == 1)
    +			used_slots++;
    +		else
    +			used_slots += vioq->viq_txdmamaps[slot]->dm_nsegs + 1;
    +
     
     		bus_dmamap_sync(vsc->sc_dmat, vioq->viq_txdmamaps[slot], 0,
     		    vioq->viq_txdmamaps[slot]->dm_mapsize,
    @@ -1024,14 +1200,22 @@ again:
     			bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
     #endif
     	}
    -	if (ifq_is_oactive(&ifp->if_snd)) {
    +	 if (used_slots > 0) {
    +		if (used_slots > vioq->viq_txfree_slots)
    +			printf("%s: used_slots %d viq_txfree_slots %d "
    +			    "free_slots %d\n", __func__, used_slots,
    +			    vioq->viq_txfree_slots, free_slots);
    +		vioq->viq_txfree_slots -= used_slots;
    +		KASSERT(vioq->viq_txfree_slots >= 0);
    +	}
    +	if (ifq_is_oactive(viq_ifq)) {
     		int r;
     		if (virtio_has_feature(vsc, VIRTIO_F_RING_EVENT_IDX))
    -			r = virtio_postpone_intr_smart(vioq->viq_txvq);
    +			r = virtio_postpone_intr_smart(vq);
     		else
    -			r = virtio_start_vq_intr(vsc, vioq->viq_txvq);
    +			r = virtio_start_vq_intr(vsc, vq);
     		if (r) {
    -			vio_txeof(vq);
    +			vio_tx_dequeue(vq);
     			goto again;
     		}
     	}
    @@ -1040,6 +1224,7 @@ again:
     		virtio_notify(vsc, vq);
     		timeout_add_sec(&sc->sc_txtick, 1);
     	}
    +	mtx_leave(&vioq->viq_txmtx);
     }
     
     #if VIRTIO_DEBUG
    @@ -1047,19 +1232,20 @@ void
     vio_dump(struct vio_softc *sc)
     {
     	struct ifnet *ifp = &sc->sc_ac.ac_if;
    -	struct virtio_softc *vsc = sc->sc_virtio;
     	int i;
     
     	printf("%s status dump:\n", ifp->if_xname);
     	printf("tx tick active: %d\n", !timeout_triggered(&sc->sc_txtick));
    +	printf("max tx slots per req %d\n", sc->sc_tx_slots_per_req);
     	printf("rx tick active: %d\n", !timeout_triggered(&sc->sc_rxtick));
     	for (i = 0; i < sc->sc_nqueues; i++) {
     		printf("%d: TX virtqueue:\n", i);
    +		printf("  tx free slots %d\n", sc->sc_q[i].viq_txfree_slots);
     		virtio_vq_dump(sc->sc_q[i].viq_txvq);
     		printf("%d: RX virtqueue:\n", i);
     		virtio_vq_dump(sc->sc_q[i].viq_rxvq);
     	}
    -	if (virtio_has_feature(vsc, VIRTIO_NET_F_CTRL_VQ)) {
    +	if (sc->sc_ctl_vq != NULL) {
     		printf("CTL virtqueue:\n");
     		virtio_vq_dump(sc->sc_ctl_vq);
     		printf("ctrl_inuse: %d\n", sc->sc_ctrl_inuse);
    @@ -1067,6 +1253,33 @@ vio_dump(struct vio_softc *sc)
     }
     #endif
     
    +static int
    +vio_rxr_info(struct vio_softc *sc, struct if_rxrinfo *ifri)
    +{
    +	struct if_rxring_info *ifrs, *ifr;
    +	int error;
    +	unsigned int i;
    +
    +	ifrs = mallocarray(sc->sc_nqueues, sizeof(*ifrs),
    +	    M_TEMP, M_WAITOK|M_ZERO|M_CANFAIL);
    +	if (ifrs == NULL)
    +		return (ENOMEM);
    +
    +	for (i = 0; i < sc->sc_nqueues; i++) {
    +		ifr = &ifrs[i];
    +
    +		ifr->ifr_size = sc->sc_rx_mbuf_size;
    +		snprintf(ifr->ifr_name, sizeof(ifr->ifr_name), "%u", i);
    +		ifr->ifr_info = sc->sc_q[i].viq_rxring;
    +	}
    +
    +	error = if_rxr_info_ioctl(ifri, i, ifrs);
    +
    +	free(ifrs, M_TEMP, i * sizeof(*ifrs));
    +
    +	return (error);
    +}
    +
     int
     vio_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
     {
    @@ -1101,8 +1314,7 @@ vio_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
     		r = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
     		break;
     	case SIOCGIFRXR:
    -		r = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data,
    -		    NULL, MCLBYTES, &sc->sc_q[0].viq_rxring);
    +		r = vio_rxr_info(sc, (struct if_rxrinfo *)ifr->ifr_data);
     		break;
     	default:
     		r = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
    @@ -1127,7 +1339,7 @@ vio_add_rx_mbuf(struct vio_softc *sc, struct vio_queue *vioq, int i)
     	struct mbuf *m;
     	int r;
     
    -	m = MCLGETL(NULL, M_DONTWAIT, MCLBYTES);
    +	m = MCLGETL(NULL, M_DONTWAIT, sc->sc_rx_mbuf_size);
     	if (m == NULL)
     		return ENOBUFS;
     	vioq->viq_rxmbufs[i] = m;
    @@ -1163,6 +1375,7 @@ vio_populate_rx_mbufs(struct vio_softc *sc, struct vio_queue *vioq)
     	struct virtqueue *vq = vioq->viq_rxvq;
     	int mrg_rxbuf = VIO_HAVE_MRG_RXBUF(sc);
     
    +	MUTEX_ASSERT_LOCKED(&vioq->viq_rxmtx);
     	for (slots = if_rxr_get(&vioq->viq_rxring, vq->vq_num);
     	    slots > 0; slots--) {
     		int slot;
    @@ -1199,7 +1412,8 @@ vio_populate_rx_mbufs(struct vio_softc *sc, struct vio_queue *vioq)
     			virtio_enqueue_p(vq, slot, vioq->viq_rxdmamaps[slot],
     			    0, sc->sc_hdr_size, 0);
     			virtio_enqueue_p(vq, slot, vioq->viq_rxdmamaps[slot],
    -			    sc->sc_hdr_size, MCLBYTES - sc->sc_hdr_size, 0);
    +			    sc->sc_hdr_size,
    +			    sc->sc_rx_mbuf_size - sc->sc_hdr_size, 0);
     		}
     		virtio_enqueue_commit(vsc, vq, slot, 0);
     		done = 1;
    @@ -1264,6 +1478,7 @@ vio_rxeof(struct vio_queue *vioq)
     	int slot, len, bufs_left;
     	struct virtio_net_hdr *hdr;
     
    +	MUTEX_ASSERT_LOCKED(&vioq->viq_rxmtx);
     	while (virtio_dequeue(vsc, vioq->viq_rxvq, &slot, &len) == 0) {
     		r = 1;
     		bus_dmamap_sync(vsc->sc_dmat, vioq->viq_rxdmamaps[slot], 0,
    @@ -1307,7 +1522,7 @@ vio_rxeof(struct vio_queue *vioq)
     		m_freem(m0);
     	}
     
    -	if (ifiq_input(&ifp->if_rcv, &ml))
    +	if (ifiq_input(vioq->viq_ifiq, &ml))
     		if_rxr_livelocked(&vioq->viq_rxring);
     
     	return r;
    @@ -1322,6 +1537,7 @@ vio_rx_intr(struct virtqueue *vq)
     	struct vio_queue *vioq = &sc->sc_q[vq->vq_index/2];
     	int r, sum = 0;
     
    +	mtx_enter(&vioq->viq_rxmtx);
     again:
     	r = vio_rxeof(vioq);
     	sum += r;
    @@ -1334,24 +1550,21 @@ again:
     		}
     	}
     
    +	mtx_leave(&vioq->viq_rxmtx);
     	return sum;
     }
     
     void
     vio_rxtick(void *arg)
     {
    -	struct virtqueue *vq = arg;
    -	struct virtio_softc *vsc = vq->vq_owner;
    -	struct vio_softc *sc = (struct vio_softc *)vsc->sc_child;
    -	struct vio_queue *vioq;
    -	int s, qidx;
    +	struct vio_softc *sc = arg;
    +	int i;
     
    -	s = splnet();
    -	for (qidx = 0; qidx < sc->sc_nqueues; qidx++) {
    -		vioq = &sc->sc_q[qidx];
    -		vio_populate_rx_mbufs(sc, vioq);
    +	for (i = 0; i < sc->sc_nqueues; i++) {
    +		mtx_enter(&sc->sc_q[i].viq_rxmtx);
    +		vio_populate_rx_mbufs(sc, &sc->sc_q[i]);
    +		mtx_leave(&sc->sc_q[i].viq_rxmtx);
     	}
    -	splx(s);
     }
     
     /* free all the mbufs; called from if_stop(disable) */
    @@ -1386,25 +1599,26 @@ vio_tx_intr(struct virtqueue *vq)
     {
     	struct virtio_softc *vsc = vq->vq_owner;
     	struct vio_softc *sc = (struct vio_softc *)vsc->sc_child;
    -	struct ifnet *ifp = &sc->sc_ac.ac_if;
    +	struct vio_queue *vioq = &sc->sc_q[vq->vq_index/2];
     	int r;
     
     	r = vio_txeof(vq);
    -	vio_start(ifp);
    +	vio_start(vioq->viq_ifq);
     	return r;
     }
     
     void
     vio_txtick(void *arg)
     {
    -	struct virtqueue *vq = arg;
    -	int s = splnet();
    -	virtio_check_vq(vq->vq_owner, vq);
    -	splx(s);
    +	struct vio_softc *sc = arg;
    +	int i;
    +
    +	for (i = 0; i < sc->sc_nqueues; i++)
    +		virtio_check_vq(sc->sc_virtio, sc->sc_q[i].viq_txvq);
     }
     
     int
    -vio_txeof(struct virtqueue *vq)
    +vio_tx_dequeue(struct virtqueue *vq)
     {
     	struct virtio_softc *vsc = vq->vq_owner;
     	struct vio_softc *sc = (struct vio_softc *)vsc->sc_child;
    @@ -1413,8 +1627,9 @@ vio_txeof(struct virtqueue *vq)
     	struct ifnet *ifp = &sc->sc_ac.ac_if;
     	struct mbuf *m;
     	int r = 0;
    -	int slot, len;
    +	int slot, len, freed = 0;
     
    +	MUTEX_ASSERT_LOCKED(&vioq->viq_txmtx);
     	if (!ISSET(ifp->if_flags, IFF_RUNNING))
     		return 0;
     
    @@ -1429,13 +1644,34 @@ vio_txeof(struct virtqueue *vq)
     		m = vioq->viq_txmbufs[slot];
     		bus_dmamap_unload(vsc->sc_dmat, vioq->viq_txdmamaps[slot]);
     		vioq->viq_txmbufs[slot] = NULL;
    -		virtio_dequeue_commit(vq, slot);
    +		freed += virtio_dequeue_commit(vq, slot);
     		m_freem(m);
     	}
    +	KASSERT(vioq->viq_txfree_slots >= 0);
    +	vioq->viq_txfree_slots += freed;
    +	return r;
    +}
    +
    +
    +int
    +vio_txeof(struct virtqueue *vq)
    +{
    +	struct virtio_softc *vsc = vq->vq_owner;
    +	struct vio_softc *sc = (struct vio_softc *)vsc->sc_child;
    +	struct vio_queue *vioq = &sc->sc_q[vq->vq_index/2];
    +	int r;
    +
    +	mtx_enter(&vioq->viq_txmtx);
    +	r = vio_tx_dequeue(vq);
    +	mtx_leave(&vioq->viq_txmtx);
     
     	if (r) {
    -		ifq_clr_oactive(&ifp->if_snd);
    -		virtio_stop_vq_intr(vsc, vioq->viq_txvq);
    +		if (ifq_is_oactive(vioq->viq_ifq)) {
    +			mtx_enter(&vioq->viq_txmtx);
    +			virtio_stop_vq_intr(vsc, vq);
    +			mtx_leave(&vioq->viq_txmtx);
    +			ifq_restart(vioq->viq_ifq);
    +		}
     	}
     	if (vq->vq_used_idx == vq->vq_avail_idx)
     		timeout_del(&sc->sc_txtick);
    @@ -1480,6 +1716,8 @@ vio_tx_drain(struct vio_softc *sc)
     
     	for (q = 0; q < sc->sc_nqueues; q++) {
     		vioq = &sc->sc_q[q];
    +		ifq_barrier(vioq->viq_ifq);
    +		mtx_enter(&vioq->viq_txmtx);
     		for (i = 0; i < vioq->viq_txvq->vq_num; i++) {
     			if (vioq->viq_txmbufs[i] == NULL)
     				continue;
    @@ -1488,6 +1726,10 @@ vio_tx_drain(struct vio_softc *sc)
     			m_freem(vioq->viq_txmbufs[i]);
     			vioq->viq_txmbufs[i] = NULL;
     		}
    +		ifq_purge(vioq->viq_ifq);
    +		ifq_clr_oactive(vioq->viq_ifq);
    +		vioq->viq_txfree_slots = vioq->viq_txvq->vq_num - 1;
    +		mtx_leave(&vioq->viq_txmtx);
     	}
     }
     
    @@ -1571,6 +1813,8 @@ vio_ctrl_submit(struct vio_softc *sc, int slot)
     			vio_ctrl_wakeup(sc, RESET);
     			return ENXIO;
     		}
    +		if (cold)
    +			virtio_check_vq(sc->sc_virtio, sc->sc_ctl_vq);
     	}
     
     	VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_cmd,
    @@ -1628,6 +1872,41 @@ vio_ctrl_rx(struct vio_softc *sc, int cmd, int onoff)
     	return r;
     }
     
    +/* issue a VIRTIO_NET_CTRL_MQ class command and wait for completion */
    +int
    +vio_ctrl_mq(struct vio_softc *sc)
    +{
    +	struct virtio_softc *vsc = sc->sc_virtio;
    +	struct virtqueue *vq = sc->sc_ctl_vq;
    +	int r, slot;
    +
    +
    +	r = vio_ctrl_start(sc, VIRTIO_NET_CTRL_MQ,
    +	    VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, 1, &slot);
    +	if (r != 0)
    +		return r;
    +
    +	sc->sc_ctrl_mq_pairs->virtqueue_pairs = sc->sc_nqueues;
    +
    +	vio_dmamem_enqueue(vsc, sc, vq, slot, sc->sc_ctrl_mq_pairs,
    +	    sizeof(*sc->sc_ctrl_mq_pairs), 1);
    +
    +	r = vio_ctrl_submit(sc, slot);
    +
    +	VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_mq_pairs,
    +	    sizeof(*sc->sc_ctrl_mq_pairs), BUS_DMASYNC_POSTWRITE);
    +
    +	if (r != 0)
    +		printf("%s: ctrl cmd %d failed\n", sc->sc_dev.dv_xname,
    +		    VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET);
    +
    +	DPRINTF("%s: cmd %d %d: %d\n", __func__,
    +	    VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, sc->sc_nqueues, r);
    +
    +	vio_ctrl_finish(sc);
    +	return r;
    +}
    +
     int
     vio_ctrl_guest_offloads(struct vio_softc *sc, uint64_t features)
     {
    @@ -1673,18 +1952,23 @@ vio_ctrleof(struct virtqueue *vq)
     {
     	struct virtio_softc *vsc = vq->vq_owner;
     	struct vio_softc *sc = (struct vio_softc *)vsc->sc_child;
    -	int r = 0, ret, slot;
    +	int r = 0, ret, slot, s;
     
    +	KERNEL_LOCK();
    +	s = splnet();
     again:
     	ret = virtio_dequeue(vsc, vq, &slot, NULL);
     	if (ret == ENOENT)
    -		return r;
    +		goto out;
     	virtio_dequeue_commit(vq, slot);
     	r++;
     	vio_ctrl_wakeup(sc, DONE);
     	if (virtio_start_vq_intr(vsc, vq))
     		goto again;
     
    +out:
    +	splx(s);
    +	KERNEL_UNLOCK();
     	return r;
     }
     
    @@ -1750,9 +2034,6 @@ vio_iff(struct vio_softc *sc)
     		return;
     	}
     
    -	if (sc->sc_dev.dv_cfdata->cf_flags & CONFFLAG_QEMU_VLAN_BUG)
    -		ifp->if_flags |= IFF_PROMISC;
    -
     	if (ifp->if_flags & IFF_PROMISC || ac->ac_multirangecnt > 0 ||
     	    ac->ac_multicnt >= VIRTIO_NET_CTRL_MAC_MC_ENTRIES) {
     		ifp->if_flags |= IFF_ALLMULTI;
    diff --git a/sys/dev/pv/virtio.c b/sys/dev/pv/virtio.c
    index 6d9fe06d645..11527cff6a8 100644
    --- a/sys/dev/pv/virtio.c
    +++ b/sys/dev/pv/virtio.c
    @@ -165,9 +165,9 @@ virtio_reinit_start(struct virtio_softc *sc)
     	for (i = 0; i < sc->sc_nvqs; i++) {
     		int n;
     		struct virtqueue *vq = &sc->sc_vqs[i];
    -		n = virtio_read_queue_size(sc, vq->vq_index);
    -		if (n == 0)	/* vq disappeared */
    +		if (vq->vq_num == 0)	/* not used */
     			continue;
    +		n = virtio_read_queue_size(sc, vq->vq_index);
     		if (n != vq->vq_num) {
     			panic("%s: virtqueue size changed, vq index %d",
     			    sc->sc_dev.dv_xname, vq->vq_index);
    @@ -255,8 +255,11 @@ virtio_check_vqs(struct virtio_softc *sc)
     	int i, r = 0;
     
     	/* going backwards is better for if_vio */
    -	for (i = sc->sc_nvqs - 1; i >= 0; i--)
    +	for (i = sc->sc_nvqs - 1; i >= 0; i--) {
    +		if (sc->sc_vqs[i].vq_num == 0)	/* not used */
    +			continue;
     		r |= virtio_check_vq(sc, &sc->sc_vqs[i]);
    +	}
     
     	return r;
     }
    @@ -450,6 +453,11 @@ virtio_free_vq(struct virtio_softc *sc, struct virtqueue *vq)
     	struct vq_entry *qe;
     	int i = 0;
     
    +	if (vq->vq_num == 0) {
    +		/* virtio_alloc_vq() was never called */
    +		return 0;
    +	}
    +
     	/* device must be already deactivated */
     	/* confirm the vq is empty */
     	SLIST_FOREACH(qe, &vq->vq_freelist, qe_list) {
    @@ -848,22 +856,25 @@ virtio_dequeue(struct virtio_softc *sc, struct virtqueue *vq,
      *
      *                 Don't call this if you use statically allocated slots
      *                 and virtio_enqueue_trim().
    + *
    + *                 returns the number of freed slots.
      */
     int
     virtio_dequeue_commit(struct virtqueue *vq, int slot)
     {
     	struct vq_entry *qe = &vq->vq_entries[slot];
     	struct vring_desc *vd = &vq->vq_desc[0];
    -	int s = slot;
    +	int s = slot, r = 1;
     
     	while (vd[s].flags & VRING_DESC_F_NEXT) {
     		s = vd[s].next;
     		vq_free_entry(vq, qe);
     		qe = &vq->vq_entries[s];
    +		r++;
     	}
     	vq_free_entry(vq, qe);
     
    -	return 0;
    +	return r;
     }
     
     /*
    @@ -871,6 +882,10 @@ virtio_dequeue_commit(struct virtqueue *vq, int slot)
      * Returns 0 on success; returns 1 if the used ring has already advanced
      * too far, and the caller must process the queue again (otherwise, no
      * more interrupts will happen).
    + *
    + * The next nslots entries in the used ring will receive no interrupt.
    + * The next interrupt will be triggered when nslots+1 slots have been
    + * used.
      */
     int
     virtio_postpone_intr(struct virtqueue *vq, uint16_t nslots)
    @@ -909,13 +924,16 @@ virtio_postpone_intr_smart(struct virtqueue *vq)
     /*
      * Postpone interrupt until all of the available descriptors have been
      * consumed.
    + *
    + * If there is no descriptor available right now, interrupt will be
    + * disabled.
      */
     int
     virtio_postpone_intr_far(struct virtqueue *vq)
     {
     	uint16_t	nslots;
     
    -	nslots = (uint16_t)(vq->vq_avail->idx - vq->vq_used_idx);
    +	nslots = (uint16_t)(vq->vq_avail->idx - vq->vq_used_idx - 1);
     
     	return virtio_postpone_intr(vq, nslots);
     }
    @@ -944,6 +962,11 @@ virtio_stop_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
     	vq->vq_queued++;
     }
     
    +/*
    + * For event_idx, there will be an interrupt for the next used descriptor,
    + * regardless if that descriptor is in the available ring now or if
    + * the available ring is now empty and a descriptor is put there later.
    + */
     int
     virtio_start_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
     {
    diff --git a/sys/dev/pv/virtiovar.h b/sys/dev/pv/virtiovar.h
    index 63a4eb4b14c..d1820c74737 100644
    --- a/sys/dev/pv/virtiovar.h
    +++ b/sys/dev/pv/virtiovar.h
    @@ -103,7 +103,8 @@ struct vq_entry {
     
     struct virtqueue {
     	struct virtio_softc	*vq_owner;
    -	unsigned int		vq_num;  /* queue size (# of entries) */
    +	unsigned int		vq_num;  /* queue size (# of entries),
    +					  * 0 if unused/non-existant */
     	unsigned int		vq_mask; /* (1 << vq_num - 1) */
     	int			vq_index; /* queue number (0, 1, ...) */
     
    @@ -162,6 +163,8 @@ struct virtio_ops {
     	void		(*set_status)(struct virtio_softc *, int);
     	int		(*neg_features)(struct virtio_softc *, const struct virtio_feature_name *);
     	int		(*poll_intr)(void *);
    +	int		(*intr_establish)(struct virtio_softc *, struct virtio_attach_args *,
    +			    int, struct cpu_info *, int (*)(void *), void *);
     };
     
     #define VIRTIO_CHILD_ERROR	((void*)1)
    @@ -178,7 +181,7 @@ struct virtio_softc {
     	int			 sc_indirect;
     	int			 sc_version_1;
     
    -	int			 sc_nvqs;	/* set by child */
    +	int			 sc_nvqs;	/* size of sc_vqs, set by child */
     	struct virtqueue	*sc_vqs;	/* set by child */
     
     	struct device		*sc_child;	/* set by child,
    @@ -204,6 +207,14 @@ struct virtio_softc {
     #define	virtio_get_status(sc)			(sc)->sc_ops->get_status(sc)
     #define	virtio_set_status(sc, i)		(sc)->sc_ops->set_status(sc, i)
     
    +/*
    + * virtio_intr_establish() only works if va_nintr > 1. If it is called by a
    + * child driver, the transport driver will skip automatic intr allocation and
    + * the child driver must allocate all required interrupts itself. Vector 0 is
    + * always used for the config change interrupt.
    + */
    +#define	virtio_intr_establish(sc, va, v, ci, fn, a)	(sc)->sc_ops->intr_establish(sc, va, v, ci, fn, a)
    +
     /* only for transport drivers */
     #define	virtio_device_reset(sc)			virtio_set_status((sc), 0)
     
    
    
    
  • Stefan Fritsch:

    vio(4) multi-queue V6