Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: [EXT] Re: SEV: busdma bounce buffer
To:
Hans-J?rg H?xer <Hans-Joerg_Hoexer@genua.de>
Cc:
Mark Kettenis <mark.kettenis@xs4all.nl>, tech@openbsd.org
Date:
Wed, 14 Aug 2024 13:43:25 +0200

Download raw body.

Thread
On Tue, Aug 13, 2024 at 03:47:54PM +0200, Hans-J?rg H?xer wrote:
> > > > @@ -551,7 +551,11 @@ typedef struct bus_dmamap		*bus_dmamap_t;
> > > >   */
> > > >  struct bus_dma_segment {
> > > >  	bus_addr_t	ds_addr;	/* DMA address */
> > > > +	bus_addr_t	ds_addr2;	/* replacement store */
> > > 
> > > A more descriptive name would be better.  Also, because this isn't
> > > part of the bus_dma API, it should probably start with an underscore.
> > > So perhaps _ds_bounce_addr?
> > 
> > ok, I will change this.
> > 
> > > >  	bus_size_t	ds_len;		/* length of transfer */
> > > > +	vaddr_t		ds_va;		/* mapped loaded data */
> > > > +	vaddr_t		ds_bounce;	/* mapped bounced data */
> > > > +
> > > 
> > > Same here.  For consitencey maybe _ds_va and _ds_bounce_va?
> 
> Updated diff below

I have tested the new diff.  As expected there is no fallout on
non-SEV machines.  Under heavy load with bonuce buffers enforced,
I see network traffic stall with vio(4) interfaces.

kettenis: Is this diff good enough to commit?

bluhm

> ------------------------------------------------------------------------
> commit 5a9b3f80d9a3d9d0e86a20b1d0b449c562e197be
> Author: Hans-Joerg Hoexer <hshoexer@genua.de>
> Date:   Fri Jul 12 10:56:19 2024 +0200
> 
>     busdma: Bounce buffering for SEV
>     
>     When running as SEV guest (indicated by cpu_sev_guestmode != 0),
>     we allocate additional pages for each segment on dma map creation.
>     These pages are mapped with the PMAP_NOCRYPT attribute, i.e. the
>     crypt bit is not set in the PTE.  Thus, these pages are shared with
>     the hypervisor.
>     
>     When the map is loaded with actual pages, the address in the
>     descriptor is replaced with the corresponding bounce buffer.  Using
>     bus_dmamap_sync(), data is copied from the encrypted pages used by
>     guest-drivers to the unencrypted bounce buffers shared with the
>     hypervisor.  And vice versa.
>     
>     When the kernel is running in non-SEV guest mode (i.e. normal host,
>     or non-SEV guest), no bounce buffers are used.
> 
> diff --git a/sys/arch/amd64/amd64/bus_dma.c b/sys/arch/amd64/amd64/bus_dma.c
> index 19a366115f9..9d4705b36f3 100644
> --- a/sys/arch/amd64/amd64/bus_dma.c
> +++ b/sys/arch/amd64/amd64/bus_dma.c
> @@ -108,8 +108,13 @@ _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments,
>      bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp)
>  {
>  	struct bus_dmamap *map;
> +	struct pglist mlist;
> +	struct vm_page **pg, *pgnext;
> +	size_t mapsize, sz, ssize;
> +	vaddr_t va, sva;
>  	void *mapstore;
> -	size_t mapsize;
> +	int npages, error;
> +	const struct kmem_dyn_mode *kd;
>  
>  	/*
>  	 * Allocate and initialize the DMA map.  The end of the map
> @@ -125,6 +130,16 @@ _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments,
>  	 */
>  	mapsize = sizeof(struct bus_dmamap) +
>  	    (sizeof(bus_dma_segment_t) * (nsegments - 1));
> +
> +	/* allocate and use bounce buffers when running as SEV guest */
> +	if (cpu_sev_guestmode) {
> +		/* this many pages plus one in case we get split */
> +		npages = round_page(size) / PAGE_SIZE + 1;
> +		if (npages < nsegments)
> +			npages = nsegments;
> +		mapsize += sizeof(struct vm_page *) * npages;
> +	}
> +
>  	if ((mapstore = malloc(mapsize, M_DEVBUF,
>  	    (flags & BUS_DMA_NOWAIT) ?
>  	        (M_NOWAIT|M_ZERO) : (M_WAITOK|M_ZERO))) == NULL)
> @@ -135,8 +150,59 @@ _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments,
>  	map->_dm_segcnt = nsegments;
>  	map->_dm_maxsegsz = maxsegsz;
>  	map->_dm_boundary = boundary;
> +	if (cpu_sev_guestmode) {
> +		map->_dm_pages = (void *)&map->dm_segs[nsegments];
> +		map->_dm_npages = npages;
> +	}
>  	map->_dm_flags = flags & ~(BUS_DMA_WAITOK|BUS_DMA_NOWAIT);
>  
> +	if (!cpu_sev_guestmode) {
> +		*dmamp = map;
> +		return (0);
> +	}
> +
> +	sz = npages << PGSHIFT;
> +	kd = flags & BUS_DMA_NOWAIT ? &kd_trylock : &kd_waitok;
> +	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, kd);
> +	if (va == 0) {
> +		map->_dm_npages = 0;
> +		free(map, M_DEVBUF, mapsize);
> +		return (ENOMEM);
> +	}
> +
> +	TAILQ_INIT(&mlist);
> +	error = uvm_pglistalloc(sz, 0, -1, PAGE_SIZE, 0, &mlist, nsegments,
> +	    (flags & BUS_DMA_NOWAIT) ? UVM_PLA_NOWAIT : UVM_PLA_WAITOK);
> +	if (error) {
> +		map->_dm_npages = 0;
> +		km_free((void *)va, sz, &kv_any, &kp_none);
> +		free(map, M_DEVBUF, mapsize);
> +		return (ENOMEM);
> +	}
> +
> +	sva = va;
> +	ssize = sz;
> +	pgnext = TAILQ_FIRST(&mlist);
> +	for (pg = map->_dm_pages; npages--; va += PAGE_SIZE, pg++) {
> +		*pg = pgnext;
> +		error = pmap_enter(pmap_kernel(), va, VM_PAGE_TO_PHYS(*pg),
> +		    PROT_READ | PROT_WRITE,
> +		    PROT_READ | PROT_WRITE | PMAP_WIRED |
> +		    PMAP_CANFAIL | PMAP_NOCRYPT);
> +		if (error) {
> +			pmap_update(pmap_kernel());
> +			map->_dm_npages = 0;
> +			km_free((void *)sva, ssize, &kv_any, &kp_none);
> +			free(map, M_DEVBUF, mapsize);
> +			uvm_pglistfree(&mlist);
> +			return (ENOMEM);
> +		}
> +		pgnext = TAILQ_NEXT(*pg, pageq);
> +		bzero((void *)va, PAGE_SIZE);
> +	}
> +	pmap_update(pmap_kernel());
> +	map->_dm_pgva = sva;
> +
>  	*dmamp = map;
>  	return (0);
>  }
> @@ -149,7 +215,25 @@ void
>  _bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t map)
>  {
>  	size_t mapsize;
> -	
> +	struct vm_page **pg;
> +	struct pglist mlist;
> +	vaddr_t va;
> +
> +	if (map->_dm_pgva) {
> +		km_free((void *)map->_dm_pgva, map->_dm_npages << PGSHIFT,
> +		    &kv_any, &kp_none);
> +	}
> +
> +	if (map->_dm_pages) {
> +		TAILQ_INIT(&mlist);
> +		for (pg = map->_dm_pages, va = map->_dm_pgva;
> +		    map->_dm_npages--; va += PAGE_SIZE, pg++) {
> +			pmap_remove(pmap_kernel(), va, va + PAGE_SIZE);
> +			TAILQ_INSERT_TAIL(&mlist, *pg, pageq);
> +		}
> +		uvm_pglistfree(&mlist);
> +	}
> +
>  	mapsize = sizeof(struct bus_dmamap) +
>  		(sizeof(bus_dma_segment_t) * (map->_dm_segcnt - 1));
>  	free(map, M_DEVBUF, mapsize);
> @@ -383,6 +467,7 @@ _bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t map)
>  	 */
>  	map->dm_mapsize = 0;
>  	map->dm_nsegs = 0;
> +	map->_dm_nused = 0;
>  }
>  
>  /*
> @@ -393,7 +478,40 @@ void
>  _bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t addr,
>      bus_size_t size, int op)
>  {
> -	/* Nothing to do here. */
> +	bus_dma_segment_t *sg;
> +	int i, off = addr;
> +	bus_size_t l;
> +
> +	if (!cpu_sev_guestmode)
> +		return;
> +
> +	for (i = map->_dm_segcnt, sg = map->dm_segs; size && i--; sg++) {
> +		if (off >= sg->ds_len) {
> +			off -= sg->ds_len;
> +			continue;
> +		}
> +
> +		l = sg->ds_len - off;
> +		if (l > size)
> +			l = size;
> +		size -= l;
> +
> +		/* PREREAD and POSTWRITE are no-ops. */
> +
> +		/* READ: device -> memory */
> +		if (op & BUS_DMASYNC_POSTREAD) {
> +			bcopy((void *)(sg->_ds_bounce_va + off),
> +			    (void *)(sg->_ds_va + off), l);
> +		}
> +
> +		/* WRITE: memory -> device */
> +		if (op & BUS_DMASYNC_PREWRITE) {
> +			bcopy((void *)(sg->_ds_va + off),
> +			    (void *)(sg->_ds_bounce_va + off), l);
> +		}
> +
> +		off = 0;
> +	}
>  }
>  
>  /*
> @@ -565,10 +683,11 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
>      int first)
>  {
>  	bus_size_t sgsize;
> -	bus_addr_t curaddr, lastaddr, baddr, bmask;
> -	vaddr_t vaddr = (vaddr_t)buf;
> -	int seg;
> +	bus_addr_t curaddr, lastaddr, baddr, bmask, oaddr = -1;
> +	vaddr_t pgva = -1, vaddr = (vaddr_t)buf;
> +	int seg, page, off;
>  	pmap_t pmap;
> +	struct vm_page *pg;
>  
>  	if (p != NULL)
>  		pmap = p->p_vmspace->vm_map.pmap;
> @@ -589,6 +708,19 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
>  			panic("Non dma-reachable buffer at curaddr %#lx(raw)",
>  			    curaddr);
>  
> +		if (cpu_sev_guestmode) {
> +			/* use bounce buffer */
> +			if (map->_dm_nused + 1 >= map->_dm_npages)
> +				return (ENOMEM);
> +
> +			off = vaddr & PAGE_MASK;
> +			pg = map->_dm_pages[page = map->_dm_nused++];
> +			oaddr = curaddr;
> +			curaddr = VM_PAGE_TO_PHYS(pg) + off;
> +
> +			pgva = map->_dm_pgva + (page << PGSHIFT) + off;
> +		}
> +
>  		/*
>  		 * Compute the segment size, and adjust counts.
>  		 */
> @@ -611,7 +743,10 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
>  		 */
>  		if (first) {
>  			map->dm_segs[seg].ds_addr = curaddr;
> +			map->dm_segs[seg]._ds_bounce_addr = oaddr;
>  			map->dm_segs[seg].ds_len = sgsize;
> +			map->dm_segs[seg]._ds_va = vaddr;
> +			map->dm_segs[seg]._ds_bounce_va = pgva;
>  			first = 0;
>  		} else {
>  			if (curaddr == lastaddr &&
> @@ -625,7 +760,10 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
>  				if (++seg >= map->_dm_segcnt)
>  					break;
>  				map->dm_segs[seg].ds_addr = curaddr;
> +				map->dm_segs[seg]._ds_bounce_addr = oaddr;
>  				map->dm_segs[seg].ds_len = sgsize;
> +				map->dm_segs[seg]._ds_va = vaddr;
> +				map->dm_segs[seg]._ds_bounce_va = pgva;
>  			}
>  		}
>  
> diff --git a/sys/arch/amd64/include/bus.h b/sys/arch/amd64/include/bus.h
> index 33d6cd6eaeb..d78735ede6f 100644
> --- a/sys/arch/amd64/include/bus.h
> +++ b/sys/arch/amd64/include/bus.h
> @@ -550,8 +550,12 @@ typedef struct bus_dmamap		*bus_dmamap_t;
>   *	are suitable for programming into DMA registers.
>   */
>  struct bus_dma_segment {
> -	bus_addr_t	ds_addr;	/* DMA address */
> -	bus_size_t	ds_len;		/* length of transfer */
> +	bus_addr_t	ds_addr;		/* DMA address */
> +	bus_addr_t	_ds_bounce_addr;	/* bounce buffer */
> +	bus_size_t	ds_len;			/* length of transfer */
> +	vaddr_t		_ds_va;			/* mapped loaded data */
> +	vaddr_t		_ds_bounce_va;		/* mapped bounced data */
> +
>  	/*
>  	 * Ugh. need this so can pass alignment down from bus_dmamem_alloc
>  	 * to scatter gather maps. only the first one is used so the rest is
> @@ -655,6 +659,11 @@ struct bus_dmamap {
>  
>  	void		*_dm_cookie;	/* cookie for bus-specific functions */
>  
> +	struct vm_page **_dm_pages;	/* replacement pages */
> +	vaddr_t		_dm_pgva;	/* those above -- mapped */
> +	int		_dm_npages;	/* number of pages allocated */
> +	int		_dm_nused;	/* number of pages replaced */
> +
>  	/*
>  	 * PUBLIC MEMBERS: these are used by machine-independent code.
>  	 */