From: Hans-Jörg Höxer Subject: SEV: busdma bounce buffer To: Cc: Date: Wed, 7 Aug 2024 17:50:57 +0200 Hi, the diff below introduces bounce buffers to busdma(9) when the kernel is running as a SEV-guest. This is determined by checking cpu_sev_guestmode (set in locore0). The goal is, to make virtio(4) work for SEV-guests. With the recenct fixes done by sf@ -- thanks! -- the next step is dma bounce buffering for SEV-enabled guests. _bus_dmamap_create(): On map creation we allocate an array of PAGE_SIZEd bounce buffers. These pages are mapped with PMAP_NOCRYPT. This means, they are unencrypted and shared with the hypervisor. Note, that we allocate at least one page for each dma segment. This allows us to not have worry about a single shared page holding clear text data from different dma segments. On the other hand, for segments of size less than a page (e.g. only a few bytes), we always use a full page. _bus_dmamap_sync(): On sync data gets now copied from the original encrypted pages to the unencrypted bounce buffer; and vice versa. _bus_dmamap_load_buffer(): When loading actual pages (these are always encrypted) into the map, we replace the ds_addr in the descriptor with the (guest physical) address of the boune buffer. The original address is stored in the new member ds_addr2. When the kernel is not in SEV guest mode, all this code is bypassed; except the check on cpu_sev_guestmode in _bus_dmamap_sync(). Thus I guess, that performance impact in non-SEV mode is neglectable. Note: Another approach could be to define a dedicated bus_dma_tag_t for SEV guests. Maybe similar to isa_bus_dma_tag. I have not explored this. I think, this diff is quite straight forward and in the end rather simple. And I guess, there is plenty of room for optimization. And I want to mention, that this diff is based on some (ancient) code from the late mickey@. What do you think? Take carre, Hans-Joerg ------------------------------------------------------------------- commit 06d4c480dc94ae8a4ca2e6a6810aecb2c9e8afc7 Author: Hans-Joerg Hoexer Date: Fri Jul 12 10:56:19 2024 +0200 busdma: Bounce buffering for SEV When running as SEV guest (indicated by cpu_sev_guestmode != 0), we allocate additional pages for each segment on dma map creation. These pages are mapped with the PMAP_NOCRYPT attribute, i.e. the crypt bit is not set in the PTE. Thus, these pages are shared with the hypervisor. When the map is loaded with actual pages, the address in the descriptor is replaced with the corresponding bounce buffer. Using bus_dmamap_sync(), data is copied from the encrypted pages used by guest-drivers to the unencrypted bounce buffers shared with the hypervisor. And vice versa. When the kernel is running in non-SEV guest mode (i.e. normal host, or non-SEV guest), no bounce buffers are used. diff --git a/sys/arch/amd64/amd64/bus_dma.c b/sys/arch/amd64/amd64/bus_dma.c index 19a366115f9..32f21f5166e 100644 --- a/sys/arch/amd64/amd64/bus_dma.c +++ b/sys/arch/amd64/amd64/bus_dma.c @@ -108,8 +108,13 @@ _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp) { struct bus_dmamap *map; + struct pglist mlist; + struct vm_page **pg, *pgnext; + size_t mapsize, sz, ssize; + vaddr_t va, sva; void *mapstore; - size_t mapsize; + int npages, error; + const struct kmem_dyn_mode *kd; /* * Allocate and initialize the DMA map. The end of the map @@ -125,6 +130,16 @@ _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments, */ mapsize = sizeof(struct bus_dmamap) + (sizeof(bus_dma_segment_t) * (nsegments - 1)); + + /* allocate and use bounce buffers when running as SEV guest */ + if (cpu_sev_guestmode) { + /* this many pages plus one in case we get split */ + npages = round_page(size) / PAGE_SIZE + 1; + if (npages < nsegments) + npages = nsegments; + mapsize += sizeof(struct vm_page *) * npages; + } + if ((mapstore = malloc(mapsize, M_DEVBUF, (flags & BUS_DMA_NOWAIT) ? (M_NOWAIT|M_ZERO) : (M_WAITOK|M_ZERO))) == NULL) @@ -135,8 +150,59 @@ _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments, map->_dm_segcnt = nsegments; map->_dm_maxsegsz = maxsegsz; map->_dm_boundary = boundary; + if (cpu_sev_guestmode) { + map->_dm_pages = (void *)&map->dm_segs[nsegments]; + map->_dm_npages = npages; + } map->_dm_flags = flags & ~(BUS_DMA_WAITOK|BUS_DMA_NOWAIT); + if (!cpu_sev_guestmode) { + *dmamp = map; + return (0); + } + + sz = npages << PGSHIFT; + kd = flags & BUS_DMA_NOWAIT ? &kd_trylock : &kd_waitok; + va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, kd); + if (va == 0) { + map->_dm_npages = 0; + free(map, M_DEVBUF, mapsize); + return (ENOMEM); + } + + TAILQ_INIT(&mlist); + error = uvm_pglistalloc(sz, 0, -1, PAGE_SIZE, 0, &mlist, nsegments, + (flags & BUS_DMA_NOWAIT) ? UVM_PLA_NOWAIT : UVM_PLA_WAITOK); + if (error) { + map->_dm_npages = 0; + km_free((void *)va, sz, &kv_any, &kp_none); + free(map, M_DEVBUF, mapsize); + return (ENOMEM); + } + + sva = va; + ssize = sz; + pgnext = TAILQ_FIRST(&mlist); + for (pg = map->_dm_pages; npages--; va += PAGE_SIZE, pg++) { + *pg = pgnext; + error = pmap_enter(pmap_kernel(), va, VM_PAGE_TO_PHYS(*pg), + PROT_READ | PROT_WRITE, + PROT_READ | PROT_WRITE | PMAP_WIRED | + PMAP_CANFAIL | PMAP_NOCRYPT); + if (error) { + pmap_update(pmap_kernel()); + map->_dm_npages = 0; + km_free((void *)sva, ssize, &kv_any, &kp_none); + free(map, M_DEVBUF, mapsize); + uvm_pglistfree(&mlist); + return (ENOMEM); + } + pgnext = TAILQ_NEXT(*pg, pageq); + bzero((void *)va, PAGE_SIZE); + } + pmap_update(pmap_kernel()); + map->_dm_pgva = sva; + *dmamp = map; return (0); } @@ -149,7 +215,25 @@ void _bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t map) { size_t mapsize; - + struct vm_page **pg; + struct pglist mlist; + vaddr_t va; + + if (map->_dm_pgva) { + km_free((void *)map->_dm_pgva, map->_dm_npages << PGSHIFT, + &kv_any, &kp_none); + } + + if (map->_dm_pages) { + TAILQ_INIT(&mlist); + for (pg = map->_dm_pages, va = map->_dm_pgva; + map->_dm_npages--; va += PAGE_SIZE, pg++) { + pmap_remove(pmap_kernel(), va, va + PAGE_SIZE); + TAILQ_INSERT_TAIL(&mlist, *pg, pageq); + } + uvm_pglistfree(&mlist); + } + mapsize = sizeof(struct bus_dmamap) + (sizeof(bus_dma_segment_t) * (map->_dm_segcnt - 1)); free(map, M_DEVBUF, mapsize); @@ -383,6 +467,7 @@ _bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t map) */ map->dm_mapsize = 0; map->dm_nsegs = 0; + map->_dm_nused = 0; } /* @@ -393,7 +478,40 @@ void _bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t addr, bus_size_t size, int op) { - /* Nothing to do here. */ + bus_dma_segment_t *sg; + int i, off = addr; + bus_size_t l; + + if (!cpu_sev_guestmode) + return; + + for (i = map->_dm_segcnt, sg = map->dm_segs; size && i--; sg++) { + if (off >= sg->ds_len) { + off -= sg->ds_len; + continue; + } + + l = sg->ds_len - off; + if (l > size) + l = size; + size -= l; + + /* PREREAD and POSTWRITE are no-ops. */ + + /* READ: device -> memory */ + if (op & BUS_DMASYNC_POSTREAD) { + bcopy((void *)(sg->ds_bounce + off), + (void *)(sg->ds_va + off), l); + } + + /* WRITE: memory -> device */ + if (op & BUS_DMASYNC_PREWRITE) { + bcopy((void *)(sg->ds_va + off), + (void *)(sg->ds_bounce + off), l); + } + + off = 0; + } } /* @@ -565,10 +683,11 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, int first) { bus_size_t sgsize; - bus_addr_t curaddr, lastaddr, baddr, bmask; - vaddr_t vaddr = (vaddr_t)buf; - int seg; + bus_addr_t curaddr, lastaddr, baddr, bmask, oaddr = -1; + vaddr_t pgva = -1, vaddr = (vaddr_t)buf; + int seg, page, off; pmap_t pmap; + struct vm_page *pg; if (p != NULL) pmap = p->p_vmspace->vm_map.pmap; @@ -589,6 +708,19 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, panic("Non dma-reachable buffer at curaddr %#lx(raw)", curaddr); + if (cpu_sev_guestmode) { + /* use bounce buffer */ + if (map->_dm_nused + 1 >= map->_dm_npages) + return (ENOMEM); + + off = vaddr & PAGE_MASK; + pg = map->_dm_pages[page = map->_dm_nused++]; + oaddr = curaddr; + curaddr = VM_PAGE_TO_PHYS(pg) + off; + + pgva = map->_dm_pgva + (page << PGSHIFT) + off; + } + /* * Compute the segment size, and adjust counts. */ @@ -611,7 +743,10 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, */ if (first) { map->dm_segs[seg].ds_addr = curaddr; + map->dm_segs[seg].ds_addr2 = oaddr; map->dm_segs[seg].ds_len = sgsize; + map->dm_segs[seg].ds_va = vaddr; + map->dm_segs[seg].ds_bounce = pgva; first = 0; } else { if (curaddr == lastaddr && @@ -625,7 +760,10 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, if (++seg >= map->_dm_segcnt) break; map->dm_segs[seg].ds_addr = curaddr; + map->dm_segs[seg].ds_addr2 = oaddr; map->dm_segs[seg].ds_len = sgsize; + map->dm_segs[seg].ds_va = vaddr; + map->dm_segs[seg].ds_bounce = pgva; } } diff --git a/sys/arch/amd64/include/bus.h b/sys/arch/amd64/include/bus.h index 33d6cd6eaeb..e3ceaf71ee7 100644 --- a/sys/arch/amd64/include/bus.h +++ b/sys/arch/amd64/include/bus.h @@ -551,7 +551,11 @@ typedef struct bus_dmamap *bus_dmamap_t; */ struct bus_dma_segment { bus_addr_t ds_addr; /* DMA address */ + bus_addr_t ds_addr2; /* replacement store */ bus_size_t ds_len; /* length of transfer */ + vaddr_t ds_va; /* mapped loaded data */ + vaddr_t ds_bounce; /* mapped bounced data */ + /* * Ugh. need this so can pass alignment down from bus_dmamem_alloc * to scatter gather maps. only the first one is used so the rest is @@ -655,6 +659,11 @@ struct bus_dmamap { void *_dm_cookie; /* cookie for bus-specific functions */ + struct vm_page **_dm_pages; /* replacement pages */ + vaddr_t _dm_pgva; /* those above -- mapped */ + int _dm_npages; /* number of pages allocated */ + int _dm_nused; /* number of pages replaced */ + /* * PUBLIC MEMBERS: these are used by machine-independent code. */