Download raw body.
Please test: parallel fault handling
On Tue, Jun 03, 2025 at 06:21:17PM +0200, Jeremie Courreges-Anglas wrote:
> On Sun, May 25, 2025 at 11:20:46PM +0200, Jeremie Courreges-Anglas wrote:
> > On Thu, May 22, 2025 at 08:19:38PM +0200, Mark Kettenis wrote:
> > > > Date: Thu, 22 May 2025 18:54:08 +0200
> > > > From: Jeremie Courreges-Anglas <jca@wxcvbn.org>
> > [...]
> > > > *Bzzzt*
> > > >
> > > > The same LDOM was busy compiling two devel/llvm copies under dpb(1).
> > > > Input welcome, I'm not sure yet what other ddb commands could help.
> > > >
> > > > login: panic: trap type 0x34 (mem address not aligned): pc=1012f68 npc=1012f6c pstate=820006<PRIV,IE>
> > > > Stopped at db_enter+0x8: nop
> > > > TID PID UID PRFLAGS PFLAGS CPU COMMAND
> > > > 57488 1522 0 0x11 0 1 perl
> > > > 435923 9891 55 0x1000002 0 4 cc1plus
> > > > 135860 36368 55 0x1000002 0 13 cc1plus
> > > > 333743 96489 55 0x1000002 0 0 cc1plus
> > > > 433162 55422 55 0x1000002 0 9 cc1plus
> > > > 171658 49723 55 0x1000002 0 5 cc1plus
> > > > 47127 57536 55 0x1000002 0 10 cc1plus
> > > > 56600 9350 55 0x1000002 0 14 cc1plus
> > > > 159792 13842 55 0x1000002 0 6 cc1plus
> > > > 510019 10312 55 0x1000002 0 8 cc1plus
> > > > 20489 65709 55 0x1000002 0 15 cc1plus
> > > > 337455 42430 55 0x1000002 0 12 cc1plus
> > > > 401407 80906 55 0x1000002 0 11 cc1plus
> > > > 22993 62317 55 0x1000002 0 2 cc1plus
> > > > 114916 17058 55 0x1000002 0 7 cc1plus
> > > > *435412 33034 0 0x14000 0x200 3K pagedaemon
> > > > trap(400fe6b19b0, 34, 1012f68, 820006, 3, 42) at trap+0x334
> > > > Lslowtrap_reenter(40015a58a00, 77b5db2000, deadbeefdeadc0c7, 1d8, 2df0fc468, 468) at Lslowtrap_reenter+0xf8
> > > > pmap_page_protect(40010716ab8, c16, 1cc9860, 193dfa0, 1cc9000, 1cc9000) at pmap_page_protect+0x1fc
> > > > uvm_pagedeactivate(40010716a50, 40015a50d24, 18667a0, 0, 0, 1c8dac0) at uvm_pagedeactivate+0x54
> > > > uvmpd_scan_active(0, 0, 270f2, 18667a0, 0, ffffffffffffffff) at uvmpd_scan_active+0x150
> > > > uvm_pageout(400fe6b1e08, 55555556, 18667a0, 1c83f08, 1c83000, 1c8dc18) at uvm_pageout+0x2dc
> > > > proc_trampoline(0, 0, 0, 0, 0, 0) at proc_trampoline+0x10
> > > > https://www.openbsd.org/ddb.html describes the minimum info required in bug
> > > > reports. Insufficient info makes it difficult to find and fix bugs.
> > >
> > > If there are pmap issues, pmap_page_protect() is certainly the first
> > > place I'd look. I'll start looking, but don't expect to have much
> > > time until after monday.
> >
> > Indeed this crash lies in pmap_page_protect(). llvm-objdump -dlS says
> > it's stopped at l.2499:
> >
> > } else {
> > pv_entry_t firstpv;
> > /* remove mappings */
> >
> > firstpv = pa_to_pvh(pa);
> > mtx_enter(&pg->mdpage.pvmtx);
> >
> > /* First remove the entire list of continuation pv's*/
> > while ((pv = firstpv->pv_next) != NULL) {
> > --> data = pseg_get(pv->pv_pmap, pv->pv_va & PV_VAMASK);
> >
> > /* Save REF/MOD info */
> > firstpv->pv_va |= pmap_tte2flags(data);
> >
> > ; /sys/arch/sparc64/sparc64/pmap.c:2499
> > ; data = pseg_get(pv->pv_pmap, pv->pv_va & PV_VAMASK);
> > 3c10: a7 29 30 0d sllx %g4, 13, %l3
> > 3c14: d2 5c 60 10 ldx [%l1+16], %o1
> > 3c18: d0 5c 60 08 ldx [%l1+8], %o0
> > --> 3c1c: 40 00 00 00 call 0
> > 3c20: 92 0a 40 13 and %o1, %l3, %o1
> >
> > As discussed with miod I suspect the crash actually lies inside
> > pseg_get(), but I can't prove it.
>
> Another similar crash, at the very same offset in pmap_page_protect,
> with:
> - pmap_collect() removed
> - uvm_purge() applied
> - uvm parallel fault applied
To try to reproduce this one, I went back to:
- pmap_collect() applied
- uvm_purge() backed out
- uvm parallel fault applied
- pmap_page_protect() simplification applied
In the parent mail in this thread I only dumped the first pv entry of
the page. Here we can see that the pmap of the second entry in the pv
list appears corrupted.
This is relatively easy to reproduce for me, I just need to build rust
and another big port in parallel to reproduce. rust is a big user of
threads.
ddb{1}> sh panic
*cpu1: kernel data fault: pc=1012f68 addr=140ace698
cpu6: kernel diagnostic assertion "(bp->b_flags & B_BUSY) == 0" failed: file "
/sys/kern/vfs_biomem.c", line 60
cpu8: pool_do_put: namei: double pool_put: 0x40111a4a400
cpu11: pool_do_get: amappl10: page empty
cpu15: pool_do_get: amappl10: page empty
ddb{1}> ps /o
TID PID UID PRFLAGS PFLAGS CPU COMMAND
275990 93158 55 0x1000002 0 8 llvm-tblgen
325404 591 55 0x1000002 0x4000000 2 rustc
75703 21571 55 0x1000002 0 13 llvm-tblgen
255026 26724 55 0x1000002 0 11 llvm-tblgen
107507 15517 55 0x1000002 0 15 llvm-tblgen
33864 44824 55 0x1000002 0x4000000 0 rustc
160053 28821 55 0x1000002 0x4000000 4 rustc
433855 21856 55 0x1002802 0x4002000 6 rustc
500815 3711 0 0x13 0 7 perl
419052 55730 115 0x100012 0 3 slaacd
254297 67737 0 0x100000 0 12 slaacd
446399 32364 0 0x14000 0x200 9 reaper
*260624 12034 0 0x14000 0x200 1 pagedaemon
395177 29816 0 0x14000 0x200 5 softnet0
414544 9880 0 0x14000 0x200 14 systqmp
396133 8080 0 0x14000 0x200 10 kmthread
ddb{1}> tr
data_access_fault(400fe6b99b0, 30, 1012f68, 140ace698, 140ace698, 1) at data_ac
cess_fault+0x2f0
sun4v_datatrap(40100f543d0, 3ad9f06000, dead4110dead41f8, e8, 140ace698, 698) a
t sun4v_datatrap+0x200
pmap_page_protect(4000761cb58, 4010a17cb20, 1cc4530, 18ec840, 1cc4000, 1cc4000)
at pmap_page_protect+0x478
uvm_pagedeactivate(4000761caf0, 0, 18764a8, 0, 0, 1ccc318) at uvm_pagedeactivat
e+0x54
uvmpd_scan_active(0, 0, d9, 18764a8, ffffffffffffffff, ffffffffffffffff) at uvm
pd_scan_active+0x150
uvm_pageout(400fe6b9e08, 55555556, 18764a8, 1ccc708, 1ccc000, 1ccc470) at uvm_p
ageout+0x2dc
proc_trampoline(0, 0, 0, 0, 0, 0) at proc_trampoline+0x10
ddb{1}> show page 4000761caf0
PAGE 0x4000761caf0:
flags=4000c<TABLED,CLEAN,ACTIVE>, vers=102, wire_count=0, pa=0x1d55f2000
uobject=0x4001341b810, uanon=0x0, offset=0x82b2000
[page ownership tracking disabled] vm_page_md 0x4000761cb58
ddb{1}> sh struct vm_page_md 4000761cb58
struct vm_page_md at 0x4000761cb58 (40 bytes) {pvmtx = {mtx_owner = (void *)0x4
03c1ff8000, mtx_wantipl = 12, mtx_oldipl = 12}, pvent = {pv_next = (struct pv_e
ntry *)0x4010a17cb20, pv_pmap = (struct pmap *)0x40100f552d0, pv_va = 995190661
123}}
ddb{1}> mach pv 1d55f2000
pv@0x4000761cb68: next=0x4010a17cb20 pmap=0x40100f552d0 va=0xe7b5fc6003
pv@0x4010a17cb20: next=0x4010b1d9bf0 pmap=0x40100f543d0 va=0x3ad9f06000
pv@0x4010b1d9bf0: next=0x0 pmap=0x40100f55a10 va=0x22f1faa000
ddb{1}> mach pmap 40100f552d0
pmap 0x40100f552d0: ctx 164e refs 1 physaddr 18dffe000 psegs 0x18dffe000
seg 0 => 41f912000
seg 512 => 41f5d4000 seg 513 => 5f742000
ddb{1}> mach pmap 40100f543d0
pmap 0x40100f543d0: ctx dead4110 refs -559070960 physaddr dead4110dead4110 pseg
s 0xdead4110dead4110
seg 0 => 41f912000
seg 512 => 41f5d4000 seg 513 => 5f742000
ddb{1}> mach pmap 40100f55a10
pmap 0x40100f55a10: ctx 16ed refs 1 physaddr 232cc000 psegs 0x232cc000
seg 0 => 41f912000
seg 512 => 41f5d4000 seg 513 => 5f742000
--
jca
Please test: parallel fault handling