From: Jeremie Courreges-Anglas Subject: Re: Please test: parallel fault handling To: Mark Kettenis , tech@openbsd.org Date: Tue, 3 Jun 2025 18:21:17 +0200 On Sun, May 25, 2025 at 11:20:46PM +0200, Jeremie Courreges-Anglas wrote: > On Thu, May 22, 2025 at 08:19:38PM +0200, Mark Kettenis wrote: > > > Date: Thu, 22 May 2025 18:54:08 +0200 > > > From: Jeremie Courreges-Anglas > [...] > > > *Bzzzt* > > > > > > The same LDOM was busy compiling two devel/llvm copies under dpb(1). > > > Input welcome, I'm not sure yet what other ddb commands could help. > > > > > > login: panic: trap type 0x34 (mem address not aligned): pc=1012f68 npc=1012f6c pstate=820006 > > > Stopped at db_enter+0x8: nop > > > TID PID UID PRFLAGS PFLAGS CPU COMMAND > > > 57488 1522 0 0x11 0 1 perl > > > 435923 9891 55 0x1000002 0 4 cc1plus > > > 135860 36368 55 0x1000002 0 13 cc1plus > > > 333743 96489 55 0x1000002 0 0 cc1plus > > > 433162 55422 55 0x1000002 0 9 cc1plus > > > 171658 49723 55 0x1000002 0 5 cc1plus > > > 47127 57536 55 0x1000002 0 10 cc1plus > > > 56600 9350 55 0x1000002 0 14 cc1plus > > > 159792 13842 55 0x1000002 0 6 cc1plus > > > 510019 10312 55 0x1000002 0 8 cc1plus > > > 20489 65709 55 0x1000002 0 15 cc1plus > > > 337455 42430 55 0x1000002 0 12 cc1plus > > > 401407 80906 55 0x1000002 0 11 cc1plus > > > 22993 62317 55 0x1000002 0 2 cc1plus > > > 114916 17058 55 0x1000002 0 7 cc1plus > > > *435412 33034 0 0x14000 0x200 3K pagedaemon > > > trap(400fe6b19b0, 34, 1012f68, 820006, 3, 42) at trap+0x334 > > > Lslowtrap_reenter(40015a58a00, 77b5db2000, deadbeefdeadc0c7, 1d8, 2df0fc468, 468) at Lslowtrap_reenter+0xf8 > > > pmap_page_protect(40010716ab8, c16, 1cc9860, 193dfa0, 1cc9000, 1cc9000) at pmap_page_protect+0x1fc > > > uvm_pagedeactivate(40010716a50, 40015a50d24, 18667a0, 0, 0, 1c8dac0) at uvm_pagedeactivate+0x54 > > > uvmpd_scan_active(0, 0, 270f2, 18667a0, 0, ffffffffffffffff) at uvmpd_scan_active+0x150 > > > uvm_pageout(400fe6b1e08, 55555556, 18667a0, 1c83f08, 1c83000, 1c8dc18) at uvm_pageout+0x2dc > > > proc_trampoline(0, 0, 0, 0, 0, 0) at proc_trampoline+0x10 > > > https://www.openbsd.org/ddb.html describes the minimum info required in bug > > > reports. Insufficient info makes it difficult to find and fix bugs. > > > > If there are pmap issues, pmap_page_protect() is certainly the first > > place I'd look. I'll start looking, but don't expect to have much > > time until after monday. > > Indeed this crash lies in pmap_page_protect(). llvm-objdump -dlS says > it's stopped at l.2499: > > } else { > pv_entry_t firstpv; > /* remove mappings */ > > firstpv = pa_to_pvh(pa); > mtx_enter(&pg->mdpage.pvmtx); > > /* First remove the entire list of continuation pv's*/ > while ((pv = firstpv->pv_next) != NULL) { > --> data = pseg_get(pv->pv_pmap, pv->pv_va & PV_VAMASK); > > /* Save REF/MOD info */ > firstpv->pv_va |= pmap_tte2flags(data); > > ; /sys/arch/sparc64/sparc64/pmap.c:2499 > ; data = pseg_get(pv->pv_pmap, pv->pv_va & PV_VAMASK); > 3c10: a7 29 30 0d sllx %g4, 13, %l3 > 3c14: d2 5c 60 10 ldx [%l1+16], %o1 > 3c18: d0 5c 60 08 ldx [%l1+8], %o0 > --> 3c1c: 40 00 00 00 call 0 > 3c20: 92 0a 40 13 and %o1, %l3, %o1 > > As discussed with miod I suspect the crash actually lies inside > pseg_get(), but I can't prove it. Another similar crash, at the very same offset in pmap_page_protect, with: - pmap_collect() removed - uvm_purge() applied - uvm parallel fault applied ddb{10}> sh panic cpu7: pmemrange allocation error: allocated 9 pages in 9 segments, but request was 8 pages in 8 segments *cpu10: trap type 0x34 (mem address not aligned): pc=1012f68 npc=1012f6c pstate=820006 cpu12: trap type 0x34 (mem address not aligned): pc=16d99f0 npc=16d9a8c pstate=44820006 cpu14: kernel data fault: pc=128ebd8 addr=0 ddb{10}> tr trap(400fe6b99b0, 34, 1012f68, 820006, 3, 42) at trap+0x334 Lslowtrap_reenter(40015a59d00, f70ad7e000, deadbeefdeadc2c7, 3d8, 3d9424f58, f58) at Lslowtrap_reenter+0xf8 pmap_page_protect(4000a70ead8, 1481, 1c99e88, 1826700, 1c99000, 1c99000) at pmap_page_protect+0x1fc uvm_pagedeactivate(4000a70ea70, 40015a50544, 18227c8, 40106638f60, 0, 1cc6058) at uvm_pagedeactivate+0x54 uvmpd_scan_active(0, 0, 73, 18227c8, 0, ffffffffffffffff) at uvmpd_scan_active+0x150 uvm_pageout(400fe6b9e08, 55555556, 18227c8, 1c93848, 1c93000, 1cc61b0) at uvm_pageout+0x2dc proc_trampoline(0, 0, 0, 0, 0, 0) at proc_trampoline+0x10 ddb{10}> show uvm Current UVM status: pagesize=8192 (0x2000), pagemask=0x1fff, pageshift=13 2057212 VM pages: 1298089 active, 197207 inactive, 1 wired, 87758 free (76926 zero) freemin=68573, free-target=91430, inactive-target=497199, wired-max=685737 faults=616160313, traps=669516640, intrs=148126965, ctxswitch=39682662 fpuswi tch=670541 softint=15781509, syscalls=295482978, kmapent=12 fault counts: noram=0, noanon=0, noamap=0, pgwait=2, pgrele=0 relocks=819456(11130), upgrades=393887632(4819) anget(retries)=186971354(17 5431), amapcopy=36307633 neighbor anon/obj pg=65897919/49829919, gets(lock/unlock)=36304591/655392 cases: anon=118424896, anoncow=68371027, obj=30767834, prcopy=5520933, prze ro=393075610 daemon and swap counts: woke=15121, revs=14991, scans=36301364, obscans=14283, anscans=19957433 busy=0, freed=324734, reactivate=16327849, deactivate=23119683 pageouts=2455919, pending=36130, nswget=175435 nswapdev=1 swpages=2130619, swpginuse=12925, swpgonly=11729 paging=0 kernel pointers: objs(kern)=0x1c7ac40 ddb{10}> ps /o TID PID UID PRFLAGS PFLAGS CPU COMMAND 203190 61835 0 0x11 0 6 perl 238524 44552 55 0x1000002 0 9 cc1plus 164719 6199 55 0x1000802 0x2000 11 cc1 292678 85783 55 0x1000802 0x2000 0 cc1 403905 16469 55 0x1000002 0 7 llvm-tblgen 129561 18309 55 0x1000002 0 14 llvm-tblgen 520584 62689 55 0x1000002 0x4000000 5 rustc 307416 62689 55 0x1000002 0x4000000 1 rustc 348685 62689 55 0x1000002 0x4000000 4 rustc 45025 62689 55 0x1000002 0x4000000 13 rustc 27728 62689 55 0x1000002 0x4000000 3 rustc 504117 3083 74 0x1100012 0 15 pflogd 310807 82726 115 0x100012 0 2 slaacd *254454 33715 0 0x14000 0x200 10 pagedaemon 344259 58467 0 0x14000 0x200 8 softnet0 ddb{10}> show proc PROC (pagedaemon) tid=254454 pid=33715 tcnt=1 stat=onproc flags process=14000 proc=200 runpri=4, usrpri=75, slppri=4, nice=20 wchan=0x0, wmesg=, ps_single=0x0 scnt=0 ecnt=0 forw=0xffffffffffffffff, list=0x40015a507e0,0x40015a502b0 process=0x400fe685b60 user=0x400fe6b6000, vmspace=0x1c7c070 estcpu=25, cpticks=0, pctcpu=1.9, user=0, sys=0, intr=0 ddb{10}> ddb{10}> show page 4000a70ea70 PAGE 0x4000a70ea70: flags=4000c, vers=226, wire_count=0, pa=0x283666000 uobject=0x4010686dd60, uanon=0x0, offset=0xa000 [page ownership tracking disabled] vm_page_md 0x4000a70ead8 ddb{10}> show struct vm_page_md 4000a70ead8 struct vm_page_md at 0x4000a70ead8 (40 bytes) {pvmtx = {mtx_owner = (void *)0x403c1f68000, mtx_wantipl = 12, mtx_oldipl = 12}, pvent = {pv_next = (struct pv_entry *)0x4010ba2fac0, pv_pmap = (struct pmap *)0x400ff201200, pv_va = 483741900803}} ddb{10}> mach pmap 400ff201200 pmap 0x400ff201200: ctx 730 refs 1 physaddr 37a8f6000 psegs 0x37a8f6000 seg 0 => 41f912000 seg 512 => 41f5d8000 seg 513 => 5f742000 ddb{10}> Dunno how useful that data is. I moved the machine out of ddb to verify whether I really had the pmap_collect() diff applied (yep). Bulk build restarted, hopefully that one won't be too hard to reprodduce. -- jca