From: Vitaliy Makkoveev Subject: sysctl_file(): rework FILLSO() macro To: Alexander Bluhm , tech@openbsd.org Date: Sun, 12 Jan 2025 03:58:04 +0300 I don't like how KERN_FILE_BYFILE case of the sysctl_file() delivers sockets data to the userland. It not only takes exclusive netlock around all except divert socket tables walkthrough, but also does copyout() with mutex(9) held. Sounds strange, but the context switch is avoided because userland pages are wired. The table is protected with `inpt_mtx' mutex(9), so the socket lock or netlock should be take outside. Since we have no socket pointer, we can't use solock(). We can't use the only shared netlock because this left socket unprotected against concurrent threads which rely on solock_shared(). Now it is possible to fix this precious gem. We can use reference counting for all socket types, and bluhm@ introduced `inp_sofree_mtx' mutex(9) to protect `inp_socket'. The INP tables have the special iterator to safely release `inpt_mtx' during table walkthrough. The FILLSO() or FILLIT2() macros can't be used unrolled, because we need to push mutexes re-locking and solock() deep within. I made the FILLINPTABLE() macro which is the unrolling, but with the socket related locking dances. FILLIT2() macro is not required anymore and was merged with FILLIT(). Current implementation takes the reference on `inp_socket' and releases `inpt_mtx' mutex(9). Now it's possible to fairly use shared_solock() on socket instead of netlock while performing fill_file(). The copyout() is external for fill_file() and touches nothing required to be protected so it could be made lockless. The KERN_FILE_BYFILE case became mp-safe, but the rest sysctl_file() cases still not, so the sysctl_vslock() dances left as is. Index: sys/kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.460 diff -u -p -r1.460 kern_sysctl.c --- sys/kern/kern_sysctl.c 4 Jan 2025 09:26:01 -0000 1.460 +++ sys/kern/kern_sysctl.c 12 Jan 2025 00:16:41 -0000 @@ -1672,9 +1672,10 @@ sysctl_file(int *name, u_int namelen, ch kf = malloc(sizeof(*kf), M_TEMP, M_WAITOK); -#define FILLIT2(fp, fdp, i, vp, pr, so) do { \ +#define FILLIT(fp, fdp, i, vp, pr) do { \ if (buflen >= elem_size && elem_count > 0) { \ - fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\ + fill_file(kf, fp, fdp, i, vp, pr, p, NULL, \ + show_pointers); \ error = copyout(kf, dp, outsize); \ if (error) \ break; \ @@ -1684,62 +1685,59 @@ sysctl_file(int *name, u_int namelen, ch } \ needed += elem_size; \ } while (0) -#define FILLIT(fp, fdp, i, vp, pr) \ - FILLIT2(fp, fdp, i, vp, pr, NULL) -#define FILLSO(so) \ - FILLIT2(NULL, NULL, 0, NULL, NULL, so) + +#define FILLINPTABLE(table) \ +do { \ + struct inpcb_iterator iter = { .inp_table = NULL }; \ + struct inpcb *inp = NULL; \ + struct socket *so; \ + \ + mtx_enter(&(table)->inpt_mtx); \ + while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { \ + if (buflen >= elem_size && elem_count > 0) { \ + mtx_enter(&inp->inp_sofree_mtx); \ + so = soref(inp->inp_socket); \ + mtx_leave(&inp->inp_sofree_mtx); \ + if (so == NULL) \ + continue; \ + mtx_leave(&(table)->inpt_mtx); \ + solock_shared(so); \ + fill_file(kf, NULL, NULL, 0, NULL, NULL, p, \ + so, show_pointers); \ + sounlock_shared(so); \ + sorele(so); \ + error = copyout(kf, dp, outsize); \ + mtx_enter(&(table)->inpt_mtx); \ + if (error) { \ + in_pcb_iterator_abort((table), inp, \ + &iter); \ + break; \ + } \ + dp += elem_size; \ + buflen -= elem_size; \ + elem_count--; \ + } \ + needed += elem_size; \ + } \ + mtx_leave(&(table)->inpt_mtx); \ +} while (0) switch (op) { case KERN_FILE_BYFILE: /* use the inp-tables to pick up closed connections, too */ if (arg == DTYPE_SOCKET) { - struct inpcb *inp; - - NET_LOCK(); - mtx_enter(&tcbtable.inpt_mtx); - TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) - FILLSO(inp->inp_socket); - mtx_leave(&tcbtable.inpt_mtx); + FILLINPTABLE(&tcbtable); #ifdef INET6 - mtx_enter(&tcb6table.inpt_mtx); - TAILQ_FOREACH(inp, &tcb6table.inpt_queue, inp_queue) - FILLSO(inp->inp_socket); - mtx_leave(&tcb6table.inpt_mtx); + FILLINPTABLE(&tcb6table); #endif - mtx_enter(&udbtable.inpt_mtx); - TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) { - if (in_pcb_is_iterator(inp)) - continue; - FILLSO(inp->inp_socket); - } - mtx_leave(&udbtable.inpt_mtx); + FILLINPTABLE(&udbtable); #ifdef INET6 - mtx_enter(&udb6table.inpt_mtx); - TAILQ_FOREACH(inp, &udb6table.inpt_queue, inp_queue) { - if (in_pcb_is_iterator(inp)) - continue; - FILLSO(inp->inp_socket); - } - mtx_leave(&udb6table.inpt_mtx); + FILLINPTABLE(&udb6table); #endif - mtx_enter(&rawcbtable.inpt_mtx); - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { - if (in_pcb_is_iterator(inp)) - continue; - FILLSO(inp->inp_socket); - } - mtx_leave(&rawcbtable.inpt_mtx); + FILLINPTABLE(&rawcbtable); #ifdef INET6 - mtx_enter(&rawin6pcbtable.inpt_mtx); - TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue, - inp_queue) { - if (in_pcb_is_iterator(inp)) - continue; - FILLSO(inp->inp_socket); - } - mtx_leave(&rawin6pcbtable.inpt_mtx); + FILLINPTABLE(&rawin6pcbtable); #endif - NET_UNLOCK(); } fp = NULL; while ((fp = fd_iterfile(fp, p)) != NULL) {