From: Alexander Bluhm Subject: Re: sysctl_file(): rework FILLSO() macro To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Mon, 13 Jan 2025 14:24:48 +0100 On Sun, Jan 12, 2025 at 03:58:04AM +0300, Vitaliy Makkoveev wrote: > I don't like how KERN_FILE_BYFILE case of the sysctl_file() delivers > sockets data to the userland. It not only takes exclusive netlock > around all except divert socket tables walkthrough, but also does > copyout() with mutex(9) held. Sounds strange, but the context switch > is avoided because userland pages are wired. > > The table is protected with `inpt_mtx' mutex(9), so the socket lock or > netlock should be take outside. Since we have no socket pointer, we > can't use solock(). We can't use the only shared netlock because this > left socket unprotected against concurrent threads which rely on > solock_shared(). > > Now it is possible to fix this precious gem. We can use reference > counting for all socket types, and bluhm@ introduced `inp_sofree_mtx' > mutex(9) to protect `inp_socket'. The INP tables have the special > iterator to safely release `inpt_mtx' during table walkthrough. > > The FILLSO() or FILLIT2() macros can't be used unrolled, because we > need to push mutexes re-locking and solock() deep within. I made > the FILLINPTABLE() macro which is the unrolling, but with the socket > related locking dances. FILLIT2() macro is not required anymore and > was merged with FILLIT(). > > Current implementation takes the reference on `inp_socket' and > releases `inpt_mtx' mutex(9). Now it's possible to fairly use > shared_solock() on socket instead of netlock while performing > fill_file(). The copyout() is external for fill_file() and touches > nothing required to be protected so it could be made lockless. > > The KERN_FILE_BYFILE case became mp-safe, but the rest sysctl_file() > cases still not, so the sysctl_vslock() dances left as is. OK bluhm@ > Index: sys/kern/kern_sysctl.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v > retrieving revision 1.460 > diff -u -p -r1.460 kern_sysctl.c > --- sys/kern/kern_sysctl.c 4 Jan 2025 09:26:01 -0000 1.460 > +++ sys/kern/kern_sysctl.c 12 Jan 2025 00:16:41 -0000 > @@ -1672,9 +1672,10 @@ sysctl_file(int *name, u_int namelen, ch > > kf = malloc(sizeof(*kf), M_TEMP, M_WAITOK); > > -#define FILLIT2(fp, fdp, i, vp, pr, so) do { \ > +#define FILLIT(fp, fdp, i, vp, pr) do { \ > if (buflen >= elem_size && elem_count > 0) { \ > - fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\ > + fill_file(kf, fp, fdp, i, vp, pr, p, NULL, \ > + show_pointers); \ > error = copyout(kf, dp, outsize); \ > if (error) \ > break; \ > @@ -1684,62 +1685,59 @@ sysctl_file(int *name, u_int namelen, ch > } \ > needed += elem_size; \ > } while (0) > -#define FILLIT(fp, fdp, i, vp, pr) \ > - FILLIT2(fp, fdp, i, vp, pr, NULL) > -#define FILLSO(so) \ > - FILLIT2(NULL, NULL, 0, NULL, NULL, so) > + > +#define FILLINPTABLE(table) \ > +do { \ > + struct inpcb_iterator iter = { .inp_table = NULL }; \ > + struct inpcb *inp = NULL; \ > + struct socket *so; \ > + \ > + mtx_enter(&(table)->inpt_mtx); \ > + while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { \ > + if (buflen >= elem_size && elem_count > 0) { \ > + mtx_enter(&inp->inp_sofree_mtx); \ > + so = soref(inp->inp_socket); \ > + mtx_leave(&inp->inp_sofree_mtx); \ > + if (so == NULL) \ > + continue; \ > + mtx_leave(&(table)->inpt_mtx); \ > + solock_shared(so); \ > + fill_file(kf, NULL, NULL, 0, NULL, NULL, p, \ > + so, show_pointers); \ > + sounlock_shared(so); \ > + sorele(so); \ > + error = copyout(kf, dp, outsize); \ > + mtx_enter(&(table)->inpt_mtx); \ > + if (error) { \ > + in_pcb_iterator_abort((table), inp, \ > + &iter); \ > + break; \ > + } \ > + dp += elem_size; \ > + buflen -= elem_size; \ > + elem_count--; \ > + } \ > + needed += elem_size; \ > + } \ > + mtx_leave(&(table)->inpt_mtx); \ > +} while (0) > > switch (op) { > case KERN_FILE_BYFILE: > /* use the inp-tables to pick up closed connections, too */ > if (arg == DTYPE_SOCKET) { > - struct inpcb *inp; > - > - NET_LOCK(); > - mtx_enter(&tcbtable.inpt_mtx); > - TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) > - FILLSO(inp->inp_socket); > - mtx_leave(&tcbtable.inpt_mtx); > + FILLINPTABLE(&tcbtable); > #ifdef INET6 > - mtx_enter(&tcb6table.inpt_mtx); > - TAILQ_FOREACH(inp, &tcb6table.inpt_queue, inp_queue) > - FILLSO(inp->inp_socket); > - mtx_leave(&tcb6table.inpt_mtx); > + FILLINPTABLE(&tcb6table); > #endif > - mtx_enter(&udbtable.inpt_mtx); > - TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) { > - if (in_pcb_is_iterator(inp)) > - continue; > - FILLSO(inp->inp_socket); > - } > - mtx_leave(&udbtable.inpt_mtx); > + FILLINPTABLE(&udbtable); > #ifdef INET6 > - mtx_enter(&udb6table.inpt_mtx); > - TAILQ_FOREACH(inp, &udb6table.inpt_queue, inp_queue) { > - if (in_pcb_is_iterator(inp)) > - continue; > - FILLSO(inp->inp_socket); > - } > - mtx_leave(&udb6table.inpt_mtx); > + FILLINPTABLE(&udb6table); > #endif > - mtx_enter(&rawcbtable.inpt_mtx); > - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { > - if (in_pcb_is_iterator(inp)) > - continue; > - FILLSO(inp->inp_socket); > - } > - mtx_leave(&rawcbtable.inpt_mtx); > + FILLINPTABLE(&rawcbtable); > #ifdef INET6 > - mtx_enter(&rawin6pcbtable.inpt_mtx); > - TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue, > - inp_queue) { > - if (in_pcb_is_iterator(inp)) > - continue; > - FILLSO(inp->inp_socket); > - } > - mtx_leave(&rawin6pcbtable.inpt_mtx); > + FILLINPTABLE(&rawin6pcbtable); > #endif > - NET_UNLOCK(); > } > fp = NULL; > while ((fp = fd_iterfile(fp, p)) != NULL) {