Download raw body.
sysctl_file(): rework FILLSO() macro
On Sun, Jan 12, 2025 at 03:58:04AM +0300, Vitaliy Makkoveev wrote:
> I don't like how KERN_FILE_BYFILE case of the sysctl_file() delivers
> sockets data to the userland. It not only takes exclusive netlock
> around all except divert socket tables walkthrough, but also does
> copyout() with mutex(9) held. Sounds strange, but the context switch
> is avoided because userland pages are wired.
>
> The table is protected with `inpt_mtx' mutex(9), so the socket lock or
> netlock should be take outside. Since we have no socket pointer, we
> can't use solock(). We can't use the only shared netlock because this
> left socket unprotected against concurrent threads which rely on
> solock_shared().
>
> Now it is possible to fix this precious gem. We can use reference
> counting for all socket types, and bluhm@ introduced `inp_sofree_mtx'
> mutex(9) to protect `inp_socket'. The INP tables have the special
> iterator to safely release `inpt_mtx' during table walkthrough.
>
> The FILLSO() or FILLIT2() macros can't be used unrolled, because we
> need to push mutexes re-locking and solock() deep within. I made
> the FILLINPTABLE() macro which is the unrolling, but with the socket
> related locking dances. FILLIT2() macro is not required anymore and
> was merged with FILLIT().
>
> Current implementation takes the reference on `inp_socket' and
> releases `inpt_mtx' mutex(9). Now it's possible to fairly use
> shared_solock() on socket instead of netlock while performing
> fill_file(). The copyout() is external for fill_file() and touches
> nothing required to be protected so it could be made lockless.
>
> The KERN_FILE_BYFILE case became mp-safe, but the rest sysctl_file()
> cases still not, so the sysctl_vslock() dances left as is.
OK bluhm@
> Index: sys/kern/kern_sysctl.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.460
> diff -u -p -r1.460 kern_sysctl.c
> --- sys/kern/kern_sysctl.c 4 Jan 2025 09:26:01 -0000 1.460
> +++ sys/kern/kern_sysctl.c 12 Jan 2025 00:16:41 -0000
> @@ -1672,9 +1672,10 @@ sysctl_file(int *name, u_int namelen, ch
>
> kf = malloc(sizeof(*kf), M_TEMP, M_WAITOK);
>
> -#define FILLIT2(fp, fdp, i, vp, pr, so) do { \
> +#define FILLIT(fp, fdp, i, vp, pr) do { \
> if (buflen >= elem_size && elem_count > 0) { \
> - fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\
> + fill_file(kf, fp, fdp, i, vp, pr, p, NULL, \
> + show_pointers); \
> error = copyout(kf, dp, outsize); \
> if (error) \
> break; \
> @@ -1684,62 +1685,59 @@ sysctl_file(int *name, u_int namelen, ch
> } \
> needed += elem_size; \
> } while (0)
> -#define FILLIT(fp, fdp, i, vp, pr) \
> - FILLIT2(fp, fdp, i, vp, pr, NULL)
> -#define FILLSO(so) \
> - FILLIT2(NULL, NULL, 0, NULL, NULL, so)
> +
> +#define FILLINPTABLE(table) \
> +do { \
> + struct inpcb_iterator iter = { .inp_table = NULL }; \
> + struct inpcb *inp = NULL; \
> + struct socket *so; \
> + \
> + mtx_enter(&(table)->inpt_mtx); \
> + while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { \
> + if (buflen >= elem_size && elem_count > 0) { \
> + mtx_enter(&inp->inp_sofree_mtx); \
> + so = soref(inp->inp_socket); \
> + mtx_leave(&inp->inp_sofree_mtx); \
> + if (so == NULL) \
> + continue; \
> + mtx_leave(&(table)->inpt_mtx); \
> + solock_shared(so); \
> + fill_file(kf, NULL, NULL, 0, NULL, NULL, p, \
> + so, show_pointers); \
> + sounlock_shared(so); \
> + sorele(so); \
> + error = copyout(kf, dp, outsize); \
> + mtx_enter(&(table)->inpt_mtx); \
> + if (error) { \
> + in_pcb_iterator_abort((table), inp, \
> + &iter); \
> + break; \
> + } \
> + dp += elem_size; \
> + buflen -= elem_size; \
> + elem_count--; \
> + } \
> + needed += elem_size; \
> + } \
> + mtx_leave(&(table)->inpt_mtx); \
> +} while (0)
>
> switch (op) {
> case KERN_FILE_BYFILE:
> /* use the inp-tables to pick up closed connections, too */
> if (arg == DTYPE_SOCKET) {
> - struct inpcb *inp;
> -
> - NET_LOCK();
> - mtx_enter(&tcbtable.inpt_mtx);
> - TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue)
> - FILLSO(inp->inp_socket);
> - mtx_leave(&tcbtable.inpt_mtx);
> + FILLINPTABLE(&tcbtable);
> #ifdef INET6
> - mtx_enter(&tcb6table.inpt_mtx);
> - TAILQ_FOREACH(inp, &tcb6table.inpt_queue, inp_queue)
> - FILLSO(inp->inp_socket);
> - mtx_leave(&tcb6table.inpt_mtx);
> + FILLINPTABLE(&tcb6table);
> #endif
> - mtx_enter(&udbtable.inpt_mtx);
> - TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
> - if (in_pcb_is_iterator(inp))
> - continue;
> - FILLSO(inp->inp_socket);
> - }
> - mtx_leave(&udbtable.inpt_mtx);
> + FILLINPTABLE(&udbtable);
> #ifdef INET6
> - mtx_enter(&udb6table.inpt_mtx);
> - TAILQ_FOREACH(inp, &udb6table.inpt_queue, inp_queue) {
> - if (in_pcb_is_iterator(inp))
> - continue;
> - FILLSO(inp->inp_socket);
> - }
> - mtx_leave(&udb6table.inpt_mtx);
> + FILLINPTABLE(&udb6table);
> #endif
> - mtx_enter(&rawcbtable.inpt_mtx);
> - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> - if (in_pcb_is_iterator(inp))
> - continue;
> - FILLSO(inp->inp_socket);
> - }
> - mtx_leave(&rawcbtable.inpt_mtx);
> + FILLINPTABLE(&rawcbtable);
> #ifdef INET6
> - mtx_enter(&rawin6pcbtable.inpt_mtx);
> - TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue,
> - inp_queue) {
> - if (in_pcb_is_iterator(inp))
> - continue;
> - FILLSO(inp->inp_socket);
> - }
> - mtx_leave(&rawin6pcbtable.inpt_mtx);
> + FILLINPTABLE(&rawin6pcbtable);
> #endif
> - NET_UNLOCK();
> }
> fp = NULL;
> while ((fp = fd_iterfile(fp, p)) != NULL) {
sysctl_file(): rework FILLSO() macro