Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: sysctl_file(): rework FILLSO() macro
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Mon, 13 Jan 2025 14:24:48 +0100

Download raw body.

Thread
On Sun, Jan 12, 2025 at 03:58:04AM +0300, Vitaliy Makkoveev wrote:
> I don't like how KERN_FILE_BYFILE case of the sysctl_file() delivers
> sockets data to the userland. It not only takes exclusive netlock
> around all except divert socket tables walkthrough, but also does
> copyout() with mutex(9) held. Sounds strange, but the context switch
> is avoided because userland pages are wired.
> 
> The table is protected with `inpt_mtx' mutex(9), so the socket lock or
> netlock should be take outside. Since we have no socket pointer, we
> can't use solock(). We can't use the only shared netlock because this
> left socket unprotected against concurrent threads which rely on
> solock_shared().
> 
> Now it is possible to fix this precious gem. We can use reference
> counting for all socket types, and bluhm@ introduced `inp_sofree_mtx'
> mutex(9) to protect `inp_socket'. The INP tables have the special
> iterator to safely release `inpt_mtx' during table walkthrough. 
> 
> The FILLSO() or FILLIT2() macros can't be used unrolled, because we
> need to push mutexes re-locking and solock() deep within. I made
> the FILLINPTABLE() macro which is the unrolling, but with the socket
> related locking dances. FILLIT2() macro is not required anymore and
> was merged with FILLIT().
> 
> Current implementation takes the reference on `inp_socket' and
> releases  `inpt_mtx' mutex(9). Now it's possible to fairly use
> shared_solock() on socket instead of netlock while performing
> fill_file(). The copyout() is external for fill_file() and touches
> nothing required to be protected so it could be made lockless.
> 
> The KERN_FILE_BYFILE case became mp-safe, but the rest sysctl_file()
> cases still not, so the sysctl_vslock() dances left as is.

OK bluhm@

> Index: sys/kern/kern_sysctl.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.460
> diff -u -p -r1.460 kern_sysctl.c
> --- sys/kern/kern_sysctl.c	4 Jan 2025 09:26:01 -0000	1.460
> +++ sys/kern/kern_sysctl.c	12 Jan 2025 00:16:41 -0000
> @@ -1672,9 +1672,10 @@ sysctl_file(int *name, u_int namelen, ch
>  
>  	kf = malloc(sizeof(*kf), M_TEMP, M_WAITOK);
>  
> -#define FILLIT2(fp, fdp, i, vp, pr, so) do {				\
> +#define FILLIT(fp, fdp, i, vp, pr) do {					\
>  	if (buflen >= elem_size && elem_count > 0) {			\
> -		fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\
> +		fill_file(kf, fp, fdp, i, vp, pr, p, NULL,		\
> +		    show_pointers);					\
>  		error = copyout(kf, dp, outsize);			\
>  		if (error)						\
>  			break;						\
> @@ -1684,62 +1685,59 @@ sysctl_file(int *name, u_int namelen, ch
>  	}								\
>  	needed += elem_size;						\
>  } while (0)
> -#define FILLIT(fp, fdp, i, vp, pr) \
> -	FILLIT2(fp, fdp, i, vp, pr, NULL)
> -#define FILLSO(so) \
> -	FILLIT2(NULL, NULL, 0, NULL, NULL, so)
> +
> +#define FILLINPTABLE(table)						\
> +do {									\
> +	struct inpcb_iterator iter = { .inp_table = NULL };		\
> +	struct inpcb *inp = NULL;					\
> +	struct socket *so;						\
> +									\
> +	mtx_enter(&(table)->inpt_mtx);					\
> +	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {	\
> +		if (buflen >= elem_size && elem_count > 0) {		\
> +			mtx_enter(&inp->inp_sofree_mtx);		\
> +			so = soref(inp->inp_socket);			\
> +			mtx_leave(&inp->inp_sofree_mtx);		\
> +			if (so == NULL)					\
> +				continue;				\
> +			mtx_leave(&(table)->inpt_mtx);			\
> +			solock_shared(so);				\
> +			fill_file(kf, NULL, NULL, 0, NULL, NULL, p,	\
> +			    so, show_pointers);				\
> +			sounlock_shared(so);				\
> +			sorele(so);					\
> +			error = copyout(kf, dp, outsize);		\
> +			mtx_enter(&(table)->inpt_mtx);			\
> +			if (error) {					\
> +				in_pcb_iterator_abort((table), inp,	\
> +				    &iter);				\
> +				break;					\
> +			}						\
> +			dp += elem_size;				\
> +			buflen -= elem_size;				\
> +			elem_count--;					\
> +		}							\
> +		needed += elem_size;					\
> +	}								\
> +	mtx_leave(&(table)->inpt_mtx);					\
> +} while (0)
>  
>  	switch (op) {
>  	case KERN_FILE_BYFILE:
>  		/* use the inp-tables to pick up closed connections, too */
>  		if (arg == DTYPE_SOCKET) {
> -			struct inpcb *inp;
> -
> -			NET_LOCK();
> -			mtx_enter(&tcbtable.inpt_mtx);
> -			TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue)
> -				FILLSO(inp->inp_socket);
> -			mtx_leave(&tcbtable.inpt_mtx);
> +			FILLINPTABLE(&tcbtable);
>  #ifdef INET6
> -			mtx_enter(&tcb6table.inpt_mtx);
> -			TAILQ_FOREACH(inp, &tcb6table.inpt_queue, inp_queue)
> -				FILLSO(inp->inp_socket);
> -			mtx_leave(&tcb6table.inpt_mtx);
> +			FILLINPTABLE(&tcb6table);
>  #endif
> -			mtx_enter(&udbtable.inpt_mtx);
> -			TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
> -				if (in_pcb_is_iterator(inp))
> -					continue;
> -				FILLSO(inp->inp_socket);
> -			}
> -			mtx_leave(&udbtable.inpt_mtx);
> +			FILLINPTABLE(&udbtable);
>  #ifdef INET6
> -			mtx_enter(&udb6table.inpt_mtx);
> -			TAILQ_FOREACH(inp, &udb6table.inpt_queue, inp_queue) {
> -				if (in_pcb_is_iterator(inp))
> -					continue;
> -				FILLSO(inp->inp_socket);
> -			}
> -			mtx_leave(&udb6table.inpt_mtx);
> +			FILLINPTABLE(&udb6table);
>  #endif
> -			mtx_enter(&rawcbtable.inpt_mtx);
> -			TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> -				if (in_pcb_is_iterator(inp))
> -					continue;
> -				FILLSO(inp->inp_socket);
> -			}
> -			mtx_leave(&rawcbtable.inpt_mtx);
> +			FILLINPTABLE(&rawcbtable);
>  #ifdef INET6
> -			mtx_enter(&rawin6pcbtable.inpt_mtx);
> -			TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue,
> -			    inp_queue) {
> -				if (in_pcb_is_iterator(inp))
> -					continue;
> -				FILLSO(inp->inp_socket);
> -			}
> -			mtx_leave(&rawin6pcbtable.inpt_mtx);
> +			FILLINPTABLE(&rawin6pcbtable);
>  #endif
> -			NET_UNLOCK();
>  		}
>  		fp = NULL;
>  		while ((fp = fd_iterfile(fp, p)) != NULL) {