Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Kill `inp_notify' list remains
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Fri, 20 Dec 2024 21:16:09 +0100

Download raw body.

Thread
On Fri, Dec 20, 2024 at 04:08:14AM +0300, Vitaliy Makkoveev wrote:
> This was the list, where PCBs were temporary linked to avoid sleep with 
> `inpt_mtx' mutex(9) held. in_pcbnotifyall() and in6_pcbnotify are the
> last list users, so I propose to switch them to in_pcb_iterator() too,
> moreover they already do in_pcb_is_iterator() check.
> 
> Note, in_pcb_iterator() does in_pcbref() and in_pcbunref() for
> corresponding `inp', unlocked dereference is safe and no extra
> reference release required.
> 
> Outside divert(4) sockets search loop, sysctl_file() is the only
> `inp_queue' foreach loops runner. It takes exclusive netlock and holds
> it while processing all inet PCB tables. I want to switch these loops to
> the iterators and push shared solock into `inp_queue' loops.
> 
> >From this:
> 
> 	NET_LOCK();
> 	mtx_enter(&tcbtable.inpt_mtx);
> 	TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue)
> 	FILLSO(inp->inp_socket);
> 	mtx_leave(&tcbtable.inpt_mtx);
> 
> to this:
> 
> 	mtx_enter(&tcbtable.inpt_mtx);
> 	while ((inp = in_pcb_iterator(&tcbtable, inp, &iter)) != NULL) {
> 		mtx_leave(&tcbtable.inpt_mtx);
> 		NET_LOCK_SHARED();
> 		if (inp->inp_socket)
> 			FILLSO(inp->inp_socket);
> 		NET_UNLOCK_SHARED();
> 		mtx_enter(&tcbtable.inpt_mtx);
> 	}
> 	mtx_leave(&tcbtable.inpt_mtx);
> 
> In the future, net lock could be replaced with solock() and pushed down
> to FILLSO(), so copyout() could be moved outside net lock too.

I would expect that we need per socket lock for the code above.
Something has to gaurantee that socket does not change while sysctl
is reading its values.  Shared net lock is not enough.

Diff below is OK bluhm@

> Index: sys/netinet/in_pcb.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/in_pcb.c,v
> diff -u -p -r1.305 in_pcb.c
> --- sys/netinet/in_pcb.c	5 Nov 2024 22:44:20 -0000	1.305
> +++ sys/netinet/in_pcb.c	20 Dec 2024 01:01:19 -0000
> @@ -179,7 +179,6 @@ void
>  in_pcbinit(struct inpcbtable *table, int hashsize)
>  {
>  	mtx_init(&table->inpt_mtx, IPL_SOFTNET);
> -	rw_init(&table->inpt_notify, "inpnotify");
>  	TAILQ_INIT(&table->inpt_queue);
>  	table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
>  	    &table->inpt_mask);
> @@ -763,8 +762,8 @@ void
>  in_pcbnotifyall(struct inpcbtable *table, const struct sockaddr_in *dst,
>      u_int rtable, int errno, void (*notify)(struct inpcb *, int))
>  {
> -	SIMPLEQ_HEAD(, inpcb) inpcblist;
> -	struct inpcb *inp;
> +	struct inpcb_iterator iter = { .inp_table = NULL };
> +	struct inpcb *inp = NULL;
>  	u_int rdomain;
>  
>  	if (dst->sin_addr.s_addr == INADDR_ANY)
> @@ -772,39 +771,20 @@ in_pcbnotifyall(struct inpcbtable *table
>  	if (notify == NULL)
>  		return;
>  
> -	/*
> -	 * Use a temporary notify list protected by rwlock to run over
> -	 * selected PCB.  This is necessary as the list of all PCB is
> -	 * protected by a mutex.  Notify may call ip_output() eventually
> -	 * which may sleep as pf lock is a rwlock.  Also the SRP
> -	 * implementation of the routing table might sleep.
> -	 * The same inp_notify list entry and inpt_notify rwlock are
> -	 * used for UDP multicast and raw IP delivery.
> -	 */
> -	SIMPLEQ_INIT(&inpcblist);
>  	rdomain = rtable_l2(rtable);
> -	rw_enter_write(&table->inpt_notify);
>  	mtx_enter(&table->inpt_mtx);
> -	TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
> -		if (in_pcb_is_iterator(inp))
> -			continue;
> +	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
>  		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
>  
>  		if (inp->inp_faddr.s_addr != dst->sin_addr.s_addr ||
>  		    rtable_l2(inp->inp_rtableid) != rdomain) {
>  			continue;
>  		}
> -		in_pcbref(inp);
> -		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
> -	}
> -	mtx_leave(&table->inpt_mtx);
> -
> -	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> -		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> +		mtx_leave(&table->inpt_mtx);
>  		(*notify)(inp, errno);
> -		in_pcbunref(inp);
> +		mtx_enter(&table->inpt_mtx);
>  	}
> -	rw_exit_write(&table->inpt_notify);
> +	mtx_leave(&table->inpt_mtx);
>  }
>  
>  /*
> Index: sys/netinet/in_pcb.h
> ===================================================================
> RCS file: /cvs/src/sys/netinet/in_pcb.h,v
> diff -u -p -r1.159 in_pcb.h
> --- sys/netinet/in_pcb.h	5 Nov 2024 10:49:23 -0000	1.159
> +++ sys/netinet/in_pcb.h	20 Dec 2024 01:01:19 -0000
> @@ -79,7 +79,6 @@
>   *	I	immutable after creation
>   *	N	net lock
>   *	t	inpt_mtx		pcb table mutex
> - *	y	inpt_notify		pcb table rwlock for notify
>   *	L	pf_inp_mtx		link pf to inp mutex
>   *	s	so_lock			socket rwlock
>   */
> @@ -128,7 +127,6 @@ struct inpcb {
>  	LIST_ENTRY(inpcb) inp_hash;		/* [t] local and foreign hash */
>  	LIST_ENTRY(inpcb) inp_lhash;		/* [t] local port hash */
>  	TAILQ_ENTRY(inpcb) inp_queue;		/* [t] inet PCB queue */
> -	SIMPLEQ_ENTRY(inpcb) inp_notify;	/* [y] notify or udp append */
>  	struct	  inpcbtable *inp_table;	/* [I] inet queue/hash table */
>  	union	  inpaddru inp_faddru;		/* [t] Foreign address. */
>  	union	  inpaddru inp_laddru;		/* [t] Local address. */
> @@ -194,7 +192,6 @@ in_pcb_is_iterator(struct inpcb *inp)
>  
>  struct inpcbtable {
>  	struct mutex inpt_mtx;			/* protect queue and hash */
> -	struct rwlock inpt_notify;		/* protect inp_notify list */
>  	TAILQ_HEAD(inpthead, inpcb) inpt_queue;	/* [t] inet PCB queue */
>  	struct	inpcbhead *inpt_hashtbl;	/* [t] local and foreign hash */
>  	struct	inpcbhead *inpt_lhashtbl;	/* [t] local port hash */
> Index: sys/netinet6/in6_pcb.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v
> diff -u -p -r1.145 in6_pcb.c
> --- sys/netinet6/in6_pcb.c	5 Nov 2024 10:49:23 -0000	1.145
> +++ sys/netinet6/in6_pcb.c	20 Dec 2024 01:01:19 -0000
> @@ -427,8 +427,8 @@ in6_pcbnotify(struct inpcbtable *table, 
>      uint fport_arg, const struct sockaddr_in6 *src, uint lport_arg,
>      u_int rtable, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int))
>  {
> -	SIMPLEQ_HEAD(, inpcb) inpcblist;
> -	struct inpcb *inp;
> +	struct inpcb_iterator iter = { .inp_table = NULL };
> +	struct inpcb *inp = NULL;
>  	u_short fport = fport_arg, lport = lport_arg;
>  	struct sockaddr_in6 sa6_src;
>  	int errno;
> @@ -474,13 +474,9 @@ in6_pcbnotify(struct inpcbtable *table, 
>  	if (notify == NULL)
>  		return;
>  
> -	SIMPLEQ_INIT(&inpcblist);
>  	rdomain = rtable_l2(rtable);
> -	rw_enter_write(&table->inpt_notify);
>  	mtx_enter(&table->inpt_mtx);
> -	TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
> -		if (in_pcb_is_iterator(inp))
> -			continue;
> +	while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
>  		KASSERT(ISSET(inp->inp_flags, INP_IPV6));
>  
>  		/*
> @@ -546,17 +542,11 @@ in6_pcbnotify(struct inpcbtable *table, 
>  			continue;
>  		}
>  	  do_notify:
> -		in_pcbref(inp);
> -		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
> -	}
> -	mtx_leave(&table->inpt_mtx);
> -
> -	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> -		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> +		mtx_leave(&table->inpt_mtx);
>  		(*notify)(inp, errno);
> -		in_pcbunref(inp);
> +		mtx_enter(&table->inpt_mtx);
>  	}
> -	rw_exit_write(&table->inpt_notify);
> +	mtx_leave(&table->inpt_mtx);
>  }
>  
>  struct rtentry *