Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: raw IP input loop iterator
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Tue, 5 Nov 2024 18:55:06 +0300

Download raw body.

Thread
On Tue, Nov 05, 2024 at 03:52:36PM +0100, Alexander Bluhm wrote:
> On Tue, Nov 05, 2024 at 05:03:46PM +0300, Vitaliy Makkoveev wrote:
> > On Tue, Nov 05, 2024 at 01:58:07PM +0100, Alexander Bluhm wrote:
> > > Hi,
> > > 
> > > Inspired by mvs@ idea with the iterator in the UDP multicast loop,
> > > I implemented the same for raw IP input delivery.  This removes an
> > > unneccesary rwlock and only uses the table mutex.
> > > 
> > > While there, I found some issues with my UDP implementation.  When
> > > comparing the inp address and port, we must hold some lock.  So
> > > assume that iterator already has the table mutex and hold it while
> > > traversing the list and doing the checks.  We are free to release
> > > the mutex when appending to the the socket buffer and doing the
> > > upcalls.
> > 
> > Do we really need this? They could be rebound only once from 0 to
> > selected source address.
> 
> I would prefer a clear lock for reading values that another thread
> can modify.
> 
> In rip_disconnect() we set inp->inp_faddr.s_addr = INADDR_ANY.  As
> I understand it, soconnect() can call sodisconnect(), so userland
> can connect and disconnect some socket types arbitrarily.  That is
> the reason why I did put all this under table mutex when I unlocked
> connect(2).
> 
> inp_rtableid can be changed by setsockopt(2).
> 
> Doing a locked comparison gives me a cleaner feeling.  Also I think
> we do less unlock/lock dancing.  The common case it to hit one of
> the continue instructions.  Keeping the mutex in this case helps
> to get through the list quickly.
> 
> > > I think the goto bad in udp_input does one in_pcbunref(inp) too
> > > much.  If we exited the loop with in_pcb_iterator_abort() inp is
> > > not NULL, but already unrefed.
> > > 
> > 
> > I could miss something, but everything is fine with references.
> 
> You are right, inp is always NULL, there is no bug.
> 
> I would like to keep the explicit m_freem() and return IPPROTO_DONE
> anyway.  This is what my raw IP code does in the last == NULL case.
> No need to call in_pcbunref(NULL).
> 
> bluhm

Ok from me.

> 
> > We follow "last == NULL" branch if no `inp' was found during the
> > loop. In this case `inp' and `last' are NULL, so in_pcbunref(inp)
> > is just NULL op.
> > 
> > If we exited the loop with in_pcb_iterator_abort() `inp' is not
> > NULL, but `last' is not NULL too, because we assign it to `inp'
> > just before the check. It holds the reference taken by implicit
> > "last = in_pcbref(inp)", by the reference taken by iterator was
> > released by in_pcb_iterator_abort(). The reference taken for
> > `last' will be released just after udp_sbappend().
> > 
> > > In rip_input() move the actual work to rip_sbappend().  This can
> > > be called without mutex during list traversal and for the final
> > > element.
> > > 
> > > ok?
> > > 
> > > bluhm
> > > 
> > > Index: kern/kern_sysctl.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
> > > diff -u -p -r1.452 kern_sysctl.c
> > > --- kern/kern_sysctl.c	5 Nov 2024 10:49:23 -0000	1.452
> > > +++ kern/kern_sysctl.c	5 Nov 2024 12:18:15 -0000
> > > @@ -1705,8 +1705,11 @@ sysctl_file(int *name, u_int namelen, ch
> > >  			mtx_leave(&udb6table.inpt_mtx);
> > >  #endif
> > >  			mtx_enter(&rawcbtable.inpt_mtx);
> > > -			TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue)
> > > +			TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> > > +				if (in_pcb_is_iterator(inp))
> > > +					continue;
> > >  				FILLSO(inp->inp_socket);
> > > +			}
> > >  			mtx_leave(&rawcbtable.inpt_mtx);
> > >  #ifdef INET6
> > >  			mtx_enter(&rawin6pcbtable.inpt_mtx);
> > > Index: netinet/in_pcb.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> > > diff -u -p -r1.304 in_pcb.c
> > > --- netinet/in_pcb.c	5 Nov 2024 10:49:23 -0000	1.304
> > > +++ netinet/in_pcb.c	5 Nov 2024 12:18:15 -0000
> > > @@ -650,7 +650,7 @@ in_pcb_iterator(struct inpcbtable *table
> > >  {
> > >  	struct inpcb *tmp;
> > >  
> > > -	mtx_enter(&table->inpt_mtx);
> > > +	MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
> > >  
> > >  	if (inp)
> > >  		tmp = TAILQ_NEXT((struct inpcb *)iter, inp_queue);
> > > @@ -663,6 +663,7 @@ in_pcb_iterator(struct inpcbtable *table
> > >  	if (inp) {
> > >  		TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter,
> > >  		    inp_queue);
> > > +		in_pcbunref(inp);
> > >  	}
> > >  	if (tmp) {
> > >  		TAILQ_INSERT_AFTER(&table->inpt_queue, tmp,
> > > @@ -670,10 +671,6 @@ in_pcb_iterator(struct inpcbtable *table
> > >  		in_pcbref(tmp);
> > >  	}
> > >  
> > > -	mtx_leave(&table->inpt_mtx);
> > > -
> > > -	in_pcbunref(inp);
> > > -
> > >  	return tmp;
> > >  }
> > >  
> > > @@ -681,16 +678,13 @@ void
> > >  in_pcb_iterator_abort(struct inpcbtable *table, struct inpcb *inp,
> > >      struct inpcb_iterator *iter)
> > >  {
> > > -	mtx_enter(&table->inpt_mtx);
> > > +	MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
> > >  
> > >  	if (inp) {
> > >  		TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter,
> > >  		    inp_queue);
> > > +		in_pcbunref(inp);
> > >  	}
> > > -
> > > -	mtx_leave(&table->inpt_mtx);
> > > -
> > > -	in_pcbunref(inp);
> > >  }
> > >  
> > >  void
> > > Index: netinet/raw_ip.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> > > diff -u -p -r1.160 raw_ip.c
> > > --- netinet/raw_ip.c	12 Jul 2024 19:50:35 -0000	1.160
> > > +++ netinet/raw_ip.c	5 Nov 2024 12:18:15 -0000
> > > @@ -116,6 +116,9 @@ const struct pr_usrreqs rip_usrreqs = {
> > >  	.pru_peeraddr	= in_peeraddr,
> > >  };
> > >  
> > > +void    rip_sbappend(struct inpcb *, struct mbuf *, struct ip *,
> > > +	    struct sockaddr_in *);
> > > +
> > >  /*
> > >   * Initialize raw connection block q.
> > >   */
> > > @@ -130,8 +133,8 @@ rip_input(struct mbuf **mp, int *offp, i
> > >  {
> > >  	struct mbuf *m = *mp;
> > >  	struct ip *ip = mtod(m, struct ip *);
> > > -	struct inpcb *inp;
> > > -	SIMPLEQ_HEAD(, inpcb) inpcblist;
> > > +	struct inpcb_iterator iter = { .inp_table = NULL };
> > > +	struct inpcb *inp, *last;
> > >  	struct in_addr *key;
> > >  	struct counters_ref ref;
> > >  	uint64_t *counters;
> > > @@ -163,10 +166,9 @@ rip_input(struct mbuf **mp, int *offp, i
> > >  		}
> > >  	}
> > >  #endif
> > > -	SIMPLEQ_INIT(&inpcblist);
> > > -	rw_enter_write(&rawcbtable.inpt_notify);
> > >  	mtx_enter(&rawcbtable.inpt_mtx);
> > > -	TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> > > +	last = inp = NULL;
> > > +	while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
> > >  		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
> > >  
> > >  		/*
> > > @@ -190,14 +192,23 @@ rip_input(struct mbuf **mp, int *offp, i
> > >  		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
> > >  			continue;
> > >  
> > > -		in_pcbref(inp);
> > > -		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
> > > +		if (last != NULL) {
> > > +			struct mbuf *n;
> > > +
> > > +			mtx_leave(&rawcbtable.inpt_mtx);
> > > +
> > > +			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > > +			if (n != NULL)
> > > +				rip_sbappend(last, n, ip, &ripsrc);
> > > +			in_pcbunref(last);
> > > +
> > > +			mtx_enter(&rawcbtable.inpt_mtx);
> > > +		}
> > > +		last = in_pcbref(inp);
> > >  	}
> > >  	mtx_leave(&rawcbtable.inpt_mtx);
> > >  
> > > -	if (SIMPLEQ_EMPTY(&inpcblist)) {
> > > -		rw_exit_write(&rawcbtable.inpt_notify);
> > > -
> > > +	if (last == NULL) {
> > >  		if (ip->ip_p != IPPROTO_ICMP)
> > >  			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
> > >  			    0, 0);
> > > @@ -212,42 +223,34 @@ rip_input(struct mbuf **mp, int *offp, i
> > >  		return IPPROTO_DONE;
> > >  	}
> > >  
> > > -	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > -		struct mbuf *n, *opts = NULL;
> > > -
> > > -		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > -		if (SIMPLEQ_EMPTY(&inpcblist))
> > > -			n = m;
> > > -		else
> > > -			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > > -		if (n != NULL) {
> > > -			struct socket *so = inp->inp_socket;
> > > -			int ret = 0;
> > > -
> > > -			if (inp->inp_flags & INP_CONTROLOPTS ||
> > > -			    so->so_options & SO_TIMESTAMP)
> > > -				ip_savecontrol(inp, &opts, ip, n);
> > > -
> > > -			mtx_enter(&so->so_rcv.sb_mtx);
> > > -			if (!ISSET(inp->inp_socket->so_rcv.sb_state,
> > > -			    SS_CANTRCVMORE)) {
> > > -				ret = sbappendaddr(so, &so->so_rcv,
> > > -				    sintosa(&ripsrc), n, opts);
> > > -			}
> > > -			mtx_leave(&so->so_rcv.sb_mtx);
> > > -
> > > -			if (ret == 0) {
> > > -				m_freem(n);
> > > -				m_freem(opts);
> > > -				ipstat_inc(ips_noproto);
> > > -			} else
> > > -				sorwakeup(so);
> > > -		}
> > > -		in_pcbunref(inp);
> > > -	}
> > > -	rw_exit_write(&rawcbtable.inpt_notify);
> > > +	rip_sbappend(last, m, ip, &ripsrc);
> > > +	in_pcbunref(last);
> > >  
> > >  	return IPPROTO_DONE;
> > > +}
> > > +
> > > +void
> > > +rip_sbappend(struct inpcb *inp, struct mbuf *m, struct ip *ip,
> > > +    struct sockaddr_in *ripsrc)
> > > +{
> > > +	struct socket *so = inp->inp_socket;
> > > +	struct mbuf *opts = NULL;
> > > +	int ret = 0;
> > > +
> > > +	if (inp->inp_flags & INP_CONTROLOPTS || so->so_options & SO_TIMESTAMP)
> > > +		ip_savecontrol(inp, &opts, ip, m);
> > > +
> > > +	mtx_enter(&so->so_rcv.sb_mtx);
> > > +	if (!ISSET(inp->inp_socket->so_rcv.sb_state, SS_CANTRCVMORE))
> > > +		ret = sbappendaddr(so, &so->so_rcv, sintosa(ripsrc), m, opts);
> > > +	mtx_leave(&so->so_rcv.sb_mtx);
> > > +
> > > +	if (ret == 0) {
> > > +		m_freem(m);
> > > +		m_freem(opts);
> > > +		ipstat_inc(ips_noproto);
> > > +	} else
> > > +		sorwakeup(so);
> > >  }
> > >  
> > >  /*
> > > Index: netinet/udp_usrreq.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
> > > diff -u -p -r1.326 udp_usrreq.c
> > > --- netinet/udp_usrreq.c	5 Nov 2024 10:49:23 -0000	1.326
> > > +++ netinet/udp_usrreq.c	5 Nov 2024 12:18:15 -0000
> > > @@ -409,6 +409,7 @@ udp_input(struct mbuf **mp, int *offp, i
> > >  #endif
> > >  			table = &udbtable;
> > >  
> > > +		mtx_enter(&table->inpt_mtx);
> > >  		last = inp = NULL;
> > >  		while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
> > >  			if (ip6)
> > > @@ -464,12 +465,16 @@ udp_input(struct mbuf **mp, int *offp, i
> > >  			if (last != NULL) {
> > >  				struct mbuf *n;
> > >  
> > > +				mtx_leave(&table->inpt_mtx);
> > > +
> > >  				n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > >  				if (n != NULL) {
> > >  					udp_sbappend(last, n, ip, ip6, iphlen,
> > >  					    uh, &srcsa.sa, 0);
> > >  				}
> > >  				in_pcbunref(last);
> > > +
> > > +				mtx_enter(&table->inpt_mtx);
> > >  			}
> > >  			last = in_pcbref(inp);
> > >  
> > > @@ -487,6 +492,7 @@ udp_input(struct mbuf **mp, int *offp, i
> > >  				break;
> > >  			}
> > >  		}
> > > +		mtx_leave(&table->inpt_mtx);
> > >  
> > >  		if (last == NULL) {
> > >  			/*
> > > @@ -495,7 +501,8 @@ udp_input(struct mbuf **mp, int *offp, i
> > >  			 * for a broadcast or multicast datgram.)
> > >  			 */
> > >  			udpstat_inc(udps_noportbcast);
> > > -			goto bad;
> > > +			m_freem(m);
> > > +			return IPPROTO_DONE;
> > >  		}
> > >  
> > >  		udp_sbappend(last, m, ip, ip6, iphlen, uh, &srcsa.sa, 0);
> > > 
>