Download raw body.
raw IP input loop iterator
On Tue, Nov 05, 2024 at 01:58:07PM +0100, Alexander Bluhm wrote:
> Hi,
>
> Inspired by mvs@ idea with the iterator in the UDP multicast loop,
> I implemented the same for raw IP input delivery. This removes an
> unneccesary rwlock and only uses the table mutex.
>
> While there, I found some issues with my UDP implementation. When
> comparing the inp address and port, we must hold some lock. So
> assume that iterator already has the table mutex and hold it while
> traversing the list and doing the checks. We are free to release
> the mutex when appending to the the socket buffer and doing the
> upcalls.
Do we really need this? They could be rebound only once from 0 to
selected source address.
>
> I think the goto bad in udp_input does one in_pcbunref(inp) too
> much. If we exited the loop with in_pcb_iterator_abort() inp is
> not NULL, but already unrefed.
>
I could miss something, but everything is fine with references.
We follow "last == NULL" branch if no `inp' was found during the
loop. In this case `inp' and `last' are NULL, so in_pcbunref(inp)
is just NULL op.
If we exited the loop with in_pcb_iterator_abort() `inp' is not
NULL, but `last' is not NULL too, because we assign it to `inp'
just before the check. It holds the reference taken by implicit
"last = in_pcbref(inp)", by the reference taken by iterator was
released by in_pcb_iterator_abort(). The reference taken for
`last' will be released just after udp_sbappend().
> In rip_input() move the actual work to rip_sbappend(). This can
> be called without mutex during list traversal and for the final
> element.
>
> ok?
>
> bluhm
>
> Index: kern/kern_sysctl.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
> diff -u -p -r1.452 kern_sysctl.c
> --- kern/kern_sysctl.c 5 Nov 2024 10:49:23 -0000 1.452
> +++ kern/kern_sysctl.c 5 Nov 2024 12:18:15 -0000
> @@ -1705,8 +1705,11 @@ sysctl_file(int *name, u_int namelen, ch
> mtx_leave(&udb6table.inpt_mtx);
> #endif
> mtx_enter(&rawcbtable.inpt_mtx);
> - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue)
> + TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> + if (in_pcb_is_iterator(inp))
> + continue;
> FILLSO(inp->inp_socket);
> + }
> mtx_leave(&rawcbtable.inpt_mtx);
> #ifdef INET6
> mtx_enter(&rawin6pcbtable.inpt_mtx);
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> diff -u -p -r1.304 in_pcb.c
> --- netinet/in_pcb.c 5 Nov 2024 10:49:23 -0000 1.304
> +++ netinet/in_pcb.c 5 Nov 2024 12:18:15 -0000
> @@ -650,7 +650,7 @@ in_pcb_iterator(struct inpcbtable *table
> {
> struct inpcb *tmp;
>
> - mtx_enter(&table->inpt_mtx);
> + MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
>
> if (inp)
> tmp = TAILQ_NEXT((struct inpcb *)iter, inp_queue);
> @@ -663,6 +663,7 @@ in_pcb_iterator(struct inpcbtable *table
> if (inp) {
> TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter,
> inp_queue);
> + in_pcbunref(inp);
> }
> if (tmp) {
> TAILQ_INSERT_AFTER(&table->inpt_queue, tmp,
> @@ -670,10 +671,6 @@ in_pcb_iterator(struct inpcbtable *table
> in_pcbref(tmp);
> }
>
> - mtx_leave(&table->inpt_mtx);
> -
> - in_pcbunref(inp);
> -
> return tmp;
> }
>
> @@ -681,16 +678,13 @@ void
> in_pcb_iterator_abort(struct inpcbtable *table, struct inpcb *inp,
> struct inpcb_iterator *iter)
> {
> - mtx_enter(&table->inpt_mtx);
> + MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
>
> if (inp) {
> TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter,
> inp_queue);
> + in_pcbunref(inp);
> }
> -
> - mtx_leave(&table->inpt_mtx);
> -
> - in_pcbunref(inp);
> }
>
> void
> Index: netinet/raw_ip.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> diff -u -p -r1.160 raw_ip.c
> --- netinet/raw_ip.c 12 Jul 2024 19:50:35 -0000 1.160
> +++ netinet/raw_ip.c 5 Nov 2024 12:18:15 -0000
> @@ -116,6 +116,9 @@ const struct pr_usrreqs rip_usrreqs = {
> .pru_peeraddr = in_peeraddr,
> };
>
> +void rip_sbappend(struct inpcb *, struct mbuf *, struct ip *,
> + struct sockaddr_in *);
> +
> /*
> * Initialize raw connection block q.
> */
> @@ -130,8 +133,8 @@ rip_input(struct mbuf **mp, int *offp, i
> {
> struct mbuf *m = *mp;
> struct ip *ip = mtod(m, struct ip *);
> - struct inpcb *inp;
> - SIMPLEQ_HEAD(, inpcb) inpcblist;
> + struct inpcb_iterator iter = { .inp_table = NULL };
> + struct inpcb *inp, *last;
> struct in_addr *key;
> struct counters_ref ref;
> uint64_t *counters;
> @@ -163,10 +166,9 @@ rip_input(struct mbuf **mp, int *offp, i
> }
> }
> #endif
> - SIMPLEQ_INIT(&inpcblist);
> - rw_enter_write(&rawcbtable.inpt_notify);
> mtx_enter(&rawcbtable.inpt_mtx);
> - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> + last = inp = NULL;
> + while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
> KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
>
> /*
> @@ -190,14 +192,23 @@ rip_input(struct mbuf **mp, int *offp, i
> inp->inp_faddr.s_addr != ip->ip_src.s_addr)
> continue;
>
> - in_pcbref(inp);
> - SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
> + if (last != NULL) {
> + struct mbuf *n;
> +
> + mtx_leave(&rawcbtable.inpt_mtx);
> +
> + n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> + if (n != NULL)
> + rip_sbappend(last, n, ip, &ripsrc);
> + in_pcbunref(last);
> +
> + mtx_enter(&rawcbtable.inpt_mtx);
> + }
> + last = in_pcbref(inp);
> }
> mtx_leave(&rawcbtable.inpt_mtx);
>
> - if (SIMPLEQ_EMPTY(&inpcblist)) {
> - rw_exit_write(&rawcbtable.inpt_notify);
> -
> + if (last == NULL) {
> if (ip->ip_p != IPPROTO_ICMP)
> icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
> 0, 0);
> @@ -212,42 +223,34 @@ rip_input(struct mbuf **mp, int *offp, i
> return IPPROTO_DONE;
> }
>
> - while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> - struct mbuf *n, *opts = NULL;
> -
> - SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> - if (SIMPLEQ_EMPTY(&inpcblist))
> - n = m;
> - else
> - n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> - if (n != NULL) {
> - struct socket *so = inp->inp_socket;
> - int ret = 0;
> -
> - if (inp->inp_flags & INP_CONTROLOPTS ||
> - so->so_options & SO_TIMESTAMP)
> - ip_savecontrol(inp, &opts, ip, n);
> -
> - mtx_enter(&so->so_rcv.sb_mtx);
> - if (!ISSET(inp->inp_socket->so_rcv.sb_state,
> - SS_CANTRCVMORE)) {
> - ret = sbappendaddr(so, &so->so_rcv,
> - sintosa(&ripsrc), n, opts);
> - }
> - mtx_leave(&so->so_rcv.sb_mtx);
> -
> - if (ret == 0) {
> - m_freem(n);
> - m_freem(opts);
> - ipstat_inc(ips_noproto);
> - } else
> - sorwakeup(so);
> - }
> - in_pcbunref(inp);
> - }
> - rw_exit_write(&rawcbtable.inpt_notify);
> + rip_sbappend(last, m, ip, &ripsrc);
> + in_pcbunref(last);
>
> return IPPROTO_DONE;
> +}
> +
> +void
> +rip_sbappend(struct inpcb *inp, struct mbuf *m, struct ip *ip,
> + struct sockaddr_in *ripsrc)
> +{
> + struct socket *so = inp->inp_socket;
> + struct mbuf *opts = NULL;
> + int ret = 0;
> +
> + if (inp->inp_flags & INP_CONTROLOPTS || so->so_options & SO_TIMESTAMP)
> + ip_savecontrol(inp, &opts, ip, m);
> +
> + mtx_enter(&so->so_rcv.sb_mtx);
> + if (!ISSET(inp->inp_socket->so_rcv.sb_state, SS_CANTRCVMORE))
> + ret = sbappendaddr(so, &so->so_rcv, sintosa(ripsrc), m, opts);
> + mtx_leave(&so->so_rcv.sb_mtx);
> +
> + if (ret == 0) {
> + m_freem(m);
> + m_freem(opts);
> + ipstat_inc(ips_noproto);
> + } else
> + sorwakeup(so);
> }
>
> /*
> Index: netinet/udp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
> diff -u -p -r1.326 udp_usrreq.c
> --- netinet/udp_usrreq.c 5 Nov 2024 10:49:23 -0000 1.326
> +++ netinet/udp_usrreq.c 5 Nov 2024 12:18:15 -0000
> @@ -409,6 +409,7 @@ udp_input(struct mbuf **mp, int *offp, i
> #endif
> table = &udbtable;
>
> + mtx_enter(&table->inpt_mtx);
> last = inp = NULL;
> while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) {
> if (ip6)
> @@ -464,12 +465,16 @@ udp_input(struct mbuf **mp, int *offp, i
> if (last != NULL) {
> struct mbuf *n;
>
> + mtx_leave(&table->inpt_mtx);
> +
> n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> if (n != NULL) {
> udp_sbappend(last, n, ip, ip6, iphlen,
> uh, &srcsa.sa, 0);
> }
> in_pcbunref(last);
> +
> + mtx_enter(&table->inpt_mtx);
> }
> last = in_pcbref(inp);
>
> @@ -487,6 +492,7 @@ udp_input(struct mbuf **mp, int *offp, i
> break;
> }
> }
> + mtx_leave(&table->inpt_mtx);
>
> if (last == NULL) {
> /*
> @@ -495,7 +501,8 @@ udp_input(struct mbuf **mp, int *offp, i
> * for a broadcast or multicast datgram.)
> */
> udpstat_inc(udps_noportbcast);
> - goto bad;
> + m_freem(m);
> + return IPPROTO_DONE;
> }
>
> udp_sbappend(last, m, ip, ip6, iphlen, uh, &srcsa.sa, 0);
>
raw IP input loop iterator