From: Vitaliy Makkoveev Subject: Re: raw IP input loop iterator To: Alexander Bluhm Cc: tech@openbsd.org Date: Tue, 5 Nov 2024 17:03:46 +0300 On Tue, Nov 05, 2024 at 01:58:07PM +0100, Alexander Bluhm wrote: > Hi, > > Inspired by mvs@ idea with the iterator in the UDP multicast loop, > I implemented the same for raw IP input delivery. This removes an > unneccesary rwlock and only uses the table mutex. > > While there, I found some issues with my UDP implementation. When > comparing the inp address and port, we must hold some lock. So > assume that iterator already has the table mutex and hold it while > traversing the list and doing the checks. We are free to release > the mutex when appending to the the socket buffer and doing the > upcalls. Do we really need this? They could be rebound only once from 0 to selected source address. > > I think the goto bad in udp_input does one in_pcbunref(inp) too > much. If we exited the loop with in_pcb_iterator_abort() inp is > not NULL, but already unrefed. > I could miss something, but everything is fine with references. We follow "last == NULL" branch if no `inp' was found during the loop. In this case `inp' and `last' are NULL, so in_pcbunref(inp) is just NULL op. If we exited the loop with in_pcb_iterator_abort() `inp' is not NULL, but `last' is not NULL too, because we assign it to `inp' just before the check. It holds the reference taken by implicit "last = in_pcbref(inp)", by the reference taken by iterator was released by in_pcb_iterator_abort(). The reference taken for `last' will be released just after udp_sbappend(). > In rip_input() move the actual work to rip_sbappend(). This can > be called without mutex during list traversal and for the final > element. > > ok? > > bluhm > > Index: kern/kern_sysctl.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v > diff -u -p -r1.452 kern_sysctl.c > --- kern/kern_sysctl.c 5 Nov 2024 10:49:23 -0000 1.452 > +++ kern/kern_sysctl.c 5 Nov 2024 12:18:15 -0000 > @@ -1705,8 +1705,11 @@ sysctl_file(int *name, u_int namelen, ch > mtx_leave(&udb6table.inpt_mtx); > #endif > mtx_enter(&rawcbtable.inpt_mtx); > - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) > + TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { > + if (in_pcb_is_iterator(inp)) > + continue; > FILLSO(inp->inp_socket); > + } > mtx_leave(&rawcbtable.inpt_mtx); > #ifdef INET6 > mtx_enter(&rawin6pcbtable.inpt_mtx); > Index: netinet/in_pcb.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v > diff -u -p -r1.304 in_pcb.c > --- netinet/in_pcb.c 5 Nov 2024 10:49:23 -0000 1.304 > +++ netinet/in_pcb.c 5 Nov 2024 12:18:15 -0000 > @@ -650,7 +650,7 @@ in_pcb_iterator(struct inpcbtable *table > { > struct inpcb *tmp; > > - mtx_enter(&table->inpt_mtx); > + MUTEX_ASSERT_LOCKED(&table->inpt_mtx); > > if (inp) > tmp = TAILQ_NEXT((struct inpcb *)iter, inp_queue); > @@ -663,6 +663,7 @@ in_pcb_iterator(struct inpcbtable *table > if (inp) { > TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter, > inp_queue); > + in_pcbunref(inp); > } > if (tmp) { > TAILQ_INSERT_AFTER(&table->inpt_queue, tmp, > @@ -670,10 +671,6 @@ in_pcb_iterator(struct inpcbtable *table > in_pcbref(tmp); > } > > - mtx_leave(&table->inpt_mtx); > - > - in_pcbunref(inp); > - > return tmp; > } > > @@ -681,16 +678,13 @@ void > in_pcb_iterator_abort(struct inpcbtable *table, struct inpcb *inp, > struct inpcb_iterator *iter) > { > - mtx_enter(&table->inpt_mtx); > + MUTEX_ASSERT_LOCKED(&table->inpt_mtx); > > if (inp) { > TAILQ_REMOVE(&table->inpt_queue, (struct inpcb *)iter, > inp_queue); > + in_pcbunref(inp); > } > - > - mtx_leave(&table->inpt_mtx); > - > - in_pcbunref(inp); > } > > void > Index: netinet/raw_ip.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v > diff -u -p -r1.160 raw_ip.c > --- netinet/raw_ip.c 12 Jul 2024 19:50:35 -0000 1.160 > +++ netinet/raw_ip.c 5 Nov 2024 12:18:15 -0000 > @@ -116,6 +116,9 @@ const struct pr_usrreqs rip_usrreqs = { > .pru_peeraddr = in_peeraddr, > }; > > +void rip_sbappend(struct inpcb *, struct mbuf *, struct ip *, > + struct sockaddr_in *); > + > /* > * Initialize raw connection block q. > */ > @@ -130,8 +133,8 @@ rip_input(struct mbuf **mp, int *offp, i > { > struct mbuf *m = *mp; > struct ip *ip = mtod(m, struct ip *); > - struct inpcb *inp; > - SIMPLEQ_HEAD(, inpcb) inpcblist; > + struct inpcb_iterator iter = { .inp_table = NULL }; > + struct inpcb *inp, *last; > struct in_addr *key; > struct counters_ref ref; > uint64_t *counters; > @@ -163,10 +166,9 @@ rip_input(struct mbuf **mp, int *offp, i > } > } > #endif > - SIMPLEQ_INIT(&inpcblist); > - rw_enter_write(&rawcbtable.inpt_notify); > mtx_enter(&rawcbtable.inpt_mtx); > - TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { > + last = inp = NULL; > + while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { > KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > > /* > @@ -190,14 +192,23 @@ rip_input(struct mbuf **mp, int *offp, i > inp->inp_faddr.s_addr != ip->ip_src.s_addr) > continue; > > - in_pcbref(inp); > - SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); > + if (last != NULL) { > + struct mbuf *n; > + > + mtx_leave(&rawcbtable.inpt_mtx); > + > + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); > + if (n != NULL) > + rip_sbappend(last, n, ip, &ripsrc); > + in_pcbunref(last); > + > + mtx_enter(&rawcbtable.inpt_mtx); > + } > + last = in_pcbref(inp); > } > mtx_leave(&rawcbtable.inpt_mtx); > > - if (SIMPLEQ_EMPTY(&inpcblist)) { > - rw_exit_write(&rawcbtable.inpt_notify); > - > + if (last == NULL) { > if (ip->ip_p != IPPROTO_ICMP) > icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, > 0, 0); > @@ -212,42 +223,34 @@ rip_input(struct mbuf **mp, int *offp, i > return IPPROTO_DONE; > } > > - while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { > - struct mbuf *n, *opts = NULL; > - > - SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); > - if (SIMPLEQ_EMPTY(&inpcblist)) > - n = m; > - else > - n = m_copym(m, 0, M_COPYALL, M_NOWAIT); > - if (n != NULL) { > - struct socket *so = inp->inp_socket; > - int ret = 0; > - > - if (inp->inp_flags & INP_CONTROLOPTS || > - so->so_options & SO_TIMESTAMP) > - ip_savecontrol(inp, &opts, ip, n); > - > - mtx_enter(&so->so_rcv.sb_mtx); > - if (!ISSET(inp->inp_socket->so_rcv.sb_state, > - SS_CANTRCVMORE)) { > - ret = sbappendaddr(so, &so->so_rcv, > - sintosa(&ripsrc), n, opts); > - } > - mtx_leave(&so->so_rcv.sb_mtx); > - > - if (ret == 0) { > - m_freem(n); > - m_freem(opts); > - ipstat_inc(ips_noproto); > - } else > - sorwakeup(so); > - } > - in_pcbunref(inp); > - } > - rw_exit_write(&rawcbtable.inpt_notify); > + rip_sbappend(last, m, ip, &ripsrc); > + in_pcbunref(last); > > return IPPROTO_DONE; > +} > + > +void > +rip_sbappend(struct inpcb *inp, struct mbuf *m, struct ip *ip, > + struct sockaddr_in *ripsrc) > +{ > + struct socket *so = inp->inp_socket; > + struct mbuf *opts = NULL; > + int ret = 0; > + > + if (inp->inp_flags & INP_CONTROLOPTS || so->so_options & SO_TIMESTAMP) > + ip_savecontrol(inp, &opts, ip, m); > + > + mtx_enter(&so->so_rcv.sb_mtx); > + if (!ISSET(inp->inp_socket->so_rcv.sb_state, SS_CANTRCVMORE)) > + ret = sbappendaddr(so, &so->so_rcv, sintosa(ripsrc), m, opts); > + mtx_leave(&so->so_rcv.sb_mtx); > + > + if (ret == 0) { > + m_freem(m); > + m_freem(opts); > + ipstat_inc(ips_noproto); > + } else > + sorwakeup(so); > } > > /* > Index: netinet/udp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v > diff -u -p -r1.326 udp_usrreq.c > --- netinet/udp_usrreq.c 5 Nov 2024 10:49:23 -0000 1.326 > +++ netinet/udp_usrreq.c 5 Nov 2024 12:18:15 -0000 > @@ -409,6 +409,7 @@ udp_input(struct mbuf **mp, int *offp, i > #endif > table = &udbtable; > > + mtx_enter(&table->inpt_mtx); > last = inp = NULL; > while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { > if (ip6) > @@ -464,12 +465,16 @@ udp_input(struct mbuf **mp, int *offp, i > if (last != NULL) { > struct mbuf *n; > > + mtx_leave(&table->inpt_mtx); > + > n = m_copym(m, 0, M_COPYALL, M_NOWAIT); > if (n != NULL) { > udp_sbappend(last, n, ip, ip6, iphlen, > uh, &srcsa.sa, 0); > } > in_pcbunref(last); > + > + mtx_enter(&table->inpt_mtx); > } > last = in_pcbref(inp); > > @@ -487,6 +492,7 @@ udp_input(struct mbuf **mp, int *offp, i > break; > } > } > + mtx_leave(&table->inpt_mtx); > > if (last == NULL) { > /* > @@ -495,7 +501,8 @@ udp_input(struct mbuf **mp, int *offp, i > * for a broadcast or multicast datgram.) > */ > udpstat_inc(udps_noportbcast); > - goto bad; > + m_freem(m); > + return IPPROTO_DONE; > } > > udp_sbappend(last, m, ip, ip6, iphlen, uh, &srcsa.sa, 0); >