From: Alexander Bluhm Subject: Re: Kill `inp_notify' list remains To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Fri, 20 Dec 2024 21:16:09 +0100 On Fri, Dec 20, 2024 at 04:08:14AM +0300, Vitaliy Makkoveev wrote: > This was the list, where PCBs were temporary linked to avoid sleep with > `inpt_mtx' mutex(9) held. in_pcbnotifyall() and in6_pcbnotify are the > last list users, so I propose to switch them to in_pcb_iterator() too, > moreover they already do in_pcb_is_iterator() check. > > Note, in_pcb_iterator() does in_pcbref() and in_pcbunref() for > corresponding `inp', unlocked dereference is safe and no extra > reference release required. > > Outside divert(4) sockets search loop, sysctl_file() is the only > `inp_queue' foreach loops runner. It takes exclusive netlock and holds > it while processing all inet PCB tables. I want to switch these loops to > the iterators and push shared solock into `inp_queue' loops. > > >From this: > > NET_LOCK(); > mtx_enter(&tcbtable.inpt_mtx); > TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) > FILLSO(inp->inp_socket); > mtx_leave(&tcbtable.inpt_mtx); > > to this: > > mtx_enter(&tcbtable.inpt_mtx); > while ((inp = in_pcb_iterator(&tcbtable, inp, &iter)) != NULL) { > mtx_leave(&tcbtable.inpt_mtx); > NET_LOCK_SHARED(); > if (inp->inp_socket) > FILLSO(inp->inp_socket); > NET_UNLOCK_SHARED(); > mtx_enter(&tcbtable.inpt_mtx); > } > mtx_leave(&tcbtable.inpt_mtx); > > In the future, net lock could be replaced with solock() and pushed down > to FILLSO(), so copyout() could be moved outside net lock too. I would expect that we need per socket lock for the code above. Something has to gaurantee that socket does not change while sysctl is reading its values. Shared net lock is not enough. Diff below is OK bluhm@ > Index: sys/netinet/in_pcb.c > =================================================================== > RCS file: /cvs/src/sys/netinet/in_pcb.c,v > diff -u -p -r1.305 in_pcb.c > --- sys/netinet/in_pcb.c 5 Nov 2024 22:44:20 -0000 1.305 > +++ sys/netinet/in_pcb.c 20 Dec 2024 01:01:19 -0000 > @@ -179,7 +179,6 @@ void > in_pcbinit(struct inpcbtable *table, int hashsize) > { > mtx_init(&table->inpt_mtx, IPL_SOFTNET); > - rw_init(&table->inpt_notify, "inpnotify"); > TAILQ_INIT(&table->inpt_queue); > table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK, > &table->inpt_mask); > @@ -763,8 +762,8 @@ void > in_pcbnotifyall(struct inpcbtable *table, const struct sockaddr_in *dst, > u_int rtable, int errno, void (*notify)(struct inpcb *, int)) > { > - SIMPLEQ_HEAD(, inpcb) inpcblist; > - struct inpcb *inp; > + struct inpcb_iterator iter = { .inp_table = NULL }; > + struct inpcb *inp = NULL; > u_int rdomain; > > if (dst->sin_addr.s_addr == INADDR_ANY) > @@ -772,39 +771,20 @@ in_pcbnotifyall(struct inpcbtable *table > if (notify == NULL) > return; > > - /* > - * Use a temporary notify list protected by rwlock to run over > - * selected PCB. This is necessary as the list of all PCB is > - * protected by a mutex. Notify may call ip_output() eventually > - * which may sleep as pf lock is a rwlock. Also the SRP > - * implementation of the routing table might sleep. > - * The same inp_notify list entry and inpt_notify rwlock are > - * used for UDP multicast and raw IP delivery. > - */ > - SIMPLEQ_INIT(&inpcblist); > rdomain = rtable_l2(rtable); > - rw_enter_write(&table->inpt_notify); > mtx_enter(&table->inpt_mtx); > - TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { > - if (in_pcb_is_iterator(inp)) > - continue; > + while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { > KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > > if (inp->inp_faddr.s_addr != dst->sin_addr.s_addr || > rtable_l2(inp->inp_rtableid) != rdomain) { > continue; > } > - in_pcbref(inp); > - SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); > - } > - mtx_leave(&table->inpt_mtx); > - > - while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { > - SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); > + mtx_leave(&table->inpt_mtx); > (*notify)(inp, errno); > - in_pcbunref(inp); > + mtx_enter(&table->inpt_mtx); > } > - rw_exit_write(&table->inpt_notify); > + mtx_leave(&table->inpt_mtx); > } > > /* > Index: sys/netinet/in_pcb.h > =================================================================== > RCS file: /cvs/src/sys/netinet/in_pcb.h,v > diff -u -p -r1.159 in_pcb.h > --- sys/netinet/in_pcb.h 5 Nov 2024 10:49:23 -0000 1.159 > +++ sys/netinet/in_pcb.h 20 Dec 2024 01:01:19 -0000 > @@ -79,7 +79,6 @@ > * I immutable after creation > * N net lock > * t inpt_mtx pcb table mutex > - * y inpt_notify pcb table rwlock for notify > * L pf_inp_mtx link pf to inp mutex > * s so_lock socket rwlock > */ > @@ -128,7 +127,6 @@ struct inpcb { > LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */ > LIST_ENTRY(inpcb) inp_lhash; /* [t] local port hash */ > TAILQ_ENTRY(inpcb) inp_queue; /* [t] inet PCB queue */ > - SIMPLEQ_ENTRY(inpcb) inp_notify; /* [y] notify or udp append */ > struct inpcbtable *inp_table; /* [I] inet queue/hash table */ > union inpaddru inp_faddru; /* [t] Foreign address. */ > union inpaddru inp_laddru; /* [t] Local address. */ > @@ -194,7 +192,6 @@ in_pcb_is_iterator(struct inpcb *inp) > > struct inpcbtable { > struct mutex inpt_mtx; /* protect queue and hash */ > - struct rwlock inpt_notify; /* protect inp_notify list */ > TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */ > struct inpcbhead *inpt_hashtbl; /* [t] local and foreign hash */ > struct inpcbhead *inpt_lhashtbl; /* [t] local port hash */ > Index: sys/netinet6/in6_pcb.c > =================================================================== > RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v > diff -u -p -r1.145 in6_pcb.c > --- sys/netinet6/in6_pcb.c 5 Nov 2024 10:49:23 -0000 1.145 > +++ sys/netinet6/in6_pcb.c 20 Dec 2024 01:01:19 -0000 > @@ -427,8 +427,8 @@ in6_pcbnotify(struct inpcbtable *table, > uint fport_arg, const struct sockaddr_in6 *src, uint lport_arg, > u_int rtable, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int)) > { > - SIMPLEQ_HEAD(, inpcb) inpcblist; > - struct inpcb *inp; > + struct inpcb_iterator iter = { .inp_table = NULL }; > + struct inpcb *inp = NULL; > u_short fport = fport_arg, lport = lport_arg; > struct sockaddr_in6 sa6_src; > int errno; > @@ -474,13 +474,9 @@ in6_pcbnotify(struct inpcbtable *table, > if (notify == NULL) > return; > > - SIMPLEQ_INIT(&inpcblist); > rdomain = rtable_l2(rtable); > - rw_enter_write(&table->inpt_notify); > mtx_enter(&table->inpt_mtx); > - TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { > - if (in_pcb_is_iterator(inp)) > - continue; > + while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { > KASSERT(ISSET(inp->inp_flags, INP_IPV6)); > > /* > @@ -546,17 +542,11 @@ in6_pcbnotify(struct inpcbtable *table, > continue; > } > do_notify: > - in_pcbref(inp); > - SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); > - } > - mtx_leave(&table->inpt_mtx); > - > - while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { > - SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); > + mtx_leave(&table->inpt_mtx); > (*notify)(inp, errno); > - in_pcbunref(inp); > + mtx_enter(&table->inpt_mtx); > } > - rw_exit_write(&table->inpt_notify); > + mtx_leave(&table->inpt_mtx); > } > > struct rtentry *