From: Vitaliy Makkoveev Subject: Re: run TCP syn cache timer in parallel To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 1 Jan 2025 18:02:54 +0300 > On 1 Jan 2025, at 16:01, Alexander Bluhm wrote: > > Hi, > > With mvs@'s socket ref counting, I think it is possible to run TCP > SYN cache in parallel. > > Idea is to reset inp_socket pointer when socket is about to be > freed. By protecting this with a mutex, we reliable retrieve the > socket from an inpcb reference. By using the socket refcount we > switch from mutex to socket lock. > > The syn cache timer is an easy place to use this. Goal it to use > in_pcbsolock() also for TCP timer and input. Note that switching > sc_tp to sc_inplisten and refcounting it, is a diff I sent to tech@ > earlier. > > ok? > Hi Alexander, This diff lokks good to me. However, I don’t like the sorele() after so_lock rwlock(9) release. I made this with previous sorele() and this worked perfectly because exclusive solock() should be taken before continue teardown, but now I prefer to keep the reference until we leave this socket alone. This doesn’t bring any extra impact. So I like to rename in_pcbsolock() to in_pcb_solock_ref() and in_pcbsounlock() to in_pcb_sounlock_rele() and rework them as something like below. in_pcb_solock_ref() { mtx_enter(&inp->inp_sofree_mtx); so = soref(inp->inp_socket); mtx_leave(&inp->inp_sofree_mtx); if (so) rw_enter_write(&so->so_lock); return so; } in_pcb_sounlock_rele() { rw_exit_write(&so->so_lock); sorele(so, 1); } > bluhm > > Index: netinet/in_pcb.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v > diff -u -p -r1.307 in_pcb.c > --- netinet/in_pcb.c 24 Dec 2024 16:27:07 -0000 1.307 > +++ netinet/in_pcb.c 1 Jan 2025 12:55:13 -0000 > @@ -584,6 +584,9 @@ in_pcbdetach(struct inpcb *inp) > struct inpcbtable *table = inp->inp_table; > > so->so_pcb = NULL; > + mtx_enter(&inp->inp_sofree_mtx); > + inp->inp_socket = NULL; > + mtx_leave(&inp->inp_sofree_mtx); > /* > * As long as the NET_LOCK() is the default lock for Internet > * sockets, do not release it to not introduce new sleeping > @@ -616,6 +619,32 @@ in_pcbdetach(struct inpcb *inp) > mtx_leave(&table->inpt_mtx); > > in_pcbunref(inp); > +} > + > +struct socket * > +in_pcbsolock(struct inpcb *inp) > +{ > + struct socket *so; > + > + NET_ASSERT_LOCKED(); > + > + mtx_enter(&inp->inp_sofree_mtx); > + so = soref(inp->inp_socket); > + mtx_leave(&inp->inp_sofree_mtx); > + if (so == NULL) > + return NULL; > + > + rw_enter_write(&so->so_lock); > + sorele(so, 1); > + > + return so; > +} > + > +void > +in_pcbsounlock(struct inpcb *inp, struct socket *so) > +{ > + KASSERT(inp->inp_socket == so); > + rw_exit_write(&so->so_lock); > } > > struct inpcb * > Index: netinet/in_pcb.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v > diff -u -p -r1.161 in_pcb.h > --- netinet/in_pcb.h 24 Dec 2024 16:27:07 -0000 1.161 > +++ netinet/in_pcb.h 1 Jan 2025 10:53:17 -0000 > @@ -81,6 +81,7 @@ > * t inpt_mtx pcb table mutex > * L pf_inp_mtx link pf to inp mutex > * s so_lock socket rwlock > + * f inp_sofree_mtx socket detach and lock > */ > > /* > @@ -136,7 +137,8 @@ struct inpcb { > #define inp_laddr6 inp_laddru.iau_addr6 > u_int16_t inp_fport; /* [t] foreign port */ > u_int16_t inp_lport; /* [t] local port */ > - struct socket *inp_socket; /* [I] back pointer to socket */ > + struct socket *inp_socket; /* [f] back pointer to socket */ > + struct mutex inp_sofree_mtx; /* protect socket free */ > caddr_t inp_ppcb; /* pointer to per-protocol pcb */ > struct route inp_route; /* [s] cached route */ > struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ > @@ -309,6 +311,9 @@ int in_pcbaddrisavail(const struct inpc > struct proc *); > int in_pcbconnect(struct inpcb *, struct mbuf *); > void in_pcbdetach(struct inpcb *); > +struct socket * > + in_pcbsolock(struct inpcb *); > +void in_pcbsounlock(struct inpcb *, struct socket *); > struct inpcb * > in_pcbref(struct inpcb *); > void in_pcbunref(struct inpcb *); > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.416 tcp_input.c > --- netinet/tcp_input.c 31 Dec 2024 12:19:46 -0000 1.416 > +++ netinet/tcp_input.c 1 Jan 2025 10:49:40 -0000 > @@ -3173,7 +3173,8 @@ syn_cache_rm(struct syn_cache *sc) > KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); > SET(sc->sc_dynflags, SCF_DEAD); > TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); > - sc->sc_tp = NULL; > + in_pcbunref(sc->sc_inplisten); > + sc->sc_inplisten = NULL; > LIST_REMOVE(sc, sc_tpq); > refcnt_rele(&sc->sc_refcnt); > sc->sc_buckethead->sch_length--; > @@ -3369,6 +3370,8 @@ void > syn_cache_timer(void *arg) > { > struct syn_cache *sc = arg; > + struct inpcb *inp; > + struct socket *so; > uint64_t now; > int lastref; > > @@ -3397,14 +3400,22 @@ syn_cache_timer(void *arg) > TCPTV_REXMTMAX); > if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) > refcnt_take(&sc->sc_refcnt); > + inp = in_pcbref(sc->sc_inplisten); > + if (inp == NULL) > + goto freeit; > mtx_leave(&syn_cache_mtx); > > - NET_LOCK(); > - now = tcp_now(); > - (void) syn_cache_respond(sc, NULL, now); > - tcpstat_inc(tcps_sc_retransmitted); > - NET_UNLOCK(); > + NET_LOCK_SHARED(); > + so = in_pcbsolock(inp); > + if (so != NULL) { > + now = tcp_now(); > + (void) syn_cache_respond(sc, NULL, now); > + tcpstat_inc(tcps_sc_retransmitted); > + in_pcbsounlock(inp, so); > + } > + NET_UNLOCK_SHARED(); > > + in_pcbunref(inp); > syn_cache_put(sc); > return; > > @@ -3434,10 +3445,7 @@ syn_cache_cleanup(struct tcpcb *tp) > > mtx_enter(&syn_cache_mtx); > LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { > -#ifdef DIAGNOSTIC > - if (sc->sc_tp != tp) > - panic("invalid sc_tp in syn_cache_cleanup"); > -#endif > + KASSERT(sc->sc_inplisten == tp->t_inpcb); > syn_cache_rm(sc); > syn_cache_put(sc); > } > @@ -3949,7 +3957,7 @@ syn_cache_add(struct sockaddr *src, stru > if (tb.t_flags & TF_SIGNATURE) > SET(sc->sc_fixflags, SCF_SIGNATURE); > #endif > - sc->sc_tp = tp; > + sc->sc_inplisten = in_pcbref(tp->t_inpcb); > if (syn_cache_respond(sc, m, now) == 0) { > mtx_enter(&syn_cache_mtx); > /* > @@ -3962,6 +3970,7 @@ syn_cache_add(struct sockaddr *src, stru > tcpstat_inc(tcps_sndacks); > tcpstat_inc(tcps_sndtotal); > } else { > + in_pcbunref(sc->sc_inplisten); > syn_cache_put(sc); > tcpstat_inc(tcps_sc_dropped); > } > @@ -4158,7 +4167,7 @@ syn_cache_respond(struct syn_cache *sc, > > /* use IPsec policy and ttl from listening socket, on SYN ACK */ > mtx_enter(&syn_cache_mtx); > - inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; > + inp = in_pcbref(sc->sc_inplisten); > mtx_leave(&syn_cache_mtx); > > /* > @@ -4189,5 +4198,6 @@ syn_cache_respond(struct syn_cache *sc, > break; > #endif > } > + in_pcbunref(inp); > return (error); > } > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.181 tcp_var.h > --- netinet/tcp_var.h 28 Dec 2024 22:17:09 -0000 1.181 > +++ netinet/tcp_var.h 1 Jan 2025 10:26:36 -0000 > @@ -278,7 +278,7 @@ struct syn_cache { > u_int sc_request_r_scale : 4, /* [I] */ > sc_requested_s_scale : 4; /* [I] */ > > - struct tcpcb *sc_tp; /* [S] tcb for listening socket */ > + struct inpcb *sc_inplisten; /* [S] inpcb for listening socket */ > LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */ > }; > > Index: sys/socketvar.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v > diff -u -p -r1.135 socketvar.h > --- sys/socketvar.h 30 Dec 2024 12:12:35 -0000 1.135 > +++ sys/socketvar.h 1 Jan 2025 10:56:10 -0000 > @@ -209,10 +209,13 @@ struct socket { > void soassertlocked(struct socket *); > void soassertlocked_readonly(struct socket *); > > -static inline void > +static inline struct socket * > soref(struct socket *so) > { > + if (so == NULL) > + return NULL; > refcnt_take(&so->so_refcnt); > + return so; > } > > /* >