Download raw body.
run TCP syn cache timer in parallel
On Wed, Jan 01, 2025 at 06:02:54PM +0300, Vitaliy Makkoveev wrote:
> > On 1 Jan 2025, at 16:01, Alexander Bluhm <bluhm@openbsd.org> wrote:
> >
> > Hi,
> >
> > With mvs@'s socket ref counting, I think it is possible to run TCP
> > SYN cache in parallel.
> >
> > Idea is to reset inp_socket pointer when socket is about to be
> > freed. By protecting this with a mutex, we reliable retrieve the
> > socket from an inpcb reference. By using the socket refcount we
> > switch from mutex to socket lock.
> >
> > The syn cache timer is an easy place to use this. Goal it to use
> > in_pcbsolock() also for TCP timer and input. Note that switching
> > sc_tp to sc_inplisten and refcounting it, is a diff I sent to tech@
> > earlier.
> >
> > ok?
> >
>
> Hi Alexander,
>
> This diff lokks good to me. However, I don???t like the sorele()
> after so_lock rwlock(9) release. I made this with previous
> sorele() and this worked perfectly because exclusive solock()
> should be taken before continue teardown, but now I prefer to keep
> the reference until we leave this socket alone. This doesn???t bring
> any extra impact.
>
> So I like to rename in_pcbsolock() to in_pcb_solock_ref() and
> in_pcbsounlock() to in_pcb_sounlock_rele() and rework them as
> something like below.
>
> in_pcb_solock_ref()
> {
> mtx_enter(&inp->inp_sofree_mtx);
> so = soref(inp->inp_socket);
> mtx_leave(&inp->inp_sofree_mtx);
>
> if (so)
> rw_enter_write(&so->so_lock);
> return so;
> }
>
> in_pcb_sounlock_rele()
> {
> rw_exit_write(&so->so_lock);
> sorele(so, 1);
> }
I have an experimental diff with in_pcbsolock() that runs tcp_input()
in parallel. It seems that keeping locks and refcounts gets more
complicated than just relying on so_lock alone to avoid destruction
in a different thread.
So I prefer to keep in_pcbsolock() as it is. We can always change
it later.
First step would be to get the sc_inplisten refcount part commited.
bluhm
> > Index: netinet/in_pcb.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> > diff -u -p -r1.307 in_pcb.c
> > --- netinet/in_pcb.c 24 Dec 2024 16:27:07 -0000 1.307
> > +++ netinet/in_pcb.c 1 Jan 2025 12:55:13 -0000
> > @@ -584,6 +584,9 @@ in_pcbdetach(struct inpcb *inp)
> > struct inpcbtable *table = inp->inp_table;
> >
> > so->so_pcb = NULL;
> > + mtx_enter(&inp->inp_sofree_mtx);
> > + inp->inp_socket = NULL;
> > + mtx_leave(&inp->inp_sofree_mtx);
> > /*
> > * As long as the NET_LOCK() is the default lock for Internet
> > * sockets, do not release it to not introduce new sleeping
> > @@ -616,6 +619,32 @@ in_pcbdetach(struct inpcb *inp)
> > mtx_leave(&table->inpt_mtx);
> >
> > in_pcbunref(inp);
> > +}
> > +
> > +struct socket *
> > +in_pcbsolock(struct inpcb *inp)
> > +{
> > + struct socket *so;
> > +
> > + NET_ASSERT_LOCKED();
> > +
> > + mtx_enter(&inp->inp_sofree_mtx);
> > + so = soref(inp->inp_socket);
> > + mtx_leave(&inp->inp_sofree_mtx);
> > + if (so == NULL)
> > + return NULL;
> > +
> > + rw_enter_write(&so->so_lock);
> > + sorele(so, 1);
> > +
> > + return so;
> > +}
> > +
> > +void
> > +in_pcbsounlock(struct inpcb *inp, struct socket *so)
> > +{
> > + KASSERT(inp->inp_socket == so);
> > + rw_exit_write(&so->so_lock);
> > }
> >
> > struct inpcb *
> > Index: netinet/in_pcb.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> > diff -u -p -r1.161 in_pcb.h
> > --- netinet/in_pcb.h 24 Dec 2024 16:27:07 -0000 1.161
> > +++ netinet/in_pcb.h 1 Jan 2025 10:53:17 -0000
> > @@ -81,6 +81,7 @@
> > * t inpt_mtx pcb table mutex
> > * L pf_inp_mtx link pf to inp mutex
> > * s so_lock socket rwlock
> > + * f inp_sofree_mtx socket detach and lock
> > */
> >
> > /*
> > @@ -136,7 +137,8 @@ struct inpcb {
> > #define inp_laddr6 inp_laddru.iau_addr6
> > u_int16_t inp_fport; /* [t] foreign port */
> > u_int16_t inp_lport; /* [t] local port */
> > - struct socket *inp_socket; /* [I] back pointer to socket */
> > + struct socket *inp_socket; /* [f] back pointer to socket */
> > + struct mutex inp_sofree_mtx; /* protect socket free */
> > caddr_t inp_ppcb; /* pointer to per-protocol pcb */
> > struct route inp_route; /* [s] cached route */
> > struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
> > @@ -309,6 +311,9 @@ int in_pcbaddrisavail(const struct inpc
> > struct proc *);
> > int in_pcbconnect(struct inpcb *, struct mbuf *);
> > void in_pcbdetach(struct inpcb *);
> > +struct socket *
> > + in_pcbsolock(struct inpcb *);
> > +void in_pcbsounlock(struct inpcb *, struct socket *);
> > struct inpcb *
> > in_pcbref(struct inpcb *);
> > void in_pcbunref(struct inpcb *);
> > Index: netinet/tcp_input.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> > diff -u -p -r1.416 tcp_input.c
> > --- netinet/tcp_input.c 31 Dec 2024 12:19:46 -0000 1.416
> > +++ netinet/tcp_input.c 1 Jan 2025 10:49:40 -0000
> > @@ -3173,7 +3173,8 @@ syn_cache_rm(struct syn_cache *sc)
> > KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
> > SET(sc->sc_dynflags, SCF_DEAD);
> > TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
> > - sc->sc_tp = NULL;
> > + in_pcbunref(sc->sc_inplisten);
> > + sc->sc_inplisten = NULL;
> > LIST_REMOVE(sc, sc_tpq);
> > refcnt_rele(&sc->sc_refcnt);
> > sc->sc_buckethead->sch_length--;
> > @@ -3369,6 +3370,8 @@ void
> > syn_cache_timer(void *arg)
> > {
> > struct syn_cache *sc = arg;
> > + struct inpcb *inp;
> > + struct socket *so;
> > uint64_t now;
> > int lastref;
> >
> > @@ -3397,14 +3400,22 @@ syn_cache_timer(void *arg)
> > TCPTV_REXMTMAX);
> > if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
> > refcnt_take(&sc->sc_refcnt);
> > + inp = in_pcbref(sc->sc_inplisten);
> > + if (inp == NULL)
> > + goto freeit;
> > mtx_leave(&syn_cache_mtx);
> >
> > - NET_LOCK();
> > - now = tcp_now();
> > - (void) syn_cache_respond(sc, NULL, now);
> > - tcpstat_inc(tcps_sc_retransmitted);
> > - NET_UNLOCK();
> > + NET_LOCK_SHARED();
> > + so = in_pcbsolock(inp);
> > + if (so != NULL) {
> > + now = tcp_now();
> > + (void) syn_cache_respond(sc, NULL, now);
> > + tcpstat_inc(tcps_sc_retransmitted);
> > + in_pcbsounlock(inp, so);
> > + }
> > + NET_UNLOCK_SHARED();
> >
> > + in_pcbunref(inp);
> > syn_cache_put(sc);
> > return;
> >
> > @@ -3434,10 +3445,7 @@ syn_cache_cleanup(struct tcpcb *tp)
> >
> > mtx_enter(&syn_cache_mtx);
> > LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
> > -#ifdef DIAGNOSTIC
> > - if (sc->sc_tp != tp)
> > - panic("invalid sc_tp in syn_cache_cleanup");
> > -#endif
> > + KASSERT(sc->sc_inplisten == tp->t_inpcb);
> > syn_cache_rm(sc);
> > syn_cache_put(sc);
> > }
> > @@ -3949,7 +3957,7 @@ syn_cache_add(struct sockaddr *src, stru
> > if (tb.t_flags & TF_SIGNATURE)
> > SET(sc->sc_fixflags, SCF_SIGNATURE);
> > #endif
> > - sc->sc_tp = tp;
> > + sc->sc_inplisten = in_pcbref(tp->t_inpcb);
> > if (syn_cache_respond(sc, m, now) == 0) {
> > mtx_enter(&syn_cache_mtx);
> > /*
> > @@ -3962,6 +3970,7 @@ syn_cache_add(struct sockaddr *src, stru
> > tcpstat_inc(tcps_sndacks);
> > tcpstat_inc(tcps_sndtotal);
> > } else {
> > + in_pcbunref(sc->sc_inplisten);
> > syn_cache_put(sc);
> > tcpstat_inc(tcps_sc_dropped);
> > }
> > @@ -4158,7 +4167,7 @@ syn_cache_respond(struct syn_cache *sc,
> >
> > /* use IPsec policy and ttl from listening socket, on SYN ACK */
> > mtx_enter(&syn_cache_mtx);
> > - inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
> > + inp = in_pcbref(sc->sc_inplisten);
> > mtx_leave(&syn_cache_mtx);
> >
> > /*
> > @@ -4189,5 +4198,6 @@ syn_cache_respond(struct syn_cache *sc,
> > break;
> > #endif
> > }
> > + in_pcbunref(inp);
> > return (error);
> > }
> > Index: netinet/tcp_var.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> > diff -u -p -r1.181 tcp_var.h
> > --- netinet/tcp_var.h 28 Dec 2024 22:17:09 -0000 1.181
> > +++ netinet/tcp_var.h 1 Jan 2025 10:26:36 -0000
> > @@ -278,7 +278,7 @@ struct syn_cache {
> > u_int sc_request_r_scale : 4, /* [I] */
> > sc_requested_s_scale : 4; /* [I] */
> >
> > - struct tcpcb *sc_tp; /* [S] tcb for listening socket */
> > + struct inpcb *sc_inplisten; /* [S] inpcb for listening socket */
> > LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */
> > };
> >
> > Index: sys/socketvar.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> > diff -u -p -r1.135 socketvar.h
> > --- sys/socketvar.h 30 Dec 2024 12:12:35 -0000 1.135
> > +++ sys/socketvar.h 1 Jan 2025 10:56:10 -0000
> > @@ -209,10 +209,13 @@ struct socket {
> > void soassertlocked(struct socket *);
> > void soassertlocked_readonly(struct socket *);
> >
> > -static inline void
> > +static inline struct socket *
> > soref(struct socket *so)
> > {
> > + if (so == NULL)
> > + return NULL;
> > refcnt_take(&so->so_refcnt);
> > + return so;
> > }
> >
> > /*
> >
>
run TCP syn cache timer in parallel