From: Vitaliy Makkoveev Subject: Re: run TCP syn cache timer in parallel To: Alexander Bluhm Cc: OpenBSD tech Date: Thu, 2 Jan 2025 08:29:27 +0300 > On 1 Jan 2025, at 21:39, Alexander Bluhm wrote: > > On Wed, Jan 01, 2025 at 06:02:54PM +0300, Vitaliy Makkoveev wrote: >>> On 1 Jan 2025, at 16:01, Alexander Bluhm wrote: >>> >>> Hi, >>> >>> With mvs@'s socket ref counting, I think it is possible to run TCP >>> SYN cache in parallel. >>> >>> Idea is to reset inp_socket pointer when socket is about to be >>> freed. By protecting this with a mutex, we reliable retrieve the >>> socket from an inpcb reference. By using the socket refcount we >>> switch from mutex to socket lock. >>> >>> The syn cache timer is an easy place to use this. Goal it to use >>> in_pcbsolock() also for TCP timer and input. Note that switching >>> sc_tp to sc_inplisten and refcounting it, is a diff I sent to tech@ >>> earlier. >>> >>> ok? >>> >> >> Hi Alexander, >> >> This diff lokks good to me. However, I don???t like the sorele() >> after so_lock rwlock(9) release. I made this with previous >> sorele() and this worked perfectly because exclusive solock() >> should be taken before continue teardown, but now I prefer to keep >> the reference until we leave this socket alone. This doesn???t bring >> any extra impact. >> >> So I like to rename in_pcbsolock() to in_pcb_solock_ref() and >> in_pcbsounlock() to in_pcb_sounlock_rele() and rework them as >> something like below. >> >> in_pcb_solock_ref() >> { >> mtx_enter(&inp->inp_sofree_mtx); >> so = soref(inp->inp_socket); >> mtx_leave(&inp->inp_sofree_mtx); >> >> if (so) >> rw_enter_write(&so->so_lock); >> return so; >> } >> >> in_pcb_sounlock_rele() >> { >> rw_exit_write(&so->so_lock); >> sorele(so, 1); >> } > > I have an experimental diff with in_pcbsolock() that runs tcp_input() > in parallel. It seems that keeping locks and refcounts gets more > complicated than just relying on so_lock alone to avoid destruction > in a different thread. > > So I prefer to keep in_pcbsolock() as it is. We can always change > it later. > Go ahead, we always could correct usage with the new vision. > First step would be to get the sc_inplisten refcount part commited. > > bluhm > I’m working on sockets splice unlocking, and I have the cases, then socket lock was taken and released before reference release. sorele() does nothing that requires solock() to be taken, so I think it’s normal to allow sorele() be called unlocked. We could rename keep_lock arg to something like no_unlock for consistency. As you can see there is nothing complicated: sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); if (isspliced(so)) { struct socket *sosp = so->so_sp->ssp_socket; int freeing = SOSP_FREEING_READ; if (so == so->so_sp->ssp_socket) freeing |= SOSP_FREEING_WRITE; soref(sosp); sounsplice(so, so->so_sp->ssp_socket, freeing); sorele(sosp, 1); } sbunlock(&so->so_rcv); sounsplice(struct socket *so, struct socket *sosp, int freeing) { sbassertlocked(&so->so_rcv); /* Do unsplicing */ /* Do not wakeup a socket that is about to be freed. */ if ((freeing & SOSP_FREEING_READ) == 0) { solock_shared(so); if (soreadable(so)) sorwakeup(so); sounlock_shared(so); } if ((freeing & SOSP_FREEING_WRITE) == 0) { solock_shared(sosp); if (sowriteable(sosp)) sowwakeup(sosp); sounlock_shared(sosp); } } >>> Index: netinet/in_pcb.c >>> =================================================================== >>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v >>> diff -u -p -r1.307 in_pcb.c >>> --- netinet/in_pcb.c 24 Dec 2024 16:27:07 -0000 1.307 >>> +++ netinet/in_pcb.c 1 Jan 2025 12:55:13 -0000 >>> @@ -584,6 +584,9 @@ in_pcbdetach(struct inpcb *inp) >>> struct inpcbtable *table = inp->inp_table; >>> >>> so->so_pcb = NULL; >>> + mtx_enter(&inp->inp_sofree_mtx); >>> + inp->inp_socket = NULL; >>> + mtx_leave(&inp->inp_sofree_mtx); >>> /* >>> * As long as the NET_LOCK() is the default lock for Internet >>> * sockets, do not release it to not introduce new sleeping >>> @@ -616,6 +619,32 @@ in_pcbdetach(struct inpcb *inp) >>> mtx_leave(&table->inpt_mtx); >>> >>> in_pcbunref(inp); >>> +} >>> + >>> +struct socket * >>> +in_pcbsolock(struct inpcb *inp) >>> +{ >>> + struct socket *so; >>> + >>> + NET_ASSERT_LOCKED(); >>> + >>> + mtx_enter(&inp->inp_sofree_mtx); >>> + so = soref(inp->inp_socket); >>> + mtx_leave(&inp->inp_sofree_mtx); >>> + if (so == NULL) >>> + return NULL; >>> + >>> + rw_enter_write(&so->so_lock); >>> + sorele(so, 1); >>> + >>> + return so; >>> +} >>> + >>> +void >>> +in_pcbsounlock(struct inpcb *inp, struct socket *so) >>> +{ >>> + KASSERT(inp->inp_socket == so); >>> + rw_exit_write(&so->so_lock); >>> } >>> >>> struct inpcb * >>> Index: netinet/in_pcb.h >>> =================================================================== >>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v >>> diff -u -p -r1.161 in_pcb.h >>> --- netinet/in_pcb.h 24 Dec 2024 16:27:07 -0000 1.161 >>> +++ netinet/in_pcb.h 1 Jan 2025 10:53:17 -0000 >>> @@ -81,6 +81,7 @@ >>> * t inpt_mtx pcb table mutex >>> * L pf_inp_mtx link pf to inp mutex >>> * s so_lock socket rwlock >>> + * f inp_sofree_mtx socket detach and lock >>> */ >>> >>> /* >>> @@ -136,7 +137,8 @@ struct inpcb { >>> #define inp_laddr6 inp_laddru.iau_addr6 >>> u_int16_t inp_fport; /* [t] foreign port */ >>> u_int16_t inp_lport; /* [t] local port */ >>> - struct socket *inp_socket; /* [I] back pointer to socket */ >>> + struct socket *inp_socket; /* [f] back pointer to socket */ >>> + struct mutex inp_sofree_mtx; /* protect socket free */ >>> caddr_t inp_ppcb; /* pointer to per-protocol pcb */ >>> struct route inp_route; /* [s] cached route */ >>> struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ >>> @@ -309,6 +311,9 @@ int in_pcbaddrisavail(const struct inpc >>> struct proc *); >>> int in_pcbconnect(struct inpcb *, struct mbuf *); >>> void in_pcbdetach(struct inpcb *); >>> +struct socket * >>> + in_pcbsolock(struct inpcb *); >>> +void in_pcbsounlock(struct inpcb *, struct socket *); >>> struct inpcb * >>> in_pcbref(struct inpcb *); >>> void in_pcbunref(struct inpcb *); >>> Index: netinet/tcp_input.c >>> =================================================================== >>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v >>> diff -u -p -r1.416 tcp_input.c >>> --- netinet/tcp_input.c 31 Dec 2024 12:19:46 -0000 1.416 >>> +++ netinet/tcp_input.c 1 Jan 2025 10:49:40 -0000 >>> @@ -3173,7 +3173,8 @@ syn_cache_rm(struct syn_cache *sc) >>> KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); >>> SET(sc->sc_dynflags, SCF_DEAD); >>> TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); >>> - sc->sc_tp = NULL; >>> + in_pcbunref(sc->sc_inplisten); >>> + sc->sc_inplisten = NULL; >>> LIST_REMOVE(sc, sc_tpq); >>> refcnt_rele(&sc->sc_refcnt); >>> sc->sc_buckethead->sch_length--; >>> @@ -3369,6 +3370,8 @@ void >>> syn_cache_timer(void *arg) >>> { >>> struct syn_cache *sc = arg; >>> + struct inpcb *inp; >>> + struct socket *so; >>> uint64_t now; >>> int lastref; >>> >>> @@ -3397,14 +3400,22 @@ syn_cache_timer(void *arg) >>> TCPTV_REXMTMAX); >>> if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) >>> refcnt_take(&sc->sc_refcnt); >>> + inp = in_pcbref(sc->sc_inplisten); >>> + if (inp == NULL) >>> + goto freeit; >>> mtx_leave(&syn_cache_mtx); >>> >>> - NET_LOCK(); >>> - now = tcp_now(); >>> - (void) syn_cache_respond(sc, NULL, now); >>> - tcpstat_inc(tcps_sc_retransmitted); >>> - NET_UNLOCK(); >>> + NET_LOCK_SHARED(); >>> + so = in_pcbsolock(inp); >>> + if (so != NULL) { >>> + now = tcp_now(); >>> + (void) syn_cache_respond(sc, NULL, now); >>> + tcpstat_inc(tcps_sc_retransmitted); >>> + in_pcbsounlock(inp, so); >>> + } >>> + NET_UNLOCK_SHARED(); >>> >>> + in_pcbunref(inp); >>> syn_cache_put(sc); >>> return; >>> >>> @@ -3434,10 +3445,7 @@ syn_cache_cleanup(struct tcpcb *tp) >>> >>> mtx_enter(&syn_cache_mtx); >>> LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { >>> -#ifdef DIAGNOSTIC >>> - if (sc->sc_tp != tp) >>> - panic("invalid sc_tp in syn_cache_cleanup"); >>> -#endif >>> + KASSERT(sc->sc_inplisten == tp->t_inpcb); >>> syn_cache_rm(sc); >>> syn_cache_put(sc); >>> } >>> @@ -3949,7 +3957,7 @@ syn_cache_add(struct sockaddr *src, stru >>> if (tb.t_flags & TF_SIGNATURE) >>> SET(sc->sc_fixflags, SCF_SIGNATURE); >>> #endif >>> - sc->sc_tp = tp; >>> + sc->sc_inplisten = in_pcbref(tp->t_inpcb); >>> if (syn_cache_respond(sc, m, now) == 0) { >>> mtx_enter(&syn_cache_mtx); >>> /* >>> @@ -3962,6 +3970,7 @@ syn_cache_add(struct sockaddr *src, stru >>> tcpstat_inc(tcps_sndacks); >>> tcpstat_inc(tcps_sndtotal); >>> } else { >>> + in_pcbunref(sc->sc_inplisten); >>> syn_cache_put(sc); >>> tcpstat_inc(tcps_sc_dropped); >>> } >>> @@ -4158,7 +4167,7 @@ syn_cache_respond(struct syn_cache *sc, >>> >>> /* use IPsec policy and ttl from listening socket, on SYN ACK */ >>> mtx_enter(&syn_cache_mtx); >>> - inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; >>> + inp = in_pcbref(sc->sc_inplisten); >>> mtx_leave(&syn_cache_mtx); >>> >>> /* >>> @@ -4189,5 +4198,6 @@ syn_cache_respond(struct syn_cache *sc, >>> break; >>> #endif >>> } >>> + in_pcbunref(inp); >>> return (error); >>> } >>> Index: netinet/tcp_var.h >>> =================================================================== >>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v >>> diff -u -p -r1.181 tcp_var.h >>> --- netinet/tcp_var.h 28 Dec 2024 22:17:09 -0000 1.181 >>> +++ netinet/tcp_var.h 1 Jan 2025 10:26:36 -0000 >>> @@ -278,7 +278,7 @@ struct syn_cache { >>> u_int sc_request_r_scale : 4, /* [I] */ >>> sc_requested_s_scale : 4; /* [I] */ >>> >>> - struct tcpcb *sc_tp; /* [S] tcb for listening socket */ >>> + struct inpcb *sc_inplisten; /* [S] inpcb for listening socket */ >>> LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */ >>> }; >>> >>> Index: sys/socketvar.h >>> =================================================================== >>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v >>> diff -u -p -r1.135 socketvar.h >>> --- sys/socketvar.h 30 Dec 2024 12:12:35 -0000 1.135 >>> +++ sys/socketvar.h 1 Jan 2025 10:56:10 -0000 >>> @@ -209,10 +209,13 @@ struct socket { >>> void soassertlocked(struct socket *); >>> void soassertlocked_readonly(struct socket *); >>> >>> -static inline void >>> +static inline struct socket * >>> soref(struct socket *so) >>> { >>> + if (so == NULL) >>> + return NULL; >>> refcnt_take(&so->so_refcnt); >>> + return so; >>> } >>> >>> /* >>> >>