Download raw body.
run TCP syn cache timer in parallel
> On 1 Jan 2025, at 21:39, Alexander Bluhm <alexander.bluhm@gmx.net> wrote:
>
> On Wed, Jan 01, 2025 at 06:02:54PM +0300, Vitaliy Makkoveev wrote:
>>> On 1 Jan 2025, at 16:01, Alexander Bluhm <bluhm@openbsd.org> wrote:
>>>
>>> Hi,
>>>
>>> With mvs@'s socket ref counting, I think it is possible to run TCP
>>> SYN cache in parallel.
>>>
>>> Idea is to reset inp_socket pointer when socket is about to be
>>> freed. By protecting this with a mutex, we reliable retrieve the
>>> socket from an inpcb reference. By using the socket refcount we
>>> switch from mutex to socket lock.
>>>
>>> The syn cache timer is an easy place to use this. Goal it to use
>>> in_pcbsolock() also for TCP timer and input. Note that switching
>>> sc_tp to sc_inplisten and refcounting it, is a diff I sent to tech@
>>> earlier.
>>>
>>> ok?
>>>
>>
>> Hi Alexander,
>>
>> This diff lokks good to me. However, I don???t like the sorele()
>> after so_lock rwlock(9) release. I made this with previous
>> sorele() and this worked perfectly because exclusive solock()
>> should be taken before continue teardown, but now I prefer to keep
>> the reference until we leave this socket alone. This doesn???t bring
>> any extra impact.
>>
>> So I like to rename in_pcbsolock() to in_pcb_solock_ref() and
>> in_pcbsounlock() to in_pcb_sounlock_rele() and rework them as
>> something like below.
>>
>> in_pcb_solock_ref()
>> {
>> mtx_enter(&inp->inp_sofree_mtx);
>> so = soref(inp->inp_socket);
>> mtx_leave(&inp->inp_sofree_mtx);
>>
>> if (so)
>> rw_enter_write(&so->so_lock);
>> return so;
>> }
>>
>> in_pcb_sounlock_rele()
>> {
>> rw_exit_write(&so->so_lock);
>> sorele(so, 1);
>> }
>
> I have an experimental diff with in_pcbsolock() that runs tcp_input()
> in parallel. It seems that keeping locks and refcounts gets more
> complicated than just relying on so_lock alone to avoid destruction
> in a different thread.
>
> So I prefer to keep in_pcbsolock() as it is. We can always change
> it later.
>
Go ahead, we always could correct usage with the new vision.
> First step would be to get the sc_inplisten refcount part commited.
>
> bluhm
>
I’m working on sockets splice unlocking, and I have the cases, then
socket lock was taken and released before reference release.
sorele() does nothing that requires solock() to be taken, so I think
it’s normal to allow sorele() be called unlocked. We could rename
keep_lock arg to something like no_unlock for consistency. As you
can see there is nothing complicated:
sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
if (isspliced(so)) {
struct socket *sosp = so->so_sp->ssp_socket;
int freeing = SOSP_FREEING_READ;
if (so == so->so_sp->ssp_socket)
freeing |= SOSP_FREEING_WRITE;
soref(sosp);
sounsplice(so, so->so_sp->ssp_socket, freeing);
sorele(sosp, 1);
}
sbunlock(&so->so_rcv);
sounsplice(struct socket *so, struct socket *sosp, int freeing)
{
sbassertlocked(&so->so_rcv);
/* Do unsplicing */
/* Do not wakeup a socket that is about to be freed. */
if ((freeing & SOSP_FREEING_READ) == 0) {
solock_shared(so);
if (soreadable(so))
sorwakeup(so);
sounlock_shared(so);
}
if ((freeing & SOSP_FREEING_WRITE) == 0) {
solock_shared(sosp);
if (sowriteable(sosp))
sowwakeup(sosp);
sounlock_shared(sosp);
}
}
>>> Index: netinet/in_pcb.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
>>> diff -u -p -r1.307 in_pcb.c
>>> --- netinet/in_pcb.c 24 Dec 2024 16:27:07 -0000 1.307
>>> +++ netinet/in_pcb.c 1 Jan 2025 12:55:13 -0000
>>> @@ -584,6 +584,9 @@ in_pcbdetach(struct inpcb *inp)
>>> struct inpcbtable *table = inp->inp_table;
>>>
>>> so->so_pcb = NULL;
>>> + mtx_enter(&inp->inp_sofree_mtx);
>>> + inp->inp_socket = NULL;
>>> + mtx_leave(&inp->inp_sofree_mtx);
>>> /*
>>> * As long as the NET_LOCK() is the default lock for Internet
>>> * sockets, do not release it to not introduce new sleeping
>>> @@ -616,6 +619,32 @@ in_pcbdetach(struct inpcb *inp)
>>> mtx_leave(&table->inpt_mtx);
>>>
>>> in_pcbunref(inp);
>>> +}
>>> +
>>> +struct socket *
>>> +in_pcbsolock(struct inpcb *inp)
>>> +{
>>> + struct socket *so;
>>> +
>>> + NET_ASSERT_LOCKED();
>>> +
>>> + mtx_enter(&inp->inp_sofree_mtx);
>>> + so = soref(inp->inp_socket);
>>> + mtx_leave(&inp->inp_sofree_mtx);
>>> + if (so == NULL)
>>> + return NULL;
>>> +
>>> + rw_enter_write(&so->so_lock);
>>> + sorele(so, 1);
>>> +
>>> + return so;
>>> +}
>>> +
>>> +void
>>> +in_pcbsounlock(struct inpcb *inp, struct socket *so)
>>> +{
>>> + KASSERT(inp->inp_socket == so);
>>> + rw_exit_write(&so->so_lock);
>>> }
>>>
>>> struct inpcb *
>>> Index: netinet/in_pcb.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
>>> diff -u -p -r1.161 in_pcb.h
>>> --- netinet/in_pcb.h 24 Dec 2024 16:27:07 -0000 1.161
>>> +++ netinet/in_pcb.h 1 Jan 2025 10:53:17 -0000
>>> @@ -81,6 +81,7 @@
>>> * t inpt_mtx pcb table mutex
>>> * L pf_inp_mtx link pf to inp mutex
>>> * s so_lock socket rwlock
>>> + * f inp_sofree_mtx socket detach and lock
>>> */
>>>
>>> /*
>>> @@ -136,7 +137,8 @@ struct inpcb {
>>> #define inp_laddr6 inp_laddru.iau_addr6
>>> u_int16_t inp_fport; /* [t] foreign port */
>>> u_int16_t inp_lport; /* [t] local port */
>>> - struct socket *inp_socket; /* [I] back pointer to socket */
>>> + struct socket *inp_socket; /* [f] back pointer to socket */
>>> + struct mutex inp_sofree_mtx; /* protect socket free */
>>> caddr_t inp_ppcb; /* pointer to per-protocol pcb */
>>> struct route inp_route; /* [s] cached route */
>>> struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
>>> @@ -309,6 +311,9 @@ int in_pcbaddrisavail(const struct inpc
>>> struct proc *);
>>> int in_pcbconnect(struct inpcb *, struct mbuf *);
>>> void in_pcbdetach(struct inpcb *);
>>> +struct socket *
>>> + in_pcbsolock(struct inpcb *);
>>> +void in_pcbsounlock(struct inpcb *, struct socket *);
>>> struct inpcb *
>>> in_pcbref(struct inpcb *);
>>> void in_pcbunref(struct inpcb *);
>>> Index: netinet/tcp_input.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
>>> diff -u -p -r1.416 tcp_input.c
>>> --- netinet/tcp_input.c 31 Dec 2024 12:19:46 -0000 1.416
>>> +++ netinet/tcp_input.c 1 Jan 2025 10:49:40 -0000
>>> @@ -3173,7 +3173,8 @@ syn_cache_rm(struct syn_cache *sc)
>>> KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
>>> SET(sc->sc_dynflags, SCF_DEAD);
>>> TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
>>> - sc->sc_tp = NULL;
>>> + in_pcbunref(sc->sc_inplisten);
>>> + sc->sc_inplisten = NULL;
>>> LIST_REMOVE(sc, sc_tpq);
>>> refcnt_rele(&sc->sc_refcnt);
>>> sc->sc_buckethead->sch_length--;
>>> @@ -3369,6 +3370,8 @@ void
>>> syn_cache_timer(void *arg)
>>> {
>>> struct syn_cache *sc = arg;
>>> + struct inpcb *inp;
>>> + struct socket *so;
>>> uint64_t now;
>>> int lastref;
>>>
>>> @@ -3397,14 +3400,22 @@ syn_cache_timer(void *arg)
>>> TCPTV_REXMTMAX);
>>> if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
>>> refcnt_take(&sc->sc_refcnt);
>>> + inp = in_pcbref(sc->sc_inplisten);
>>> + if (inp == NULL)
>>> + goto freeit;
>>> mtx_leave(&syn_cache_mtx);
>>>
>>> - NET_LOCK();
>>> - now = tcp_now();
>>> - (void) syn_cache_respond(sc, NULL, now);
>>> - tcpstat_inc(tcps_sc_retransmitted);
>>> - NET_UNLOCK();
>>> + NET_LOCK_SHARED();
>>> + so = in_pcbsolock(inp);
>>> + if (so != NULL) {
>>> + now = tcp_now();
>>> + (void) syn_cache_respond(sc, NULL, now);
>>> + tcpstat_inc(tcps_sc_retransmitted);
>>> + in_pcbsounlock(inp, so);
>>> + }
>>> + NET_UNLOCK_SHARED();
>>>
>>> + in_pcbunref(inp);
>>> syn_cache_put(sc);
>>> return;
>>>
>>> @@ -3434,10 +3445,7 @@ syn_cache_cleanup(struct tcpcb *tp)
>>>
>>> mtx_enter(&syn_cache_mtx);
>>> LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
>>> -#ifdef DIAGNOSTIC
>>> - if (sc->sc_tp != tp)
>>> - panic("invalid sc_tp in syn_cache_cleanup");
>>> -#endif
>>> + KASSERT(sc->sc_inplisten == tp->t_inpcb);
>>> syn_cache_rm(sc);
>>> syn_cache_put(sc);
>>> }
>>> @@ -3949,7 +3957,7 @@ syn_cache_add(struct sockaddr *src, stru
>>> if (tb.t_flags & TF_SIGNATURE)
>>> SET(sc->sc_fixflags, SCF_SIGNATURE);
>>> #endif
>>> - sc->sc_tp = tp;
>>> + sc->sc_inplisten = in_pcbref(tp->t_inpcb);
>>> if (syn_cache_respond(sc, m, now) == 0) {
>>> mtx_enter(&syn_cache_mtx);
>>> /*
>>> @@ -3962,6 +3970,7 @@ syn_cache_add(struct sockaddr *src, stru
>>> tcpstat_inc(tcps_sndacks);
>>> tcpstat_inc(tcps_sndtotal);
>>> } else {
>>> + in_pcbunref(sc->sc_inplisten);
>>> syn_cache_put(sc);
>>> tcpstat_inc(tcps_sc_dropped);
>>> }
>>> @@ -4158,7 +4167,7 @@ syn_cache_respond(struct syn_cache *sc,
>>>
>>> /* use IPsec policy and ttl from listening socket, on SYN ACK */
>>> mtx_enter(&syn_cache_mtx);
>>> - inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
>>> + inp = in_pcbref(sc->sc_inplisten);
>>> mtx_leave(&syn_cache_mtx);
>>>
>>> /*
>>> @@ -4189,5 +4198,6 @@ syn_cache_respond(struct syn_cache *sc,
>>> break;
>>> #endif
>>> }
>>> + in_pcbunref(inp);
>>> return (error);
>>> }
>>> Index: netinet/tcp_var.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
>>> diff -u -p -r1.181 tcp_var.h
>>> --- netinet/tcp_var.h 28 Dec 2024 22:17:09 -0000 1.181
>>> +++ netinet/tcp_var.h 1 Jan 2025 10:26:36 -0000
>>> @@ -278,7 +278,7 @@ struct syn_cache {
>>> u_int sc_request_r_scale : 4, /* [I] */
>>> sc_requested_s_scale : 4; /* [I] */
>>>
>>> - struct tcpcb *sc_tp; /* [S] tcb for listening socket */
>>> + struct inpcb *sc_inplisten; /* [S] inpcb for listening socket */
>>> LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */
>>> };
>>>
>>> Index: sys/socketvar.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
>>> diff -u -p -r1.135 socketvar.h
>>> --- sys/socketvar.h 30 Dec 2024 12:12:35 -0000 1.135
>>> +++ sys/socketvar.h 1 Jan 2025 10:56:10 -0000
>>> @@ -209,10 +209,13 @@ struct socket {
>>> void soassertlocked(struct socket *);
>>> void soassertlocked_readonly(struct socket *);
>>>
>>> -static inline void
>>> +static inline struct socket *
>>> soref(struct socket *so)
>>> {
>>> + if (so == NULL)
>>> + return NULL;
>>> refcnt_take(&so->so_refcnt);
>>> + return so;
>>> }
>>>
>>> /*
>>>
>>
run TCP syn cache timer in parallel