Download raw body.
route cache per softnet thread
On Wed, Feb 19, 2025 at 01:00:13AM +0100, Alexander Bluhm wrote:
> On Tue, Feb 18, 2025 at 09:44:44AM +1000, David Gwynne wrote:
> >
> >
> > > On 18 Feb 2025, at 08:53, Alexander Bluhm <bluhm@openbsd.org> wrote:
> > >
> > > On Mon, Feb 17, 2025 at 05:37:01PM +0100, Claudio Jeker wrote:
> > >> On Fri, Feb 14, 2025 at 04:19:36PM +0100, Alexander Bluhm wrote:
> > >>> Hi,
> > >>>
> > >>> When experimenting with parallel TCP, I realized that we need local
> > >>> storage per softnet thread. That allows effective caching.
> > >>>
> > >>> The same is true for IP forwarding route. In one of my benchmarks,
> > >>> which is forwarding 10 parallel TCP streams over OpenBSD, thoughput
> > >>> increases by 30%.
> > >>>
> > >>> As you can see here, route6_mpath() uses 10.5% CPU time basically
> > >>> for rtable_match().
> > >>> http://bluhm.genua.de/perform/results/2025-02-09T21%3A10%3A25Z/2025-02-09T00%3A00%3A00Z/btrace/ssh_perform%40lt13_iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0346%3A%3A36_-P10_-t10-btrace-kstack.0.svg?s=route6_mpath
> > >>>
> > >>> With the patch below patch only 2.6% CPU are used in route6_mpath().
> > >>> http://bluhm.genua.de/perform/results/2025-02-09T21%3A10%3A25Z/patch-sys-forward-route-cache.0/btrace/ssh_perform%40lt13_iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0346%3A%3A36_-P10_-t10-btrace-kstack.0.svg?s=route6_mpath
> > >>>
> > >>> Idea is that every softnet task has its own storage. It is passed
> > >>> down to IP input in mbuf cookie. If the cookie is set, use the
> > >>> route cache in struct softnet. Otherwise use route cache in stack
> > >>> memory of IP input. From there the route cache is passed to IP
> > >>> forward and IP output as usual.
> > >>>
> > >>> When I sent a similar diff with cache per CPU memory, there were
> > >>> concerns with sleeping and switching CPU. Per softnet memory should
> > >>> be safe.
> > >>>
> > >>> There was also the question when the cache is invalidated. I have
> > >>> implemented a route generation number two releases ago. The cache
> > >>> contains the this number when the route was stored. After a change
> > >>> of the routing table, the cache is filled with a new route during
> > >>> next lookup.
> > >>>
> > >>> ok?
> > >>
> > >> In general I dislike how ph_cookie is used in more and more places.
> > >> That field has no validity checks and so code just dereferences the
> > >> returned value. This is a ticking bomb.
> > >> I would prefer to have a better concept of per softnet / task storage.
> > >> Maybe dlg@ has a great idea on how to do that.
> > >
> > > I am also not a big fan of ph_cookie. One alternative is to store
> > > softnet index in mbuf. But we want to keep size of mbuf small.
> > > Interface index is at most 16 bit. So we can steal some storage
> > > there.
> >
> > as a rule of thumb i only use ph_cookie to hold state over a queue when both ends are very well understood or have a narrow scope.
> >
> > i was working on a diff a while ago to provide softnet (or something like it) as an argument throughout the stack. i definitely got it from the if input processing through to ip_output. id prefer this approach since it makes it obvious that the softnet state is part of the code or execution context that the mbuf is being processed in, rather than a property of the mbuf itself. right now it's all too easy for an mbuf to be queued unexpectedly.
> >
> > if you're ok with waiting a bit, i can try and find the diff and cut it back to just passing a softnet pointer around and we can compare?
>
> I was considering passing struct softnet as an argument. For TCP
> lock caching I need it in protocol input. That means a lot of
> passing around through all the functions. So I thought an mbuf
> field would be easier.
>
> Below is the diff that uses struct softnet as cache for socket locks
> in tcp_input().
Now diff without crash, fixed minor bug.
Index: net/if.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
diff -u -p -r1.726 if.c
--- net/if.c 3 Feb 2025 08:58:52 -0000 1.726
+++ net/if.c 19 Feb 2025 00:11:24 -0000
@@ -241,19 +241,10 @@ struct rwlock if_tmplist_lock = RWLOCK_I
struct mutex if_hooks_mtx = MUTEX_INITIALIZER(IPL_NONE);
void if_hooks_run(struct task_list *);
-int ifq_congestion;
-
-int netisr;
-
-struct softnet {
- char sn_name[16];
- struct taskq *sn_taskq;
-};
-
-#define NET_TASKQ 4
+int ifq_congestion;
+int netisr;
struct softnet softnets[NET_TASKQ];
-
-struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
+struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
/*
* Serialize socket operations to ensure no new sleeping points
@@ -979,9 +970,10 @@ if_output_local(struct ifnet *ifp, struc
}
void
-if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+if_input_process(struct ifnet *ifp, struct mbuf_list *ml, unsigned int idx)
{
struct mbuf *m;
+ struct softnet *sn;
if (ml_empty(ml))
return;
@@ -996,9 +988,25 @@ if_input_process(struct ifnet *ifp, stru
* read only or MP safe. Usually they hold the exclusive net lock.
*/
+ idx = net_idx(idx);
+ sn = &softnets[idx];
+ ml_init(&sn->sn_tcp_ml);
+#ifdef INET6
+ ml_init(&sn->sn_tcp6_ml);
+#endif
+
NET_LOCK_SHARED();
- while ((m = ml_dequeue(ml)) != NULL)
+
+ while ((m = ml_dequeue(ml)) != NULL) {
+ /* add 1 to index, 0 means not set */
+ m->m_pkthdr.ph_softidx = idx + 1;
(*ifp->if_input)(ifp, m);
+ }
+ tcp_input_mlist(&sn->sn_tcp_ml, AF_INET);
+#ifdef INET6
+ tcp_input_mlist(&sn->sn_tcp6_ml, AF_INET6);
+#endif
+
NET_UNLOCK_SHARED();
}
@@ -3672,16 +3680,23 @@ unhandled_af(int af)
panic("unhandled af %d", af);
}
-struct taskq *
-net_tq(unsigned int ifindex)
+unsigned int
+net_idx(unsigned int ifindex)
{
- struct softnet *sn;
static int nettaskqs;
if (nettaskqs == 0)
nettaskqs = min(NET_TASKQ, ncpus);
- sn = &softnets[ifindex % nettaskqs];
+ return (ifindex % nettaskqs);
+}
+
+struct taskq *
+net_tq(unsigned int ifindex)
+{
+ struct softnet *sn;
+
+ sn = &softnets[net_idx(ifindex)];
return (sn->sn_taskq);
}
Index: net/if.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.h,v
diff -u -p -r1.217 if.h
--- net/if.h 9 Jun 2024 16:25:28 -0000 1.217
+++ net/if.h 19 Feb 2025 00:11:24 -0000
@@ -560,7 +560,10 @@ void if_congestion(void);
int if_congested(void);
__dead void unhandled_af(int);
int if_setlladdr(struct ifnet *, const uint8_t *);
-struct taskq * net_tq(unsigned int);
+unsigned int
+ net_idx(unsigned int);
+struct taskq *
+ net_tq(unsigned int);
void net_tq_barriers(const char *);
#endif /* _KERNEL */
Index: net/if_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
diff -u -p -r1.135 if_var.h
--- net/if_var.h 24 Jan 2025 09:19:07 -0000 1.135
+++ net/if_var.h 19 Feb 2025 00:11:24 -0000
@@ -301,6 +301,15 @@ struct ifg_list {
#define IF_WWAN_DEFAULT_PRIORITY 6
#define IF_CARP_DEFAULT_PRIORITY 15
+struct softnet {
+ char sn_name[16];
+ struct taskq *sn_taskq;
+ struct mbuf_list sn_tcp_ml;
+ struct mbuf_list sn_tcp6_ml;
+} __aligned(64);
+#define NET_TASKQ 4
+extern struct softnet softnets[NET_TASKQ];
+
/*
* Network stack input queues.
*/
@@ -331,7 +340,7 @@ int if_enqueue(struct ifnet *, struct mb
int if_enqueue_ifq(struct ifnet *, struct mbuf *);
void if_input(struct ifnet *, struct mbuf_list *);
void if_vinput(struct ifnet *, struct mbuf *);
-void if_input_process(struct ifnet *, struct mbuf_list *);
+void if_input_process(struct ifnet *, struct mbuf_list *, unsigned int);
int if_input_local(struct ifnet *, struct mbuf *, sa_family_t);
int if_output_ml(struct ifnet *, struct mbuf_list *,
struct sockaddr *, struct rtentry *);
Index: net/ifq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
diff -u -p -r1.56 ifq.c
--- net/ifq.c 3 Feb 2025 08:58:52 -0000 1.56
+++ net/ifq.c 18 Feb 2025 12:58:10 -0000
@@ -862,7 +862,7 @@ ifiq_process(void *arg)
ml_init(&ifiq->ifiq_ml);
mtx_leave(&ifiq->ifiq_mtx);
- if_input_process(ifiq->ifiq_if, &ml);
+ if_input_process(ifiq->ifiq_if, &ml, ifiq->ifiq_idx);
}
int
Index: netinet/in_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
diff -u -p -r1.121 in_proto.c
--- netinet/in_proto.c 5 Jan 2025 12:36:48 -0000 1.121
+++ netinet/in_proto.c 18 Feb 2025 13:00:16 -0000
@@ -197,7 +197,8 @@ const struct protosw inetsw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_TCP,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|
+ PR_MPINPUT,
.pr_input = tcp_input,
.pr_ctlinput = tcp_ctlinput,
.pr_ctloutput = tcp_ctloutput,
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.431 tcp_input.c
--- netinet/tcp_input.c 17 Feb 2025 08:56:33 -0000 1.431
+++ netinet/tcp_input.c 19 Feb 2025 00:11:24 -0000
@@ -100,9 +100,6 @@
#include <net/pfvar.h>
#endif
-int tcp_mss_adv(struct rtentry *, int);
-int tcp_flush_queue(struct tcpcb *);
-
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
@@ -177,6 +174,9 @@ do { \
if_put(ifp); \
} while (0)
+int tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **);
+int tcp_mss_adv(struct rtentry *, int);
+int tcp_flush_queue(struct tcpcb *);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
@@ -347,12 +347,57 @@ tcp_flush_queue(struct tcpcb *tp)
return (flags);
}
+int
+tcp_input(struct mbuf **mp, int *offp, int proto, int af)
+{
+ struct softnet *sn;
+
+ if ((*mp)->m_pkthdr.ph_softidx == 0)
+ return tcp_input_solocked(mp, offp, proto, af, NULL);
+ sn = &softnets[(*mp)->m_pkthdr.ph_softidx - 1];
+ (*mp)->m_pkthdr.ph_softidx = 0;
+ (*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp);
+ switch (af) {
+ case AF_INET:
+ ml_enqueue(&sn->sn_tcp_ml, *mp);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ ml_enqueue(&sn->sn_tcp6_ml, *mp);
+ break;
+#endif
+ default:
+ m_freemp(mp);
+ }
+ *mp = NULL;
+ return IPPROTO_DONE;
+}
+
+void
+tcp_input_mlist(struct mbuf_list *ml, int af)
+{
+ struct socket *so = NULL;
+ struct mbuf *m;
+
+ while ((m = ml_dequeue(ml)) != NULL) {
+ int off, nxt;
+
+ off = (long)m->m_pkthdr.ph_cookie;
+ m->m_pkthdr.ph_cookie = NULL;
+ nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so);
+ KASSERT(nxt == IPPROTO_DONE);
+ }
+
+ in_pcbsounlock_rele(NULL, so);
+}
+
/*
* TCP input routine, follows pages 65-76 of the
* protocol specification dated September, 1981 very closely.
*/
int
-tcp_input(struct mbuf **mp, int *offp, int proto, int af)
+tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af,
+ struct socket ** solocked)
{
struct mbuf *m = *mp;
int iphlen = *offp;
@@ -604,6 +649,25 @@ findpcb:
tcpstat_inc(tcps_noport);
goto dropwithreset_ratelim;
}
+ /*
+ * Avoid needless lock and unlock operation when handling multiple
+ * TCP packets from the same stream consecutively.
+ */
+ if (solocked != NULL && *solocked != NULL &&
+ sotoinpcb(*solocked) == inp) {
+ so = *solocked;
+ *solocked = NULL;
+ } else {
+ if (solocked != NULL && *solocked != NULL) {
+ in_pcbsounlock_rele(NULL, *solocked);
+ *solocked = NULL;
+ }
+ so = in_pcbsolock_ref(inp);
+ }
+ if (so == NULL) {
+ tcpstat_inc(tcps_noport);
+ goto dropwithreset_ratelim;
+ }
KASSERT(sotoinpcb(inp->inp_socket) == inp);
KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp);
@@ -636,7 +700,6 @@ findpcb:
else
tiwin = th->th_win;
- so = inp->inp_socket;
if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
union syn_cache_sa src;
union syn_cache_sa dst;
@@ -725,6 +788,7 @@ findpcb:
* in use for the reply,
* do not free it.
*/
+ so = NULL;
m = *mp = NULL;
goto drop;
} else {
@@ -732,13 +796,11 @@ findpcb:
* We have created a
* full-blown connection.
*/
- tp = NULL;
in_pcbunref(inp);
inp = in_pcbref(sotoinpcb(so));
tp = intotcpcb(inp);
if (tp == NULL)
goto badsyn; /*XXX*/
-
}
break;
@@ -844,6 +906,10 @@ findpcb:
tcpstat_inc(tcps_dropsyn);
goto drop;
}
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -1019,6 +1085,10 @@ findpcb:
if (so->so_snd.sb_cc ||
tp->t_flags & TF_NEEDOUTPUT)
(void) tcp_output(tp);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -1069,6 +1139,10 @@ findpcb:
tp->t_flags &= ~TF_BLOCKOUTPUT;
if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -1262,6 +1336,8 @@ trimthenstep6:
((arc4random() & 0x7fffffff) | 0x8000);
reuse = &iss;
tp = tcp_close(tp);
+ in_pcbsounlock_rele(inp, so);
+ so = NULL;
in_pcbunref(inp);
inp = NULL;
goto findpcb;
@@ -2066,6 +2142,10 @@ dodata: /* XXX */
*/
if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
@@ -2095,6 +2175,10 @@ dropafterack:
m_freem(m);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
@@ -2130,6 +2214,7 @@ dropwithreset:
(tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now);
}
m_freem(m);
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
@@ -2141,6 +2226,7 @@ drop:
tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen);
m_freem(m);
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -3544,6 +3630,7 @@ syn_cache_get(struct sockaddr *src, stru
sc = syn_cache_lookup(src, dst, &scp, inp->inp_rtableid);
if (sc == NULL) {
mtx_leave(&syn_cache_mtx);
+ in_pcbsounlock_rele(inp, so);
return (NULL);
}
@@ -3557,6 +3644,7 @@ syn_cache_get(struct sockaddr *src, stru
refcnt_take(&sc->sc_refcnt);
mtx_leave(&syn_cache_mtx);
(void) syn_cache_respond(sc, m, now, do_ecn);
+ in_pcbsounlock_rele(inp, so);
syn_cache_put(sc);
return ((struct socket *)(-1));
}
@@ -3697,7 +3785,7 @@ syn_cache_get(struct sockaddr *src, stru
tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
tp->last_ack_sent = tp->rcv_nxt;
- in_pcbsounlock_rele(inp, so);
+ in_pcbsounlock_rele(listeninp, listenso);
tcpstat_inc(tcps_sc_completed);
syn_cache_put(sc);
return (so);
@@ -3710,6 +3798,7 @@ abort:
tp = tcp_drop(tp, ECONNABORTED); /* destroys socket */
m_freem(m);
in_pcbsounlock_rele(inp, so);
+ in_pcbsounlock_rele(listeninp, listenso);
syn_cache_put(sc);
tcpstat_inc(tcps_sc_aborted);
return ((struct socket *)(-1));
@@ -3814,7 +3903,7 @@ syn_cache_add(struct sockaddr *src, stru
struct mbuf *ipopts;
struct rtentry *rt = NULL;
- NET_ASSERT_LOCKED();
+ soassertlocked(so);
tp = sototcpcb(so);
@@ -3990,9 +4079,8 @@ syn_cache_add(struct sockaddr *src, stru
if (syn_cache_respond(sc, m, now, do_ecn) == 0) {
mtx_enter(&syn_cache_mtx);
/*
- * XXXSMP Currently exclusive netlock prevents another insert
- * after our syn_cache_lookup() and before syn_cache_insert().
- * Double insert should be handled and not rely on netlock.
+ * Socket lock prevents another insert after our
+ * syn_cache_lookup() and before syn_cache_insert().
*/
syn_cache_insert(sc, tp);
mtx_leave(&syn_cache_mtx);
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.185 tcp_var.h
--- netinet/tcp_var.h 16 Jan 2025 11:59:20 -0000 1.185
+++ netinet/tcp_var.h 18 Feb 2025 13:00:16 -0000
@@ -718,6 +718,7 @@ int tcp_dooptions(struct tcpcb *, u_cha
struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t);
void tcp_init(void);
int tcp_input(struct mbuf **, int *, int, int);
+void tcp_input_mlist(struct mbuf_list *, int);
int tcp_mss(struct tcpcb *, int);
void tcp_mss_update(struct tcpcb *);
u_int tcp_hdrsz(struct tcpcb *);
Index: netinet6/in6_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v
diff -u -p -r1.124 in6_proto.c
--- netinet6/in6_proto.c 5 Jan 2025 12:36:48 -0000 1.124
+++ netinet6/in6_proto.c 18 Feb 2025 13:00:16 -0000
@@ -147,7 +147,8 @@ const struct protosw inet6sw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inet6domain,
.pr_protocol = IPPROTO_TCP,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|
+ PR_MPINPUT,
.pr_input = tcp_input,
.pr_ctlinput = tcp6_ctlinput,
.pr_ctloutput = tcp_ctloutput,
Index: sys/mbuf.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.265 mbuf.h
--- sys/mbuf.h 5 Nov 2024 13:15:13 -0000 1.265
+++ sys/mbuf.h 19 Feb 2025 00:11:24 -0000
@@ -122,13 +122,13 @@ struct pkthdr_pf {
#endif
/* record/packet header in first mbuf of chain; valid if M_PKTHDR set */
-struct pkthdr {
+struct pkthdr {
void *ph_cookie; /* additional data */
SLIST_HEAD(, m_tag) ph_tags; /* list of packet tags */
int64_t ph_timestamp; /* packet timestamp */
int len; /* total packet length */
u_int ph_rtableid; /* routing table id */
- u_int ph_ifidx; /* rcv interface index */
+ u_int16_t ph_ifidx; /* rcv interface index */
u_int16_t ph_tagsset; /* mtags attached */
u_int16_t ph_flowid; /* pseudo unique flow id */
u_int16_t csum_flags; /* checksum flags */
@@ -136,6 +136,8 @@ struct pkthdr {
u_int16_t ph_mss; /* TCP max segment size */
u_int8_t ph_loopcnt; /* mbuf is looping in kernel */
u_int8_t ph_family; /* af, used when queueing */
+ u_int8_t ph_softidx; /* index of softnet thread */
+ u_int8_t ph_pad[1];
struct pkthdr_pf pf;
};
route cache per softnet thread