Download raw body.
route cache per softnet thread
> On 18 Feb 2025, at 08:53, Alexander Bluhm <bluhm@openbsd.org> wrote:
>
> On Mon, Feb 17, 2025 at 05:37:01PM +0100, Claudio Jeker wrote:
>> On Fri, Feb 14, 2025 at 04:19:36PM +0100, Alexander Bluhm wrote:
>>> Hi,
>>>
>>> When experimenting with parallel TCP, I realized that we need local
>>> storage per softnet thread. That allows effective caching.
>>>
>>> The same is true for IP forwarding route. In one of my benchmarks,
>>> which is forwarding 10 parallel TCP streams over OpenBSD, thoughput
>>> increases by 30%.
>>>
>>> As you can see here, route6_mpath() uses 10.5% CPU time basically
>>> for rtable_match().
>>> http://bluhm.genua.de/perform/results/2025-02-09T21%3A10%3A25Z/2025-02-09T00%3A00%3A00Z/btrace/ssh_perform%40lt13_iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0346%3A%3A36_-P10_-t10-btrace-kstack.0.svg?s=route6_mpath
>>>
>>> With the patch below patch only 2.6% CPU are used in route6_mpath().
>>> http://bluhm.genua.de/perform/results/2025-02-09T21%3A10%3A25Z/patch-sys-forward-route-cache.0/btrace/ssh_perform%40lt13_iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0346%3A%3A36_-P10_-t10-btrace-kstack.0.svg?s=route6_mpath
>>>
>>> Idea is that every softnet task has its own storage. It is passed
>>> down to IP input in mbuf cookie. If the cookie is set, use the
>>> route cache in struct softnet. Otherwise use route cache in stack
>>> memory of IP input. From there the route cache is passed to IP
>>> forward and IP output as usual.
>>>
>>> When I sent a similar diff with cache per CPU memory, there were
>>> concerns with sleeping and switching CPU. Per softnet memory should
>>> be safe.
>>>
>>> There was also the question when the cache is invalidated. I have
>>> implemented a route generation number two releases ago. The cache
>>> contains the this number when the route was stored. After a change
>>> of the routing table, the cache is filled with a new route during
>>> next lookup.
>>>
>>> ok?
>>
>> In general I dislike how ph_cookie is used in more and more places.
>> That field has no validity checks and so code just dereferences the
>> returned value. This is a ticking bomb.
>> I would prefer to have a better concept of per softnet / task storage.
>> Maybe dlg@ has a great idea on how to do that.
>
> I am also not a big fan of ph_cookie. One alternative is to store
> softnet index in mbuf. But we want to keep size of mbuf small.
> Interface index is at most 16 bit. So we can steal some storage
> there.
as a rule of thumb i only use ph_cookie to hold state over a queue when both ends are very well understood or have a narrow scope.
i was working on a diff a while ago to provide softnet (or something like it) as an argument throughout the stack. i definitely got it from the if input processing through to ip_output. id prefer this approach since it makes it obvious that the softnet state is part of the code or execution context that the mbuf is being processed in, rather than a property of the mbuf itself. right now it's all too easy for an mbuf to be queued unexpectedly.
if you're ok with waiting a bit, i can try and find the diff and cut it back to just passing a softnet pointer around and we can compare?
>
>> Also if you use an array of struct softnet to back this then you should
>> make sure that the objects end up on different cache lines so you don't
>> constantly invalidate each others caches.
>
> I added an __aligned(64).
>
> Is the diff below better?
>
> bluhm
>
> Index: net/if.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> diff -u -p -r1.726 if.c
> --- net/if.c 3 Feb 2025 08:58:52 -0000 1.726
> +++ net/if.c 17 Feb 2025 22:20:49 -0000
> @@ -241,19 +241,10 @@ struct rwlock if_tmplist_lock = RWLOCK_I
> struct mutex if_hooks_mtx = MUTEX_INITIALIZER(IPL_NONE);
> void if_hooks_run(struct task_list *);
>
> -int ifq_congestion;
> -
> -int netisr;
> -
> -struct softnet {
> - char sn_name[16];
> - struct taskq *sn_taskq;
> -};
> -
> -#define NET_TASKQ 4
> +int ifq_congestion;
> +int netisr;
> struct softnet softnets[NET_TASKQ];
> -
> -struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
> +struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
>
> /*
> * Serialize socket operations to ensure no new sleeping points
> @@ -979,7 +970,7 @@ if_output_local(struct ifnet *ifp, struc
> }
>
> void
> -if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
> +if_input_process(struct ifnet *ifp, struct mbuf_list *ml, unsigned int idx)
> {
> struct mbuf *m;
>
> @@ -996,9 +987,14 @@ if_input_process(struct ifnet *ifp, stru
> * read only or MP safe. Usually they hold the exclusive net lock.
> */
>
> + idx = net_idx(idx);
> + KASSERT(idx < 256 - 1);
> +
> NET_LOCK_SHARED();
> - while ((m = ml_dequeue(ml)) != NULL)
> + while ((m = ml_dequeue(ml)) != NULL) {
> + m->m_pkthdr.ph_softidx = idx + 1;
> (*ifp->if_input)(ifp, m);
> + }
> NET_UNLOCK_SHARED();
> }
>
> @@ -3672,16 +3668,23 @@ unhandled_af(int af)
> panic("unhandled af %d", af);
> }
>
> -struct taskq *
> -net_tq(unsigned int ifindex)
> +unsigned int
> +net_idx(unsigned int ifindex)
> {
> - struct softnet *sn;
> static int nettaskqs;
>
> if (nettaskqs == 0)
> nettaskqs = min(NET_TASKQ, ncpus);
>
> - sn = &softnets[ifindex % nettaskqs];
> + return (ifindex % nettaskqs);
> +}
> +
> +struct taskq *
> +net_tq(unsigned int ifindex)
> +{
> + struct softnet *sn;
> +
> + sn = &softnets[net_idx(ifindex)];
>
> return (sn->sn_taskq);
> }
> Index: net/if.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.h,v
> diff -u -p -r1.217 if.h
> --- net/if.h 9 Jun 2024 16:25:28 -0000 1.217
> +++ net/if.h 17 Feb 2025 22:15:40 -0000
> @@ -560,7 +560,10 @@ void if_congestion(void);
> int if_congested(void);
> __dead void unhandled_af(int);
> int if_setlladdr(struct ifnet *, const uint8_t *);
> -struct taskq * net_tq(unsigned int);
> +unsigned int
> + net_idx(unsigned int);
> +struct taskq *
> + net_tq(unsigned int);
> void net_tq_barriers(const char *);
>
> #endif /* _KERNEL */
> Index: net/if_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
> diff -u -p -r1.135 if_var.h
> --- net/if_var.h 24 Jan 2025 09:19:07 -0000 1.135
> +++ net/if_var.h 17 Feb 2025 22:05:13 -0000
> @@ -46,6 +46,7 @@
> #include <sys/timeout.h>
>
> #include <net/ifq.h>
> +#include <net/route.h>
>
> /*
> * Structures defining a network interface, providing a packet
> @@ -301,6 +302,14 @@ struct ifg_list {
> #define IF_WWAN_DEFAULT_PRIORITY 6
> #define IF_CARP_DEFAULT_PRIORITY 15
>
> +struct softnet {
> + char sn_name[16];
> + struct taskq *sn_taskq;
> + struct route sn_route;
> +} __aligned(64);
> +#define NET_TASKQ 4
> +extern struct softnet softnets[NET_TASKQ];
> +
> /*
> * Network stack input queues.
> */
> @@ -331,7 +340,7 @@ int if_enqueue(struct ifnet *, struct mb
> int if_enqueue_ifq(struct ifnet *, struct mbuf *);
> void if_input(struct ifnet *, struct mbuf_list *);
> void if_vinput(struct ifnet *, struct mbuf *);
> -void if_input_process(struct ifnet *, struct mbuf_list *);
> +void if_input_process(struct ifnet *, struct mbuf_list *, unsigned int);
> int if_input_local(struct ifnet *, struct mbuf *, sa_family_t);
> int if_output_ml(struct ifnet *, struct mbuf_list *,
> struct sockaddr *, struct rtentry *);
> Index: net/ifq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
> diff -u -p -r1.56 ifq.c
> --- net/ifq.c 3 Feb 2025 08:58:52 -0000 1.56
> +++ net/ifq.c 17 Feb 2025 21:03:07 -0000
> @@ -862,7 +862,7 @@ ifiq_process(void *arg)
> ml_init(&ifiq->ifiq_ml);
> mtx_leave(&ifiq->ifiq_mtx);
>
> - if_input_process(ifiq->ifiq_if, &ml);
> + if_input_process(ifiq->ifiq_if, &ml, ifiq->ifiq_idx);
> }
>
> int
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> diff -u -p -r1.403 ip_input.c
> --- netinet/ip_input.c 3 Jan 2025 21:27:40 -0000 1.403
> +++ netinet/ip_input.c 17 Feb 2025 22:17:09 -0000
> @@ -441,7 +441,7 @@ bad:
> int
> ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp)
> {
> - struct route ro;
> + struct route iproute, *ro = NULL;
> struct mbuf *m;
> struct ip *ip;
> int hlen;
> @@ -452,7 +452,6 @@ ip_input_if(struct mbuf **mp, int *offp,
>
> KASSERT(*offp == 0);
>
> - ro.ro_rt = NULL;
> ipstat_inc(ips_total);
> m = *mp = ipv4_check(ifp, *mp);
> if (m == NULL)
> @@ -512,7 +511,16 @@ ip_input_if(struct mbuf **mp, int *offp,
> goto out;
> }
>
> - switch(in_ouraddr(m, ifp, &ro, flags)) {
> + if ((*mp)->m_pkthdr.ph_softidx == 0) {
> + ro = &iproute;
> + ro->ro_rt = NULL;
> + } else {
> + struct softnet *sn;
> +
> + sn = &softnets[m->m_pkthdr.ph_softidx - 1];
> + ro = &sn->sn_route;
> + }
> + switch(in_ouraddr(m, ifp, ro, flags)) {
> case 2:
> goto bad;
> case 1:
> @@ -614,15 +622,17 @@ ip_input_if(struct mbuf **mp, int *offp,
> }
> #endif /* IPSEC */
>
> - ip_forward(m, ifp, &ro, flags);
> + ip_forward(m, ifp, ro, flags);
> *mp = NULL;
> - rtfree(ro.ro_rt);
> + if (ro == &iproute)
> + rtfree(ro->ro_rt);
> return IPPROTO_DONE;
> bad:
> nxt = IPPROTO_DONE;
> m_freemp(mp);
> out:
> - rtfree(ro.ro_rt);
> + if (ro == &iproute)
> + rtfree(ro->ro_rt);
> return nxt;
> }
>
> Index: netinet6/ip6_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v
> diff -u -p -r1.267 ip6_input.c
> --- netinet6/ip6_input.c 21 Nov 2024 20:15:44 -0000 1.267
> +++ netinet6/ip6_input.c 17 Feb 2025 21:03:22 -0000
> @@ -362,7 +362,7 @@ bad:
> int
> ip6_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp)
> {
> - struct route ro;
> + struct route iproute, *ro = NULL;
> struct mbuf *m;
> struct ip6_hdr *ip6;
> struct rtentry *rt;
> @@ -375,7 +375,6 @@ ip6_input_if(struct mbuf **mp, int *offp
>
> KASSERT(*offp == 0);
>
> - ro.ro_rt = NULL;
> ip6stat_inc(ip6s_total);
> m = *mp = ipv6_check(ifp, *mp);
> if (m == NULL)
> @@ -533,7 +532,18 @@ ip6_input_if(struct mbuf **mp, int *offp
> /*
> * Unicast check
> */
> - rt = route6_mpath(&ro, &ip6->ip6_dst, &ip6->ip6_src,
> + if ((*mp)->m_pkthdr.ph_cookie == NULL) {
> + ro = &iproute;
> + ro->ro_rt = NULL;
> + } else {
> + struct softnet *sn;
> +
> + sn = (*mp)->m_pkthdr.ph_cookie;
> + /* sanity check that noone else uses mbuf cookie */
> + KASSERT(sn >= softnets && sn < softnets + sizeof(softnets));
> + ro = &sn->sn_route;
> + }
> + rt = route6_mpath(ro, &ip6->ip6_dst, &ip6->ip6_src,
> m->m_pkthdr.ph_rtableid);
>
> /*
> @@ -631,15 +641,17 @@ ip6_input_if(struct mbuf **mp, int *offp
> }
> #endif /* IPSEC */
>
> - ip6_forward(m, &ro, flags);
> + ip6_forward(m, ro, flags);
> *mp = NULL;
> - rtfree(ro.ro_rt);
> + if (ro == &iproute)
> + rtfree(ro->ro_rt);
> return IPPROTO_DONE;
> bad:
> nxt = IPPROTO_DONE;
> m_freemp(mp);
> out:
> - rtfree(ro.ro_rt);
> + if (ro == &iproute)
> + rtfree(ro->ro_rt);
> return nxt;
> }
>
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.265 mbuf.h
> --- sys/mbuf.h 5 Nov 2024 13:15:13 -0000 1.265
> +++ sys/mbuf.h 17 Feb 2025 22:06:09 -0000
> @@ -122,13 +122,13 @@ struct pkthdr_pf {
> #endif
>
> /* record/packet header in first mbuf of chain; valid if M_PKTHDR set */
> -struct pkthdr {
> +struct pkthdr {
> void *ph_cookie; /* additional data */
> SLIST_HEAD(, m_tag) ph_tags; /* list of packet tags */
> int64_t ph_timestamp; /* packet timestamp */
> int len; /* total packet length */
> u_int ph_rtableid; /* routing table id */
> - u_int ph_ifidx; /* rcv interface index */
> + u_int16_t ph_ifidx; /* rcv interface index */
> u_int16_t ph_tagsset; /* mtags attached */
> u_int16_t ph_flowid; /* pseudo unique flow id */
> u_int16_t csum_flags; /* checksum flags */
> @@ -136,6 +136,8 @@ struct pkthdr {
> u_int16_t ph_mss; /* TCP max segment size */
> u_int8_t ph_loopcnt; /* mbuf is looping in kernel */
> u_int8_t ph_family; /* af, used when queueing */
> + u_int8_t ph_softidx; /* index of softnet thread */
> + u_int8_t ph_pad[1];
> struct pkthdr_pf pf;
> };
>
>
route cache per softnet thread