From: David Gwynne Subject: pf_route simplification, or pf_route vs pfsync To: tech@openbsd.org Date: Sat, 6 Jul 2024 16:06:16 +1000 at work i have a pair of firewalls with pfsync set up, and we have pfsync defer enabled so we cope better in when routes flop between them. i was trying to use route-to in a rule like this: pass in quick proto { tcp udp } from $mydesktop to $rns port domain route-to $newrns the $rns addresses are aliases on the resolver boxes, so i just needed to force the packets to go to the new server without actually rewriting them. this didnt work as expeted. the packet leaving the firewall always went to the original resolver. the weird thing was that for tcp connections the first packet went to the original resolver which would ack the connection attempt, but the firewall would send subsequent packets packet to th new resolvers. this also didnt work. i figured out it was a bad interaction between pfsync defer and route-to. route-to steals packets leaving the firewall and holds onto them until the peer pfsync box acks the state creation or a timeout occurs. if we're using route-to on a "pass in" rule, then pf_route has done it's thing. when it comes time for pfsync to send the packet out, it's lost the work that pf_route did to change the outgoing gateway/interface and assumes it can route it normally. i tried to fix this by using the state links to look at the incoming state and using that to rerun pf_route. however, this doesnt work because we don't really have links between pf states, we end up with links between pf_state_keys (see "link mbufs/inpcbs to pf_states, not pf_state_keys" sent to this list last year), so resolving the incoming state is more complicated than it should be. on top of that, these links don't get set up properly on the first packets in the state, and even if they were set up pf tears them down by the time pfsync steals the packet. my second attempt was to gut pf_route and have it jus add an mbuf tag with the dst address i want to use instead of the real ip. the actual ip stack is then modified to use the tag for the real route lookups. in my opinion this turned out really well. it's long bothered me that pf_route and pf_route6 are a large duplication of the ip_output code. there's been cases where we forget to update pf_route with new stack features. this problem goes away if we just use the stack for everything. a big chunk of code just goes away. pfsync defer with route-to works too. ive been running this in production since the end of april without issue. if there's a performance hit i haven't noticed it. mbuf tag lookups are cheap since henning added what's basically a bloom filter for them to mbufs. ok? Index: net/if_pfsync.c =================================================================== RCS file: /cvs/src/sys/net/if_pfsync.c,v diff -u -p -r1.326 if_pfsync.c --- net/if_pfsync.c 24 May 2024 06:38:41 -0000 1.326 +++ net/if_pfsync.c 6 Jul 2024 05:16:51 -0000 @@ -2068,13 +2068,16 @@ pfsync_deferrals_task(void *arg) static void pfsync_defer_output(struct pfsync_deferral *pd) { - struct pf_pdesc pdesc; struct pf_state *st = pd->pd_st; + struct mbuf *m = pd->pd_m; + + if (st->rt) { + struct pf_pdesc pdesc; - if (st->rt == PF_ROUTETO) { if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af, - st->direction, NULL, pd->pd_m, NULL) != PF_PASS) - return; + st->direction, NULL, m, NULL) != PF_PASS) + goto done; + switch (st->key[PF_SK_WIRE]->af) { case AF_INET: pf_route(&pdesc, st); @@ -2087,26 +2090,27 @@ pfsync_defer_output(struct pfsync_deferr default: unhandled_af(st->key[PF_SK_WIRE]->af); } - pd->pd_m = pdesc.m; - } else { - switch (st->key[PF_SK_WIRE]->af) { - case AF_INET: - ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0); - break; + m = pdesc.m; + + if (m == NULL) + goto done; + } + + switch (st->key[PF_SK_WIRE]->af) { + case AF_INET: + ip_output(m, NULL, NULL, 0, NULL, NULL, 0); + break; #ifdef INET6 - case AF_INET6: - ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL); - break; + case AF_INET6: + ip6_output(m, NULL, NULL, 0, NULL, NULL); + break; #endif /* INET6 */ - default: - unhandled_af(st->key[PF_SK_WIRE]->af); - } - - pd->pd_m = NULL; + default: + unhandled_af(st->key[PF_SK_WIRE]->af); } +done: pf_state_unref(st); - m_freem(pd->pd_m); pool_put(&pfsync_deferrals_pool, pd); } Index: net/pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v diff -u -p -r1.1201 pf.c --- net/pf.c 4 Jul 2024 12:50:08 -0000 1.1201 +++ net/pf.c 6 Jul 2024 05:16:51 -0000 @@ -6568,12 +6568,7 @@ void pf_route(struct pf_pdesc *pd, struct pf_state *st) { struct mbuf *m0; - struct mbuf_list ml; - struct sockaddr_in *dst, sin; - struct rtentry *rt = NULL; - struct ip *ip; - struct ifnet *ifp = NULL; - unsigned int rtableid; + struct m_tag *mtag; if (pd->m->m_pkthdr.pf.routed++ > 3) { m_freem(pd->m); @@ -6582,117 +6577,47 @@ pf_route(struct pf_pdesc *pd, struct pf_ } if (st->rt == PF_DUPTO) { - if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL) - return; - } else { - if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir)) + m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT); + if (m0 == NULL) return; + } else if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir)) + return; + else m0 = pd->m; - pd->m = NULL; - } - - if (m0->m_len < sizeof(struct ip)) { - DPFPRINTF(LOG_ERR, - "%s: m0->m_len < sizeof(struct ip)", __func__); - goto bad; - } - ip = mtod(m0, struct ip *); - - if (pd->dir == PF_IN) { - if (ip->ip_ttl <= IPTTLDEC) { - if (st->rt != PF_DUPTO) { - pf_send_icmp(m0, ICMP_TIMXCEED, - ICMP_TIMXCEED_INTRANS, 0, - pd->af, st->rule.ptr, pd->rdomain); - } - goto bad; - } - ip->ip_ttl -= IPTTLDEC; - } - - memset(&sin, 0, sizeof(sin)); - dst = &sin; - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = st->rt_addr.v4; - rtableid = m0->m_pkthdr.ph_rtableid; - - rt = rtalloc_mpath(sintosa(dst), &ip->ip_src.s_addr, rtableid); - if (!rtisvalid(rt)) { - if (st->rt != PF_DUPTO) { - pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_HOST, - 0, pd->af, st->rule.ptr, pd->rdomain); - } - ipstat_inc(ips_noroute); - goto bad; - } - - ifp = if_get(rt->rt_ifidx); - if (ifp == NULL) - goto bad; - - /* A locally generated packet may have invalid source address. */ - if ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET && - (ifp->if_flags & IFF_LOOPBACK) == 0) - ip->ip_src = ifatoia(rt->rt_ifa)->ia_addr.sin_addr; - - if (st->rt != PF_DUPTO && pd->dir == PF_IN) { - if (pf_test(AF_INET, PF_OUT, ifp, &m0) != PF_PASS) - goto bad; - else if (m0 == NULL) - goto done; - if (m0->m_len < sizeof(struct ip)) { - DPFPRINTF(LOG_ERR, - "%s: m0->m_len < sizeof(struct ip)", __func__); - goto bad; - } - ip = mtod(m0, struct ip *); + mtag = m_tag_find(m0, PACKET_TAG_PF_ROUTE, NULL); + if (mtag == NULL) { + mtag = m_tag_get(PACKET_TAG_PF_ROUTE, sizeof(st->rt_addr), + M_NOWAIT); + if (mtag == NULL) + goto drop; } - if (if_output_tso(ifp, &m0, sintosa(dst), rt, ifp->if_mtu) || - m0 == NULL) - goto done; + *(struct pf_addr *)(mtag + 1) = st->rt_addr; + m_tag_prepend(m0, mtag); - /* - * Too large for interface; fragment if possible. - * Must be able to put at least 8 bytes per fragment. - */ - if (ip->ip_off & htons(IP_DF)) { - ipstat_inc(ips_cantfrag); - if (st->rt != PF_DUPTO) - pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, - ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain); - goto bad; - } + if (st->rt == PF_DUPTO) { + SET(m0->m_pkthdr.pf.flags, PF_TAG_GENERATED); - if (ip_fragment(m0, &ml, ifp, ifp->if_mtu) || - if_output_ml(ifp, &ml, sintosa(dst), rt)) - goto done; - ipstat_inc(ips_fragmented); + ip_output(m0, NULL, NULL, 0, NULL, NULL, 0); + } else if (pd->dir == PF_OUT) + SET(m0->m_pkthdr.pf.flags, PF_TAG_REROUTE); -done: - if_put(ifp); - rtfree(rt); return; - -bad: +drop: + if (m0 == pd->m) + pd->m = NULL; m_freem(m0); - goto done; } #ifdef INET6 -/* pf_route6() may change pd->m, adjust local copies after calling */ void pf_route6(struct pf_pdesc *pd, struct pf_state *st) { struct mbuf *m0; - struct sockaddr_in6 *dst, sin6; - struct rtentry *rt = NULL; - struct ip6_hdr *ip6; - struct ifnet *ifp = NULL; struct m_tag *mtag; - unsigned int rtableid; + +printf("%s\n", __func__); if (pd->m->m_pkthdr.pf.routed++ > 3) { m_freem(pd->m); @@ -6701,101 +6626,37 @@ pf_route6(struct pf_pdesc *pd, struct pf } if (st->rt == PF_DUPTO) { - if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL) - return; - } else { - if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir)) + m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT); + if (m0 == NULL) return; + } else if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir)) + return; + else m0 = pd->m; - pd->m = NULL; - } - if (m0->m_len < sizeof(struct ip6_hdr)) { - DPFPRINTF(LOG_ERR, - "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__); - goto bad; + mtag = m_tag_find(m0, PACKET_TAG_PF_ROUTE, NULL); + if (mtag == NULL) { + mtag = m_tag_get(PACKET_TAG_PF_ROUTE, sizeof(st->rt_addr), + M_NOWAIT); + if (mtag == NULL) + goto drop; } - ip6 = mtod(m0, struct ip6_hdr *); - if (pd->dir == PF_IN) { - if (ip6->ip6_hlim <= IPV6_HLIMDEC) { - if (st->rt != PF_DUPTO) { - pf_send_icmp(m0, ICMP6_TIME_EXCEEDED, - ICMP6_TIME_EXCEED_TRANSIT, 0, - pd->af, st->rule.ptr, pd->rdomain); - } - goto bad; - } - ip6->ip6_hlim -= IPV6_HLIMDEC; - } - - memset(&sin6, 0, sizeof(sin6)); - dst = &sin6; - dst->sin6_family = AF_INET6; - dst->sin6_len = sizeof(*dst); - dst->sin6_addr = st->rt_addr.v6; - rtableid = m0->m_pkthdr.ph_rtableid; - - rt = rtalloc_mpath(sin6tosa(dst), &ip6->ip6_src.s6_addr32[0], - rtableid); - if (!rtisvalid(rt)) { - if (st->rt != PF_DUPTO) { - pf_send_icmp(m0, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_NOROUTE, 0, - pd->af, st->rule.ptr, pd->rdomain); - } - ip6stat_inc(ip6s_noroute); - goto bad; - } - - ifp = if_get(rt->rt_ifidx); - if (ifp == NULL) - goto bad; - - /* A locally generated packet may have invalid source address. */ - if (IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && - (ifp->if_flags & IFF_LOOPBACK) == 0) - ip6->ip6_src = ifatoia6(rt->rt_ifa)->ia_addr.sin6_addr; - - if (st->rt != PF_DUPTO && pd->dir == PF_IN) { - if (pf_test(AF_INET6, PF_OUT, ifp, &m0) != PF_PASS) - goto bad; - else if (m0 == NULL) - goto done; - if (m0->m_len < sizeof(struct ip6_hdr)) { - DPFPRINTF(LOG_ERR, - "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__); - goto bad; - } - } + *(struct pf_addr *)(mtag + 1) = st->rt_addr; + m_tag_prepend(m0, mtag); - /* - * If packet has been reassembled by PF earlier, we have to - * use pf_refragment6() here to turn it back to fragments. - */ - if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) { - (void) pf_refragment6(&m0, mtag, dst, ifp, rt); - goto done; - } - - if (if_output_tso(ifp, &m0, sin6tosa(dst), rt, ifp->if_mtu) || - m0 == NULL) - goto done; + if (st->rt == PF_DUPTO) { + SET(m0->m_pkthdr.pf.flags, PF_TAG_GENERATED); - ip6stat_inc(ip6s_cantfrag); - if (st->rt != PF_DUPTO) - pf_send_icmp(m0, ICMP6_PACKET_TOO_BIG, 0, - ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain); - goto bad; + ip6_output(m0, NULL, NULL, 0, NULL, NULL); + } else if (pd->dir == PF_OUT) + SET(m0->m_pkthdr.pf.flags, PF_TAG_REROUTE); -done: - if_put(ifp); - rtfree(rt); return; - -bad: +drop: + if (m0 == pd->m) + pd->m = NULL; m_freem(m0); - goto done; } #endif /* INET6 */ @@ -8059,6 +7920,9 @@ pf_ouraddr(struct mbuf *m) { struct pf_state_key *sk; + if (ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_PF_ROUTE)) + return (0); + if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) return (1); @@ -8078,6 +7942,16 @@ pf_ouraddr(struct mbuf *m) void pf_pkt_addr_changed(struct mbuf *m) { + struct m_tag *mtag; + + mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL); + if (mtag != NULL) { + m_tag_delete(m, mtag); + + KASSERTMSG(m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL) == NULL, + "mbuf %p had multiple PACKET_TAG_PF_ROUTE mbuf tags", m); + } + pf_mbuf_unlink_state_key(m); pf_mbuf_unlink_inpcb(m); } Index: netinet/ip_output.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_output.c,v diff -u -p -r1.401 ip_output.c --- netinet/ip_output.c 2 Jul 2024 18:33:47 -0000 1.401 +++ netinet/ip_output.c 6 Jul 2024 05:16:51 -0000 @@ -110,7 +110,9 @@ ip_output(struct mbuf *m, struct mbuf *o struct sockaddr_in *dst; struct tdb *tdb = NULL; u_long mtu; + struct in_addr *rt_dst; #if NPF > 0 + struct m_tag *rt_mtag; u_int orig_rtableid; #endif @@ -128,7 +130,7 @@ ip_output(struct mbuf *m, struct mbuf *o /* * Fill in IP header. */ - if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + if (!ISSET(flags, IP_FORWARDING|IP_RAWOUTPUT)) { ip->ip_v = IPVERSION; ip->ip_off &= htons(IP_DF); ip->ip_id = htons(ip_randomid()); @@ -151,6 +153,7 @@ ip_output(struct mbuf *m, struct mbuf *o orig_rtableid = m->m_pkthdr.ph_rtableid; reroute: #endif + rt_dst = &ip->ip_dst; /* * Do a route lookup now in case we need the source address to @@ -163,11 +166,19 @@ reroute: ro->ro_rt = NULL; } +#if NPF > 0 + rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL); + if (rt_mtag != NULL) { + struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1); + rt_dst = &rt_addr->v4; + } +#endif + /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. */ - route_cache(ro, &ip->ip_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid); + route_cache(ro, rt_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid); dst = &ro->ro_dstsin; if ((IN_MULTICAST(ip->ip_dst.s_addr) || Index: netinet6/ip6_forward.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v diff -u -p -r1.120 ip6_forward.c --- netinet6/ip6_forward.c 4 Jul 2024 12:50:08 -0000 1.120 +++ netinet6/ip6_forward.c 6 Jul 2024 05:16:51 -0000 @@ -95,6 +95,10 @@ ip6_forward(struct mbuf *m, struct route struct tdb *tdb = NULL; #endif /* IPSEC */ char src6[INET6_ADDRSTRLEN], dst6[INET6_ADDRSTRLEN]; + struct in6_addr *rt_dst; +#if NPF > 0 + struct m_tag *rt_mtag; +#endif /* * Do not forward packets to multicast destination (should be handled @@ -166,11 +170,20 @@ reroute: } #endif /* IPSEC */ + rt_dst = &ip6->ip6_dst; +#if NPF > 0 + rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL); + if (rt_mtag != NULL) { + struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1); + rt_dst = &rt_addr->v6; + } +#endif + if (ro == NULL) { ro = &iproute; ro->ro_rt = NULL; } - rt = route6_mpath(ro, &ip6->ip6_dst, &ip6->ip6_src, + rt = route6_mpath(ro, rt_dst, &ip6->ip6_src, m->m_pkthdr.ph_rtableid); if (rt == NULL) { ip6stat_inc(ip6s_noroute); Index: netinet6/ip6_input.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_input.c,v diff -u -p -r1.264 ip6_input.c --- netinet6/ip6_input.c 4 Jul 2024 12:50:08 -0000 1.264 +++ netinet6/ip6_input.c 6 Jul 2024 05:16:51 -0000 @@ -364,7 +364,9 @@ ip6_input_if(struct mbuf **mp, int *offp u_int16_t src_scope, dst_scope; #if NPF > 0 struct in6_addr odst; + struct m_tag *rt_mtag; #endif + struct in6_addr *rt_dst; int flags = 0; KASSERT(*offp == 0); @@ -523,11 +525,19 @@ ip6_input_if(struct mbuf **mp, int *offp goto out; } - /* * Unicast check */ - rt = route6_mpath(&ro, &ip6->ip6_dst, &ip6->ip6_src, + rt_dst = &ip6->ip6_dst; +#if NPF > 0 + rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL); + if (rt_mtag != NULL) { + struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1); + rt_dst = &rt_addr->v6; + } +#endif + + rt = route6_mpath(&ro, rt_dst, &ip6->ip6_src, m->m_pkthdr.ph_rtableid); /* Index: netinet6/ip6_output.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_output.c,v diff -u -p -r1.292 ip6_output.c --- netinet6/ip6_output.c 4 Jul 2024 12:50:08 -0000 1.292 +++ netinet6/ip6_output.c 6 Jul 2024 05:16:51 -0000 @@ -177,6 +177,7 @@ ip6_output(struct mbuf *m, struct ip6_pk u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst; + struct in6_addr *rt_dst; struct route *ro_pmtu = NULL; int hdrsplit = 0; u_int8_t sproto = 0; @@ -184,6 +185,9 @@ ip6_output(struct mbuf *m, struct ip6_pk #ifdef IPSEC struct tdb *tdb = NULL; #endif /* IPSEC */ +#if NPF > 0 + struct m_tag *rt_mtag; +#endif ip6 = mtod(m, struct ip6_hdr *); finaldst = ip6->ip6_dst; @@ -387,6 +391,7 @@ ip6_output(struct mbuf *m, struct ip6_pk #if NPF > 0 reroute: #endif + rt_dst = &ip6->ip6_dst; /* initialize cached route */ if (ro == NULL) { @@ -456,8 +461,16 @@ reroute: ifp = if_get(im6o->im6o_ifidx); } +#if NPF > 0 + rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL); + if (rt_mtag != NULL) { + struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1); + rt_dst = &rt_addr->v6; + } +#endif + if (ifp == NULL) { - rt = in6_selectroute(&ip6->ip6_dst, opt, ro, + rt = in6_selectroute(rt_dst, opt, ro, m->m_pkthdr.ph_rtableid); if (rt == NULL) { ip6stat_inc(ip6s_noroute); @@ -480,7 +493,7 @@ reroute: goto bad; } } else { - route6_cache(ro, &ip6->ip6_dst, NULL, m->m_pkthdr.ph_rtableid); + route6_cache(ro, rt_dst, NULL, m->m_pkthdr.ph_rtableid); } if (rt && (rt->rt_flags & RTF_GATEWAY) && Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v diff -u -p -r1.263 mbuf.h --- sys/mbuf.h 14 Apr 2024 20:46:27 -0000 1.263 +++ sys/mbuf.h 6 Jul 2024 05:16:51 -0000 @@ -477,6 +478,7 @@ struct m_tag *m_tag_next(struct mbuf *, #define PACKET_TAG_GRE 0x0080 /* GRE processing done */ #define PACKET_TAG_DLT 0x0100 /* data link layer type */ #define PACKET_TAG_PF_DIVERT 0x0200 /* pf(4) diverted packet */ +#define PACKET_TAG_PF_ROUTE 0x0400 /* pf(4) route-to */ #define PACKET_TAG_PF_REASSEMBLED 0x0800 /* pf reassembled ipv6 packet */ #define PACKET_TAG_SRCROUTE 0x1000 /* IPv4 source routing options */ #define PACKET_TAG_TUNNEL 0x2000 /* Tunnel endpoint address */