Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: pf_route simplification, or pf_route vs pfsync
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Thu, 11 Jul 2024 22:21:49 +1000

Download raw body.

Thread
On Tue, Jul 09, 2024 at 11:16:46AM +0200, Alexander Bluhm wrote:
> regress sys/net/pf_forward shows that it breaks path MTU discovery
> with pf route-to.

cool, thanks for trying it out.

> Unfortunately this regress test requires 4 machines and is hard to
> set up.  But it covers a bunch of corner caes.

yes...

i had another look at the code and found that the pmtu stuff is
handled by the caller of ip_output. when forwarding that's ip_forward,
which does a route lookup which i hadn't updated to use the address
from the mbuf tag pf sets. ip_forward fiddles with the route it
found, which may be where this regression you found comes from.

i think i've addressed the stuff sashan@ pointed out too.

if you could have a look at this updated diff i'd appreciate it.
if it still sucks i'll try setting up 4 machines.

> 
> bluhm
> 
> On Sat, Jul 06, 2024 at 04:06:16PM +1000, David Gwynne wrote:
> > at work i have a pair of firewalls with pfsync set up, and we have
> > pfsync defer enabled so we cope better in when routes flop between
> > them.
> >
> > i was trying to use route-to in a rule like this:
> >
> >  pass in quick proto { tcp udp } from $mydesktop to $rns port domain
> > route-to $newrns
> >
> > the $rns addresses are aliases on the resolver boxes, so i just
> > needed to force the packets to go to the new server without actually
> > rewriting them.
> >
> > this didnt work as expeted. the packet leaving the firewall always
> > went to the original resolver. the weird thing was that for tcp
> > connections the first packet went to the original resolver which would
> > ack the connection attempt, but the firewall would send subsequent
> > packets packet to th new resolvers. this also didnt work.
> >
> > i figured out it was a bad interaction between pfsync defer and
> > route-to.
> >
> > route-to steals packets leaving the firewall and holds onto them
> > until the peer pfsync box acks the state creation or a timeout
> > occurs. if we're using route-to on a "pass in" rule, then pf_route
> > has done it's thing. when it comes time for pfsync to send the
> > packet out, it's lost the work that pf_route did to change the
> > outgoing gateway/interface and assumes it can route it normally.
> >
> > i tried to fix this by using the state links to look at the incoming
> > state and using that to rerun pf_route. however, this doesnt work
> > because we don't really have links between pf states, we end up
> > with links between pf_state_keys (see "link mbufs/inpcbs to pf_states,
> > not pf_state_keys" sent to this list last year), so resolving the
> > incoming state is more complicated than it should be. on top of
> > that, these links don't get set up properly on the first packets
> > in the state, and even if they were set up pf tears them down by
> > the time pfsync steals the packet.
> >
> > my second attempt was to gut pf_route and have it jus add an mbuf
> > tag with the dst address i want to use instead of the real ip. the
> > actual ip stack is then modified to use the tag for the real route
> > lookups.
> >
> > in my opinion this turned out really well. it's long bothered me that
> > pf_route and pf_route6 are a large duplication of the ip_output code.
> > there's been cases where we forget to update pf_route with new stack
> > features. this problem goes away if we just use the stack for
> > everything. a big chunk of code just goes away.
> >
> > pfsync defer with route-to works too.
> >
> > ive been running this in production since the end of april without
> > issue. if there's a performance hit i haven't noticed it. mbuf tag
> > lookups are cheap since henning added what's basically a bloom
> > filter for them to mbufs.
> >
> > ok?
> >

Index: net/if_pfsync.c
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.c,v
diff -u -p -r1.326 if_pfsync.c
--- net/if_pfsync.c	24 May 2024 06:38:41 -0000	1.326
+++ net/if_pfsync.c	11 Jul 2024 12:00:56 -0000
@@ -2068,13 +2068,16 @@ pfsync_deferrals_task(void *arg)
 static void
 pfsync_defer_output(struct pfsync_deferral *pd)
 {
-	struct pf_pdesc pdesc;
 	struct pf_state *st = pd->pd_st;
+	struct mbuf *m = pd->pd_m;
+
+	if (st->rt) {
+		struct pf_pdesc pdesc;
 
-	if (st->rt == PF_ROUTETO) {
 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
-		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
-			return;
+		    st->direction, NULL, m, NULL) != PF_PASS)
+			goto done;
+
 		switch (st->key[PF_SK_WIRE]->af) {
 		case AF_INET:
 			pf_route(&pdesc, st);
@@ -2087,26 +2090,27 @@ pfsync_defer_output(struct pfsync_deferr
 		default:
 			unhandled_af(st->key[PF_SK_WIRE]->af);
 		}
-		pd->pd_m = pdesc.m;
-	} else {
-		switch (st->key[PF_SK_WIRE]->af) {
-		case AF_INET:
-			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
-			break;
+		m = pdesc.m;
+
+		if (m == NULL)
+			goto done;
+	}
+
+	switch (st->key[PF_SK_WIRE]->af) {
+	case AF_INET:
+		ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+		break;
 #ifdef INET6
-		case AF_INET6:
-			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
-			break;
+	case AF_INET6:
+		ip6_output(m, NULL, NULL, 0, NULL, NULL);
+		break;
 #endif /* INET6 */
-		default:
-			unhandled_af(st->key[PF_SK_WIRE]->af);
-		}
-
-		pd->pd_m = NULL;
+	default:
+		unhandled_af(st->key[PF_SK_WIRE]->af);
 	}
 
+done:
 	pf_state_unref(st);
-	m_freem(pd->pd_m);
 	pool_put(&pfsync_deferrals_pool, pd);
 }
 
Index: net/pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
diff -u -p -r1.1201 pf.c
--- net/pf.c	4 Jul 2024 12:50:08 -0000	1.1201
+++ net/pf.c	11 Jul 2024 12:00:56 -0000
@@ -6563,17 +6563,12 @@ pf_rtlabel_match(struct pf_addr *addr, s
 	return (ret);
 }
 
-/* pf_route() may change pd->m, adjust local copies after calling */
-void
-pf_route(struct pf_pdesc *pd, struct pf_state *st)
+static void
+pf_route_af(struct pf_pdesc *pd, struct pf_state *st,
+    void (*send)(struct mbuf *))
 {
 	struct mbuf		*m0;
-	struct mbuf_list	 ml;
-	struct sockaddr_in	*dst, sin;
-	struct rtentry		*rt = NULL;
-	struct ip		*ip;
-	struct ifnet		*ifp = NULL;
-	unsigned int		 rtableid;
+	struct m_tag		*mtag;
 
 	if (pd->m->m_pkthdr.pf.routed++ > 3) {
 		m_freem(pd->m);
@@ -6582,220 +6577,49 @@ pf_route(struct pf_pdesc *pd, struct pf_
 	}
 
 	if (st->rt == PF_DUPTO) {
-		if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL)
-			return;
-	} else {
-		if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
+		m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT);
+		if (m0 == NULL)
 			return;
+	} else if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
+		return;
+	else
 		m0 = pd->m;
-		pd->m = NULL;
-	}
 
-	if (m0->m_len < sizeof(struct ip)) {
-		DPFPRINTF(LOG_ERR,
-		    "%s: m0->m_len < sizeof(struct ip)", __func__);
-		goto bad;
-	}
-
-	ip = mtod(m0, struct ip *);
-
-	if (pd->dir == PF_IN) {
-		if (ip->ip_ttl <= IPTTLDEC) {
-			if (st->rt != PF_DUPTO) {
-				pf_send_icmp(m0, ICMP_TIMXCEED,
-				    ICMP_TIMXCEED_INTRANS, 0,
-				    pd->af, st->rule.ptr, pd->rdomain);
-			}
-			goto bad;
+	mtag = m_tag_find(m0, PACKET_TAG_PF_ROUTE, NULL);
+	if (mtag == NULL) {
+		mtag = m_tag_get(PACKET_TAG_PF_ROUTE, sizeof(st->rt_addr),
+		    M_NOWAIT);
+		if (mtag == NULL) {
+			if (m0 == pd->m)
+				pd->m = NULL;
+			m_freem(m0);
+			return;
 		}
-		ip->ip_ttl -= IPTTLDEC;
-	}
-
-	memset(&sin, 0, sizeof(sin));
-	dst = &sin;
-	dst->sin_family = AF_INET;
-	dst->sin_len = sizeof(*dst);
-	dst->sin_addr = st->rt_addr.v4;
-	rtableid = m0->m_pkthdr.ph_rtableid;
 
-	rt = rtalloc_mpath(sintosa(dst), &ip->ip_src.s_addr, rtableid);
-	if (!rtisvalid(rt)) {
-		if (st->rt != PF_DUPTO) {
-			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_HOST,
-			    0, pd->af, st->rule.ptr, pd->rdomain);
-		}
-		ipstat_inc(ips_noroute);
-		goto bad;
+		m_tag_prepend(m0, mtag);
 	}
 
-	ifp = if_get(rt->rt_ifidx);
-	if (ifp == NULL)
-		goto bad;
-
-	/* A locally generated packet may have invalid source address. */
-	if ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET &&
-	    (ifp->if_flags & IFF_LOOPBACK) == 0)
-		ip->ip_src = ifatoia(rt->rt_ifa)->ia_addr.sin_addr;
+	*(struct pf_addr *)(mtag + 1) = st->rt_addr;
 
-	if (st->rt != PF_DUPTO && pd->dir == PF_IN) {
-		if (pf_test(AF_INET, PF_OUT, ifp, &m0) != PF_PASS)
-			goto bad;
-		else if (m0 == NULL)
-			goto done;
-		if (m0->m_len < sizeof(struct ip)) {
-			DPFPRINTF(LOG_ERR,
-			    "%s: m0->m_len < sizeof(struct ip)", __func__);
-			goto bad;
-		}
-		ip = mtod(m0, struct ip *);
-	}
-
-	if (if_output_tso(ifp, &m0, sintosa(dst), rt, ifp->if_mtu) ||
-	    m0 == NULL)
-		goto done;
-
-	/*
-	 * Too large for interface; fragment if possible.
-	 * Must be able to put at least 8 bytes per fragment.
-	 */
-	if (ip->ip_off & htons(IP_DF)) {
-		ipstat_inc(ips_cantfrag);
-		if (st->rt != PF_DUPTO)
-			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
-			    ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain);
-		goto bad;
-	}
-
-	if (ip_fragment(m0, &ml, ifp, ifp->if_mtu) ||
-	    if_output_ml(ifp, &ml, sintosa(dst), rt))
-		goto done;
-	ipstat_inc(ips_fragmented);
+	if (st->rt == PF_DUPTO) {
+		SET(m0->m_pkthdr.pf.flags, PF_TAG_GENERATED);
+		(*send)(m0);
+	} else if (pd->dir == PF_OUT)
+		SET(m0->m_pkthdr.pf.flags, PF_TAG_REROUTE);
+}
 
-done:
-	if_put(ifp);
-	rtfree(rt);
-	return;
-
-bad:
-	m_freem(m0);
-	goto done;
+/* pf_route() may change pd->m, adjust local copies after calling */
+void
+pf_route(struct pf_pdesc *pd, struct pf_state *st)
+{
+	pf_route_af(pd, st, ip_send);
 }
 
 #ifdef INET6
-/* pf_route6() may change pd->m, adjust local copies after calling */
 void
 pf_route6(struct pf_pdesc *pd, struct pf_state *st)
 {
-	struct mbuf		*m0;
-	struct sockaddr_in6	*dst, sin6;
-	struct rtentry		*rt = NULL;
-	struct ip6_hdr		*ip6;
-	struct ifnet		*ifp = NULL;
-	struct m_tag		*mtag;
-	unsigned int		 rtableid;
-
-	if (pd->m->m_pkthdr.pf.routed++ > 3) {
-		m_freem(pd->m);
-		pd->m = NULL;
-		return;
-	}
-
-	if (st->rt == PF_DUPTO) {
-		if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL)
-			return;
-	} else {
-		if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
-			return;
-		m0 = pd->m;
-		pd->m = NULL;
-	}
-
-	if (m0->m_len < sizeof(struct ip6_hdr)) {
-		DPFPRINTF(LOG_ERR,
-		    "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__);
-		goto bad;
-	}
-	ip6 = mtod(m0, struct ip6_hdr *);
-
-	if (pd->dir == PF_IN) {
-		if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
-			if (st->rt != PF_DUPTO) {
-				pf_send_icmp(m0, ICMP6_TIME_EXCEEDED,
-				    ICMP6_TIME_EXCEED_TRANSIT, 0,
-				    pd->af, st->rule.ptr, pd->rdomain);
-			}
-			goto bad;
-		}
-		ip6->ip6_hlim -= IPV6_HLIMDEC;
-	}
-
-	memset(&sin6, 0, sizeof(sin6));
-	dst = &sin6;
-	dst->sin6_family = AF_INET6;
-	dst->sin6_len = sizeof(*dst);
-	dst->sin6_addr = st->rt_addr.v6;
-	rtableid = m0->m_pkthdr.ph_rtableid;
-
-	rt = rtalloc_mpath(sin6tosa(dst), &ip6->ip6_src.s6_addr32[0],
-	    rtableid);
-	if (!rtisvalid(rt)) {
-		if (st->rt != PF_DUPTO) {
-			pf_send_icmp(m0, ICMP6_DST_UNREACH,
-			    ICMP6_DST_UNREACH_NOROUTE, 0,
-			    pd->af, st->rule.ptr, pd->rdomain);
-		}
-		ip6stat_inc(ip6s_noroute);
-		goto bad;
-	}
-
-	ifp = if_get(rt->rt_ifidx);
-	if (ifp == NULL)
-		goto bad;
-
-	/* A locally generated packet may have invalid source address. */
-	if (IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
-	    (ifp->if_flags & IFF_LOOPBACK) == 0)
-		ip6->ip6_src = ifatoia6(rt->rt_ifa)->ia_addr.sin6_addr;
-
-	if (st->rt != PF_DUPTO && pd->dir == PF_IN) {
-		if (pf_test(AF_INET6, PF_OUT, ifp, &m0) != PF_PASS)
-			goto bad;
-		else if (m0 == NULL)
-			goto done;
-		if (m0->m_len < sizeof(struct ip6_hdr)) {
-			DPFPRINTF(LOG_ERR,
-			    "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__);
-			goto bad;
-		}
-	}
-
-	/*
-	 * If packet has been reassembled by PF earlier, we have to
-	 * use pf_refragment6() here to turn it back to fragments.
-	 */
-	if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) {
-		(void) pf_refragment6(&m0, mtag, dst, ifp, rt);
-		goto done;
-	}
-
-	if (if_output_tso(ifp, &m0, sin6tosa(dst), rt, ifp->if_mtu) ||
-	    m0 == NULL)
-		goto done;
-
-	ip6stat_inc(ip6s_cantfrag);
-	if (st->rt != PF_DUPTO)
-		pf_send_icmp(m0, ICMP6_PACKET_TOO_BIG, 0,
-		    ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain);
-	goto bad;
-
-done:
-	if_put(ifp);
-	rtfree(rt);
-	return;
-
-bad:
-	m_freem(m0);
-	goto done;
+	pf_route_af(pd, st, ip6_send);
 }
 #endif /* INET6 */
 
@@ -8059,6 +7883,9 @@ pf_ouraddr(struct mbuf *m)
 {
 	struct pf_state_key	*sk;
 
+	if (ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_PF_ROUTE))
+		return (0); 
+
 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED)
 		return (1);
 
@@ -8078,6 +7905,16 @@ pf_ouraddr(struct mbuf *m)
 void
 pf_pkt_addr_changed(struct mbuf *m)
 {
+	struct m_tag *mtag;
+
+	mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (mtag != NULL) {
+		m_tag_delete(m, mtag);
+
+		KASSERTMSG(m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL) == NULL,
+		    "mbuf %p had multiple PACKET_TAG_PF_ROUTE mbuf tags", m);
+	}
+
 	pf_mbuf_unlink_state_key(m);
 	pf_mbuf_unlink_inpcb(m);
 }
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
diff -u -p -r1.397 ip_input.c
--- netinet/ip_input.c	2 Jul 2024 18:33:47 -0000	1.397
+++ netinet/ip_input.c	11 Jul 2024 12:00:56 -0000
@@ -1550,6 +1550,10 @@ ip_forward(struct mbuf *m, struct ifnet 
 	struct mbuf *mcopy;
 	int error = 0, type = 0, code = 0, destmtu = 0;
 	u_int32_t dest;
+	struct in_addr *rt_dst;
+#if NPF > 0
+	struct m_tag *rt_mtag;
+#endif
 
 	dest = 0;
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
@@ -1561,12 +1565,21 @@ ip_forward(struct mbuf *m, struct ifnet 
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
 		goto done;
 	}
+	rt_dst = &ip->ip_dst;
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v4;
+		SET(flags, IP_REDIRECT);
+	}
+#endif
 
 	if (ro == NULL) {
 		ro = &iproute;
 		ro->ro_rt = NULL;
 	}
-	rt = route_mpath(ro, &ip->ip_dst, &ip->ip_src, rtableid);
+	rt = route_mpath(ro, rt_dst, &ip->ip_src, rtableid);
 	if (rt == NULL) {
 		ipstat_inc(ips_noroute);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
Index: netinet/ip_output.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_output.c,v
diff -u -p -r1.401 ip_output.c
--- netinet/ip_output.c	2 Jul 2024 18:33:47 -0000	1.401
+++ netinet/ip_output.c	11 Jul 2024 12:00:56 -0000
@@ -110,7 +110,9 @@ ip_output(struct mbuf *m, struct mbuf *o
 	struct sockaddr_in *dst;
 	struct tdb *tdb = NULL;
 	u_long mtu;
+	struct in_addr *rt_dst;
 #if NPF > 0
+	struct m_tag *rt_mtag;
 	u_int orig_rtableid;
 #endif
 
@@ -128,7 +130,7 @@ ip_output(struct mbuf *m, struct mbuf *o
 	/*
 	 * Fill in IP header.
 	 */
-	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
+	if (!ISSET(flags, IP_FORWARDING|IP_RAWOUTPUT)) {
 		ip->ip_v = IPVERSION;
 		ip->ip_off &= htons(IP_DF);
 		ip->ip_id = htons(ip_randomid());
@@ -151,6 +153,7 @@ ip_output(struct mbuf *m, struct mbuf *o
 	orig_rtableid = m->m_pkthdr.ph_rtableid;
 reroute:
 #endif
+	rt_dst = &ip->ip_dst;
 
 	/*
 	 * Do a route lookup now in case we need the source address to
@@ -163,11 +166,19 @@ reroute:
 		ro->ro_rt = NULL;
 	}
 
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v4;
+	}
+#endif
+
 	/*
 	 * If there is a cached route, check that it is to the same
 	 * destination and is still up.  If not, free it and try again.
 	 */
-	route_cache(ro, &ip->ip_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid);
+	route_cache(ro, rt_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid);
 	dst = &ro->ro_dstsin;
 
 	if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
Index: netinet6/ip6_forward.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v
diff -u -p -r1.120 ip6_forward.c
--- netinet6/ip6_forward.c	4 Jul 2024 12:50:08 -0000	1.120
+++ netinet6/ip6_forward.c	11 Jul 2024 12:00:56 -0000
@@ -95,6 +95,10 @@ ip6_forward(struct mbuf *m, struct route
 	struct tdb *tdb = NULL;
 #endif /* IPSEC */
 	char src6[INET6_ADDRSTRLEN], dst6[INET6_ADDRSTRLEN];
+	struct in6_addr *rt_dst;
+#if NPF > 0
+	struct m_tag *rt_mtag;
+#endif
 
 	/*
 	 * Do not forward packets to multicast destination (should be handled
@@ -166,11 +170,20 @@ reroute:
 	}
 #endif /* IPSEC */
 
+	rt_dst = &ip6->ip6_dst;
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v6;
+	}
+#endif
+
 	if (ro == NULL) {
 		ro = &iproute;
 		ro->ro_rt = NULL;
 	}
-	rt = route6_mpath(ro, &ip6->ip6_dst, &ip6->ip6_src,
+	rt = route6_mpath(ro, rt_dst, &ip6->ip6_src,
 	    m->m_pkthdr.ph_rtableid);
 	if (rt == NULL) {
 		ip6stat_inc(ip6s_noroute);
Index: netinet6/ip6_input.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
diff -u -p -r1.264 ip6_input.c
--- netinet6/ip6_input.c	4 Jul 2024 12:50:08 -0000	1.264
+++ netinet6/ip6_input.c	11 Jul 2024 12:00:56 -0000
@@ -364,7 +364,9 @@ ip6_input_if(struct mbuf **mp, int *offp
 	u_int16_t src_scope, dst_scope;
 #if NPF > 0
 	struct in6_addr odst;
+	struct m_tag *rt_mtag;
 #endif
+	struct in6_addr *rt_dst;
 	int flags = 0;
 
 	KASSERT(*offp == 0);
@@ -523,11 +525,19 @@ ip6_input_if(struct mbuf **mp, int *offp
 		goto out;
 	}
 
-
 	/*
 	 *  Unicast check
 	 */
-	rt = route6_mpath(&ro, &ip6->ip6_dst, &ip6->ip6_src,
+	rt_dst = &ip6->ip6_dst;
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v6;
+	}
+#endif
+
+	rt = route6_mpath(&ro, rt_dst, &ip6->ip6_src,
 	    m->m_pkthdr.ph_rtableid);
 
 	/*
Index: netinet6/ip6_output.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_output.c,v
diff -u -p -r1.292 ip6_output.c
--- netinet6/ip6_output.c	4 Jul 2024 12:50:08 -0000	1.292
+++ netinet6/ip6_output.c	11 Jul 2024 12:00:56 -0000
@@ -177,6 +177,7 @@ ip6_output(struct mbuf *m, struct ip6_pk
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr finaldst;
+	struct in6_addr *rt_dst;
 	struct route *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	u_int8_t sproto = 0;
@@ -184,6 +185,9 @@ ip6_output(struct mbuf *m, struct ip6_pk
 #ifdef IPSEC
 	struct tdb *tdb = NULL;
 #endif /* IPSEC */
+#if NPF > 0
+	struct m_tag *rt_mtag;
+#endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	finaldst = ip6->ip6_dst;
@@ -387,6 +391,7 @@ ip6_output(struct mbuf *m, struct ip6_pk
 #if NPF > 0
 reroute:
 #endif
+	rt_dst = &ip6->ip6_dst;
 
 	/* initialize cached route */
 	if (ro == NULL) {
@@ -456,8 +461,16 @@ reroute:
 			ifp = if_get(im6o->im6o_ifidx);
 	}
 
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v6;
+	}
+#endif
+
 	if (ifp == NULL) {
-		rt = in6_selectroute(&ip6->ip6_dst, opt, ro,
+		rt = in6_selectroute(rt_dst, opt, ro,
 		    m->m_pkthdr.ph_rtableid);
 		if (rt == NULL) {
 			ip6stat_inc(ip6s_noroute);
@@ -480,7 +493,7 @@ reroute:
 			goto bad;
 		}
 	} else {
-		route6_cache(ro, &ip6->ip6_dst, NULL, m->m_pkthdr.ph_rtableid);
+		route6_cache(ro, rt_dst, NULL, m->m_pkthdr.ph_rtableid);
 	}
 
 	if (rt && (rt->rt_flags & RTF_GATEWAY) &&
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.263 mbuf.h
--- sys/mbuf.h	14 Apr 2024 20:46:27 -0000	1.263
+++ sys/mbuf.h	11 Jul 2024 12:00:57 -0000
@@ -477,6 +478,7 @@ struct m_tag *m_tag_next(struct mbuf *, 
 #define PACKET_TAG_GRE			0x0080  /* GRE processing done */
 #define PACKET_TAG_DLT			0x0100 /* data link layer type */
 #define PACKET_TAG_PF_DIVERT		0x0200 /* pf(4) diverted packet */
+#define PACKET_TAG_PF_ROUTE		0x0400 /* pf(4) route-to */
 #define PACKET_TAG_PF_REASSEMBLED	0x0800 /* pf reassembled ipv6 packet */
 #define PACKET_TAG_SRCROUTE		0x1000 /* IPv4 source routing options */
 #define PACKET_TAG_TUNNEL		0x2000	/* Tunnel endpoint address */