Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: pf_route simplification, or pf_route vs pfsync
To:
David Gwynne <david@gwynne.id.au>
Cc:
tech@openbsd.org
Date:
Tue, 9 Jul 2024 11:16:46 +0200

Download raw body.

Thread
regress sys/net/pf_forward shows that it breaks path MTU discovery
with pf route-to.

Unfortunately this regress test requires 4 machines and is hard to
set up.  But it covers a bunch of corner caes.

bluhm

On Sat, Jul 06, 2024 at 04:06:16PM +1000, David Gwynne wrote:
> at work i have a pair of firewalls with pfsync set up, and we have
> pfsync defer enabled so we cope better in when routes flop between
> them.
> 
> i was trying to use route-to in a rule like this:
> 
>  pass in quick proto { tcp udp } from $mydesktop to $rns port domain
> route-to $newrns
> 
> the $rns addresses are aliases on the resolver boxes, so i just
> needed to force the packets to go to the new server without actually
> rewriting them.
> 
> this didnt work as expeted. the packet leaving the firewall always
> went to the original resolver. the weird thing was that for tcp
> connections the first packet went to the original resolver which would
> ack the connection attempt, but the firewall would send subsequent
> packets packet to th new resolvers. this also didnt work.
> 
> i figured out it was a bad interaction between pfsync defer and
> route-to.
> 
> route-to steals packets leaving the firewall and holds onto them
> until the peer pfsync box acks the state creation or a timeout
> occurs. if we're using route-to on a "pass in" rule, then pf_route
> has done it's thing. when it comes time for pfsync to send the
> packet out, it's lost the work that pf_route did to change the
> outgoing gateway/interface and assumes it can route it normally.
> 
> i tried to fix this by using the state links to look at the incoming
> state and using that to rerun pf_route. however, this doesnt work
> because we don't really have links between pf states, we end up
> with links between pf_state_keys (see "link mbufs/inpcbs to pf_states,
> not pf_state_keys" sent to this list last year), so resolving the
> incoming state is more complicated than it should be. on top of
> that, these links don't get set up properly on the first packets
> in the state, and even if they were set up pf tears them down by
> the time pfsync steals the packet.
> 
> my second attempt was to gut pf_route and have it jus add an mbuf
> tag with the dst address i want to use instead of the real ip. the
> actual ip stack is then modified to use the tag for the real route
> lookups.
> 
> in my opinion this turned out really well. it's long bothered me that
> pf_route and pf_route6 are a large duplication of the ip_output code.
> there's been cases where we forget to update pf_route with new stack
> features. this problem goes away if we just use the stack for
> everything. a big chunk of code just goes away.
> 
> pfsync defer with route-to works too.
> 
> ive been running this in production since the end of april without
> issue. if there's a performance hit i haven't noticed it. mbuf tag
> lookups are cheap since henning added what's basically a bloom
> filter for them to mbufs.
> 
> ok?
> 
> Index: net/if_pfsync.c
> ===================================================================
> RCS file: /cvs/src/sys/net/if_pfsync.c,v
> diff -u -p -r1.326 if_pfsync.c
> --- net/if_pfsync.c	24 May 2024 06:38:41 -0000	1.326
> +++ net/if_pfsync.c	6 Jul 2024 05:16:51 -0000
> @@ -2068,13 +2068,16 @@ pfsync_deferrals_task(void *arg)
>  static void
>  pfsync_defer_output(struct pfsync_deferral *pd)
>  {
> -	struct pf_pdesc pdesc;
>  	struct pf_state *st = pd->pd_st;
> +	struct mbuf *m = pd->pd_m;
> +
> +	if (st->rt) {
> +		struct pf_pdesc pdesc;
>  
> -	if (st->rt == PF_ROUTETO) {
>  		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
> -		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
> -			return;
> +		    st->direction, NULL, m, NULL) != PF_PASS)
> +			goto done;
> +
>  		switch (st->key[PF_SK_WIRE]->af) {
>  		case AF_INET:
>  			pf_route(&pdesc, st);
> @@ -2087,26 +2090,27 @@ pfsync_defer_output(struct pfsync_deferr
>  		default:
>  			unhandled_af(st->key[PF_SK_WIRE]->af);
>  		}
> -		pd->pd_m = pdesc.m;
> -	} else {
> -		switch (st->key[PF_SK_WIRE]->af) {
> -		case AF_INET:
> -			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
> -			break;
> +		m = pdesc.m;
> +
> +		if (m == NULL)
> +			goto done;
> +	}
> +
> +	switch (st->key[PF_SK_WIRE]->af) {
> +	case AF_INET:
> +		ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
> +		break;
>  #ifdef INET6
> -		case AF_INET6:
> -			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
> -			break;
> +	case AF_INET6:
> +		ip6_output(m, NULL, NULL, 0, NULL, NULL);
> +		break;
>  #endif /* INET6 */
> -		default:
> -			unhandled_af(st->key[PF_SK_WIRE]->af);
> -		}
> -
> -		pd->pd_m = NULL;
> +	default:
> +		unhandled_af(st->key[PF_SK_WIRE]->af);
>  	}
>  
> +done:
>  	pf_state_unref(st);
> -	m_freem(pd->pd_m);
>  	pool_put(&pfsync_deferrals_pool, pd);
>  }
>  
> Index: net/pf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pf.c,v
> diff -u -p -r1.1201 pf.c
> --- net/pf.c	4 Jul 2024 12:50:08 -0000	1.1201
> +++ net/pf.c	6 Jul 2024 05:16:51 -0000
> @@ -6568,12 +6568,7 @@ void
>  pf_route(struct pf_pdesc *pd, struct pf_state *st)
>  {
>  	struct mbuf		*m0;
> -	struct mbuf_list	 ml;
> -	struct sockaddr_in	*dst, sin;
> -	struct rtentry		*rt = NULL;
> -	struct ip		*ip;
> -	struct ifnet		*ifp = NULL;
> -	unsigned int		 rtableid;
> +	struct m_tag		*mtag;
>  
>  	if (pd->m->m_pkthdr.pf.routed++ > 3) {
>  		m_freem(pd->m);
> @@ -6582,117 +6577,47 @@ pf_route(struct pf_pdesc *pd, struct pf_
>  	}
>  
>  	if (st->rt == PF_DUPTO) {
> -		if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL)
> -			return;
> -	} else {
> -		if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
> +		m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT);
> +		if (m0 == NULL)
>  			return;
> +	} else if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
> +		return;
> +	else
>  		m0 = pd->m;
> -		pd->m = NULL;
> -	}
> -
> -	if (m0->m_len < sizeof(struct ip)) {
> -		DPFPRINTF(LOG_ERR,
> -		    "%s: m0->m_len < sizeof(struct ip)", __func__);
> -		goto bad;
> -	}
>  
> -	ip = mtod(m0, struct ip *);
> -
> -	if (pd->dir == PF_IN) {
> -		if (ip->ip_ttl <= IPTTLDEC) {
> -			if (st->rt != PF_DUPTO) {
> -				pf_send_icmp(m0, ICMP_TIMXCEED,
> -				    ICMP_TIMXCEED_INTRANS, 0,
> -				    pd->af, st->rule.ptr, pd->rdomain);
> -			}
> -			goto bad;
> -		}
> -		ip->ip_ttl -= IPTTLDEC;
> -	}
> -
> -	memset(&sin, 0, sizeof(sin));
> -	dst = &sin;
> -	dst->sin_family = AF_INET;
> -	dst->sin_len = sizeof(*dst);
> -	dst->sin_addr = st->rt_addr.v4;
> -	rtableid = m0->m_pkthdr.ph_rtableid;
> -
> -	rt = rtalloc_mpath(sintosa(dst), &ip->ip_src.s_addr, rtableid);
> -	if (!rtisvalid(rt)) {
> -		if (st->rt != PF_DUPTO) {
> -			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_HOST,
> -			    0, pd->af, st->rule.ptr, pd->rdomain);
> -		}
> -		ipstat_inc(ips_noroute);
> -		goto bad;
> -	}
> -
> -	ifp = if_get(rt->rt_ifidx);
> -	if (ifp == NULL)
> -		goto bad;
> -
> -	/* A locally generated packet may have invalid source address. */
> -	if ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET &&
> -	    (ifp->if_flags & IFF_LOOPBACK) == 0)
> -		ip->ip_src = ifatoia(rt->rt_ifa)->ia_addr.sin_addr;
> -
> -	if (st->rt != PF_DUPTO && pd->dir == PF_IN) {
> -		if (pf_test(AF_INET, PF_OUT, ifp, &m0) != PF_PASS)
> -			goto bad;
> -		else if (m0 == NULL)
> -			goto done;
> -		if (m0->m_len < sizeof(struct ip)) {
> -			DPFPRINTF(LOG_ERR,
> -			    "%s: m0->m_len < sizeof(struct ip)", __func__);
> -			goto bad;
> -		}
> -		ip = mtod(m0, struct ip *);
> +	mtag = m_tag_find(m0, PACKET_TAG_PF_ROUTE, NULL);
> +	if (mtag == NULL) {
> +		mtag = m_tag_get(PACKET_TAG_PF_ROUTE, sizeof(st->rt_addr),
> +		    M_NOWAIT);
> +		if (mtag == NULL)
> +			goto drop;
>  	}
>  
> -	if (if_output_tso(ifp, &m0, sintosa(dst), rt, ifp->if_mtu) ||
> -	    m0 == NULL)
> -		goto done;
> +	*(struct pf_addr *)(mtag + 1) = st->rt_addr;
> +	m_tag_prepend(m0, mtag);
>  
> -	/*
> -	 * Too large for interface; fragment if possible.
> -	 * Must be able to put at least 8 bytes per fragment.
> -	 */
> -	if (ip->ip_off & htons(IP_DF)) {
> -		ipstat_inc(ips_cantfrag);
> -		if (st->rt != PF_DUPTO)
> -			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
> -			    ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain);
> -		goto bad;
> -	}
> +	if (st->rt == PF_DUPTO) {
> +		SET(m0->m_pkthdr.pf.flags, PF_TAG_GENERATED);
>  
> -	if (ip_fragment(m0, &ml, ifp, ifp->if_mtu) ||
> -	    if_output_ml(ifp, &ml, sintosa(dst), rt))
> -		goto done;
> -	ipstat_inc(ips_fragmented);
> +		ip_output(m0, NULL, NULL, 0, NULL, NULL, 0);
> +	} else if (pd->dir == PF_OUT)
> +		SET(m0->m_pkthdr.pf.flags, PF_TAG_REROUTE);
>  
> -done:
> -	if_put(ifp);
> -	rtfree(rt);
>  	return;
> -
> -bad:
> +drop:
> +	if (m0 == pd->m)
> +		pd->m = NULL;
>  	m_freem(m0);
> -	goto done;
>  }
>  
>  #ifdef INET6
> -/* pf_route6() may change pd->m, adjust local copies after calling */
>  void
>  pf_route6(struct pf_pdesc *pd, struct pf_state *st)
>  {
>  	struct mbuf		*m0;
> -	struct sockaddr_in6	*dst, sin6;
> -	struct rtentry		*rt = NULL;
> -	struct ip6_hdr		*ip6;
> -	struct ifnet		*ifp = NULL;
>  	struct m_tag		*mtag;
> -	unsigned int		 rtableid;
> +
> +printf("%s\n", __func__);
>  
>  	if (pd->m->m_pkthdr.pf.routed++ > 3) {
>  		m_freem(pd->m);
> @@ -6701,101 +6626,37 @@ pf_route6(struct pf_pdesc *pd, struct pf
>  	}
>  
>  	if (st->rt == PF_DUPTO) {
> -		if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL)
> -			return;
> -	} else {
> -		if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
> +		m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT);
> +		if (m0 == NULL)
>  			return;
> +	} else if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
> +		return;
> +	else
>  		m0 = pd->m;
> -		pd->m = NULL;
> -	}
>  
> -	if (m0->m_len < sizeof(struct ip6_hdr)) {
> -		DPFPRINTF(LOG_ERR,
> -		    "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__);
> -		goto bad;
> +	mtag = m_tag_find(m0, PACKET_TAG_PF_ROUTE, NULL);
> +	if (mtag == NULL) {
> +		mtag = m_tag_get(PACKET_TAG_PF_ROUTE, sizeof(st->rt_addr),
> +		    M_NOWAIT);
> +		if (mtag == NULL)
> +			goto drop;
>  	}
> -	ip6 = mtod(m0, struct ip6_hdr *);
>  
> -	if (pd->dir == PF_IN) {
> -		if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
> -			if (st->rt != PF_DUPTO) {
> -				pf_send_icmp(m0, ICMP6_TIME_EXCEEDED,
> -				    ICMP6_TIME_EXCEED_TRANSIT, 0,
> -				    pd->af, st->rule.ptr, pd->rdomain);
> -			}
> -			goto bad;
> -		}
> -		ip6->ip6_hlim -= IPV6_HLIMDEC;
> -	}
> -
> -	memset(&sin6, 0, sizeof(sin6));
> -	dst = &sin6;
> -	dst->sin6_family = AF_INET6;
> -	dst->sin6_len = sizeof(*dst);
> -	dst->sin6_addr = st->rt_addr.v6;
> -	rtableid = m0->m_pkthdr.ph_rtableid;
> -
> -	rt = rtalloc_mpath(sin6tosa(dst), &ip6->ip6_src.s6_addr32[0],
> -	    rtableid);
> -	if (!rtisvalid(rt)) {
> -		if (st->rt != PF_DUPTO) {
> -			pf_send_icmp(m0, ICMP6_DST_UNREACH,
> -			    ICMP6_DST_UNREACH_NOROUTE, 0,
> -			    pd->af, st->rule.ptr, pd->rdomain);
> -		}
> -		ip6stat_inc(ip6s_noroute);
> -		goto bad;
> -	}
> -
> -	ifp = if_get(rt->rt_ifidx);
> -	if (ifp == NULL)
> -		goto bad;
> -
> -	/* A locally generated packet may have invalid source address. */
> -	if (IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
> -	    (ifp->if_flags & IFF_LOOPBACK) == 0)
> -		ip6->ip6_src = ifatoia6(rt->rt_ifa)->ia_addr.sin6_addr;
> -
> -	if (st->rt != PF_DUPTO && pd->dir == PF_IN) {
> -		if (pf_test(AF_INET6, PF_OUT, ifp, &m0) != PF_PASS)
> -			goto bad;
> -		else if (m0 == NULL)
> -			goto done;
> -		if (m0->m_len < sizeof(struct ip6_hdr)) {
> -			DPFPRINTF(LOG_ERR,
> -			    "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__);
> -			goto bad;
> -		}
> -	}
> +	*(struct pf_addr *)(mtag + 1) = st->rt_addr;
> +	m_tag_prepend(m0, mtag);
>  
> -	/*
> -	 * If packet has been reassembled by PF earlier, we have to
> -	 * use pf_refragment6() here to turn it back to fragments.
> -	 */
> -	if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) {
> -		(void) pf_refragment6(&m0, mtag, dst, ifp, rt);
> -		goto done;
> -	}
> -
> -	if (if_output_tso(ifp, &m0, sin6tosa(dst), rt, ifp->if_mtu) ||
> -	    m0 == NULL)
> -		goto done;
> +	if (st->rt == PF_DUPTO) {
> +		SET(m0->m_pkthdr.pf.flags, PF_TAG_GENERATED);
>  
> -	ip6stat_inc(ip6s_cantfrag);
> -	if (st->rt != PF_DUPTO)
> -		pf_send_icmp(m0, ICMP6_PACKET_TOO_BIG, 0,
> -		    ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain);
> -	goto bad;
> +		ip6_output(m0, NULL, NULL, 0, NULL, NULL);
> +	} else if (pd->dir == PF_OUT)
> +		SET(m0->m_pkthdr.pf.flags, PF_TAG_REROUTE);
>  
> -done:
> -	if_put(ifp);
> -	rtfree(rt);
>  	return;
> -
> -bad:
> +drop:
> +	if (m0 == pd->m)
> +		pd->m = NULL;
>  	m_freem(m0);
> -	goto done;
>  }
>  #endif /* INET6 */
>  
> @@ -8059,6 +7920,9 @@ pf_ouraddr(struct mbuf *m)
>  {
>  	struct pf_state_key	*sk;
>  
> +	if (ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_PF_ROUTE))
> +		return (0); 
> +
>  	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED)
>  		return (1);
>  
> @@ -8078,6 +7942,16 @@ pf_ouraddr(struct mbuf *m)
>  void
>  pf_pkt_addr_changed(struct mbuf *m)
>  {
> +	struct m_tag *mtag;
> +
> +	mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
> +	if (mtag != NULL) {
> +		m_tag_delete(m, mtag);
> +
> +		KASSERTMSG(m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL) == NULL,
> +		    "mbuf %p had multiple PACKET_TAG_PF_ROUTE mbuf tags", m);
> +	}
> +
>  	pf_mbuf_unlink_state_key(m);
>  	pf_mbuf_unlink_inpcb(m);
>  }
> Index: netinet/ip_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_output.c,v
> diff -u -p -r1.401 ip_output.c
> --- netinet/ip_output.c	2 Jul 2024 18:33:47 -0000	1.401
> +++ netinet/ip_output.c	6 Jul 2024 05:16:51 -0000
> @@ -110,7 +110,9 @@ ip_output(struct mbuf *m, struct mbuf *o
>  	struct sockaddr_in *dst;
>  	struct tdb *tdb = NULL;
>  	u_long mtu;
> +	struct in_addr *rt_dst;
>  #if NPF > 0
> +	struct m_tag *rt_mtag;
>  	u_int orig_rtableid;
>  #endif
>  
> @@ -128,7 +130,7 @@ ip_output(struct mbuf *m, struct mbuf *o
>  	/*
>  	 * Fill in IP header.
>  	 */
> -	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
> +	if (!ISSET(flags, IP_FORWARDING|IP_RAWOUTPUT)) {
>  		ip->ip_v = IPVERSION;
>  		ip->ip_off &= htons(IP_DF);
>  		ip->ip_id = htons(ip_randomid());
> @@ -151,6 +153,7 @@ ip_output(struct mbuf *m, struct mbuf *o
>  	orig_rtableid = m->m_pkthdr.ph_rtableid;
>  reroute:
>  #endif
> +	rt_dst = &ip->ip_dst;
>  
>  	/*
>  	 * Do a route lookup now in case we need the source address to
> @@ -163,11 +166,19 @@ reroute:
>  		ro->ro_rt = NULL;
>  	}
>  
> +#if NPF > 0
> +	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
> +	if (rt_mtag != NULL) {
> +		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
> +		rt_dst = &rt_addr->v4;
> +	}
> +#endif
> +
>  	/*
>  	 * If there is a cached route, check that it is to the same
>  	 * destination and is still up.  If not, free it and try again.
>  	 */
> -	route_cache(ro, &ip->ip_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid);
> +	route_cache(ro, rt_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid);
>  	dst = &ro->ro_dstsin;
>  
>  	if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
> Index: netinet6/ip6_forward.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v
> diff -u -p -r1.120 ip6_forward.c
> --- netinet6/ip6_forward.c	4 Jul 2024 12:50:08 -0000	1.120
> +++ netinet6/ip6_forward.c	6 Jul 2024 05:16:51 -0000
> @@ -95,6 +95,10 @@ ip6_forward(struct mbuf *m, struct route
>  	struct tdb *tdb = NULL;
>  #endif /* IPSEC */
>  	char src6[INET6_ADDRSTRLEN], dst6[INET6_ADDRSTRLEN];
> +	struct in6_addr *rt_dst;
> +#if NPF > 0
> +	struct m_tag *rt_mtag;
> +#endif
>  
>  	/*
>  	 * Do not forward packets to multicast destination (should be handled
> @@ -166,11 +170,20 @@ reroute:
>  	}
>  #endif /* IPSEC */
>  
> +	rt_dst = &ip6->ip6_dst;
> +#if NPF > 0
> +	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
> +	if (rt_mtag != NULL) {
> +		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
> +		rt_dst = &rt_addr->v6;
> +	}
> +#endif
> +
>  	if (ro == NULL) {
>  		ro = &iproute;
>  		ro->ro_rt = NULL;
>  	}
> -	rt = route6_mpath(ro, &ip6->ip6_dst, &ip6->ip6_src,
> +	rt = route6_mpath(ro, rt_dst, &ip6->ip6_src,
>  	    m->m_pkthdr.ph_rtableid);
>  	if (rt == NULL) {
>  		ip6stat_inc(ip6s_noroute);
> Index: netinet6/ip6_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
> diff -u -p -r1.264 ip6_input.c
> --- netinet6/ip6_input.c	4 Jul 2024 12:50:08 -0000	1.264
> +++ netinet6/ip6_input.c	6 Jul 2024 05:16:51 -0000
> @@ -364,7 +364,9 @@ ip6_input_if(struct mbuf **mp, int *offp
>  	u_int16_t src_scope, dst_scope;
>  #if NPF > 0
>  	struct in6_addr odst;
> +	struct m_tag *rt_mtag;
>  #endif
> +	struct in6_addr *rt_dst;
>  	int flags = 0;
>  
>  	KASSERT(*offp == 0);
> @@ -523,11 +525,19 @@ ip6_input_if(struct mbuf **mp, int *offp
>  		goto out;
>  	}
>  
> -
>  	/*
>  	 *  Unicast check
>  	 */
> -	rt = route6_mpath(&ro, &ip6->ip6_dst, &ip6->ip6_src,
> +	rt_dst = &ip6->ip6_dst;
> +#if NPF > 0
> +	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
> +	if (rt_mtag != NULL) {
> +		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
> +		rt_dst = &rt_addr->v6;
> +	}
> +#endif
> +
> +	rt = route6_mpath(&ro, rt_dst, &ip6->ip6_src,
>  	    m->m_pkthdr.ph_rtableid);
>  
>  	/*
> Index: netinet6/ip6_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_output.c,v
> diff -u -p -r1.292 ip6_output.c
> --- netinet6/ip6_output.c	4 Jul 2024 12:50:08 -0000	1.292
> +++ netinet6/ip6_output.c	6 Jul 2024 05:16:51 -0000
> @@ -177,6 +177,7 @@ ip6_output(struct mbuf *m, struct ip6_pk
>  	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
>  	struct ip6_exthdrs exthdrs;
>  	struct in6_addr finaldst;
> +	struct in6_addr *rt_dst;
>  	struct route *ro_pmtu = NULL;
>  	int hdrsplit = 0;
>  	u_int8_t sproto = 0;
> @@ -184,6 +185,9 @@ ip6_output(struct mbuf *m, struct ip6_pk
>  #ifdef IPSEC
>  	struct tdb *tdb = NULL;
>  #endif /* IPSEC */
> +#if NPF > 0
> +	struct m_tag *rt_mtag;
> +#endif
>  
>  	ip6 = mtod(m, struct ip6_hdr *);
>  	finaldst = ip6->ip6_dst;
> @@ -387,6 +391,7 @@ ip6_output(struct mbuf *m, struct ip6_pk
>  #if NPF > 0
>  reroute:
>  #endif
> +	rt_dst = &ip6->ip6_dst;
>  
>  	/* initialize cached route */
>  	if (ro == NULL) {
> @@ -456,8 +461,16 @@ reroute:
>  			ifp = if_get(im6o->im6o_ifidx);
>  	}
>  
> +#if NPF > 0
> +	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
> +	if (rt_mtag != NULL) {
> +		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
> +		rt_dst = &rt_addr->v6;
> +	}
> +#endif
> +
>  	if (ifp == NULL) {
> -		rt = in6_selectroute(&ip6->ip6_dst, opt, ro,
> +		rt = in6_selectroute(rt_dst, opt, ro,
>  		    m->m_pkthdr.ph_rtableid);
>  		if (rt == NULL) {
>  			ip6stat_inc(ip6s_noroute);
> @@ -480,7 +493,7 @@ reroute:
>  			goto bad;
>  		}
>  	} else {
> -		route6_cache(ro, &ip6->ip6_dst, NULL, m->m_pkthdr.ph_rtableid);
> +		route6_cache(ro, rt_dst, NULL, m->m_pkthdr.ph_rtableid);
>  	}
>  
>  	if (rt && (rt->rt_flags & RTF_GATEWAY) &&
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.263 mbuf.h
> --- sys/mbuf.h	14 Apr 2024 20:46:27 -0000	1.263
> +++ sys/mbuf.h	6 Jul 2024 05:16:51 -0000
> @@ -477,6 +478,7 @@ struct m_tag *m_tag_next(struct mbuf *, 
>  #define PACKET_TAG_GRE			0x0080  /* GRE processing done */
>  #define PACKET_TAG_DLT			0x0100 /* data link layer type */
>  #define PACKET_TAG_PF_DIVERT		0x0200 /* pf(4) diverted packet */
> +#define PACKET_TAG_PF_ROUTE		0x0400 /* pf(4) route-to */
>  #define PACKET_TAG_PF_REASSEMBLED	0x0800 /* pf reassembled ipv6 packet */
>  #define PACKET_TAG_SRCROUTE		0x1000 /* IPv4 source routing options */
>  #define PACKET_TAG_TUNNEL		0x2000	/* Tunnel endpoint address */