Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: ipv4 icmp_reflect() source address selection optimisation
To:
Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Wed, 21 Aug 2024 15:40:23 +1000

Download raw body.

Thread
On Fri, Aug 09, 2024 at 11:47:14AM +0200, Claudio Jeker wrote:
> On Fri, Aug 09, 2024 at 11:15:34AM +0200, Alexander Bluhm wrote:
> > On Fri, Aug 09, 2024 at 10:40:11AM +1000, David Gwynne wrote:
> > > tl;dr: i believe this change would let us simplify pf_route (the
> > > loopback ip handling specifically), so it's worth it.
> > 
> > In contrast I have worked on all the corner cases in 15 years to
> > make it work.  Maybe not the best solution, but it works.  I fear
> > that at our customers somethig will break if we change behavior
> > just because we think it looks better.

all of them? that's a bold claim.

add this config the PF host in your test setup and then run the regress:

# cat /etc/hostname.vport0
inet 169.254.0.1 255.255.255.0
up
# cat /etc/hostname.pfsync0
syncif vport0
maxupd 128
defer
up


im hitting edge cases with pf_route and pfsync, that's why im working
on this. this bit is a chunk out of a much larger set of changes to fix
it, which ive included below. i'm starting to worry that i'll have
to carry it as part of my menagerie of local diffs on the firewalls cos
i'm the only person in the world doing multipath and pfsync?

my full diff for the pf_route and pfsync fixes is below. i'm not hugely
happy with chewing up more mbuf tags ids, but we haven't come up with a
less worse idea yet.

> This is not only about looks. ICMP source selection is a massive pain
> point on DFZ routers and we need to fix this. Reducing the amount of
> corners to cover would be very benefitial.
> 
> > In this particular case I think using the first addreess as source
> > address is wrong.  IPv4 source address selection should be done
> > with a route lookup.  Basically what in_pcbselsrc() does.  And using
> > the route interface address seems reasonable to me.

for the large part my last diff does follow what in_pcbselsrc does
more closely than the current code. it respects the "route sourceaddr"
config while the current code doesnt.

however, the icmp reflect situation is a little different to what
in_pcbselsrc usually handles though. in_pcbselsrc is used to pick
an address for a locally terminated connection, while icmp_reflect
can be used to reply to a packet going through the box. let's not
pretend these are completely identical situations.

> To be honest I think it should not matter which IP is selected in the IPv4
> case. All adresses on the interface can be used to send out an ICMP error.
> Now I do agree that using the same logic as in in_pcbselsrc() would benefit
> consitancy.

i mostly agree, except selecting 127.0.0.1 to reply to a packet that
arrived off the wire seems bloody minded when there's (what i consider)
reasonable ways to avoid it.

dlg

Index: regress/sys/net/pf_forward/Makefile
===================================================================
RCS file: /cvs/src/regress/sys/net/pf_forward/Makefile,v
diff -u -p -r1.35 Makefile
--- regress/sys/net/pf_forward/Makefile	1 Feb 2021 12:52:07 -0000	1.35
+++ regress/sys/net/pf_forward/Makefile	21 Aug 2024 05:32:39 -0000
@@ -342,8 +342,8 @@ check-setup-pf:
 	    fgrep -q 'gateway: ${RT_IN}'  # ${ip} RT_IN
 .endfor
 .for ip in RTT_IN RTT_OUT RPT_IN RPT_OUT
-	ssh ${PF_SSH} route -n get -inet ${${ip}} | grep -q 'flags: .*REJECT' \
-	    # ${ip} reject
+	ssh ${PF_SSH} route -n get -inet ${${ip}} |\
+	    grep -q 'interface: .*lo' # ${ip} reject
 .endfor
 	ssh ${PF_SSH} ping6 -n -c 1 ${PF_IN6}  # PF_IN6
 	ssh ${PF_SSH} route -n get -inet6 ${PF_IN6} | grep -q 'flags: .*LOCAL' \
@@ -359,7 +359,7 @@ check-setup-pf:
 .endfor
 .for ip in RTT_IN RTT_OUT RPT_IN RPT_OUT
 	ssh ${PF_SSH} route -n get -inet6 ${${ip}6} |\
-	    grep -q 'flags: .*REJECT'  # ${ip}6 reject
+	    grep -q 'interface: .*lo'  # ${ip}6 reject
 .endfor
 	ssh ${PF_SSH} ${SUDO} pfctl -sr | grep '^anchor "regress" all$$'
 	ssh ${PF_SSH} ${SUDO} pfctl -si | grep '^Status: Enabled '
Index: sys/net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
diff -u -p -r1.720 if.c
--- sys/net/if.c	14 Jul 2024 18:53:39 -0000	1.720
+++ sys/net/if.c	21 Aug 2024 05:32:39 -0000
@@ -66,7 +66,6 @@
 #include "carp.h"
 #include "ether.h"
 #include "pf.h"
-#include "pfsync.h"
 #include "ppp.h"
 #include "pppoe.h"
 #include "if_wg.h"
@@ -136,7 +135,13 @@
 
 #if NPF > 0
 #include <net/pfvar.h>
-#endif
+
+#include "pfsync.h"
+#if NPFSYNC > 0
+#include <netinet/ip_ipsp.h> /* for union sockaddr_union */
+#include <net/if_pfsync.h>
+#endif /* NPFSYNC > 0 */
+#endif /* NPF > 0 */
 
 #include <sys/device.h>
 
@@ -725,6 +730,14 @@ if_enqueue(struct ifnet *ifp, struct mbu
 	CLR(m->m_pkthdr.csum_flags, M_TIMESTAMP);
 
 #if NPF > 0
+#if NPFSYNC > 0
+	if (ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_PF_DEFER)) {
+		m = pfsync_defer_out(ifp, m);
+		if (m == NULL)
+			return (0);
+	}
+#endif
+
 	if (m->m_pkthdr.pf.delay > 0)
 		return (pf_delay_pkt(m, ifp->if_index));
 #endif
Index: sys/net/if_loop.c
===================================================================
RCS file: /cvs/src/sys/net/if_loop.c,v
diff -u -p -r1.98 if_loop.c
--- sys/net/if_loop.c	29 Dec 2023 11:43:04 -0000	1.98
+++ sys/net/if_loop.c	21 Aug 2024 05:32:39 -0000
@@ -261,10 +261,10 @@ looutput(struct ifnet *ifp, struct mbuf 
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("%s: no header mbuf", __func__);
 
-	if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+	if (rt != NULL && !ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
 		m_freem(m);
-		return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
-			rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+		return (ISSET(rt->rt_flags, RTF_HOST) ?
+		    EHOSTUNREACH : ENETUNREACH);
 	}
 
 	/*
Index: sys/net/if_pfsync.c
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.c,v
diff -u -p -r1.326 if_pfsync.c
--- sys/net/if_pfsync.c	24 May 2024 06:38:41 -0000	1.326
+++ sys/net/if_pfsync.c	21 Aug 2024 05:32:39 -0000
@@ -109,8 +109,9 @@ struct pfsync_softc;
 struct pfsync_deferral {
 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
 	struct pf_state				*pd_st;
-	struct mbuf				*pd_m;
+	struct mbuf_list			 pd_ml;
 	uint64_t				 pd_deadline;
+	unsigned int				 pd_ifidx;
 };
 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
 
@@ -428,7 +429,6 @@ pfsync_clone_create(struct if_clone *ifc
 			TAILQ_INIT(&s->s_qs[q]);
 		TAILQ_INIT(&s->s_tdb_q);
 
-		/* stupid NET_LOCK */
 		timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s);
 		task_set(&s->s_deferrals_task, pfsync_deferrals_task, s);
 		TAILQ_INIT(&s->s_deferrals);
@@ -1928,10 +1928,9 @@ int
 pfsync_defer(struct pf_state *st, struct mbuf *m)
 {
 	struct pfsync_softc *sc;
-	struct pfsync_slice *s;
-	struct pfsync_deferral *pd;
-	int sched = 0;
-	int rv = 0;
+	struct m_tag *mtag;
+	struct pf_state_cmp *cmp;
+	int defer;
 
 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
 	    ISSET(m->m_flags, M_BCAST|M_MCAST))
@@ -1939,26 +1938,83 @@ pfsync_defer(struct pf_state *st, struct
 
 	smr_read_enter();
 	sc = SMR_PTR_GET(&pfsyncif);
-	if (sc == NULL || !sc->sc_defer)
-		goto leave;
+	defer = (sc != NULL && sc->sc_defer);
+	smr_read_leave();
 
-	pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
-	if (pd == NULL) {
+	if (!defer)
+		return (0);
+
+	KASSERTMSG(m_tag_find(m, PACKET_TAG_PF_DEFER, NULL) == NULL,
+	    "mbuf %p already has a PACKET_TAG_PF_DEFER mtag", m);
+
+	mtag = m_tag_get(PACKET_TAG_PF_DEFER, sizeof(*cmp), M_NOWAIT);
+	if (mtag == NULL)
+		return (ENOMEM);
+
+	cmp = (struct pf_state_cmp *)(mtag + 1);
+	cmp->id = st->id;
+	cmp->creatorid = st->creatorid;
+
+	m_tag_prepend(m, mtag);
+	return (0);
+}
+
+struct mbuf *
+pfsync_defer_out(struct ifnet *ifp, struct mbuf *m)
+{
+	struct pfsync_softc *sc;
+	struct pfsync_slice *s;
+	struct pf_state *st;
+	struct pfsync_deferral *pd;
+	struct m_tag *mtag;
+	struct pf_state_cmp *cmp;
+	int sched = 0;
+
+	mtag = m_tag_find(m, PACKET_TAG_PF_DEFER, NULL);
+	KASSERTMSG(mtag != NULL,
+	    "mbuf %p has PACKET_TAG_PF_DEFER set but no tag", m);
+	cmp = (struct pf_state_cmp *)(mtag + 1);
+
+	PF_STATE_ENTER_READ();
+	st = pf_find_state_byid(cmp);
+	pf_state_ref(st);
+	PF_STATE_EXIT_READ();
+
+	m_tag_delete(m, mtag);
+
+	/* the state doesn't exist already^Wanymore */
+	if (st == NULL)
+		return (m);
+
+	smr_read_enter();
+	sc = SMR_PTR_GET(&pfsyncif);
+	if (sc == NULL || !sc->sc_defer)
 		goto leave;
-	}
 
 	s = pfsync_slice_enter(sc, st);
 	s->s_stat_defer_add++;
 
-	pd->pd_st = pf_state_ref(st);
-	pd->pd_m = m;
-	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
+	/* pd is protected by the slice mutex */
+	pd = st->sync_defer;
+	if (pd == NULL) {
+		pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
+		if (pd == NULL)
+			goto leave;
+
+		pd->pd_ifidx = ifp->if_index;
+		pd->pd_st = pf_state_ref(st);
+		ml_init(&pd->pd_ml);
+		pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
 
-	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
-	st->sync_defer = pd;
+		st->sync_defer = pd;
 
-	sched = s->s_deferred++;
-	TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
+		sched = s->s_deferred++;
+		TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
+	}
+
+	//m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
+	ml_enqueue(&pd->pd_ml, m);
+	m = NULL; /* take the packet away from the caller */
 
 	if (sched == 0)
 		timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC);
@@ -1970,11 +2026,11 @@ pfsync_defer(struct pf_state *st, struct
 
 	pfsync_slice_sched(s);
 	pfsync_slice_leave(sc, s);
-	rv = 1;
 leave:
 	smr_read_leave();
+	pf_state_unref(st);
 
-	return (rv);
+	return (m);
 }
 
 static void
@@ -2056,57 +2112,31 @@ pfsync_deferrals_task(void *arg)
 	if (TAILQ_EMPTY(&pds))
 		return;
 
-	NET_LOCK();
 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
 		TAILQ_REMOVE(&pds, pd, pd_entry);
 
 		pfsync_defer_output(pd);
 	}
-	NET_UNLOCK();
 }
 
 static void
 pfsync_defer_output(struct pfsync_deferral *pd)
 {
-	struct pf_pdesc pdesc;
 	struct pf_state *st = pd->pd_st;
+	struct ifnet *ifp;
+	struct mbuf *m;
 
-	if (st->rt == PF_ROUTETO) {
-		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
-		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
-			return;
-		switch (st->key[PF_SK_WIRE]->af) {
-		case AF_INET:
-			pf_route(&pdesc, st);
-			break;
-#ifdef INET6
-		case AF_INET6:
-			pf_route6(&pdesc, st);
-			break;
-#endif /* INET6 */
-		default:
-			unhandled_af(st->key[PF_SK_WIRE]->af);
-		}
-		pd->pd_m = pdesc.m;
-	} else {
-		switch (st->key[PF_SK_WIRE]->af) {
-		case AF_INET:
-			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
-			break;
-#ifdef INET6
-		case AF_INET6:
-			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
-			break;
-#endif /* INET6 */
-		default:
-			unhandled_af(st->key[PF_SK_WIRE]->af);
+	ifp = if_get(pd->pd_ifidx);
+	if (ifp != NULL) {
+		while ((m = ml_dequeue(&pd->pd_ml)) != NULL) {
+			if (if_enqueue(ifp, m) != 0)
+				break;
 		}
-
-		pd->pd_m = NULL;
 	}
+	if_put(ifp);
 
 	pf_state_unref(st);
-	m_freem(pd->pd_m);
+	ml_purge(&pd->pd_ml);
 	pool_put(&pfsync_deferrals_pool, pd);
 }
 
Index: sys/net/if_pfsync.h
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.h,v
diff -u -p -r1.62 if_pfsync.h
--- sys/net/if_pfsync.h	13 May 2024 01:15:53 -0000	1.62
+++ sys/net/if_pfsync.h	21 Aug 2024 05:32:39 -0000
@@ -333,6 +333,7 @@ void			pfsync_update_tdb(struct tdb *, i
 void			pfsync_delete_tdb(struct tdb *);
 
 int			pfsync_defer(struct pf_state *, struct mbuf *);
+struct mbuf		*pfsync_defer_out(struct ifnet *, struct mbuf *);
 
 int			pfsync_is_up(void);
 int			pfsync_state_in_use(struct pf_state *);
Index: sys/net/pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
diff -u -p -r1.1204 pf.c
--- sys/net/pf.c	6 Aug 2024 16:56:09 -0000	1.1204
+++ sys/net/pf.c	21 Aug 2024 05:32:39 -0000
@@ -4573,8 +4573,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
 		 * firewall has to know about it to allow
 		 * replies through it.
 		 */
-		if (pfsync_defer(*sm, pd->m))
-			return (PF_DEFER);
+		pfsync_defer(*sm, pd->m);
 	}
 #endif	/* NPFSYNC > 0 */
 
@@ -6563,17 +6562,12 @@ pf_rtlabel_match(struct pf_addr *addr, s
 	return (ret);
 }
 
-/* pf_route() may change pd->m, adjust local copies after calling */
-void
-pf_route(struct pf_pdesc *pd, struct pf_state *st)
+static void
+pf_route_af(struct pf_pdesc *pd, struct pf_state *st,
+    void (*send)(struct mbuf *))
 {
 	struct mbuf		*m0;
-	struct mbuf_list	 ml;
-	struct sockaddr_in	*dst, sin;
-	struct rtentry		*rt = NULL;
-	struct ip		*ip;
-	struct ifnet		*ifp = NULL;
-	unsigned int		 rtableid;
+	struct m_tag		*mtag;
 
 	if (pd->m->m_pkthdr.pf.routed++ > 3) {
 		m_freem(pd->m);
@@ -6582,220 +6576,49 @@ pf_route(struct pf_pdesc *pd, struct pf_
 	}
 
 	if (st->rt == PF_DUPTO) {
-		if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL)
-			return;
-	} else {
-		if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
+		m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT);
+		if (m0 == NULL)
 			return;
+	} else if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
+		return;
+	else
 		m0 = pd->m;
-		pd->m = NULL;
-	}
-
-	if (m0->m_len < sizeof(struct ip)) {
-		DPFPRINTF(LOG_ERR,
-		    "%s: m0->m_len < sizeof(struct ip)", __func__);
-		goto bad;
-	}
-
-	ip = mtod(m0, struct ip *);
-
-	if (pd->dir == PF_IN) {
-		if (ip->ip_ttl <= IPTTLDEC) {
-			if (st->rt != PF_DUPTO) {
-				pf_send_icmp(m0, ICMP_TIMXCEED,
-				    ICMP_TIMXCEED_INTRANS, 0,
-				    pd->af, st->rule.ptr, pd->rdomain);
-			}
-			goto bad;
-		}
-		ip->ip_ttl -= IPTTLDEC;
-	}
-
-	memset(&sin, 0, sizeof(sin));
-	dst = &sin;
-	dst->sin_family = AF_INET;
-	dst->sin_len = sizeof(*dst);
-	dst->sin_addr = st->rt_addr.v4;
-	rtableid = m0->m_pkthdr.ph_rtableid;
-
-	rt = rtalloc_mpath(sintosa(dst), &ip->ip_src.s_addr, rtableid);
-	if (!rtisvalid(rt)) {
-		if (st->rt != PF_DUPTO) {
-			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_HOST,
-			    0, pd->af, st->rule.ptr, pd->rdomain);
-		}
-		ipstat_inc(ips_noroute);
-		goto bad;
-	}
-
-	ifp = if_get(rt->rt_ifidx);
-	if (ifp == NULL)
-		goto bad;
-
-	/* A locally generated packet may have invalid source address. */
-	if ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET &&
-	    (ifp->if_flags & IFF_LOOPBACK) == 0)
-		ip->ip_src = ifatoia(rt->rt_ifa)->ia_addr.sin_addr;
 
-	if (st->rt != PF_DUPTO && pd->dir == PF_IN) {
-		if (pf_test(AF_INET, PF_OUT, ifp, &m0) != PF_PASS)
-			goto bad;
-		else if (m0 == NULL)
-			goto done;
-		if (m0->m_len < sizeof(struct ip)) {
-			DPFPRINTF(LOG_ERR,
-			    "%s: m0->m_len < sizeof(struct ip)", __func__);
-			goto bad;
+	mtag = m_tag_find(m0, PACKET_TAG_PF_ROUTE, NULL);
+	if (mtag == NULL) {
+		mtag = m_tag_get(PACKET_TAG_PF_ROUTE, sizeof(st->rt_addr),
+		    M_NOWAIT);
+		if (mtag == NULL) {
+			if (m0 == pd->m)
+				pd->m = NULL;
+			m_freem(m0);
+			return;
 		}
-		ip = mtod(m0, struct ip *);
-	}
 
-	if (if_output_tso(ifp, &m0, sintosa(dst), rt, ifp->if_mtu) ||
-	    m0 == NULL)
-		goto done;
-
-	/*
-	 * Too large for interface; fragment if possible.
-	 * Must be able to put at least 8 bytes per fragment.
-	 */
-	if (ip->ip_off & htons(IP_DF)) {
-		ipstat_inc(ips_cantfrag);
-		if (st->rt != PF_DUPTO)
-			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
-			    ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain);
-		goto bad;
+		m_tag_prepend(m0, mtag);
 	}
 
-	if (ip_fragment(m0, &ml, ifp, ifp->if_mtu) ||
-	    if_output_ml(ifp, &ml, sintosa(dst), rt))
-		goto done;
-	ipstat_inc(ips_fragmented);
+	*(struct pf_addr *)(mtag + 1) = st->rt_addr;
 
-done:
-	if_put(ifp);
-	rtfree(rt);
-	return;
+	if (st->rt == PF_DUPTO) {
+		SET(m0->m_pkthdr.pf.flags, PF_TAG_GENERATED);
+		(*send)(m0);
+	} else if (pd->dir == PF_OUT)
+		SET(m0->m_pkthdr.pf.flags, PF_TAG_REROUTE);
+}
 
-bad:
-	m_freem(m0);
-	goto done;
+/* pf_route() may change pd->m, adjust local copies after calling */
+void
+pf_route(struct pf_pdesc *pd, struct pf_state *st)
+{
+	pf_route_af(pd, st, ip_send);
 }
 
 #ifdef INET6
-/* pf_route6() may change pd->m, adjust local copies after calling */
 void
 pf_route6(struct pf_pdesc *pd, struct pf_state *st)
 {
-	struct mbuf		*m0;
-	struct sockaddr_in6	*dst, sin6;
-	struct rtentry		*rt = NULL;
-	struct ip6_hdr		*ip6;
-	struct ifnet		*ifp = NULL;
-	struct m_tag		*mtag;
-	unsigned int		 rtableid;
-
-	if (pd->m->m_pkthdr.pf.routed++ > 3) {
-		m_freem(pd->m);
-		pd->m = NULL;
-		return;
-	}
-
-	if (st->rt == PF_DUPTO) {
-		if ((m0 = m_dup_pkt(pd->m, max_linkhdr, M_NOWAIT)) == NULL)
-			return;
-	} else {
-		if ((st->rt == PF_REPLYTO) == (st->direction == pd->dir))
-			return;
-		m0 = pd->m;
-		pd->m = NULL;
-	}
-
-	if (m0->m_len < sizeof(struct ip6_hdr)) {
-		DPFPRINTF(LOG_ERR,
-		    "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__);
-		goto bad;
-	}
-	ip6 = mtod(m0, struct ip6_hdr *);
-
-	if (pd->dir == PF_IN) {
-		if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
-			if (st->rt != PF_DUPTO) {
-				pf_send_icmp(m0, ICMP6_TIME_EXCEEDED,
-				    ICMP6_TIME_EXCEED_TRANSIT, 0,
-				    pd->af, st->rule.ptr, pd->rdomain);
-			}
-			goto bad;
-		}
-		ip6->ip6_hlim -= IPV6_HLIMDEC;
-	}
-
-	memset(&sin6, 0, sizeof(sin6));
-	dst = &sin6;
-	dst->sin6_family = AF_INET6;
-	dst->sin6_len = sizeof(*dst);
-	dst->sin6_addr = st->rt_addr.v6;
-	rtableid = m0->m_pkthdr.ph_rtableid;
-
-	rt = rtalloc_mpath(sin6tosa(dst), &ip6->ip6_src.s6_addr32[0],
-	    rtableid);
-	if (!rtisvalid(rt)) {
-		if (st->rt != PF_DUPTO) {
-			pf_send_icmp(m0, ICMP6_DST_UNREACH,
-			    ICMP6_DST_UNREACH_NOROUTE, 0,
-			    pd->af, st->rule.ptr, pd->rdomain);
-		}
-		ip6stat_inc(ip6s_noroute);
-		goto bad;
-	}
-
-	ifp = if_get(rt->rt_ifidx);
-	if (ifp == NULL)
-		goto bad;
-
-	/* A locally generated packet may have invalid source address. */
-	if (IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
-	    (ifp->if_flags & IFF_LOOPBACK) == 0)
-		ip6->ip6_src = ifatoia6(rt->rt_ifa)->ia_addr.sin6_addr;
-
-	if (st->rt != PF_DUPTO && pd->dir == PF_IN) {
-		if (pf_test(AF_INET6, PF_OUT, ifp, &m0) != PF_PASS)
-			goto bad;
-		else if (m0 == NULL)
-			goto done;
-		if (m0->m_len < sizeof(struct ip6_hdr)) {
-			DPFPRINTF(LOG_ERR,
-			    "%s: m0->m_len < sizeof(struct ip6_hdr)", __func__);
-			goto bad;
-		}
-	}
-
-	/*
-	 * If packet has been reassembled by PF earlier, we have to
-	 * use pf_refragment6() here to turn it back to fragments.
-	 */
-	if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) {
-		(void) pf_refragment6(&m0, mtag, dst, ifp, rt);
-		goto done;
-	}
-
-	if (if_output_tso(ifp, &m0, sin6tosa(dst), rt, ifp->if_mtu) ||
-	    m0 == NULL)
-		goto done;
-
-	ip6stat_inc(ip6s_cantfrag);
-	if (st->rt != PF_DUPTO)
-		pf_send_icmp(m0, ICMP6_PACKET_TOO_BIG, 0,
-		    ifp->if_mtu, pd->af, st->rule.ptr, pd->rdomain);
-	goto bad;
-
-done:
-	if_put(ifp);
-	rtfree(rt);
-	return;
-
-bad:
-	m_freem(m0);
-	goto done;
+	pf_route_af(pd, st, ip6_send);
 }
 #endif /* INET6 */
 
@@ -7935,10 +7758,6 @@ done:
 	case PF_SYNPROXY_DROP:
 		m_freem(pd.m);
 		/* FALLTHROUGH */
-	case PF_DEFER:
-		pd.m = NULL;
-		action = PF_PASS;
-		break;
 	case PF_DIVERT:
 		switch (pd.af) {
 		case AF_INET:
@@ -8059,6 +7878,9 @@ pf_ouraddr(struct mbuf *m)
 {
 	struct pf_state_key	*sk;
 
+	if (ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_PF_ROUTE))
+		return (0); 
+
 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED)
 		return (1);
 
@@ -8078,6 +7900,16 @@ pf_ouraddr(struct mbuf *m)
 void
 pf_pkt_addr_changed(struct mbuf *m)
 {
+	struct m_tag *mtag;
+
+	mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (mtag != NULL) {
+		m_tag_delete(m, mtag);
+
+		KASSERTMSG(m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL) == NULL,
+		    "mbuf %p had multiple PACKET_TAG_PF_ROUTE mbuf tags", m);
+	}
+
 	pf_mbuf_unlink_state_key(m);
 	pf_mbuf_unlink_inpcb(m);
 }
Index: sys/net/pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
diff -u -p -r1.538 pfvar.h
--- sys/net/pfvar.h	13 May 2024 01:15:53 -0000	1.538
+++ sys/net/pfvar.h	21 Aug 2024 05:32:39 -0000
@@ -67,7 +67,7 @@ typedef struct refcnt	pf_refcnt_t;
 
 enum	{ PF_INOUT, PF_IN, PF_OUT, PF_FWD };
 enum	{ PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT,
-	  PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER,
+	  PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP,
 	  PF_MATCH, PF_DIVERT, PF_RT, PF_AFRT };
 enum	{ PF_TRANS_RULESET, PF_TRANS_TABLE };
 enum	{ PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT,
Index: sys/netinet/ip_icmp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
diff -u -p -r1.196 ip_icmp.c
--- sys/netinet/ip_icmp.c	14 Jul 2024 18:53:39 -0000	1.196
+++ sys/netinet/ip_icmp.c	21 Aug 2024 05:32:39 -0000
@@ -684,7 +684,8 @@ icmp_reflect(struct mbuf *m, struct mbuf
 	struct ip *ip = mtod(m, struct ip *);
 	struct mbuf *opts = NULL;
 	struct sockaddr_in sin;
-	struct rtentry *rt = NULL;
+	struct rtentry *rt;
+	struct in_addr ip_src = { INADDR_ANY };
 	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
 	u_int rtableid;
 	u_int8_t pfflags;
@@ -701,10 +702,6 @@ icmp_reflect(struct mbuf *m, struct mbuf
 		return (ELOOP);
 	}
 	rtableid = m->m_pkthdr.ph_rtableid;
-	pfflags = m->m_pkthdr.pf.flags;
-	m_resethdr(m);
-	m->m_pkthdr.ph_rtableid = rtableid;
-	m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
 
 	/*
 	 * If the incoming packet was addressed directly to us,
@@ -718,41 +715,80 @@ icmp_reflect(struct mbuf *m, struct mbuf
 		sin.sin_addr = ip->ip_dst;
 
 		rt = rtalloc(sintosa(&sin), 0, rtableid);
-		if (rtisvalid(rt) &&
-		    ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST))
-			ia = ifatoia(rt->rt_ifa);
-	}
+		if (rtisvalid(rt)) {
+			if (ISSET(rt->rt_flags, RTF_LOCAL))
+				ip_src = ip->ip_dst;
+			else if (ISSET(rt->rt_flags, RTF_BROADCAST)) {
+				ia = ifatoia(rt->rt_ifa);
+				ip_src = ia->ia_addr.sin_addr;
+			}
+		}
+		rtfree(rt);
+	} else
+		ip_src = ia->ia_addr.sin_addr;
 
 	/*
 	 * The following happens if the packet was not addressed to us.
-	 * Use the new source address and do a route lookup. If it fails
-	 * drop the packet as there is no path to the host.
+	 * If we're directly connected use the closest address, otherwise
+	 * try to use the sourceaddr from the routing table.
 	 */
-	if (ia == NULL) {
-		rtfree(rt);
-
+	if (ip_src.s_addr == INADDR_ANY) {
 		memset(&sin, 0, sizeof(sin));
 		sin.sin_len = sizeof(sin);
 		sin.sin_family = AF_INET;
 		sin.sin_addr = ip->ip_src;
 
-		/* keep packet in the original virtual instance */
-		rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
-		if (rt == NULL) {
-			ipstat_inc(ips_noroute);
-			m_freem(m);
-			return (EHOSTUNREACH);
+		rt = rtalloc_mpath(sintosa(&sin), &ip->ip_dst.s_addr, rtableid);
+		if (rtisvalid(rt) &&
+		    ISSET(rt->rt_flags, RTF_LLINFO|RTF_HOST)) {
+			ia = ifatoia(rt->rt_ifa);
+			ip_src = ia->ia_addr.sin_addr;
+		} else {
+			struct sockaddr *sourceaddr;
+                        struct ifaddr *ifa;
+
+			sourceaddr = rtable_getsource(rtableid, AF_INET);
+			if (sourceaddr != NULL) {
+				ifa = ifa_ifwithaddr(sourceaddr, rtableid);
+				if (ifa != NULL &&
+				    ISSET(ifa->ifa_ifp->if_flags, IFF_UP))
+					ip_src = satosin(sourceaddr)->sin_addr;
+			}
 		}
+		rtfree(rt);
+	}
 
-		ia = ifatoia(rt->rt_ifa);
+	/*
+	 * If the above didn't find an ip_src, get the IP of the
+	 * interface the original packet was received on. If all this
+	 * comes up with nothing, ip_output() will try and fill it
+	 * in for us.
+	 */
+	if (ip_src.s_addr == INADDR_ANY) {
+		struct ifnet *ifp;
+		struct ifaddr *ifa;
+
+		ifp = if_get(m->m_pkthdr.ph_ifidx);
+		if (ifp != NULL) {
+			TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
+				if (ifa->ifa_addr->sa_family != AF_INET)
+					continue;
+
+				ip_src = satosin(ifa->ifa_addr)->sin_addr;
+				break;
+			}
+		}
+		if_put(ifp);
 	}
 
+	pfflags = m->m_pkthdr.pf.flags;
+
+	m_resethdr(m);
+	m->m_pkthdr.ph_rtableid = rtableid;
+	m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
 	ip->ip_dst = ip->ip_src;
+	ip->ip_src = ip_src;
 	ip->ip_ttl = MAXTTL;
-
-	/* It is safe to dereference ``ia'' iff ``rt'' is valid. */
-	ip->ip_src = ia->ia_addr.sin_addr;
-	rtfree(rt);
 
 	if (optlen > 0) {
 		u_char *cp;
Index: sys/netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
diff -u -p -r1.401 ip_input.c
--- sys/netinet/ip_input.c	6 Aug 2024 16:56:09 -0000	1.401
+++ sys/netinet/ip_input.c	21 Aug 2024 05:32:39 -0000
@@ -1560,6 +1560,10 @@ ip_forward(struct mbuf *m, struct ifnet 
 	struct mbuf *mcopy;
 	int error = 0, type = 0, code = 0, destmtu = 0;
 	u_int32_t dest;
+	struct in_addr *rt_dst;
+#if NPF > 0
+	struct m_tag *rt_mtag;
+#endif
 
 	dest = 0;
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
@@ -1571,12 +1575,21 @@ ip_forward(struct mbuf *m, struct ifnet 
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
 		goto done;
 	}
+	rt_dst = &ip->ip_dst;
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v4;
+		SET(flags, IP_REDIRECT);
+	}
+#endif
 
 	if (ro == NULL) {
 		ro = &iproute;
 		ro->ro_rt = NULL;
 	}
-	rt = route_mpath(ro, &ip->ip_dst, &ip->ip_src, rtableid);
+	rt = route_mpath(ro, rt_dst, &ip->ip_src, rtableid);
 	if (rt == NULL) {
 		ipstat_inc(ips_noroute);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
Index: sys/netinet/ip_output.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_output.c,v
diff -u -p -r1.401 ip_output.c
--- sys/netinet/ip_output.c	2 Jul 2024 18:33:47 -0000	1.401
+++ sys/netinet/ip_output.c	21 Aug 2024 05:32:39 -0000
@@ -110,7 +110,9 @@ ip_output(struct mbuf *m, struct mbuf *o
 	struct sockaddr_in *dst;
 	struct tdb *tdb = NULL;
 	u_long mtu;
+	struct in_addr *rt_dst;
 #if NPF > 0
+	struct m_tag *rt_mtag;
 	u_int orig_rtableid;
 #endif
 
@@ -128,7 +130,7 @@ ip_output(struct mbuf *m, struct mbuf *o
 	/*
 	 * Fill in IP header.
 	 */
-	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
+	if (!ISSET(flags, IP_FORWARDING|IP_RAWOUTPUT)) {
 		ip->ip_v = IPVERSION;
 		ip->ip_off &= htons(IP_DF);
 		ip->ip_id = htons(ip_randomid());
@@ -151,6 +153,7 @@ ip_output(struct mbuf *m, struct mbuf *o
 	orig_rtableid = m->m_pkthdr.ph_rtableid;
 reroute:
 #endif
+	rt_dst = &ip->ip_dst;
 
 	/*
 	 * Do a route lookup now in case we need the source address to
@@ -163,11 +166,19 @@ reroute:
 		ro->ro_rt = NULL;
 	}
 
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v4;
+	}
+#endif
+
 	/*
 	 * If there is a cached route, check that it is to the same
 	 * destination and is still up.  If not, free it and try again.
 	 */
-	route_cache(ro, &ip->ip_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid);
+	route_cache(ro, rt_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid);
 	dst = &ro->ro_dstsin;
 
 	if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
@@ -398,6 +409,20 @@ sendit:
 	}
 #endif /* IPSEC */
 
+	if (ro != NULL && ro->ro_rt != NULL) {
+		struct rtentry *rt = ro->ro_rt;
+
+		if (ISSET(rt->rt_flags, RTF_REJECT)) {
+			error = ISSET(rt->rt_flags, RTF_HOST) ?
+			    EHOSTUNREACH : ENETUNREACH;
+			goto bad;
+		}
+		if (ISSET(rt->rt_flags, RTF_BLACKHOLE)) {
+			error = 0;
+			goto bad;
+		}
+	}
+
 	/*
 	 * Packet filter
 	 */
@@ -418,9 +443,6 @@ sendit:
 	else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) {
 		/* tag as generated to skip over pf_test on rerun */
 		m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
-		if (ro == &iproute)
-			rtfree(ro->ro_rt);
-		ro = NULL;
 		if_put(ifp); /* drop reference since target changed */
 		ifp = NULL;
 		goto reroute;
Index: sys/netinet6/ip6_forward.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v
diff -u -p -r1.124 ip6_forward.c
--- sys/netinet6/ip6_forward.c	19 Jul 2024 16:58:32 -0000	1.124
+++ sys/netinet6/ip6_forward.c	21 Aug 2024 05:32:39 -0000
@@ -101,6 +101,10 @@ ip6_forward(struct mbuf *m, struct route
 	struct tdb *tdb = NULL;
 #endif /* IPSEC */
 	char src6[INET6_ADDRSTRLEN], dst6[INET6_ADDRSTRLEN];
+	struct in6_addr *rt_dst;
+#if NPF > 0
+	struct m_tag *rt_mtag;
+#endif
 
 	/*
 	 * Do not forward packets to multicast destination (should be handled
@@ -202,11 +206,20 @@ reroute:
 	}
 #endif /* IPSEC */
 
+	rt_dst = &ip6->ip6_dst;
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v6;
+	}
+#endif
+
 	if (ro == NULL) {
 		ro = &iproute;
 		ro->ro_rt = NULL;
 	}
-	rt = route6_mpath(ro, &ip6->ip6_dst, &ip6->ip6_src,
+	rt = route6_mpath(ro, rt_dst, &ip6->ip6_src,
 	    m->m_pkthdr.ph_rtableid);
 	if (rt == NULL) {
 		ip6stat_inc(ip6s_noroute);
Index: sys/netinet6/ip6_input.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
diff -u -p -r1.266 ip6_input.c
--- sys/netinet6/ip6_input.c	19 Jul 2024 16:58:32 -0000	1.266
+++ sys/netinet6/ip6_input.c	21 Aug 2024 05:32:39 -0000
@@ -99,7 +99,6 @@
 #include <netinet6/nd6.h>
 
 #include "gif.h"
-#include "bpfilter.h"
 
 #ifdef MROUTING
 #include <netinet6/ip6_mroute.h>
@@ -364,7 +363,9 @@ ip6_input_if(struct mbuf **mp, int *offp
 	u_int16_t src_scope, dst_scope;
 #if NPF > 0
 	struct in6_addr odst;
+	struct m_tag *rt_mtag;
 #endif
+	struct in6_addr *rt_dst;
 	int flags = 0;
 
 	KASSERT(*offp == 0);
@@ -523,11 +524,19 @@ ip6_input_if(struct mbuf **mp, int *offp
 		goto out;
 	}
 
-
 	/*
 	 *  Unicast check
 	 */
-	rt = route6_mpath(&ro, &ip6->ip6_dst, &ip6->ip6_src,
+	rt_dst = &ip6->ip6_dst;
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v6;
+	}
+#endif
+
+	rt = route6_mpath(&ro, rt_dst, &ip6->ip6_src,
 	    m->m_pkthdr.ph_rtableid);
 
 	/*
Index: sys/netinet6/ip6_output.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_output.c,v
diff -u -p -r1.292 ip6_output.c
--- sys/netinet6/ip6_output.c	4 Jul 2024 12:50:08 -0000	1.292
+++ sys/netinet6/ip6_output.c	21 Aug 2024 05:32:39 -0000
@@ -177,6 +177,7 @@ ip6_output(struct mbuf *m, struct ip6_pk
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr finaldst;
+	struct in6_addr *rt_dst;
 	struct route *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	u_int8_t sproto = 0;
@@ -184,6 +185,9 @@ ip6_output(struct mbuf *m, struct ip6_pk
 #ifdef IPSEC
 	struct tdb *tdb = NULL;
 #endif /* IPSEC */
+#if NPF > 0
+	struct m_tag *rt_mtag;
+#endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	finaldst = ip6->ip6_dst;
@@ -387,6 +391,7 @@ ip6_output(struct mbuf *m, struct ip6_pk
 #if NPF > 0
 reroute:
 #endif
+	rt_dst = &ip6->ip6_dst;
 
 	/* initialize cached route */
 	if (ro == NULL) {
@@ -456,8 +461,16 @@ reroute:
 			ifp = if_get(im6o->im6o_ifidx);
 	}
 
+#if NPF > 0
+	rt_mtag = m_tag_find(m, PACKET_TAG_PF_ROUTE, NULL);
+	if (rt_mtag != NULL) {
+		struct pf_addr *rt_addr = (struct pf_addr *)(rt_mtag + 1);
+		rt_dst = &rt_addr->v6;
+	}
+#endif
+
 	if (ifp == NULL) {
-		rt = in6_selectroute(&ip6->ip6_dst, opt, ro,
+		rt = in6_selectroute(rt_dst, opt, ro,
 		    m->m_pkthdr.ph_rtableid);
 		if (rt == NULL) {
 			ip6stat_inc(ip6s_noroute);
@@ -480,7 +493,7 @@ reroute:
 			goto bad;
 		}
 	} else {
-		route6_cache(ro, &ip6->ip6_dst, NULL, m->m_pkthdr.ph_rtableid);
+		route6_cache(ro, rt_dst, NULL, m->m_pkthdr.ph_rtableid);
 	}
 
 	if (rt && (rt->rt_flags & RTF_GATEWAY) &&
Index: sys/sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.263 mbuf.h
--- sys/sys/mbuf.h	14 Apr 2024 20:46:27 -0000	1.263
+++ sys/sys/mbuf.h	21 Aug 2024 05:32:39 -0000
@@ -471,12 +471,14 @@ struct m_tag *m_tag_next(struct mbuf *, 
 #define PACKET_TAG_IPSEC_IN_DONE	0x0001  /* IPsec applied, in */
 #define PACKET_TAG_IPSEC_OUT_DONE	0x0002  /* IPsec applied, out */
 #define PACKET_TAG_IPSEC_FLOWINFO	0x0004	/* IPsec flowinfo */
+#define PACKET_TAG_PF_DEFER		0x0008	/* pfsync deferred packet */
 #define PACKET_TAG_IP_OFFNXT		0x0010  /* IPv4 offset and next proto */
 #define PACKET_TAG_IP6_OFFNXT		0x0020  /* IPv6 offset and next proto */
 #define PACKET_TAG_WIREGUARD		0x0040  /* WireGuard data */
 #define PACKET_TAG_GRE			0x0080  /* GRE processing done */
 #define PACKET_TAG_DLT			0x0100 /* data link layer type */
 #define PACKET_TAG_PF_DIVERT		0x0200 /* pf(4) diverted packet */
+#define PACKET_TAG_PF_ROUTE		0x0400 /* pf(4) route-to */
 #define PACKET_TAG_PF_REASSEMBLED	0x0800 /* pf reassembled ipv6 packet */
 #define PACKET_TAG_SRCROUTE		0x1000 /* IPv4 source routing options */
 #define PACKET_TAG_TUNNEL		0x2000	/* Tunnel endpoint address */