Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: link mbufs/inpcbs to pf_states, not pf_state_keys
To:
tech@openbsd.org
Date:
Fri, 16 Jan 2026 07:26:52 +1000

Download raw body.

Thread
On Mon, Jun 02, 2025 at 10:27:39PM +1000, David Gwynne wrote:
> On Fri, May 23, 2025 at 10:24:28AM +0900, Alexander Bluhm wrote:
> > On Wed, May 21, 2025 at 06:36:27PM +0900, Alexander Bluhm wrote:
> > > On Sat, May 10, 2025 at 10:28:56AM +1000, David Gwynne wrote:
> > > > tl;dr: less pointer chasing, and more precise links between things
> > > > representing connections in the kernel.
> > > 
> > > regress/sys/net/pf_forward and regress/sys/net/pflog fail.
> > > 
> > > I am not 100% sure that it is related to this diff.  Due to other
> > > bugs, my last successful regress run was at May 9th.  Give me another
> > > day, then I can say if these tests are broken in current already.
> > 
> > I have tested i386 snapshots, tests pass.  Apply the diff and they
> > fail.
> > 
> > http://bluhm.genua.de/regress/results/2025-05-22T13%3A51%3A46Z/logs/sys/net/pf_forward/make.log
> > 
> > FAIL	sys/net/pf_forward	*** Error 1 in . (Makefile:178 'run-ping-inet-AF_IN'), *** Error 2 in . (Makefile:197 'run-ping-mtu-1400-inet-AF_IN'), *** Error 2 in . (Makefile:219 'run-ping-mtu-1300-inet-AF_IN'), *** Error 1 in . (Makefile:236 'run-udp-inet-AF_IN'), *** Error 1 in . (Makefile:251 'run-tcp-inet-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-icmp-inet-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-udp-inet-AF_IN'), *** Error 1 in . (Makefile:178 'run-ping-inet6-AF_IN'), *** Error 2 in . (Makefile:199 'run-ping-mtu-1400-inet6-AF_IN'), *** Error 2 in . (Makefile:221 'run-ping-mtu-1300-inet6-AF_IN'), *** Error 1 in . (Makefile:236 'run-udp-inet6-AF_IN'), *** Error 1 in . (Makefile:251 'run-tcp-inet6-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-icmp-inet6-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-udp-inet6-AF_IN')
> > 
> > http://bluhm.genua.de/regress/results/2025-05-22T13%3A51%3A46Z/logs/sys/net/pflog/make.log
> 
> > 
> > FAIL	sys/net/pflog	*** Error 1 in . (Makefile:164 'run-ping-14'), *** Error 1 in . (Makefile:168 'run-ping6-14'), *** Error 1 in . (Makefile:199 'run-bpf-nothing')
> 
> thanks for running those tests, and sorry about the delay getting back
> to this. i only got back to looking at this again today.
> 
> i believe i've straightened these issues out in the diff below.

this fixes a pf_state_key leak in an error path pf_state_insert, and
updates it for current.

Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
diff -u -p -r1.302 uipc_mbuf.c
--- kern/uipc_mbuf.c	6 Aug 2025 14:00:33 -0000	1.302
+++ kern/uipc_mbuf.c	15 Jan 2026 05:34:55 -0000
@@ -294,7 +294,7 @@ m_clearhdr(struct mbuf *m)
 	/* delete all mbuf tags to reset the state */
 	m_tag_delete_chain(m);
 #if NPF > 0
-	pf_mbuf_unlink_state_key(m);
+	pf_mbuf_unlink_state(m);
 	pf_mbuf_unlink_inpcb(m);
 #endif	/* NPF > 0 */
 
@@ -422,7 +422,7 @@ m_free(struct mbuf *m)
 	if (m->m_flags & M_PKTHDR) {
 		m_tag_delete_chain(m);
 #if NPF > 0
-		pf_mbuf_unlink_state_key(m);
+		pf_mbuf_unlink_state(m);
 		pf_mbuf_unlink_inpcb(m);
 #endif	/* NPF > 0 */
 	}
@@ -1380,8 +1380,8 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
 	to->m_pkthdr = from->m_pkthdr;
 
 #if NPF > 0
-	to->m_pkthdr.pf.statekey = NULL;
-	pf_mbuf_link_state_key(to, from->m_pkthdr.pf.statekey);
+	to->m_pkthdr.pf.st = NULL;
+	pf_mbuf_link_state(to, from->m_pkthdr.pf.st);
 	to->m_pkthdr.pf.inp = NULL;
 	pf_mbuf_link_inpcb(to, from->m_pkthdr.pf.inp);
 #endif	/* NPF > 0 */
@@ -1511,8 +1511,8 @@ m_print(void *v,
 		    m->m_pkthdr.csum_flags, MCS_BITS);
 		(*pr)("m_pkthdr.ether_vtag: %u\tm_ptkhdr.ph_rtableid: %u\n",
 		    m->m_pkthdr.ether_vtag, m->m_pkthdr.ph_rtableid);
-		(*pr)("m_pkthdr.pf.statekey: %p\tm_pkthdr.pf.inp %p\n",
-		    m->m_pkthdr.pf.statekey, m->m_pkthdr.pf.inp);
+		(*pr)("m_pkthdr.pf.st: %p\tm_pkthdr.pf.inp %p\n",
+		    m->m_pkthdr.pf.st, m->m_pkthdr.pf.inp);
 		(*pr)("m_pkthdr.pf.qid: %u\tm_pkthdr.pf.tag: %u\n",
 		    m->m_pkthdr.pf.qid, m->m_pkthdr.pf.tag);
 		(*pr)("m_pkthdr.pf.flags: %b\n",
Index: net/if_mpw.c
===================================================================
RCS file: /cvs/src/sys/net/if_mpw.c,v
diff -u -p -r1.69 if_mpw.c
--- net/if_mpw.c	27 Nov 2025 03:06:59 -0000	1.69
+++ net/if_mpw.c	15 Jan 2026 05:34:55 -0000
@@ -622,7 +622,7 @@ mpw_input(struct ifnet *ifp, struct mbuf
 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
 
 	/* packet has not been processed by PF yet. */
-	KASSERT(m->m_pkthdr.pf.statekey == NULL);
+	KASSERT(m->m_pkthdr.pf.st == NULL);
 
 #if NBPFILTER > 0
 	if_bpf = sc->sc_bpf;
Index: net/if_tpmr.c
===================================================================
RCS file: /cvs/src/sys/net/if_tpmr.c,v
diff -u -p -r1.43 if_tpmr.c
--- net/if_tpmr.c	11 Dec 2025 06:02:11 -0000	1.43
+++ net/if_tpmr.c	15 Jan 2026 05:34:55 -0000
@@ -314,7 +314,7 @@ tpmr_pf(struct ifnet *ifp0, int dir, str
 		return (NULL);
 
 	if (dir == PF_IN && ISSET(m->m_pkthdr.pf.flags, PF_TAG_DIVERTED)) {
-		pf_mbuf_unlink_state_key(m);
+		pf_mbuf_unlink_state(m);
 		pf_mbuf_unlink_inpcb(m);
 		if_input_proto(ifp0, m, fam->ip_input, ns);
 		return (NULL);
Index: net/if_veb.c
===================================================================
RCS file: /cvs/src/sys/net/if_veb.c,v
diff -u -p -r1.68 if_veb.c
--- net/if_veb.c	11 Dec 2025 06:02:11 -0000	1.68
+++ net/if_veb.c	15 Jan 2026 05:34:55 -0000
@@ -841,7 +841,7 @@ veb_pf(struct ifnet *ifp0, int dir, stru
 		return (NULL);
 
 	if (dir == PF_IN && ISSET(m->m_pkthdr.pf.flags, PF_TAG_DIVERTED)) {
-		pf_mbuf_unlink_state_key(m);
+		pf_mbuf_unlink_state(m);
 		pf_mbuf_unlink_inpcb(m);
 		if_input_proto(ifp0, m, fam->ip_input, ns);
 		return (NULL);
Index: net/pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
diff -u -p -r1.1230 pf.c
--- net/pf.c	7 Jan 2026 13:50:05 -0000	1.1230
+++ net/pf.c	15 Jan 2026 05:34:55 -0000
@@ -247,15 +247,16 @@ int			 pf_state_insert(struct pfi_kif *,
 			    struct pf_state_key **, struct pf_state_key **,
 			    struct pf_state *);
 
+int			 pf_state_isvalid(struct pf_state *);
 int			 pf_state_key_isvalid(struct pf_state_key *);
 struct pf_state_key	*pf_state_key_ref(struct pf_state_key *);
 void			 pf_state_key_unref(struct pf_state_key *);
-void			 pf_state_key_link_reverse(struct pf_state_key *,
-			    struct pf_state_key *);
-void			 pf_state_key_unlink_reverse(struct pf_state_key *);
-void			 pf_state_key_link_inpcb(struct pf_state_key *,
+void			 pf_state_link_reverse(struct pf_state *,
+			    struct pf_state *);
+void			 pf_state_unlink_reverse(struct pf_state *);
+void			 pf_state_link_inpcb(struct pf_state *,
 			    struct inpcb *);
-void			 pf_state_key_unlink_inpcb(struct pf_state_key *);
+void			 pf_state_unlink_inpcb(struct pf_state *);
 void			 pf_pktenqueue_delayed(void *);
 int32_t			 pf_state_expires(const struct pf_state *, uint8_t);
 
@@ -1134,8 +1135,6 @@ pf_state_key_detach(struct pf_state *st,
 	if (TAILQ_EMPTY(&sk->sk_states)) {
 		RBT_REMOVE(pf_state_tree, &pf_statetbl, sk);
 		sk->sk_removed = 1;
-		pf_state_key_unlink_reverse(sk);
-		pf_state_key_unlink_inpcb(sk);
 		pf_state_key_unref(sk);
 	}
 
@@ -1316,14 +1315,24 @@ pf_state_insert(struct pfi_kif *kif, str
 
 	if (same) {
 		/* pf_state_key_attach might have swapped skw */
-		pf_state_key_unref(sks);
-		st->key[PF_SK_STACK] = sks = pf_state_key_ref(skw);
+		if (skw != sks) {
+			pf_state_key_unref(sks);
+			sks = pf_state_key_ref(skw);
+		}
+		st->key[PF_SK_STACK] = sks;
 	} else if (pf_state_key_attach(sks, st, PF_SK_STACK) == NULL) {
 		pf_state_key_detach(st, PF_SK_WIRE);
+		pf_state_key_unref(skw);
 		PF_STATE_EXIT_WRITE();
 		return (-1);
 	}
 
+	/*
+	 * st owns the sks and skw refs now. pf_state_unref (maybe
+	 * via pf_detach_state) is responsible for releasing them in
+	 * the future.
+	 */
+
 	if (st->id == 0 && st->creatorid == 0) {
 		st->id = htobe64(pf_status.stateid++);
 		st->creatorid = pf_status.hostid;
@@ -1397,91 +1406,31 @@ pf_compare_state_keys(struct pf_state_ke
 	}
 }
 
-int
-pf_find_state(struct pf_pdesc *pd, struct pf_state_key_cmp *key,
-    struct pf_state **stp)
+static inline struct pf_state *
+pf_find_state_lookup(struct pf_pdesc *pd, const struct pf_state_key_cmp *key)
 {
-	struct pf_state_key	*sk, *pkt_sk;
+	struct pf_state_key	*sk;
 	struct pf_state_item	*si;
-	struct pf_state		*st = NULL;
+	struct pf_state		*st;
 	int			 didx;
 
-	counters_inc(pf_status_fcounters, FCNT_STATE_SEARCH);
-	if (pf_status.debug >= LOG_DEBUG) {
-		log(LOG_DEBUG, "pf: key search, %s on %s: ",
-		    pd->dir == PF_OUT ? "out" : "in", pd->kif->pfik_name);
-		pf_print_state_parts(NULL, (struct pf_state_key *)key, NULL);
-		addlog("\n");
-	}
-
-	pkt_sk = NULL;
-	sk = NULL;
-	if (pd->dir == PF_OUT) {
-		/* first if block deals with outbound forwarded packet */
-		pkt_sk = pd->m->m_pkthdr.pf.statekey;
-
-		if (!pf_state_key_isvalid(pkt_sk)) {
-			pf_mbuf_unlink_state_key(pd->m);
-			pkt_sk = NULL;
-		}
-
-		if (pkt_sk && pf_state_key_isvalid(pkt_sk->sk_reverse))
-			sk = pkt_sk->sk_reverse;
-
-		if (pkt_sk == NULL) {
-			struct inpcb *inp = pd->m->m_pkthdr.pf.inp;
-
-			/* here we deal with local outbound packet */
-			if (inp != NULL) {
-				struct pf_state_key	*inp_sk;
-
-				mtx_enter(&pf_inp_mtx);
-				inp_sk = inp->inp_pf_sk;
-				if (pf_state_key_isvalid(inp_sk)) {
-					sk = inp_sk;
-					mtx_leave(&pf_inp_mtx);
-				} else if (inp_sk != NULL) {
-					KASSERT(inp_sk->sk_inp == inp);
-					inp_sk->sk_inp = NULL;
-					inp->inp_pf_sk = NULL;
-					mtx_leave(&pf_inp_mtx);
-
-					pf_state_key_unref(inp_sk);
-					in_pcbunref(inp);
-				} else
-					mtx_leave(&pf_inp_mtx);
-			}
-		}
-	}
-
-	if (sk == NULL) {
-		if ((sk = RBT_FIND(pf_state_tree, &pf_statetbl,
-		    (struct pf_state_key *)key)) == NULL)
-			return (PF_DROP);
-		if (pd->dir == PF_OUT && pkt_sk &&
-		    pf_compare_state_keys(pkt_sk, sk, pd->kif, pd->dir) == 0)
-			pf_state_key_link_reverse(sk, pkt_sk);
-		else if (pd->dir == PF_OUT)
-			pf_state_key_link_inpcb(sk, pd->m->m_pkthdr.pf.inp);
-	}
-
-	/* remove firewall data from outbound packet */
-	if (pd->dir == PF_OUT)
-		pf_pkt_addr_changed(pd->m);
+	sk = RBT_FIND(pf_state_tree, &pf_statetbl, (struct pf_state_key *)key);
+	if (sk == NULL)
+		return (NULL);
 
 	didx = (pd->dir == PF_IN) ? PF_SK_WIRE : PF_SK_STACK;
 
 	/* list is sorted, if-bound states before floating ones */
 	TAILQ_FOREACH(si, &sk->sk_states, si_entry) {
-		struct pf_state *sist = si->si_st;
-		if (sist->timeout == PFTM_PURGE)
+		st = si->si_st;
+		if (st->timeout == PFTM_PURGE)
 			continue;
-		if (sist->kif != pfi_all && sist->kif != pd->kif)
+		if (st->kif != pfi_all && st->kif != pd->kif)
 			continue;
 
-		/* af-to needs to handled specially */
-		if (sist->key[PF_SK_WIRE]->af == sist->key[PF_SK_STACK]->af) {
-			if (sk != sist->key[didx])
+		/* af-to needs to be handled specially */
+		if (st->key[PF_SK_WIRE]->af == st->key[PF_SK_STACK]->af) {
+			if (sk != st->key[didx])
 				continue;
 
 		/* af-to case */
@@ -1499,15 +1448,100 @@ pf_find_state(struct pf_pdesc *pd, struc
 			/* one of the sist keys has to be sk */
 		}
 
-		st = sist;
-		break;
+		return (st);
+	}
+
+	return (NULL);
+}
+
+static struct pf_state *
+pf_find_state_reverse(struct pf_state *strev)
+{
+	struct pf_state *st;
+
+	st = strev->reverse;
+	if (st != NULL) {
+		if (pf_state_isvalid(st))
+			return (st);
+
+		pf_state_unlink_reverse(st);
+	}
+
+	return (NULL);
+}
+
+static struct pf_state *
+pf_find_state_inpcb(struct inpcb *inp)
+{
+	struct pf_state *st;
+	int valid = 1;
+
+	if (READ_ONCE(inp->inp_pf_st) == NULL)
+		return (NULL);
+
+	mtx_enter(&pf_inp_mtx); /* this should be a per inp mtx */
+	st = inp->inp_pf_st;
+	if (st != NULL) {
+		KASSERT(st->inp == inp);
+		valid = pf_state_isvalid(st);
+		if (!valid) {
+			st->inp = NULL;
+			inp->inp_pf_st = NULL;
+		}
 	}
+	mtx_leave(&pf_inp_mtx);
 
+	if (!valid) {
+		pf_state_unref(st);
+		in_pcbunref(inp);
+		return (NULL);
+	}
+
+	return (st);
+}
+
+int
+pf_find_state(struct pf_pdesc *pd, struct pf_state_key_cmp *key,
+    struct pf_state **stp)
+{
+	struct pf_state		*st = NULL;
+	struct pf_state		*strev = NULL;
+	struct inpcb		*inp = NULL;
+
+	counters_inc(pf_status_fcounters, FCNT_STATE_SEARCH);
+	if (pf_status.debug >= LOG_DEBUG) {
+		log(LOG_DEBUG, "pf: key search, %s on %s: ",
+		    pd->dir == PF_OUT ? "out" : "in", pd->kif->pfik_name);
+		pf_print_state_parts(NULL, (struct pf_state_key *)key, NULL);
+		addlog("\n");
+	}
+
+	if (pd->dir == PF_OUT) {
+		strev = pd->m->m_pkthdr.pf.st;
+		inp = pd->m->m_pkthdr.pf.inp;
+
+		/* first if block deals with outbound forwarded packet */
+		if (strev != NULL) {
+			KASSERT(inp == NULL);
+
+			st = pf_find_state_reverse(strev);
+		} else if (inp != NULL) {
+			/* here we deal with local outbound packet */
+
+			st = pf_find_state_inpcb(inp);
+		}
+
+		if (st != NULL)
+			goto match;
+	}
+
+	st = pf_find_state_lookup(pd, key);
 	if (st == NULL)
 		return (PF_DROP);
 	if (ISSET(st->state_flags, PFSTATE_INP_UNLINKED))
 		return (PF_DROP);
 
+match:
 	if (st->rule.ptr->pktrate.limit && pd->dir == st->direction) {
 		pf_add_threshold(&st->rule.ptr->pktrate);
 		if (pf_check_threshold(&st->rule.ptr->pktrate))
@@ -2078,6 +2112,9 @@ pf_remove_state(struct pf_state *st)
 	st->timeout = PFTM_UNLINKED;
 	mtx_leave(&st->mtx);
 
+	pf_state_unlink_reverse(st);
+	pf_state_unlink_inpcb(st);
+
 	/* handle load balancing related tasks */
 	pf_postprocess_addr(st);
 
@@ -2168,52 +2205,47 @@ pf_remove_state(struct pf_state *st)
 void
 pf_remove_divert_state(struct inpcb *inp)
 {
-	struct pf_state_key	*sk;
-	struct pf_state_item	*si;
+	struct pf_state		*st;
 
 	PF_ASSERT_UNLOCKED();
 
-	if (READ_ONCE(inp->inp_pf_sk) == NULL)
+	if (READ_ONCE(inp->inp_pf_st) == NULL)
 		return;
 
 	mtx_enter(&pf_inp_mtx);
-	sk = pf_state_key_ref(inp->inp_pf_sk);
+	st = pf_state_ref(inp->inp_pf_st);
 	mtx_leave(&pf_inp_mtx);
-	if (sk == NULL)
+	if (st == NULL)
 		return;
 
 	PF_LOCK();
 	PF_STATE_ENTER_WRITE();
-	TAILQ_FOREACH(si, &sk->sk_states, si_entry) {
-		struct pf_state *sist = si->si_st;
-		if (sk == sist->key[PF_SK_STACK] && sist->rule.ptr &&
-		    (sist->rule.ptr->divert.type == PF_DIVERT_TO ||
-		     sist->rule.ptr->divert.type == PF_DIVERT_REPLY)) {
-			if (sist->key[PF_SK_STACK]->proto == IPPROTO_TCP &&
-			    sist->key[PF_SK_WIRE] != sist->key[PF_SK_STACK]) {
-				/*
-				 * If the local address is translated, keep
-				 * the state for "tcp.closed" seconds to
-				 * prevent its source port from being reused.
-				 */
-				if (sist->src.state < TCPS_FIN_WAIT_2 ||
-				    sist->dst.state < TCPS_FIN_WAIT_2) {
-					pf_set_protostate(sist, PF_PEER_BOTH,
-					    TCPS_TIME_WAIT);
-					pf_update_state_timeout(sist,
-					    PFTM_TCP_CLOSED);
-					sist->expire = getuptime();
-				}
-				sist->state_flags |= PFSTATE_INP_UNLINKED;
-			} else
-				pf_remove_state(sist);
-			break;
-		}
+	if (st->rule.ptr &&
+	    (st->rule.ptr->divert.type == PF_DIVERT_TO ||
+	     st->rule.ptr->divert.type == PF_DIVERT_REPLY)) {
+		if (st->key[PF_SK_STACK]->proto == IPPROTO_TCP &&
+		    st->key[PF_SK_WIRE] != st->key[PF_SK_STACK]) {
+			/*
+			 * If the local address is translated, keep
+			 * the state for "tcp.closed" seconds to
+			 * prevent its source port from being reused.
+			 */
+			if (st->src.state < TCPS_FIN_WAIT_2 ||
+			    st->dst.state < TCPS_FIN_WAIT_2) {
+				pf_set_protostate(st, PF_PEER_BOTH,
+				    TCPS_TIME_WAIT);
+				pf_update_state_timeout(st,
+				    PFTM_TCP_CLOSED);
+				st->expire = getuptime();
+			}
+			st->state_flags |= PFSTATE_INP_UNLINKED;
+		} else
+			pf_remove_state(st);
 	}
 	PF_STATE_EXIT_WRITE();
 	PF_UNLOCK();
 
-	pf_state_key_unref(sk);
+	pf_state_unref(st);
 }
 
 void
@@ -8613,13 +8645,20 @@ done:
 		pd.m->m_pkthdr.pf.qid = qid;
 	if (st != NULL) {
 		struct mbuf *m = pd.m;
+		struct pf_state *strev = m->m_pkthdr.pf.st;
 		struct inpcb *inp = m->m_pkthdr.pf.inp;
 
 		if (pd.dir == PF_IN) {
+			KASSERT(strev == NULL);
 			KASSERT(inp == NULL);
-			pf_mbuf_link_state_key(m, st->key[PF_SK_STACK]);
-		} else if (pd.dir == PF_OUT)
-			pf_state_key_link_inpcb(st->key[PF_SK_STACK], inp);
+			pf_mbuf_link_state(m, st);
+		} else if (pd.dir == PF_OUT) {
+			if (strev != NULL) {
+				KASSERT(inp == NULL);
+				pf_state_link_reverse(st, strev);
+			} else
+				pf_state_link_inpcb(st, inp);
+		}
 
 		if (!ISSET(m->m_pkthdr.csum_flags, M_FLOWID)) {
 			m->m_pkthdr.ph_flowid = st->key[PF_SK_WIRE]->hash;
@@ -8803,14 +8842,14 @@ out:
 int
 pf_ouraddr(struct mbuf *m)
 {
-	struct pf_state_key	*sk;
+	struct pf_state		*st;
 
 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED)
 		return (1);
 
-	sk = m->m_pkthdr.pf.statekey;
-	if (sk != NULL) {
-		if (READ_ONCE(sk->sk_inp) != NULL)
+	st = m->m_pkthdr.pf.st;
+	if (st != NULL) {
+		if (READ_ONCE(st->inp) != NULL)
 			return (1);
 	}
 
@@ -8824,7 +8863,7 @@ pf_ouraddr(struct mbuf *m)
 void
 pf_pkt_addr_changed(struct mbuf *m)
 {
-	pf_mbuf_unlink_state_key(m);
+	pf_mbuf_unlink_state(m);
 	pf_mbuf_unlink_inpcb(m);
 }
 
@@ -8832,85 +8871,102 @@ struct inpcb *
 pf_inp_lookup(struct mbuf *m)
 {
 	struct inpcb *inp = NULL;
-	struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
+	struct pf_state *st;
 
-	if (!pf_state_key_isvalid(sk))
-		pf_mbuf_unlink_state_key(m);
-	else if (READ_ONCE(sk->sk_inp) != NULL) {
+	st = m->m_pkthdr.pf.st;
+	if (st == NULL)
+		return (NULL);
+	if (!pf_state_isvalid(st)) {
+		pf_mbuf_unlink_state(m);
+		return (NULL);
+	}
+
+	if (READ_ONCE(st->inp) != NULL) {
 		mtx_enter(&pf_inp_mtx);
-		inp = in_pcbref(sk->sk_inp);
+		inp = in_pcbref(st->inp);
 		mtx_leave(&pf_inp_mtx);
 	}
 
 	return (inp);
 }
 
+/*
+ * This is called from the IP stack after it's found an inpcb for
+ * an mbuf so it can link the pf_state to that pcb.
+ */
 void
 pf_inp_link(struct mbuf *m, struct inpcb *inp)
 {
-	struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
+	struct pf_state *st;
 
-	if (!pf_state_key_isvalid(sk)) {
-		pf_mbuf_unlink_state_key(m);
+	st = m->m_pkthdr.pf.st;
+	if (st == NULL)
 		return;
-	}
 
 	/*
 	 * we don't need to grab PF-lock here. At worst case we link inp to
 	 * state, which might be just being marked as deleted by another
 	 * thread.
 	 */
-	pf_state_key_link_inpcb(sk, inp);
+	if (pf_state_isvalid(st)) {
+		if (READ_ONCE(st->inp) == NULL)
+			pf_state_link_inpcb(st, inp);
+	}
 
 	/* The statekey has finished finding the inp, it is no longer needed. */
-	pf_mbuf_unlink_state_key(m);
+	pf_mbuf_unlink_state(m);
 }
 
 void
 pf_inp_unlink(struct inpcb *inp)
 {
-	struct pf_state_key *sk;
+	struct pf_state *st;
 
-	if (READ_ONCE(inp->inp_pf_sk) == NULL)
+	if (READ_ONCE(inp->inp_pf_st) == NULL)
 		return;
 
 	mtx_enter(&pf_inp_mtx);
-	sk = inp->inp_pf_sk;
-	if (sk == NULL) {
+	st = inp->inp_pf_st;
+	if (st == NULL) {
 		mtx_leave(&pf_inp_mtx);
 		return;
 	}
-	KASSERT(sk->sk_inp == inp);
-	sk->sk_inp = NULL;
-	inp->inp_pf_sk = NULL;
+	KASSERT(st->inp == inp);
+	st->inp = NULL;
+	inp->inp_pf_st = NULL;
 	mtx_leave(&pf_inp_mtx);
 
-	pf_state_key_unref(sk);
+	pf_state_unref(st);
 	in_pcbunref(inp);
 }
 
 void
-pf_state_key_link_reverse(struct pf_state_key *sk, struct pf_state_key *skrev)
+pf_state_link_reverse(struct pf_state *st, struct pf_state *strev)
 {
-	struct pf_state_key *old_reverse;
+	struct pf_state *ost;
 
-	old_reverse = atomic_cas_ptr(&sk->sk_reverse, NULL, skrev);
-	if (old_reverse != NULL)
-		KASSERT(old_reverse == skrev);
-	else {
-		pf_state_key_ref(skrev);
+	ost = atomic_cas_ptr(&st->reverse, NULL, strev);
+	if (ost != NULL) {
+		KASSERTMSG(ost == strev,
+		    "st %p reverse is %p, not strev %p ",
+		    st, ost, strev);
+	} else {
+		pf_state_ref(strev);
 
 		/*
-		 * NOTE: if sk == skrev, then KASSERT() below holds true, we
+		 * NOTE: if st == strev, then KASSERT() below holds true, we
 		 * still want to grab a reference in such case, because
-		 * pf_state_key_unlink_reverse() does not check whether keys
+		 * pf_state_unlink_reverse() does not check whether states
 		 * are identical or not.
 		 */
-		old_reverse = atomic_cas_ptr(&skrev->sk_reverse, NULL, sk);
-		if (old_reverse != NULL)
-			KASSERT(old_reverse == sk);
+		ost = atomic_cas_ptr(&strev->reverse, NULL, st);
+		if (ost != NULL) {
+			KASSERTMSG(ost == st,
+			    "strev %p reverse is %p, not st %p",
+			    strev, ost, st);
+		}
 
-		pf_state_key_ref(sk);
+		pf_state_ref(st);
 	}
 }
 
@@ -8946,10 +9002,6 @@ pf_state_key_unref(struct pf_state_key *
 	if (PF_REF_RELE(sk->sk_refcnt)) {
 		/* state key must be removed from tree */
 		KASSERT(!pf_state_key_isvalid(sk));
-		/* state key must be unlinked from reverse key */
-		KASSERT(sk->sk_reverse == NULL);
-		/* state key must be unlinked from socket */
-		KASSERT(sk->sk_inp == NULL);
 		pool_put(&pf_state_key_pl, sk);
 	}
 }
@@ -8960,21 +9012,28 @@ pf_state_key_isvalid(struct pf_state_key
 	return ((sk != NULL) && (sk->sk_removed == 0));
 }
 
+int
+pf_state_isvalid(struct pf_state *st)
+{
+	return (st->timeout < PFTM_MAX);
+}
+
 void
-pf_mbuf_link_state_key(struct mbuf *m, struct pf_state_key *sk)
+pf_mbuf_link_state(struct mbuf *m, struct pf_state *st)
 {
-	KASSERT(m->m_pkthdr.pf.statekey == NULL);
-	m->m_pkthdr.pf.statekey = pf_state_key_ref(sk);
+	KASSERT(m->m_pkthdr.pf.st == NULL);
+	m->m_pkthdr.pf.st = pf_state_ref(st);
 }
 
 void
-pf_mbuf_unlink_state_key(struct mbuf *m)
+pf_mbuf_unlink_state(struct mbuf *m)
 {
-	struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
+	struct pf_state *st;
 
-	if (sk != NULL) {
-		m->m_pkthdr.pf.statekey = NULL;
-		pf_state_key_unref(sk);
+	st = m->m_pkthdr.pf.st;
+	if (st != NULL) {
+		m->m_pkthdr.pf.st = NULL;
+		pf_state_unref(st);
 	}
 }
 
@@ -8997,57 +9056,58 @@ pf_mbuf_unlink_inpcb(struct mbuf *m)
 }
 
 void
-pf_state_key_link_inpcb(struct pf_state_key *sk, struct inpcb *inp)
+pf_state_link_inpcb(struct pf_state *st, struct inpcb *inp)
 {
-	if (inp == NULL || READ_ONCE(sk->sk_inp) != NULL)
+	if (inp == NULL || READ_ONCE(st->inp) != NULL)
 		return;
 
 	mtx_enter(&pf_inp_mtx);
-	if (inp->inp_pf_sk != NULL || sk->sk_inp != NULL) {
+	if (inp->inp_pf_st != NULL || st->inp != NULL) {
 		mtx_leave(&pf_inp_mtx);
 		return;
 	}
-	sk->sk_inp = in_pcbref(inp);
-	inp->inp_pf_sk = pf_state_key_ref(sk);
+	st->inp = in_pcbref(inp);
+	inp->inp_pf_st = pf_state_ref(st);
 	mtx_leave(&pf_inp_mtx);
 }
 
 void
-pf_state_key_unlink_inpcb(struct pf_state_key *sk)
+pf_state_unlink_inpcb(struct pf_state *st)
 {
 	struct inpcb *inp;
 
-	if (READ_ONCE(sk->sk_inp) == NULL)
+	if (READ_ONCE(st->inp) == NULL)
 		return;
 
 	mtx_enter(&pf_inp_mtx);
-	inp = sk->sk_inp;
+	inp = st->inp;
 	if (inp == NULL) {
 		mtx_leave(&pf_inp_mtx);
 		return;
 	}
-	KASSERT(inp->inp_pf_sk == sk);
-	sk->sk_inp = NULL;
-	inp->inp_pf_sk = NULL;
+	KASSERT(inp->inp_pf_st == st);
+	st->inp = NULL;
+	inp->inp_pf_st = NULL;
 	mtx_leave(&pf_inp_mtx);
 
-	pf_state_key_unref(sk);
+	pf_state_unref(st);
 	in_pcbunref(inp);
 }
 
 void
-pf_state_key_unlink_reverse(struct pf_state_key *sk)
+pf_state_unlink_reverse(struct pf_state *st)
 {
-	struct pf_state_key *skrev = sk->sk_reverse;
-
-	/* Note that sk and skrev may be equal, then we unref twice. */
-	if (skrev != NULL) {
-		KASSERT(skrev->sk_reverse == sk);
-		sk->sk_reverse = NULL;
-		skrev->sk_reverse = NULL;
-		pf_state_key_unref(skrev);
-		pf_state_key_unref(sk);
-	}
+	struct pf_state *strev;
+ 
+	/* Note that st and strev may be equal, then we unref twice. */
+	strev = st->reverse;
+	if (strev != NULL) {
+		KASSERT(strev->reverse == st);
+		st->reverse = NULL;
+		strev->reverse = NULL;
+		pf_state_unref(strev);
+		pf_state_unref(st);
+       }
 }
 
 struct pf_state *
@@ -9073,6 +9133,11 @@ pf_state_unref(struct pf_state *st)
 
 		pf_state_key_unref(st->key[PF_SK_WIRE]);
 		pf_state_key_unref(st->key[PF_SK_STACK]);
+
+		/* state must be unlinked from reverse */
+		KASSERT(st->reverse == NULL);
+		/* state must be unlinked from socket */
+		KASSERT(st->inp == NULL);
 
 		KASSERT(SLIST_EMPTY(&st->linkage));
 
Index: net/pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
diff -u -p -r1.545 pfvar.h
--- net/pfvar.h	7 Jan 2026 13:50:05 -0000	1.545
+++ net/pfvar.h	15 Jan 2026 05:34:55 -0000
@@ -2000,9 +2000,8 @@ int			 pf_map_addr(sa_family_t, struct p
 			    struct pf_pool *, enum pf_sn_types);
 int			 pf_postprocess_addr(struct pf_state *);
 
-void			 pf_mbuf_link_state_key(struct mbuf *,
-			    struct pf_state_key *);
-void			 pf_mbuf_unlink_state_key(struct mbuf *);
+void			 pf_mbuf_link_state(struct mbuf *, struct pf_state *);
+void			 pf_mbuf_unlink_state(struct mbuf *);
 void			 pf_mbuf_link_inpcb(struct mbuf *, struct inpcb *);
 void			 pf_mbuf_unlink_inpcb(struct mbuf *);
 
Index: net/pfvar_priv.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar_priv.h,v
diff -u -p -r1.40 pfvar_priv.h
--- net/pfvar_priv.h	28 Nov 2025 22:55:21 -0000	1.40
+++ net/pfvar_priv.h	15 Jan 2026 05:34:55 -0000
@@ -42,11 +42,6 @@
 #include <sys/pclock.h>
 #include <sys/percpu.h>
 
-/*
- * Locks used to protect struct members in this file:
- *	L	pf_inp_mtx		link pf to inp mutex
- */
-
 struct pfsync_deferral;
 struct kstat;
 
@@ -114,8 +109,6 @@ struct pf_state_key {
 
 	RBT_ENTRY(pf_state_key)	 sk_entry;
 	struct pf_statelisthead	 sk_states;
-	struct pf_state_key	*sk_reverse;
-	struct inpcb		*sk_inp;	/* [L] */
 	pf_refcnt_t		 sk_refcnt;
 	u_int8_t		 sk_removed;
 };
@@ -135,7 +128,8 @@ RBT_PROTOTYPE(pf_state_tree, pf_state_ke
  *	M	pf_state mtx
  *	P	PF_STATE_LOCK
  *	S	pfsync
- *	L	pf_state_list
+ *	L	pf_inp_mtx		link pf to inp mutex
+ *	G	pf_state_list
  *	g	pf_purge gc
  */
 
@@ -147,7 +141,7 @@ struct pf_state {
 
 	TAILQ_ENTRY(pf_state)	 sync_list;	/* [S] */
 	struct pfsync_deferral	*sync_defer;	/* [S] */
-	TAILQ_ENTRY(pf_state)	 entry_list;	/* [L] */
+	TAILQ_ENTRY(pf_state)	 entry_list;	/* [G] */
 	SLIST_ENTRY(pf_state)	 gc_list;	/* [g] */
 	RBT_ENTRY(pf_state)	 entry_id;	/* [P] */
 	struct pf_state_peer	 src;
@@ -160,6 +154,8 @@ struct pf_state {
 	struct pf_sn_head	 src_nodes;	/* [I] */
 	struct pf_state_key	*key[2];	/* [I] stack and wire */
 	struct pfi_kif		*kif;		/* [I] */
+	struct pf_state		*reverse;	/* [M] */
+	struct inpcb		*inp;		/* [L] */
 	struct mutex		 mtx;
 	pf_refcnt_t		 refcnt;
 	u_int64_t		 packets[2];
Index: netinet/in_pcb.h
===================================================================
RCS file: /cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.172 in_pcb.h
--- netinet/in_pcb.h	24 Oct 2025 15:09:56 -0000	1.172
+++ netinet/in_pcb.h	15 Jan 2026 05:34:55 -0000
@@ -109,7 +109,7 @@
  * Protocol input only reads inp_[lf]addr/port during lookup and is safe.
  */
 
-struct pf_state_key;
+struct pf_state;
 
 union inpaddru {
 	struct in_addr iau_addr;
@@ -166,7 +166,7 @@ struct inpcb {
 
 	int	inp_cksum6;
 	struct	icmp6_filter *inp_icmp6filt;
-	struct	pf_state_key *inp_pf_sk; /* [L] */
+	struct	pf_state *inp_pf_st;	/* [L] */
 	struct	mbuf *(*inp_upcall)(void *, struct mbuf *,
 	    struct ip *, struct ip6_hdr *, void *, int, struct netstack *);
 	void	*inp_upcall_arg;
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.267 mbuf.h
--- sys/mbuf.h	25 Jun 2025 20:26:32 -0000	1.267
+++ sys/mbuf.h	15 Jan 2026 05:34:55 -0000
@@ -90,11 +90,11 @@ struct m_hdr {
 };
 
 /* pf stuff */
-struct pf_state_key;
+struct pf_state;
 struct inpcb;
 
 struct pkthdr_pf {
-	struct pf_state_key *statekey;	/* pf stackside statekey */
+	struct pf_state	*st;		/* pf state */
 	struct inpcb	*inp;		/* connected pcb for outgoing packet */
 	u_int32_t	 qid;		/* queue id */
 	u_int16_t	 tag;		/* tag id */
@@ -325,7 +325,7 @@ u_int mextfree_register(void (*)(caddr_t
 	(to)->m_pkthdr = (from)->m_pkthdr;				\
 	(from)->m_flags &= ~M_PKTHDR;					\
 	SLIST_INIT(&(from)->m_pkthdr.ph_tags);				\
-	(from)->m_pkthdr.pf.statekey = NULL;				\
+	(from)->m_pkthdr.pf.st = NULL;					\
 } while (/* CONSTCOND */ 0)
 
 /*