From: David Gwynne Subject: Re: link mbufs/inpcbs to pf_states, not pf_state_keys To: tech@openbsd.org Date: Fri, 16 Jan 2026 07:26:52 +1000 On Mon, Jun 02, 2025 at 10:27:39PM +1000, David Gwynne wrote: > On Fri, May 23, 2025 at 10:24:28AM +0900, Alexander Bluhm wrote: > > On Wed, May 21, 2025 at 06:36:27PM +0900, Alexander Bluhm wrote: > > > On Sat, May 10, 2025 at 10:28:56AM +1000, David Gwynne wrote: > > > > tl;dr: less pointer chasing, and more precise links between things > > > > representing connections in the kernel. > > > > > > regress/sys/net/pf_forward and regress/sys/net/pflog fail. > > > > > > I am not 100% sure that it is related to this diff. Due to other > > > bugs, my last successful regress run was at May 9th. Give me another > > > day, then I can say if these tests are broken in current already. > > > > I have tested i386 snapshots, tests pass. Apply the diff and they > > fail. > > > > http://bluhm.genua.de/regress/results/2025-05-22T13%3A51%3A46Z/logs/sys/net/pf_forward/make.log > > > > FAIL sys/net/pf_forward *** Error 1 in . (Makefile:178 'run-ping-inet-AF_IN'), *** Error 2 in . (Makefile:197 'run-ping-mtu-1400-inet-AF_IN'), *** Error 2 in . (Makefile:219 'run-ping-mtu-1300-inet-AF_IN'), *** Error 1 in . (Makefile:236 'run-udp-inet-AF_IN'), *** Error 1 in . (Makefile:251 'run-tcp-inet-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-icmp-inet-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-udp-inet-AF_IN'), *** Error 1 in . (Makefile:178 'run-ping-inet6-AF_IN'), *** Error 2 in . (Makefile:199 'run-ping-mtu-1400-inet6-AF_IN'), *** Error 2 in . (Makefile:221 'run-ping-mtu-1300-inet6-AF_IN'), *** Error 1 in . (Makefile:236 'run-udp-inet6-AF_IN'), *** Error 1 in . (Makefile:251 'run-tcp-inet6-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-icmp-inet6-AF_IN'), *** Error 1 in . (Makefile:269 'run-traceroute-udp-inet6-AF_IN') > > > > http://bluhm.genua.de/regress/results/2025-05-22T13%3A51%3A46Z/logs/sys/net/pflog/make.log > > > > > FAIL sys/net/pflog *** Error 1 in . (Makefile:164 'run-ping-14'), *** Error 1 in . (Makefile:168 'run-ping6-14'), *** Error 1 in . (Makefile:199 'run-bpf-nothing') > > thanks for running those tests, and sorry about the delay getting back > to this. i only got back to looking at this again today. > > i believe i've straightened these issues out in the diff below. this fixes a pf_state_key leak in an error path pf_state_insert, and updates it for current. Index: kern/uipc_mbuf.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v diff -u -p -r1.302 uipc_mbuf.c --- kern/uipc_mbuf.c 6 Aug 2025 14:00:33 -0000 1.302 +++ kern/uipc_mbuf.c 15 Jan 2026 05:34:55 -0000 @@ -294,7 +294,7 @@ m_clearhdr(struct mbuf *m) /* delete all mbuf tags to reset the state */ m_tag_delete_chain(m); #if NPF > 0 - pf_mbuf_unlink_state_key(m); + pf_mbuf_unlink_state(m); pf_mbuf_unlink_inpcb(m); #endif /* NPF > 0 */ @@ -422,7 +422,7 @@ m_free(struct mbuf *m) if (m->m_flags & M_PKTHDR) { m_tag_delete_chain(m); #if NPF > 0 - pf_mbuf_unlink_state_key(m); + pf_mbuf_unlink_state(m); pf_mbuf_unlink_inpcb(m); #endif /* NPF > 0 */ } @@ -1380,8 +1380,8 @@ m_dup_pkthdr(struct mbuf *to, struct mbu to->m_pkthdr = from->m_pkthdr; #if NPF > 0 - to->m_pkthdr.pf.statekey = NULL; - pf_mbuf_link_state_key(to, from->m_pkthdr.pf.statekey); + to->m_pkthdr.pf.st = NULL; + pf_mbuf_link_state(to, from->m_pkthdr.pf.st); to->m_pkthdr.pf.inp = NULL; pf_mbuf_link_inpcb(to, from->m_pkthdr.pf.inp); #endif /* NPF > 0 */ @@ -1511,8 +1511,8 @@ m_print(void *v, m->m_pkthdr.csum_flags, MCS_BITS); (*pr)("m_pkthdr.ether_vtag: %u\tm_ptkhdr.ph_rtableid: %u\n", m->m_pkthdr.ether_vtag, m->m_pkthdr.ph_rtableid); - (*pr)("m_pkthdr.pf.statekey: %p\tm_pkthdr.pf.inp %p\n", - m->m_pkthdr.pf.statekey, m->m_pkthdr.pf.inp); + (*pr)("m_pkthdr.pf.st: %p\tm_pkthdr.pf.inp %p\n", + m->m_pkthdr.pf.st, m->m_pkthdr.pf.inp); (*pr)("m_pkthdr.pf.qid: %u\tm_pkthdr.pf.tag: %u\n", m->m_pkthdr.pf.qid, m->m_pkthdr.pf.tag); (*pr)("m_pkthdr.pf.flags: %b\n", Index: net/if_mpw.c =================================================================== RCS file: /cvs/src/sys/net/if_mpw.c,v diff -u -p -r1.69 if_mpw.c --- net/if_mpw.c 27 Nov 2025 03:06:59 -0000 1.69 +++ net/if_mpw.c 15 Jan 2026 05:34:55 -0000 @@ -622,7 +622,7 @@ mpw_input(struct ifnet *ifp, struct mbuf m->m_pkthdr.ph_rtableid = ifp->if_rdomain; /* packet has not been processed by PF yet. */ - KASSERT(m->m_pkthdr.pf.statekey == NULL); + KASSERT(m->m_pkthdr.pf.st == NULL); #if NBPFILTER > 0 if_bpf = sc->sc_bpf; Index: net/if_tpmr.c =================================================================== RCS file: /cvs/src/sys/net/if_tpmr.c,v diff -u -p -r1.43 if_tpmr.c --- net/if_tpmr.c 11 Dec 2025 06:02:11 -0000 1.43 +++ net/if_tpmr.c 15 Jan 2026 05:34:55 -0000 @@ -314,7 +314,7 @@ tpmr_pf(struct ifnet *ifp0, int dir, str return (NULL); if (dir == PF_IN && ISSET(m->m_pkthdr.pf.flags, PF_TAG_DIVERTED)) { - pf_mbuf_unlink_state_key(m); + pf_mbuf_unlink_state(m); pf_mbuf_unlink_inpcb(m); if_input_proto(ifp0, m, fam->ip_input, ns); return (NULL); Index: net/if_veb.c =================================================================== RCS file: /cvs/src/sys/net/if_veb.c,v diff -u -p -r1.68 if_veb.c --- net/if_veb.c 11 Dec 2025 06:02:11 -0000 1.68 +++ net/if_veb.c 15 Jan 2026 05:34:55 -0000 @@ -841,7 +841,7 @@ veb_pf(struct ifnet *ifp0, int dir, stru return (NULL); if (dir == PF_IN && ISSET(m->m_pkthdr.pf.flags, PF_TAG_DIVERTED)) { - pf_mbuf_unlink_state_key(m); + pf_mbuf_unlink_state(m); pf_mbuf_unlink_inpcb(m); if_input_proto(ifp0, m, fam->ip_input, ns); return (NULL); Index: net/pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v diff -u -p -r1.1230 pf.c --- net/pf.c 7 Jan 2026 13:50:05 -0000 1.1230 +++ net/pf.c 15 Jan 2026 05:34:55 -0000 @@ -247,15 +247,16 @@ int pf_state_insert(struct pfi_kif *, struct pf_state_key **, struct pf_state_key **, struct pf_state *); +int pf_state_isvalid(struct pf_state *); int pf_state_key_isvalid(struct pf_state_key *); struct pf_state_key *pf_state_key_ref(struct pf_state_key *); void pf_state_key_unref(struct pf_state_key *); -void pf_state_key_link_reverse(struct pf_state_key *, - struct pf_state_key *); -void pf_state_key_unlink_reverse(struct pf_state_key *); -void pf_state_key_link_inpcb(struct pf_state_key *, +void pf_state_link_reverse(struct pf_state *, + struct pf_state *); +void pf_state_unlink_reverse(struct pf_state *); +void pf_state_link_inpcb(struct pf_state *, struct inpcb *); -void pf_state_key_unlink_inpcb(struct pf_state_key *); +void pf_state_unlink_inpcb(struct pf_state *); void pf_pktenqueue_delayed(void *); int32_t pf_state_expires(const struct pf_state *, uint8_t); @@ -1134,8 +1135,6 @@ pf_state_key_detach(struct pf_state *st, if (TAILQ_EMPTY(&sk->sk_states)) { RBT_REMOVE(pf_state_tree, &pf_statetbl, sk); sk->sk_removed = 1; - pf_state_key_unlink_reverse(sk); - pf_state_key_unlink_inpcb(sk); pf_state_key_unref(sk); } @@ -1316,14 +1315,24 @@ pf_state_insert(struct pfi_kif *kif, str if (same) { /* pf_state_key_attach might have swapped skw */ - pf_state_key_unref(sks); - st->key[PF_SK_STACK] = sks = pf_state_key_ref(skw); + if (skw != sks) { + pf_state_key_unref(sks); + sks = pf_state_key_ref(skw); + } + st->key[PF_SK_STACK] = sks; } else if (pf_state_key_attach(sks, st, PF_SK_STACK) == NULL) { pf_state_key_detach(st, PF_SK_WIRE); + pf_state_key_unref(skw); PF_STATE_EXIT_WRITE(); return (-1); } + /* + * st owns the sks and skw refs now. pf_state_unref (maybe + * via pf_detach_state) is responsible for releasing them in + * the future. + */ + if (st->id == 0 && st->creatorid == 0) { st->id = htobe64(pf_status.stateid++); st->creatorid = pf_status.hostid; @@ -1397,91 +1406,31 @@ pf_compare_state_keys(struct pf_state_ke } } -int -pf_find_state(struct pf_pdesc *pd, struct pf_state_key_cmp *key, - struct pf_state **stp) +static inline struct pf_state * +pf_find_state_lookup(struct pf_pdesc *pd, const struct pf_state_key_cmp *key) { - struct pf_state_key *sk, *pkt_sk; + struct pf_state_key *sk; struct pf_state_item *si; - struct pf_state *st = NULL; + struct pf_state *st; int didx; - counters_inc(pf_status_fcounters, FCNT_STATE_SEARCH); - if (pf_status.debug >= LOG_DEBUG) { - log(LOG_DEBUG, "pf: key search, %s on %s: ", - pd->dir == PF_OUT ? "out" : "in", pd->kif->pfik_name); - pf_print_state_parts(NULL, (struct pf_state_key *)key, NULL); - addlog("\n"); - } - - pkt_sk = NULL; - sk = NULL; - if (pd->dir == PF_OUT) { - /* first if block deals with outbound forwarded packet */ - pkt_sk = pd->m->m_pkthdr.pf.statekey; - - if (!pf_state_key_isvalid(pkt_sk)) { - pf_mbuf_unlink_state_key(pd->m); - pkt_sk = NULL; - } - - if (pkt_sk && pf_state_key_isvalid(pkt_sk->sk_reverse)) - sk = pkt_sk->sk_reverse; - - if (pkt_sk == NULL) { - struct inpcb *inp = pd->m->m_pkthdr.pf.inp; - - /* here we deal with local outbound packet */ - if (inp != NULL) { - struct pf_state_key *inp_sk; - - mtx_enter(&pf_inp_mtx); - inp_sk = inp->inp_pf_sk; - if (pf_state_key_isvalid(inp_sk)) { - sk = inp_sk; - mtx_leave(&pf_inp_mtx); - } else if (inp_sk != NULL) { - KASSERT(inp_sk->sk_inp == inp); - inp_sk->sk_inp = NULL; - inp->inp_pf_sk = NULL; - mtx_leave(&pf_inp_mtx); - - pf_state_key_unref(inp_sk); - in_pcbunref(inp); - } else - mtx_leave(&pf_inp_mtx); - } - } - } - - if (sk == NULL) { - if ((sk = RBT_FIND(pf_state_tree, &pf_statetbl, - (struct pf_state_key *)key)) == NULL) - return (PF_DROP); - if (pd->dir == PF_OUT && pkt_sk && - pf_compare_state_keys(pkt_sk, sk, pd->kif, pd->dir) == 0) - pf_state_key_link_reverse(sk, pkt_sk); - else if (pd->dir == PF_OUT) - pf_state_key_link_inpcb(sk, pd->m->m_pkthdr.pf.inp); - } - - /* remove firewall data from outbound packet */ - if (pd->dir == PF_OUT) - pf_pkt_addr_changed(pd->m); + sk = RBT_FIND(pf_state_tree, &pf_statetbl, (struct pf_state_key *)key); + if (sk == NULL) + return (NULL); didx = (pd->dir == PF_IN) ? PF_SK_WIRE : PF_SK_STACK; /* list is sorted, if-bound states before floating ones */ TAILQ_FOREACH(si, &sk->sk_states, si_entry) { - struct pf_state *sist = si->si_st; - if (sist->timeout == PFTM_PURGE) + st = si->si_st; + if (st->timeout == PFTM_PURGE) continue; - if (sist->kif != pfi_all && sist->kif != pd->kif) + if (st->kif != pfi_all && st->kif != pd->kif) continue; - /* af-to needs to handled specially */ - if (sist->key[PF_SK_WIRE]->af == sist->key[PF_SK_STACK]->af) { - if (sk != sist->key[didx]) + /* af-to needs to be handled specially */ + if (st->key[PF_SK_WIRE]->af == st->key[PF_SK_STACK]->af) { + if (sk != st->key[didx]) continue; /* af-to case */ @@ -1499,15 +1448,100 @@ pf_find_state(struct pf_pdesc *pd, struc /* one of the sist keys has to be sk */ } - st = sist; - break; + return (st); + } + + return (NULL); +} + +static struct pf_state * +pf_find_state_reverse(struct pf_state *strev) +{ + struct pf_state *st; + + st = strev->reverse; + if (st != NULL) { + if (pf_state_isvalid(st)) + return (st); + + pf_state_unlink_reverse(st); + } + + return (NULL); +} + +static struct pf_state * +pf_find_state_inpcb(struct inpcb *inp) +{ + struct pf_state *st; + int valid = 1; + + if (READ_ONCE(inp->inp_pf_st) == NULL) + return (NULL); + + mtx_enter(&pf_inp_mtx); /* this should be a per inp mtx */ + st = inp->inp_pf_st; + if (st != NULL) { + KASSERT(st->inp == inp); + valid = pf_state_isvalid(st); + if (!valid) { + st->inp = NULL; + inp->inp_pf_st = NULL; + } } + mtx_leave(&pf_inp_mtx); + if (!valid) { + pf_state_unref(st); + in_pcbunref(inp); + return (NULL); + } + + return (st); +} + +int +pf_find_state(struct pf_pdesc *pd, struct pf_state_key_cmp *key, + struct pf_state **stp) +{ + struct pf_state *st = NULL; + struct pf_state *strev = NULL; + struct inpcb *inp = NULL; + + counters_inc(pf_status_fcounters, FCNT_STATE_SEARCH); + if (pf_status.debug >= LOG_DEBUG) { + log(LOG_DEBUG, "pf: key search, %s on %s: ", + pd->dir == PF_OUT ? "out" : "in", pd->kif->pfik_name); + pf_print_state_parts(NULL, (struct pf_state_key *)key, NULL); + addlog("\n"); + } + + if (pd->dir == PF_OUT) { + strev = pd->m->m_pkthdr.pf.st; + inp = pd->m->m_pkthdr.pf.inp; + + /* first if block deals with outbound forwarded packet */ + if (strev != NULL) { + KASSERT(inp == NULL); + + st = pf_find_state_reverse(strev); + } else if (inp != NULL) { + /* here we deal with local outbound packet */ + + st = pf_find_state_inpcb(inp); + } + + if (st != NULL) + goto match; + } + + st = pf_find_state_lookup(pd, key); if (st == NULL) return (PF_DROP); if (ISSET(st->state_flags, PFSTATE_INP_UNLINKED)) return (PF_DROP); +match: if (st->rule.ptr->pktrate.limit && pd->dir == st->direction) { pf_add_threshold(&st->rule.ptr->pktrate); if (pf_check_threshold(&st->rule.ptr->pktrate)) @@ -2078,6 +2112,9 @@ pf_remove_state(struct pf_state *st) st->timeout = PFTM_UNLINKED; mtx_leave(&st->mtx); + pf_state_unlink_reverse(st); + pf_state_unlink_inpcb(st); + /* handle load balancing related tasks */ pf_postprocess_addr(st); @@ -2168,52 +2205,47 @@ pf_remove_state(struct pf_state *st) void pf_remove_divert_state(struct inpcb *inp) { - struct pf_state_key *sk; - struct pf_state_item *si; + struct pf_state *st; PF_ASSERT_UNLOCKED(); - if (READ_ONCE(inp->inp_pf_sk) == NULL) + if (READ_ONCE(inp->inp_pf_st) == NULL) return; mtx_enter(&pf_inp_mtx); - sk = pf_state_key_ref(inp->inp_pf_sk); + st = pf_state_ref(inp->inp_pf_st); mtx_leave(&pf_inp_mtx); - if (sk == NULL) + if (st == NULL) return; PF_LOCK(); PF_STATE_ENTER_WRITE(); - TAILQ_FOREACH(si, &sk->sk_states, si_entry) { - struct pf_state *sist = si->si_st; - if (sk == sist->key[PF_SK_STACK] && sist->rule.ptr && - (sist->rule.ptr->divert.type == PF_DIVERT_TO || - sist->rule.ptr->divert.type == PF_DIVERT_REPLY)) { - if (sist->key[PF_SK_STACK]->proto == IPPROTO_TCP && - sist->key[PF_SK_WIRE] != sist->key[PF_SK_STACK]) { - /* - * If the local address is translated, keep - * the state for "tcp.closed" seconds to - * prevent its source port from being reused. - */ - if (sist->src.state < TCPS_FIN_WAIT_2 || - sist->dst.state < TCPS_FIN_WAIT_2) { - pf_set_protostate(sist, PF_PEER_BOTH, - TCPS_TIME_WAIT); - pf_update_state_timeout(sist, - PFTM_TCP_CLOSED); - sist->expire = getuptime(); - } - sist->state_flags |= PFSTATE_INP_UNLINKED; - } else - pf_remove_state(sist); - break; - } + if (st->rule.ptr && + (st->rule.ptr->divert.type == PF_DIVERT_TO || + st->rule.ptr->divert.type == PF_DIVERT_REPLY)) { + if (st->key[PF_SK_STACK]->proto == IPPROTO_TCP && + st->key[PF_SK_WIRE] != st->key[PF_SK_STACK]) { + /* + * If the local address is translated, keep + * the state for "tcp.closed" seconds to + * prevent its source port from being reused. + */ + if (st->src.state < TCPS_FIN_WAIT_2 || + st->dst.state < TCPS_FIN_WAIT_2) { + pf_set_protostate(st, PF_PEER_BOTH, + TCPS_TIME_WAIT); + pf_update_state_timeout(st, + PFTM_TCP_CLOSED); + st->expire = getuptime(); + } + st->state_flags |= PFSTATE_INP_UNLINKED; + } else + pf_remove_state(st); } PF_STATE_EXIT_WRITE(); PF_UNLOCK(); - pf_state_key_unref(sk); + pf_state_unref(st); } void @@ -8613,13 +8645,20 @@ done: pd.m->m_pkthdr.pf.qid = qid; if (st != NULL) { struct mbuf *m = pd.m; + struct pf_state *strev = m->m_pkthdr.pf.st; struct inpcb *inp = m->m_pkthdr.pf.inp; if (pd.dir == PF_IN) { + KASSERT(strev == NULL); KASSERT(inp == NULL); - pf_mbuf_link_state_key(m, st->key[PF_SK_STACK]); - } else if (pd.dir == PF_OUT) - pf_state_key_link_inpcb(st->key[PF_SK_STACK], inp); + pf_mbuf_link_state(m, st); + } else if (pd.dir == PF_OUT) { + if (strev != NULL) { + KASSERT(inp == NULL); + pf_state_link_reverse(st, strev); + } else + pf_state_link_inpcb(st, inp); + } if (!ISSET(m->m_pkthdr.csum_flags, M_FLOWID)) { m->m_pkthdr.ph_flowid = st->key[PF_SK_WIRE]->hash; @@ -8803,14 +8842,14 @@ out: int pf_ouraddr(struct mbuf *m) { - struct pf_state_key *sk; + struct pf_state *st; if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) return (1); - sk = m->m_pkthdr.pf.statekey; - if (sk != NULL) { - if (READ_ONCE(sk->sk_inp) != NULL) + st = m->m_pkthdr.pf.st; + if (st != NULL) { + if (READ_ONCE(st->inp) != NULL) return (1); } @@ -8824,7 +8863,7 @@ pf_ouraddr(struct mbuf *m) void pf_pkt_addr_changed(struct mbuf *m) { - pf_mbuf_unlink_state_key(m); + pf_mbuf_unlink_state(m); pf_mbuf_unlink_inpcb(m); } @@ -8832,85 +8871,102 @@ struct inpcb * pf_inp_lookup(struct mbuf *m) { struct inpcb *inp = NULL; - struct pf_state_key *sk = m->m_pkthdr.pf.statekey; + struct pf_state *st; - if (!pf_state_key_isvalid(sk)) - pf_mbuf_unlink_state_key(m); - else if (READ_ONCE(sk->sk_inp) != NULL) { + st = m->m_pkthdr.pf.st; + if (st == NULL) + return (NULL); + if (!pf_state_isvalid(st)) { + pf_mbuf_unlink_state(m); + return (NULL); + } + + if (READ_ONCE(st->inp) != NULL) { mtx_enter(&pf_inp_mtx); - inp = in_pcbref(sk->sk_inp); + inp = in_pcbref(st->inp); mtx_leave(&pf_inp_mtx); } return (inp); } +/* + * This is called from the IP stack after it's found an inpcb for + * an mbuf so it can link the pf_state to that pcb. + */ void pf_inp_link(struct mbuf *m, struct inpcb *inp) { - struct pf_state_key *sk = m->m_pkthdr.pf.statekey; + struct pf_state *st; - if (!pf_state_key_isvalid(sk)) { - pf_mbuf_unlink_state_key(m); + st = m->m_pkthdr.pf.st; + if (st == NULL) return; - } /* * we don't need to grab PF-lock here. At worst case we link inp to * state, which might be just being marked as deleted by another * thread. */ - pf_state_key_link_inpcb(sk, inp); + if (pf_state_isvalid(st)) { + if (READ_ONCE(st->inp) == NULL) + pf_state_link_inpcb(st, inp); + } /* The statekey has finished finding the inp, it is no longer needed. */ - pf_mbuf_unlink_state_key(m); + pf_mbuf_unlink_state(m); } void pf_inp_unlink(struct inpcb *inp) { - struct pf_state_key *sk; + struct pf_state *st; - if (READ_ONCE(inp->inp_pf_sk) == NULL) + if (READ_ONCE(inp->inp_pf_st) == NULL) return; mtx_enter(&pf_inp_mtx); - sk = inp->inp_pf_sk; - if (sk == NULL) { + st = inp->inp_pf_st; + if (st == NULL) { mtx_leave(&pf_inp_mtx); return; } - KASSERT(sk->sk_inp == inp); - sk->sk_inp = NULL; - inp->inp_pf_sk = NULL; + KASSERT(st->inp == inp); + st->inp = NULL; + inp->inp_pf_st = NULL; mtx_leave(&pf_inp_mtx); - pf_state_key_unref(sk); + pf_state_unref(st); in_pcbunref(inp); } void -pf_state_key_link_reverse(struct pf_state_key *sk, struct pf_state_key *skrev) +pf_state_link_reverse(struct pf_state *st, struct pf_state *strev) { - struct pf_state_key *old_reverse; + struct pf_state *ost; - old_reverse = atomic_cas_ptr(&sk->sk_reverse, NULL, skrev); - if (old_reverse != NULL) - KASSERT(old_reverse == skrev); - else { - pf_state_key_ref(skrev); + ost = atomic_cas_ptr(&st->reverse, NULL, strev); + if (ost != NULL) { + KASSERTMSG(ost == strev, + "st %p reverse is %p, not strev %p ", + st, ost, strev); + } else { + pf_state_ref(strev); /* - * NOTE: if sk == skrev, then KASSERT() below holds true, we + * NOTE: if st == strev, then KASSERT() below holds true, we * still want to grab a reference in such case, because - * pf_state_key_unlink_reverse() does not check whether keys + * pf_state_unlink_reverse() does not check whether states * are identical or not. */ - old_reverse = atomic_cas_ptr(&skrev->sk_reverse, NULL, sk); - if (old_reverse != NULL) - KASSERT(old_reverse == sk); + ost = atomic_cas_ptr(&strev->reverse, NULL, st); + if (ost != NULL) { + KASSERTMSG(ost == st, + "strev %p reverse is %p, not st %p", + strev, ost, st); + } - pf_state_key_ref(sk); + pf_state_ref(st); } } @@ -8946,10 +9002,6 @@ pf_state_key_unref(struct pf_state_key * if (PF_REF_RELE(sk->sk_refcnt)) { /* state key must be removed from tree */ KASSERT(!pf_state_key_isvalid(sk)); - /* state key must be unlinked from reverse key */ - KASSERT(sk->sk_reverse == NULL); - /* state key must be unlinked from socket */ - KASSERT(sk->sk_inp == NULL); pool_put(&pf_state_key_pl, sk); } } @@ -8960,21 +9012,28 @@ pf_state_key_isvalid(struct pf_state_key return ((sk != NULL) && (sk->sk_removed == 0)); } +int +pf_state_isvalid(struct pf_state *st) +{ + return (st->timeout < PFTM_MAX); +} + void -pf_mbuf_link_state_key(struct mbuf *m, struct pf_state_key *sk) +pf_mbuf_link_state(struct mbuf *m, struct pf_state *st) { - KASSERT(m->m_pkthdr.pf.statekey == NULL); - m->m_pkthdr.pf.statekey = pf_state_key_ref(sk); + KASSERT(m->m_pkthdr.pf.st == NULL); + m->m_pkthdr.pf.st = pf_state_ref(st); } void -pf_mbuf_unlink_state_key(struct mbuf *m) +pf_mbuf_unlink_state(struct mbuf *m) { - struct pf_state_key *sk = m->m_pkthdr.pf.statekey; + struct pf_state *st; - if (sk != NULL) { - m->m_pkthdr.pf.statekey = NULL; - pf_state_key_unref(sk); + st = m->m_pkthdr.pf.st; + if (st != NULL) { + m->m_pkthdr.pf.st = NULL; + pf_state_unref(st); } } @@ -8997,57 +9056,58 @@ pf_mbuf_unlink_inpcb(struct mbuf *m) } void -pf_state_key_link_inpcb(struct pf_state_key *sk, struct inpcb *inp) +pf_state_link_inpcb(struct pf_state *st, struct inpcb *inp) { - if (inp == NULL || READ_ONCE(sk->sk_inp) != NULL) + if (inp == NULL || READ_ONCE(st->inp) != NULL) return; mtx_enter(&pf_inp_mtx); - if (inp->inp_pf_sk != NULL || sk->sk_inp != NULL) { + if (inp->inp_pf_st != NULL || st->inp != NULL) { mtx_leave(&pf_inp_mtx); return; } - sk->sk_inp = in_pcbref(inp); - inp->inp_pf_sk = pf_state_key_ref(sk); + st->inp = in_pcbref(inp); + inp->inp_pf_st = pf_state_ref(st); mtx_leave(&pf_inp_mtx); } void -pf_state_key_unlink_inpcb(struct pf_state_key *sk) +pf_state_unlink_inpcb(struct pf_state *st) { struct inpcb *inp; - if (READ_ONCE(sk->sk_inp) == NULL) + if (READ_ONCE(st->inp) == NULL) return; mtx_enter(&pf_inp_mtx); - inp = sk->sk_inp; + inp = st->inp; if (inp == NULL) { mtx_leave(&pf_inp_mtx); return; } - KASSERT(inp->inp_pf_sk == sk); - sk->sk_inp = NULL; - inp->inp_pf_sk = NULL; + KASSERT(inp->inp_pf_st == st); + st->inp = NULL; + inp->inp_pf_st = NULL; mtx_leave(&pf_inp_mtx); - pf_state_key_unref(sk); + pf_state_unref(st); in_pcbunref(inp); } void -pf_state_key_unlink_reverse(struct pf_state_key *sk) +pf_state_unlink_reverse(struct pf_state *st) { - struct pf_state_key *skrev = sk->sk_reverse; - - /* Note that sk and skrev may be equal, then we unref twice. */ - if (skrev != NULL) { - KASSERT(skrev->sk_reverse == sk); - sk->sk_reverse = NULL; - skrev->sk_reverse = NULL; - pf_state_key_unref(skrev); - pf_state_key_unref(sk); - } + struct pf_state *strev; + + /* Note that st and strev may be equal, then we unref twice. */ + strev = st->reverse; + if (strev != NULL) { + KASSERT(strev->reverse == st); + st->reverse = NULL; + strev->reverse = NULL; + pf_state_unref(strev); + pf_state_unref(st); + } } struct pf_state * @@ -9073,6 +9133,11 @@ pf_state_unref(struct pf_state *st) pf_state_key_unref(st->key[PF_SK_WIRE]); pf_state_key_unref(st->key[PF_SK_STACK]); + + /* state must be unlinked from reverse */ + KASSERT(st->reverse == NULL); + /* state must be unlinked from socket */ + KASSERT(st->inp == NULL); KASSERT(SLIST_EMPTY(&st->linkage)); Index: net/pfvar.h =================================================================== RCS file: /cvs/src/sys/net/pfvar.h,v diff -u -p -r1.545 pfvar.h --- net/pfvar.h 7 Jan 2026 13:50:05 -0000 1.545 +++ net/pfvar.h 15 Jan 2026 05:34:55 -0000 @@ -2000,9 +2000,8 @@ int pf_map_addr(sa_family_t, struct p struct pf_pool *, enum pf_sn_types); int pf_postprocess_addr(struct pf_state *); -void pf_mbuf_link_state_key(struct mbuf *, - struct pf_state_key *); -void pf_mbuf_unlink_state_key(struct mbuf *); +void pf_mbuf_link_state(struct mbuf *, struct pf_state *); +void pf_mbuf_unlink_state(struct mbuf *); void pf_mbuf_link_inpcb(struct mbuf *, struct inpcb *); void pf_mbuf_unlink_inpcb(struct mbuf *); Index: net/pfvar_priv.h =================================================================== RCS file: /cvs/src/sys/net/pfvar_priv.h,v diff -u -p -r1.40 pfvar_priv.h --- net/pfvar_priv.h 28 Nov 2025 22:55:21 -0000 1.40 +++ net/pfvar_priv.h 15 Jan 2026 05:34:55 -0000 @@ -42,11 +42,6 @@ #include #include -/* - * Locks used to protect struct members in this file: - * L pf_inp_mtx link pf to inp mutex - */ - struct pfsync_deferral; struct kstat; @@ -114,8 +109,6 @@ struct pf_state_key { RBT_ENTRY(pf_state_key) sk_entry; struct pf_statelisthead sk_states; - struct pf_state_key *sk_reverse; - struct inpcb *sk_inp; /* [L] */ pf_refcnt_t sk_refcnt; u_int8_t sk_removed; }; @@ -135,7 +128,8 @@ RBT_PROTOTYPE(pf_state_tree, pf_state_ke * M pf_state mtx * P PF_STATE_LOCK * S pfsync - * L pf_state_list + * L pf_inp_mtx link pf to inp mutex + * G pf_state_list * g pf_purge gc */ @@ -147,7 +141,7 @@ struct pf_state { TAILQ_ENTRY(pf_state) sync_list; /* [S] */ struct pfsync_deferral *sync_defer; /* [S] */ - TAILQ_ENTRY(pf_state) entry_list; /* [L] */ + TAILQ_ENTRY(pf_state) entry_list; /* [G] */ SLIST_ENTRY(pf_state) gc_list; /* [g] */ RBT_ENTRY(pf_state) entry_id; /* [P] */ struct pf_state_peer src; @@ -160,6 +154,8 @@ struct pf_state { struct pf_sn_head src_nodes; /* [I] */ struct pf_state_key *key[2]; /* [I] stack and wire */ struct pfi_kif *kif; /* [I] */ + struct pf_state *reverse; /* [M] */ + struct inpcb *inp; /* [L] */ struct mutex mtx; pf_refcnt_t refcnt; u_int64_t packets[2]; Index: netinet/in_pcb.h =================================================================== RCS file: /cvs/src/sys/netinet/in_pcb.h,v diff -u -p -r1.172 in_pcb.h --- netinet/in_pcb.h 24 Oct 2025 15:09:56 -0000 1.172 +++ netinet/in_pcb.h 15 Jan 2026 05:34:55 -0000 @@ -109,7 +109,7 @@ * Protocol input only reads inp_[lf]addr/port during lookup and is safe. */ -struct pf_state_key; +struct pf_state; union inpaddru { struct in_addr iau_addr; @@ -166,7 +166,7 @@ struct inpcb { int inp_cksum6; struct icmp6_filter *inp_icmp6filt; - struct pf_state_key *inp_pf_sk; /* [L] */ + struct pf_state *inp_pf_st; /* [L] */ struct mbuf *(*inp_upcall)(void *, struct mbuf *, struct ip *, struct ip6_hdr *, void *, int, struct netstack *); void *inp_upcall_arg; Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v diff -u -p -r1.267 mbuf.h --- sys/mbuf.h 25 Jun 2025 20:26:32 -0000 1.267 +++ sys/mbuf.h 15 Jan 2026 05:34:55 -0000 @@ -90,11 +90,11 @@ struct m_hdr { }; /* pf stuff */ -struct pf_state_key; +struct pf_state; struct inpcb; struct pkthdr_pf { - struct pf_state_key *statekey; /* pf stackside statekey */ + struct pf_state *st; /* pf state */ struct inpcb *inp; /* connected pcb for outgoing packet */ u_int32_t qid; /* queue id */ u_int16_t tag; /* tag id */ @@ -325,7 +325,7 @@ u_int mextfree_register(void (*)(caddr_t (to)->m_pkthdr = (from)->m_pkthdr; \ (from)->m_flags &= ~M_PKTHDR; \ SLIST_INIT(&(from)->m_pkthdr.ph_tags); \ - (from)->m_pkthdr.pf.statekey = NULL; \ + (from)->m_pkthdr.pf.st = NULL; \ } while (/* CONSTCOND */ 0) /*