Download raw body.
incpb socket ref count
On Wed, May 21, 2025 at 05:44:45PM +0900, Alexander Bluhm wrote:
> Hi,
>
> Instead of protecting inp_socket with special mutex when being freed
> and reference counting in in_pcbsolock_ref(), better reference count
> inp_socket.
>
> When an inpcb is created, inp_socket is initialized with a socket
> pointer that is reference counted. When the inpcb is freed, release
> the socket reference. As incpb is reference counted itself, we can
> always access the socket memory when we have a valid inpcb counter.
>
> in_pcbsolock() can just lock the socket, no reference counting
> needed. This reduces contention a bit. As in_pcbdetach() is
> protected by socket lock, in_pcbsolock() has to check so_pcb while
> holding the lock to skip sockets that are being closed.
>
> ok?
anyone?
> bluhm
>
> Index: kern/kern_sysctl.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
> diff -u -p -r1.468 kern_sysctl.c
> --- kern/kern_sysctl.c 9 May 2025 14:53:22 -0000 1.468
> +++ kern/kern_sysctl.c 21 May 2025 04:50:15 -0000
> @@ -1722,17 +1722,15 @@ do { \
> mtx_enter(&(table)->inpt_mtx); \
> while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { \
> if (buflen >= elem_size && elem_count > 0) { \
> - mtx_enter(&inp->inp_sofree_mtx); \
> - so = soref(inp->inp_socket); \
> - mtx_leave(&inp->inp_sofree_mtx); \
> + mtx_leave(&(table)->inpt_mtx); \
> + NET_LOCK_SHARED(); \
> + so = in_pcbsolock(inp); \
> if (so == NULL) \
> continue; \
> - mtx_leave(&(table)->inpt_mtx); \
> - solock_shared(so); \
> fill_file(kf, NULL, NULL, 0, NULL, NULL, p, \
> so, show_pointers); \
> - sounlock_shared(so); \
> - sorele(so); \
> + in_pcbsounlock(inp, so); \
> + NET_UNLOCK_SHARED(); \
> error = copyout(kf, dp, outsize); \
> mtx_enter(&(table)->inpt_mtx); \
> if (error) { \
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> diff -u -p -r1.314 in_pcb.c
> --- netinet/in_pcb.c 20 May 2025 05:51:43 -0000 1.314
> +++ netinet/in_pcb.c 21 May 2025 04:50:15 -0000
> @@ -236,8 +236,7 @@ in_pcballoc(struct socket *so, struct in
> if (inp == NULL)
> return (ENOBUFS);
> inp->inp_table = table;
> - inp->inp_socket = so;
> - mtx_init(&inp->inp_sofree_mtx, IPL_SOFTNET);
> + inp->inp_socket = soref(so);
> refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
> inp->inp_seclevel.sl_auth = IPSEC_AUTH_LEVEL_DEFAULT;
> inp->inp_seclevel.sl_esp_trans = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
> @@ -584,10 +583,9 @@ in_pcbdetach(struct inpcb *inp)
> struct socket *so = inp->inp_socket;
> struct inpcbtable *table = inp->inp_table;
>
> + soassertlocked(so);
> +
> so->so_pcb = NULL;
> - mtx_enter(&inp->inp_sofree_mtx);
> - inp->inp_socket = NULL;
> - mtx_leave(&inp->inp_sofree_mtx);
> /*
> * As long as the NET_LOCK() is the default lock for Internet
> * sockets, do not release it to not introduce new sleeping
> @@ -623,22 +621,17 @@ in_pcbdetach(struct inpcb *inp)
> }
>
> struct socket *
> -in_pcbsolock_ref(struct inpcb *inp)
> +in_pcbsolock(struct inpcb *inp)
> {
> - struct socket *so;
> + struct socket *so = inp->inp_socket;
>
> NET_ASSERT_LOCKED();
>
> - mtx_enter(&inp->inp_sofree_mtx);
> - so = soref(inp->inp_socket);
> - mtx_leave(&inp->inp_sofree_mtx);
> if (so == NULL)
> return NULL;
> rw_enter_write(&so->so_lock);
> - /* between mutex and rwlock inpcb could be detached */
> if (so->so_pcb == NULL) {
> rw_exit_write(&so->so_lock);
> - sorele(so);
> return NULL;
> }
> KASSERT(inp->inp_socket == so && sotoinpcb(so) == inp);
> @@ -646,12 +639,13 @@ in_pcbsolock_ref(struct inpcb *inp)
> }
>
> void
> -in_pcbsounlock_rele(struct inpcb *inp, struct socket *so)
> +in_pcbsounlock(struct inpcb *inp, struct socket *so)
> {
> if (so == NULL)
> return;
> + if (inp != NULL && so->so_pcb != NULL)
> + KASSERT(inp->inp_socket == so && sotoinpcb(so) == inp);
> rw_exit_write(&so->so_lock);
> - sorele(so);
> }
>
> struct inpcb *
> @@ -670,6 +664,7 @@ in_pcbunref(struct inpcb *inp)
> return;
> if (refcnt_rele(&inp->inp_refcnt) == 0)
> return;
> + sorele(inp->inp_socket);
> KASSERT((LIST_NEXT(inp, inp_hash) == NULL) ||
> (LIST_NEXT(inp, inp_hash) == _Q_INVALID));
> KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) ||
> @@ -819,10 +814,10 @@ in_pcbnotifyall(struct inpcbtable *table
> continue;
> }
> mtx_leave(&table->inpt_mtx);
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> (*notify)(inp, errno);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> mtx_enter(&table->inpt_mtx);
> }
> mtx_leave(&table->inpt_mtx);
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> diff -u -p -r1.168 in_pcb.h
> --- netinet/in_pcb.h 20 May 2025 05:51:43 -0000 1.168
> +++ netinet/in_pcb.h 21 May 2025 04:51:43 -0000
> @@ -81,7 +81,6 @@
> * t inpt_mtx pcb table mutex
> * L pf_inp_mtx link pf to inp mutex
> * s so_lock socket rwlock
> - * f inp_sofree_mtx socket detach and lock
> */
>
> /*
> @@ -138,8 +137,7 @@ struct inpcb {
> #define inp_laddr6 inp_laddru.iau_addr6
> u_int16_t inp_fport; /* [t] foreign port */
> u_int16_t inp_lport; /* [t] local port */
> - struct socket *inp_socket; /* [f] back pointer to socket */
> - struct mutex inp_sofree_mtx; /* protect socket free */
> + struct socket *inp_socket; /* [I] back pointer to socket */
> caddr_t inp_ppcb; /* pointer to per-protocol pcb */
> struct route inp_route; /* [s] cached route */
> struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
> @@ -311,8 +309,8 @@ int in_pcbaddrisavail(const struct inpc
> int in_pcbconnect(struct inpcb *, struct mbuf *);
> void in_pcbdetach(struct inpcb *);
> struct socket *
> - in_pcbsolock_ref(struct inpcb *);
> -void in_pcbsounlock_rele(struct inpcb *, struct socket *);
> + in_pcbsolock(struct inpcb *);
> +void in_pcbsounlock(struct inpcb *, struct socket *);
> struct inpcb *
> in_pcbref(struct inpcb *);
> void in_pcbunref(struct inpcb *);
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.447 tcp_input.c
> --- netinet/tcp_input.c 19 May 2025 02:27:57 -0000 1.447
> +++ netinet/tcp_input.c 21 May 2025 04:50:15 -0000
> @@ -385,7 +385,7 @@ tcp_input_mlist(struct mbuf_list *ml, in
> KASSERT(nxt == IPPROTO_DONE);
> }
>
> - in_pcbsounlock_rele(NULL, so);
> + in_pcbsounlock(NULL, so);
> }
>
> /*
> @@ -655,10 +655,10 @@ findpcb:
> *solocked = NULL;
> } else {
> if (solocked != NULL && *solocked != NULL) {
> - in_pcbsounlock_rele(NULL, *solocked);
> + in_pcbsounlock(NULL, *solocked);
> *solocked = NULL;
> }
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> }
> if (so == NULL) {
> tcpstat_inc(tcps_noport);
> @@ -905,7 +905,7 @@ findpcb:
> if (solocked != NULL)
> *solocked = so;
> else
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -1084,7 +1084,7 @@ findpcb:
> if (solocked != NULL)
> *solocked = so;
> else
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -1138,7 +1138,7 @@ findpcb:
> if (solocked != NULL)
> *solocked = so;
> else
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -1332,7 +1332,7 @@ trimthenstep6:
> ((arc4random() & 0x7fffffff) | 0x8000);
> reuse = &iss;
> tp = tcp_close(tp);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> so = NULL;
> in_pcbunref(inp);
> inp = NULL;
> @@ -2141,7 +2141,7 @@ dodata: /* XXX */
> if (solocked != NULL)
> *solocked = so;
> else
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
>
> @@ -2174,7 +2174,7 @@ dropafterack:
> if (solocked != NULL)
> *solocked = so;
> else
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
>
> @@ -2210,7 +2210,7 @@ dropwithreset:
> (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now);
> }
> m_freem(m);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
>
> @@ -2222,7 +2222,7 @@ drop:
> tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen);
>
> m_freem(m);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return IPPROTO_DONE;
> }
> @@ -3490,7 +3490,7 @@ syn_cache_timer(void *arg)
> mtx_leave(&syn_cache_mtx);
>
> NET_LOCK_SHARED();
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL) {
> now = tcp_now();
> #ifdef TCP_ECN
> @@ -3499,7 +3499,7 @@ syn_cache_timer(void *arg)
> (void) syn_cache_respond(sc, NULL, now, do_ecn);
> tcpstat_inc(tcps_sc_retransmitted);
> }
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> NET_UNLOCK_SHARED();
>
> in_pcbunref(inp);
> @@ -3622,7 +3622,7 @@ syn_cache_get(struct sockaddr *src, stru
> sc = syn_cache_lookup(src, dst, &scp, inp->inp_rtableid);
> if (sc == NULL) {
> mtx_leave(&syn_cache_mtx);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> return (NULL);
> }
>
> @@ -3636,7 +3636,7 @@ syn_cache_get(struct sockaddr *src, stru
> refcnt_take(&sc->sc_refcnt);
> mtx_leave(&syn_cache_mtx);
> (void) syn_cache_respond(sc, m, now, do_ecn);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> syn_cache_put(sc);
> return ((struct socket *)(-1));
> }
> @@ -3767,7 +3767,7 @@ syn_cache_get(struct sockaddr *src, stru
> tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
> tp->last_ack_sent = tp->rcv_nxt;
>
> - in_pcbsounlock_rele(listeninp, listenso);
> + in_pcbsounlock(listeninp, listenso);
> tcpstat_inc(tcps_sc_completed);
> syn_cache_put(sc);
> return (so);
> @@ -3779,8 +3779,8 @@ abort:
> if (tp != NULL)
> tp = tcp_drop(tp, ECONNABORTED); /* destroys socket */
> m_freem(m);
> - in_pcbsounlock_rele(inp, so);
> - in_pcbsounlock_rele(listeninp, listenso);
> + in_pcbsounlock(inp, so);
> + in_pcbsounlock(listeninp, listenso);
> syn_cache_put(sc);
> tcpstat_inc(tcps_sc_aborted);
> return ((struct socket *)(-1));
> Index: netinet/tcp_subr.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v
> diff -u -p -r1.209 tcp_subr.c
> --- netinet/tcp_subr.c 2 Mar 2025 21:28:32 -0000 1.209
> +++ netinet/tcp_subr.c 21 May 2025 04:50:15 -0000
> @@ -692,7 +692,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *
> return;
> }
> if (inp != NULL)
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> tp = intotcpcb(inp);
> if (tp != NULL) {
> @@ -702,7 +702,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *
> SEQ_LT(seq, tp->snd_max))
> notify(inp, inet6ctlerrmap[cmd]);
> }
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
>
> if (tp == NULL &&
> @@ -762,7 +762,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
> ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport,
> rdomain);
> if (inp != NULL)
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> tp = intotcpcb(inp);
> if (tp != NULL &&
> @@ -779,7 +779,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
> */
> mtu = (u_int)ntohs(icp->icmp_nextmtu);
> if (mtu >= tp->t_pmtud_mtu_sent) {
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return;
> }
> @@ -800,7 +800,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
> */
> if (tp->t_flags & TF_PMTUD_PEND) {
> if (SEQ_LT(tp->t_pmtud_th_seq, seq)) {
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return;
> }
> @@ -810,17 +810,17 @@ tcp_ctlinput(int cmd, struct sockaddr *s
> tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
> tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
> tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return;
> }
> } else {
> /* ignore if we don't have a matching connection */
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> return;
> }
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> notify = tcp_mtudisc, ip = NULL;
> } else if (cmd == PRC_MTUINC)
> @@ -840,7 +840,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
> ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport,
> rdomain);
> if (inp != NULL)
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> tp = intotcpcb(inp);
> if (tp != NULL) {
> @@ -849,7 +849,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s
> SEQ_LT(seq, tp->snd_max))
> notify(inp, errno);
> }
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
>
> if (tp == NULL &&
> Index: netinet/tcp_timer.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
> diff -u -p -r1.83 tcp_timer.c
> --- netinet/tcp_timer.c 12 Feb 2025 21:28:11 -0000 1.83
> +++ netinet/tcp_timer.c 21 May 2025 04:50:15 -0000
> @@ -91,7 +91,7 @@ tcp_timer_enter(struct inpcb *inp, struc
> KASSERT(timer < TCPT_NTIMERS);
>
> NET_LOCK_SHARED();
> - *so = in_pcbsolock_ref(inp);
> + *so = in_pcbsolock(inp);
> if (*so == NULL) {
> *tp = NULL;
> return -1;
> @@ -109,7 +109,7 @@ tcp_timer_enter(struct inpcb *inp, struc
> static inline void
> tcp_timer_leave(struct inpcb *inp, struct socket *so)
> {
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> NET_UNLOCK_SHARED();
> in_pcbunref(inp);
> }
> @@ -237,7 +237,7 @@ tcp_timer_rexmt(void *arg)
> sin.sin_family = AF_INET;
> sin.sin_addr = inp->inp_faddr;
>
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
>
> icmp_mtudisc(&icmp, rtableid);
> Index: netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
> diff -u -p -r1.247 tcp_usrreq.c
> --- netinet/tcp_usrreq.c 20 May 2025 18:41:06 -0000 1.247
> +++ netinet/tcp_usrreq.c 21 May 2025 04:50:15 -0000
> @@ -1214,7 +1214,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
> struct tcpcb *tp = NULL;
>
> if (inp != NULL) {
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> tp = intotcpcb(inp);
> }
> @@ -1223,7 +1223,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
> else
> error = ESRCH;
>
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> NET_UNLOCK_SHARED();
> in_pcbunref(inp);
> return (error);
> @@ -1246,7 +1246,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
> }
>
> if (inp != NULL)
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
>
> if (so != NULL && ISSET(so->so_state, SS_CONNECTOUT)) {
> tir.ruid = so->so_ruid;
> @@ -1256,7 +1256,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v
> tir.euid = -1;
> }
>
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> NET_UNLOCK_SHARED();
> in_pcbunref(inp);
>
> Index: netinet/udp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
> diff -u -p -r1.337 udp_usrreq.c
> --- netinet/udp_usrreq.c 12 May 2025 17:21:21 -0000 1.337
> +++ netinet/udp_usrreq.c 21 May 2025 04:50:15 -0000
> @@ -921,10 +921,10 @@ udp_ctlinput(int cmd, struct sockaddr *s
> ip->ip_dst, uhp->uh_dport, ip->ip_src, uhp->uh_sport,
> rdomain);
> if (inp != NULL)
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> notify(inp, errno);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> in_pcbunref(inp);
> } else
> in_pcbnotifyall(&udbtable, satosin(sa), rdomain, errno, notify);
> Index: netinet6/in6_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
> diff -u -p -r1.148 in6_pcb.c
> --- netinet6/in6_pcb.c 4 May 2025 23:05:17 -0000 1.148
> +++ netinet6/in6_pcb.c 21 May 2025 04:50:15 -0000
> @@ -546,10 +546,10 @@ in6_pcbnotify(struct inpcbtable *table,
> }
> do_notify:
> mtx_leave(&table->inpt_mtx);
> - so = in_pcbsolock_ref(inp);
> + so = in_pcbsolock(inp);
> if (so != NULL)
> (*notify)(inp, errno);
> - in_pcbsounlock_rele(inp, so);
> + in_pcbsounlock(inp, so);
> mtx_enter(&table->inpt_mtx);
> }
> mtx_leave(&table->inpt_mtx);
incpb socket ref count