From: Alexander Bluhm Subject: Re: incpb socket ref count To: tech@openbsd.org Date: Mon, 26 May 2025 22:19:55 +0900 On Wed, May 21, 2025 at 05:44:45PM +0900, Alexander Bluhm wrote: > Hi, > > Instead of protecting inp_socket with special mutex when being freed > and reference counting in in_pcbsolock_ref(), better reference count > inp_socket. > > When an inpcb is created, inp_socket is initialized with a socket > pointer that is reference counted. When the inpcb is freed, release > the socket reference. As incpb is reference counted itself, we can > always access the socket memory when we have a valid inpcb counter. > > in_pcbsolock() can just lock the socket, no reference counting > needed. This reduces contention a bit. As in_pcbdetach() is > protected by socket lock, in_pcbsolock() has to check so_pcb while > holding the lock to skip sockets that are being closed. > > ok? anyone? > bluhm > > Index: kern/kern_sysctl.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v > diff -u -p -r1.468 kern_sysctl.c > --- kern/kern_sysctl.c 9 May 2025 14:53:22 -0000 1.468 > +++ kern/kern_sysctl.c 21 May 2025 04:50:15 -0000 > @@ -1722,17 +1722,15 @@ do { \ > mtx_enter(&(table)->inpt_mtx); \ > while ((inp = in_pcb_iterator(table, inp, &iter)) != NULL) { \ > if (buflen >= elem_size && elem_count > 0) { \ > - mtx_enter(&inp->inp_sofree_mtx); \ > - so = soref(inp->inp_socket); \ > - mtx_leave(&inp->inp_sofree_mtx); \ > + mtx_leave(&(table)->inpt_mtx); \ > + NET_LOCK_SHARED(); \ > + so = in_pcbsolock(inp); \ > if (so == NULL) \ > continue; \ > - mtx_leave(&(table)->inpt_mtx); \ > - solock_shared(so); \ > fill_file(kf, NULL, NULL, 0, NULL, NULL, p, \ > so, show_pointers); \ > - sounlock_shared(so); \ > - sorele(so); \ > + in_pcbsounlock(inp, so); \ > + NET_UNLOCK_SHARED(); \ > error = copyout(kf, dp, outsize); \ > mtx_enter(&(table)->inpt_mtx); \ > if (error) { \ > Index: netinet/in_pcb.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v > diff -u -p -r1.314 in_pcb.c > --- netinet/in_pcb.c 20 May 2025 05:51:43 -0000 1.314 > +++ netinet/in_pcb.c 21 May 2025 04:50:15 -0000 > @@ -236,8 +236,7 @@ in_pcballoc(struct socket *so, struct in > if (inp == NULL) > return (ENOBUFS); > inp->inp_table = table; > - inp->inp_socket = so; > - mtx_init(&inp->inp_sofree_mtx, IPL_SOFTNET); > + inp->inp_socket = soref(so); > refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB); > inp->inp_seclevel.sl_auth = IPSEC_AUTH_LEVEL_DEFAULT; > inp->inp_seclevel.sl_esp_trans = IPSEC_ESP_TRANS_LEVEL_DEFAULT; > @@ -584,10 +583,9 @@ in_pcbdetach(struct inpcb *inp) > struct socket *so = inp->inp_socket; > struct inpcbtable *table = inp->inp_table; > > + soassertlocked(so); > + > so->so_pcb = NULL; > - mtx_enter(&inp->inp_sofree_mtx); > - inp->inp_socket = NULL; > - mtx_leave(&inp->inp_sofree_mtx); > /* > * As long as the NET_LOCK() is the default lock for Internet > * sockets, do not release it to not introduce new sleeping > @@ -623,22 +621,17 @@ in_pcbdetach(struct inpcb *inp) > } > > struct socket * > -in_pcbsolock_ref(struct inpcb *inp) > +in_pcbsolock(struct inpcb *inp) > { > - struct socket *so; > + struct socket *so = inp->inp_socket; > > NET_ASSERT_LOCKED(); > > - mtx_enter(&inp->inp_sofree_mtx); > - so = soref(inp->inp_socket); > - mtx_leave(&inp->inp_sofree_mtx); > if (so == NULL) > return NULL; > rw_enter_write(&so->so_lock); > - /* between mutex and rwlock inpcb could be detached */ > if (so->so_pcb == NULL) { > rw_exit_write(&so->so_lock); > - sorele(so); > return NULL; > } > KASSERT(inp->inp_socket == so && sotoinpcb(so) == inp); > @@ -646,12 +639,13 @@ in_pcbsolock_ref(struct inpcb *inp) > } > > void > -in_pcbsounlock_rele(struct inpcb *inp, struct socket *so) > +in_pcbsounlock(struct inpcb *inp, struct socket *so) > { > if (so == NULL) > return; > + if (inp != NULL && so->so_pcb != NULL) > + KASSERT(inp->inp_socket == so && sotoinpcb(so) == inp); > rw_exit_write(&so->so_lock); > - sorele(so); > } > > struct inpcb * > @@ -670,6 +664,7 @@ in_pcbunref(struct inpcb *inp) > return; > if (refcnt_rele(&inp->inp_refcnt) == 0) > return; > + sorele(inp->inp_socket); > KASSERT((LIST_NEXT(inp, inp_hash) == NULL) || > (LIST_NEXT(inp, inp_hash) == _Q_INVALID)); > KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) || > @@ -819,10 +814,10 @@ in_pcbnotifyall(struct inpcbtable *table > continue; > } > mtx_leave(&table->inpt_mtx); > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > (*notify)(inp, errno); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > mtx_enter(&table->inpt_mtx); > } > mtx_leave(&table->inpt_mtx); > Index: netinet/in_pcb.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v > diff -u -p -r1.168 in_pcb.h > --- netinet/in_pcb.h 20 May 2025 05:51:43 -0000 1.168 > +++ netinet/in_pcb.h 21 May 2025 04:51:43 -0000 > @@ -81,7 +81,6 @@ > * t inpt_mtx pcb table mutex > * L pf_inp_mtx link pf to inp mutex > * s so_lock socket rwlock > - * f inp_sofree_mtx socket detach and lock > */ > > /* > @@ -138,8 +137,7 @@ struct inpcb { > #define inp_laddr6 inp_laddru.iau_addr6 > u_int16_t inp_fport; /* [t] foreign port */ > u_int16_t inp_lport; /* [t] local port */ > - struct socket *inp_socket; /* [f] back pointer to socket */ > - struct mutex inp_sofree_mtx; /* protect socket free */ > + struct socket *inp_socket; /* [I] back pointer to socket */ > caddr_t inp_ppcb; /* pointer to per-protocol pcb */ > struct route inp_route; /* [s] cached route */ > struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ > @@ -311,8 +309,8 @@ int in_pcbaddrisavail(const struct inpc > int in_pcbconnect(struct inpcb *, struct mbuf *); > void in_pcbdetach(struct inpcb *); > struct socket * > - in_pcbsolock_ref(struct inpcb *); > -void in_pcbsounlock_rele(struct inpcb *, struct socket *); > + in_pcbsolock(struct inpcb *); > +void in_pcbsounlock(struct inpcb *, struct socket *); > struct inpcb * > in_pcbref(struct inpcb *); > void in_pcbunref(struct inpcb *); > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.447 tcp_input.c > --- netinet/tcp_input.c 19 May 2025 02:27:57 -0000 1.447 > +++ netinet/tcp_input.c 21 May 2025 04:50:15 -0000 > @@ -385,7 +385,7 @@ tcp_input_mlist(struct mbuf_list *ml, in > KASSERT(nxt == IPPROTO_DONE); > } > > - in_pcbsounlock_rele(NULL, so); > + in_pcbsounlock(NULL, so); > } > > /* > @@ -655,10 +655,10 @@ findpcb: > *solocked = NULL; > } else { > if (solocked != NULL && *solocked != NULL) { > - in_pcbsounlock_rele(NULL, *solocked); > + in_pcbsounlock(NULL, *solocked); > *solocked = NULL; > } > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > } > if (so == NULL) { > tcpstat_inc(tcps_noport); > @@ -905,7 +905,7 @@ findpcb: > if (solocked != NULL) > *solocked = so; > else > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1084,7 +1084,7 @@ findpcb: > if (solocked != NULL) > *solocked = so; > else > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1138,7 +1138,7 @@ findpcb: > if (solocked != NULL) > *solocked = so; > else > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -1332,7 +1332,7 @@ trimthenstep6: > ((arc4random() & 0x7fffffff) | 0x8000); > reuse = &iss; > tp = tcp_close(tp); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > so = NULL; > in_pcbunref(inp); > inp = NULL; > @@ -2141,7 +2141,7 @@ dodata: /* XXX */ > if (solocked != NULL) > *solocked = so; > else > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > @@ -2174,7 +2174,7 @@ dropafterack: > if (solocked != NULL) > *solocked = so; > else > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > @@ -2210,7 +2210,7 @@ dropwithreset: > (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); > } > m_freem(m); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > > @@ -2222,7 +2222,7 @@ drop: > tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen); > > m_freem(m); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return IPPROTO_DONE; > } > @@ -3490,7 +3490,7 @@ syn_cache_timer(void *arg) > mtx_leave(&syn_cache_mtx); > > NET_LOCK_SHARED(); > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) { > now = tcp_now(); > #ifdef TCP_ECN > @@ -3499,7 +3499,7 @@ syn_cache_timer(void *arg) > (void) syn_cache_respond(sc, NULL, now, do_ecn); > tcpstat_inc(tcps_sc_retransmitted); > } > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > NET_UNLOCK_SHARED(); > > in_pcbunref(inp); > @@ -3622,7 +3622,7 @@ syn_cache_get(struct sockaddr *src, stru > sc = syn_cache_lookup(src, dst, &scp, inp->inp_rtableid); > if (sc == NULL) { > mtx_leave(&syn_cache_mtx); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > return (NULL); > } > > @@ -3636,7 +3636,7 @@ syn_cache_get(struct sockaddr *src, stru > refcnt_take(&sc->sc_refcnt); > mtx_leave(&syn_cache_mtx); > (void) syn_cache_respond(sc, m, now, do_ecn); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > syn_cache_put(sc); > return ((struct socket *)(-1)); > } > @@ -3767,7 +3767,7 @@ syn_cache_get(struct sockaddr *src, stru > tp->rcv_adv = tp->rcv_nxt + sc->sc_win; > tp->last_ack_sent = tp->rcv_nxt; > > - in_pcbsounlock_rele(listeninp, listenso); > + in_pcbsounlock(listeninp, listenso); > tcpstat_inc(tcps_sc_completed); > syn_cache_put(sc); > return (so); > @@ -3779,8 +3779,8 @@ abort: > if (tp != NULL) > tp = tcp_drop(tp, ECONNABORTED); /* destroys socket */ > m_freem(m); > - in_pcbsounlock_rele(inp, so); > - in_pcbsounlock_rele(listeninp, listenso); > + in_pcbsounlock(inp, so); > + in_pcbsounlock(listeninp, listenso); > syn_cache_put(sc); > tcpstat_inc(tcps_sc_aborted); > return ((struct socket *)(-1)); > Index: netinet/tcp_subr.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v > diff -u -p -r1.209 tcp_subr.c > --- netinet/tcp_subr.c 2 Mar 2025 21:28:32 -0000 1.209 > +++ netinet/tcp_subr.c 21 May 2025 04:50:15 -0000 > @@ -692,7 +692,7 @@ tcp6_ctlinput(int cmd, struct sockaddr * > return; > } > if (inp != NULL) > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > tp = intotcpcb(inp); > if (tp != NULL) { > @@ -702,7 +702,7 @@ tcp6_ctlinput(int cmd, struct sockaddr * > SEQ_LT(seq, tp->snd_max)) > notify(inp, inet6ctlerrmap[cmd]); > } > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > > if (tp == NULL && > @@ -762,7 +762,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, > rdomain); > if (inp != NULL) > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > tp = intotcpcb(inp); > if (tp != NULL && > @@ -779,7 +779,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > */ > mtu = (u_int)ntohs(icp->icmp_nextmtu); > if (mtu >= tp->t_pmtud_mtu_sent) { > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return; > } > @@ -800,7 +800,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > */ > if (tp->t_flags & TF_PMTUD_PEND) { > if (SEQ_LT(tp->t_pmtud_th_seq, seq)) { > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return; > } > @@ -810,17 +810,17 @@ tcp_ctlinput(int cmd, struct sockaddr *s > tp->t_pmtud_nextmtu = icp->icmp_nextmtu; > tp->t_pmtud_ip_len = icp->icmp_ip.ip_len; > tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl; > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return; > } > } else { > /* ignore if we don't have a matching connection */ > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > return; > } > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > notify = tcp_mtudisc, ip = NULL; > } else if (cmd == PRC_MTUINC) > @@ -840,7 +840,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, > rdomain); > if (inp != NULL) > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > tp = intotcpcb(inp); > if (tp != NULL) { > @@ -849,7 +849,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > SEQ_LT(seq, tp->snd_max)) > notify(inp, errno); > } > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > > if (tp == NULL && > Index: netinet/tcp_timer.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v > diff -u -p -r1.83 tcp_timer.c > --- netinet/tcp_timer.c 12 Feb 2025 21:28:11 -0000 1.83 > +++ netinet/tcp_timer.c 21 May 2025 04:50:15 -0000 > @@ -91,7 +91,7 @@ tcp_timer_enter(struct inpcb *inp, struc > KASSERT(timer < TCPT_NTIMERS); > > NET_LOCK_SHARED(); > - *so = in_pcbsolock_ref(inp); > + *so = in_pcbsolock(inp); > if (*so == NULL) { > *tp = NULL; > return -1; > @@ -109,7 +109,7 @@ tcp_timer_enter(struct inpcb *inp, struc > static inline void > tcp_timer_leave(struct inpcb *inp, struct socket *so) > { > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > NET_UNLOCK_SHARED(); > in_pcbunref(inp); > } > @@ -237,7 +237,7 @@ tcp_timer_rexmt(void *arg) > sin.sin_family = AF_INET; > sin.sin_addr = inp->inp_faddr; > > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > > icmp_mtudisc(&icmp, rtableid); > Index: netinet/tcp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v > diff -u -p -r1.247 tcp_usrreq.c > --- netinet/tcp_usrreq.c 20 May 2025 18:41:06 -0000 1.247 > +++ netinet/tcp_usrreq.c 21 May 2025 04:50:15 -0000 > @@ -1214,7 +1214,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v > struct tcpcb *tp = NULL; > > if (inp != NULL) { > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > tp = intotcpcb(inp); > } > @@ -1223,7 +1223,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v > else > error = ESRCH; > > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > NET_UNLOCK_SHARED(); > in_pcbunref(inp); > return (error); > @@ -1246,7 +1246,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v > } > > if (inp != NULL) > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > > if (so != NULL && ISSET(so->so_state, SS_CONNECTOUT)) { > tir.ruid = so->so_ruid; > @@ -1256,7 +1256,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v > tir.euid = -1; > } > > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > NET_UNLOCK_SHARED(); > in_pcbunref(inp); > > Index: netinet/udp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v > diff -u -p -r1.337 udp_usrreq.c > --- netinet/udp_usrreq.c 12 May 2025 17:21:21 -0000 1.337 > +++ netinet/udp_usrreq.c 21 May 2025 04:50:15 -0000 > @@ -921,10 +921,10 @@ udp_ctlinput(int cmd, struct sockaddr *s > ip->ip_dst, uhp->uh_dport, ip->ip_src, uhp->uh_sport, > rdomain); > if (inp != NULL) > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > notify(inp, errno); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > in_pcbunref(inp); > } else > in_pcbnotifyall(&udbtable, satosin(sa), rdomain, errno, notify); > Index: netinet6/in6_pcb.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v > diff -u -p -r1.148 in6_pcb.c > --- netinet6/in6_pcb.c 4 May 2025 23:05:17 -0000 1.148 > +++ netinet6/in6_pcb.c 21 May 2025 04:50:15 -0000 > @@ -546,10 +546,10 @@ in6_pcbnotify(struct inpcbtable *table, > } > do_notify: > mtx_leave(&table->inpt_mtx); > - so = in_pcbsolock_ref(inp); > + so = in_pcbsolock(inp); > if (so != NULL) > (*notify)(inp, errno); > - in_pcbsounlock_rele(inp, so); > + in_pcbsounlock(inp, so); > mtx_enter(&table->inpt_mtx); > } > mtx_leave(&table->inpt_mtx);