From: Vitaliy Makkoveev Subject: Re: split TCP incpb table in IPv4 and IPv6 To: Alexander Bluhm Cc: tech@openbsd.org Date: Fri, 12 Apr 2024 17:35:27 +0300 On Fri, Apr 12, 2024 at 03:11:23PM +0200, Alexander Bluhm wrote: > Hi, > > A while ago I splitted the UDP inpcb table in v4 and v6 part. Idea > was to reduce contention on table lock. Same can be done with TCP. > > Currently TCP runs with exclusive netlock, so there is not much > difference regarding the lock. But with two hash tables each one > gets smaller. Also we don't need an if around INP_IPV6, but can > assert that it is correct. > > ok? > ok mvs > bluhm > > Index: kern/kern_sysctl.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v > diff -u -p -r1.426 kern_sysctl.c > --- kern/kern_sysctl.c 29 Mar 2024 06:50:06 -0000 1.426 > +++ kern/kern_sysctl.c 11 Apr 2024 16:39:08 -0000 > @@ -1482,6 +1482,12 @@ sysctl_file(int *name, u_int namelen, ch > TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) > FILLSO(inp->inp_socket); > mtx_leave(&tcbtable.inpt_mtx); > +#ifdef INET6 > + mtx_enter(&tcb6table.inpt_mtx); > + TAILQ_FOREACH(inp, &tcb6table.inpt_queue, inp_queue) > + FILLSO(inp->inp_socket); > + mtx_leave(&tcb6table.inpt_mtx); > +#endif > mtx_enter(&udbtable.inpt_mtx); > TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) > FILLSO(inp->inp_socket); > Index: net/pf.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v > diff -u -p -r1.1193 pf.c > --- net/pf.c 10 Jan 2024 16:44:30 -0000 1.1193 > +++ net/pf.c 11 Apr 2024 16:39:08 -0000 > @@ -3788,7 +3788,7 @@ pf_socket_lookup(struct pf_pdesc *pd) > { > struct pf_addr *saddr, *daddr; > u_int16_t sport, dport; > - struct inpcbtable *tb; > + struct inpcbtable *table; > struct inpcb *inp; > > pd->lookup.uid = -1; > @@ -3800,14 +3800,14 @@ pf_socket_lookup(struct pf_pdesc *pd) > dport = pd->hdr.tcp.th_dport; > PF_ASSERT_LOCKED(); > NET_ASSERT_LOCKED(); > - tb = &tcbtable; > + table = &tcbtable; > break; > case IPPROTO_UDP: > sport = pd->hdr.udp.uh_sport; > dport = pd->hdr.udp.uh_dport; > PF_ASSERT_LOCKED(); > NET_ASSERT_LOCKED(); > - tb = &udbtable; > + table = &udbtable; > break; > default: > return (-1); > @@ -3830,10 +3830,10 @@ pf_socket_lookup(struct pf_pdesc *pd) > * Fails when rtable is changed while evaluating the ruleset > * The socket looked up will not match the one hit in the end. > */ > - inp = in_pcblookup(tb, saddr->v4, sport, daddr->v4, dport, > + inp = in_pcblookup(table, saddr->v4, sport, daddr->v4, dport, > pd->rdomain); > if (inp == NULL) { > - inp = in_pcblookup_listen(tb, daddr->v4, dport, > + inp = in_pcblookup_listen(table, daddr->v4, dport, > NULL, pd->rdomain); > if (inp == NULL) > return (-1); > @@ -3842,11 +3842,13 @@ pf_socket_lookup(struct pf_pdesc *pd) > #ifdef INET6 > case AF_INET6: > if (pd->virtual_proto == IPPROTO_UDP) > - tb = &udb6table; > - inp = in6_pcblookup(tb, &saddr->v6, sport, &daddr->v6, > + table = &udb6table; > + if (pd->virtual_proto == IPPROTO_TCP) > + table = &tcb6table; > + inp = in6_pcblookup(table, &saddr->v6, sport, &daddr->v6, > dport, pd->rdomain); > if (inp == NULL) { > - inp = in6_pcblookup_listen(tb, &daddr->v6, dport, > + inp = in6_pcblookup_listen(table, &daddr->v6, dport, > NULL, pd->rdomain); > if (inp == NULL) > return (-1); > Index: netinet/in_pcb.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v > diff -u -p -r1.299 in_pcb.c > --- netinet/in_pcb.c 31 Mar 2024 15:53:12 -0000 1.299 > +++ netinet/in_pcb.c 11 Apr 2024 16:39:08 -0000 > @@ -743,10 +743,8 @@ in_pcbnotifyall(struct inpcbtable *table > rw_enter_write(&table->inpt_notify); > mtx_enter(&table->inpt_mtx); > TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { > -#ifdef INET6 > - if (ISSET(inp->inp_flags, INP_IPV6)) > - continue; > -#endif > + KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > + > if (inp->inp_faddr.s_addr != dst->sin_addr.s_addr || > rtable_l2(inp->inp_rtableid) != rdomain) { > continue; > @@ -852,8 +850,7 @@ in_pcblookup_local_lock(struct inpcbtabl > wildcard = 0; > #ifdef INET6 > if (ISSET(flags, INPLOOKUP_IPV6)) { > - if (!ISSET(inp->inp_flags, INP_IPV6)) > - continue; > + KASSERT(ISSET(inp->inp_flags, INP_IPV6)); > > if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) > wildcard++; > @@ -869,10 +866,7 @@ in_pcblookup_local_lock(struct inpcbtabl > } else > #endif /* INET6 */ > { > -#ifdef INET6 > - if (ISSET(inp->inp_flags, INP_IPV6)) > - continue; > -#endif /* INET6 */ > + KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > > if (inp->inp_faddr.s_addr != INADDR_ANY) > wildcard++; > @@ -1032,7 +1026,7 @@ in_pcbhash_insert(struct inpcb *inp) > &inp->inp_faddr6, inp->inp_fport, > &inp->inp_laddr6, inp->inp_lport); > else > -#endif /* INET6 */ > +#endif > hash = in_pcbhash(table, rtable_l2(inp->inp_rtableid), > &inp->inp_faddr, inp->inp_fport, > &inp->inp_laddr, inp->inp_lport); > @@ -1052,10 +1046,8 @@ in_pcbhash_lookup(struct inpcbtable *tab > > head = &table->inpt_hashtbl[hash & table->inpt_mask]; > LIST_FOREACH(inp, head, inp_hash) { > -#ifdef INET6 > - if (ISSET(inp->inp_flags, INP_IPV6)) > - continue; > -#endif > + KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); > + > if (inp->inp_fport == fport && inp->inp_lport == lport && > inp->inp_faddr.s_addr == faddr->s_addr && > inp->inp_laddr.s_addr == laddr->s_addr && > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.402 tcp_input.c > --- netinet/tcp_input.c 10 Apr 2024 22:10:03 -0000 1.402 > +++ netinet/tcp_input.c 11 Apr 2024 16:44:02 -0000 > @@ -140,7 +140,8 @@ struct timeval tcp_ackdrop_ppslim_last; > #ifdef INET6 > #define ND6_HINT(tp) \ > do { \ > - if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ > + if (tp && tp->t_inpcb && \ > + ISSET(tp->t_inpcb->inp_flags, INP_IPV6) && \ > rtisvalid(tp->t_inpcb->inp_route.ro_rt)) { \ > nd6_nud_hint(tp->t_inpcb->inp_route.ro_rt); \ > } \ > @@ -540,7 +541,7 @@ findpcb: > switch (af) { > #ifdef INET6 > case AF_INET6: > - inp = in6_pcblookup(&tcbtable, &ip6->ip6_src, > + inp = in6_pcblookup(&tcb6table, &ip6->ip6_src, > th->th_sport, &ip6->ip6_dst, th->th_dport, > m->m_pkthdr.ph_rtableid); > break; > @@ -557,10 +558,10 @@ findpcb: > switch (af) { > #ifdef INET6 > case AF_INET6: > - inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, > + inp = in6_pcblookup_listen(&tcb6table, &ip6->ip6_dst, > th->th_dport, m, m->m_pkthdr.ph_rtableid); > break; > -#endif /* INET6 */ > +#endif > case AF_INET: > inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, > th->th_dport, m, m->m_pkthdr.ph_rtableid); > @@ -3543,17 +3544,16 @@ syn_cache_get(struct sockaddr *src, stru > sizeof(oldinp->inp_seclevel)); > #endif /* IPSEC */ > #ifdef INET6 > - /* > - * inp still has the OLD in_pcb stuff, set the > - * v6-related flags on the new guy, too. > - */ > - inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); > - if (inp->inp_flags & INP_IPV6) { > + if (ISSET(inp->inp_flags, INP_IPV6)) { > + KASSERT(ISSET(oldinp->inp_flags, INP_IPV6)); > + > inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; > inp->inp_hops = oldinp->inp_hops; > } else > -#endif /* INET6 */ > +#endif > { > + KASSERT(!ISSET(oldinp->inp_flags, INP_IPV6)); > + > inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; > inp->inp_options = ip_srcroute(m); > if (inp->inp_options == NULL) { > Index: netinet/tcp_subr.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v > diff -u -p -r1.199 tcp_subr.c > --- netinet/tcp_subr.c 13 Feb 2024 12:22:09 -0000 1.199 > +++ netinet/tcp_subr.c 11 Apr 2024 16:39:08 -0000 > @@ -159,6 +159,9 @@ tcp_init(void) > "sackhl", NULL); > pool_sethardlimit(&sackhl_pool, tcp_sackhole_limit, NULL, 0); > in_pcbinit(&tcbtable, TCB_INITIAL_HASH_SIZE); > +#ifdef INET6 > + in_pcbinit(&tcb6table, TCB_INITIAL_HASH_SIZE); > +#endif > tcpcounters = counters_alloc(tcps_ncounters); > > arc4random_buf(tcp_secret, sizeof(tcp_secret)); > @@ -461,21 +464,15 @@ tcp_newtcpcb(struct inpcb *inp, int wait > tp->t_pmtud_mss_acked = 0; > > #ifdef INET6 > - /* we disallow IPv4 mapped address completely. */ > - if ((inp->inp_flags & INP_IPV6) == 0) > - tp->pf = PF_INET; > - else > + if (ISSET(inp->inp_flags, INP_IPV6)) { > tp->pf = PF_INET6; > -#else > - tp->pf = PF_INET; > -#endif > - > -#ifdef INET6 > - if (inp->inp_flags & INP_IPV6) > inp->inp_ipv6.ip6_hlim = ip6_defhlim; > - else > -#endif /* INET6 */ > + } else > +#endif > + { > + tp->pf = PF_INET; > inp->inp_ip.ip_ttl = ip_defttl; > + } > > inp->inp_ppcb = (caddr_t)tp; > return (tp); > @@ -675,7 +672,7 @@ tcp6_ctlinput(int cmd, struct sockaddr * > * corresponding to the address in the ICMPv6 message > * payload. > */ > - inp = in6_pcblookup(&tcbtable, &sa6->sin6_addr, > + inp = in6_pcblookup(&tcb6table, &sa6->sin6_addr, > th.th_dport, &sa6_src->sin6_addr, th.th_sport, rdomain); > if (cmd == PRC_MSGSIZE) { > /* > @@ -703,7 +700,7 @@ tcp6_ctlinput(int cmd, struct sockaddr * > rdomain); > in_pcbunref(inp); > } else { > - in6_pcbnotify(&tcbtable, sa6, 0, > + in6_pcbnotify(&tcb6table, sa6, 0, > sa6_src, 0, rdomain, cmd, NULL, notify); > } > } > @@ -845,7 +842,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > void > tcp6_mtudisc_callback(struct sockaddr_in6 *sin6, u_int rdomain) > { > - in6_pcbnotify(&tcbtable, sin6, 0, > + in6_pcbnotify(&tcb6table, sin6, 0, > &sa6_any, 0, rdomain, PRC_MSGSIZE, NULL, tcp_mtudisc); > } > #endif /* INET6 */ > Index: netinet/tcp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v > diff -u -p -r1.230 tcp_usrreq.c > --- netinet/tcp_usrreq.c 11 Feb 2024 01:27:45 -0000 1.230 > +++ netinet/tcp_usrreq.c 11 Apr 2024 16:39:08 -0000 > @@ -171,6 +171,9 @@ const struct sysctl_bounded_args tcpctl_ > }; > > struct inpcbtable tcbtable; > +#ifdef INET6 > +struct inpcbtable tcb6table; > +#endif > > int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); > int tcp_ident(void *, size_t *, void *, size_t, int); > @@ -317,7 +320,7 @@ tcp_ctloutput(int op, struct socket *so, > if (ISSET(inp->inp_flags, INP_IPV6)) > error = ip6_ctloutput(op, so, level, optname, m); > else > -#endif /* INET6 */ > +#endif > error = ip_ctloutput(op, so, level, optname, m); > return (error); > } > @@ -452,6 +455,7 @@ tcp_ctloutput(int op, struct socket *so, > int > tcp_attach(struct socket *so, int proto, int wait) > { > + struct inpcbtable *table; > struct tcpcb *tp; > struct inpcb *inp; > int error; > @@ -467,7 +471,13 @@ tcp_attach(struct socket *so, int proto, > } > > NET_ASSERT_LOCKED(); > - error = in_pcballoc(so, &tcbtable, wait); > +#ifdef INET6 > + if (so->so_proto->pr_domain->dom_family == PF_INET6) > + table = &tcb6table; > + else > +#endif > + table = &tcbtable; > + error = in_pcballoc(so, table, wait); > if (error) > return (error); > inp = sotoinpcb(so); > @@ -482,14 +492,11 @@ tcp_attach(struct socket *so, int proto, > } > tp->t_state = TCPS_CLOSED; > #ifdef INET6 > - /* we disallow IPv4 mapped address completely. */ > - if (inp->inp_flags & INP_IPV6) > + if (ISSET(inp->inp_flags, INP_IPV6)) > tp->pf = PF_INET6; > else > - tp->pf = PF_INET; > -#else > - tp->pf = PF_INET; > #endif > + tp->pf = PF_INET; > if ((so->so_options & SO_LINGER) && so->so_linger == 0) > so->so_linger = TCP_LINGERTIME; > > @@ -619,7 +626,7 @@ tcp_connect(struct socket *so, struct mb > } > > #ifdef INET6 > - if (inp->inp_flags & INP_IPV6) { > + if (ISSET(inp->inp_flags, INP_IPV6)) { > struct sockaddr_in6 *sin6; > > if ((error = in6_nam2sin6(nam, &sin6))) > @@ -630,7 +637,7 @@ tcp_connect(struct socket *so, struct mb > goto out; > } > } else > -#endif /* INET6 */ > +#endif > { > struct sockaddr_in *sin; > > @@ -1148,7 +1155,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v > switch (tir.faddr.ss_family) { > #ifdef INET6 > case AF_INET6: > - inp = in6_pcblookup(&tcbtable, &f6, > + inp = in6_pcblookup(&tcb6table, &f6, > fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); > break; > #endif > @@ -1175,7 +1182,7 @@ tcp_ident(void *oldp, size_t *oldlenp, v > switch (tir.faddr.ss_family) { > #ifdef INET6 > case AF_INET6: > - inp = in6_pcblookup_listen(&tcbtable, > + inp = in6_pcblookup_listen(&tcb6table, > &l6, lin6->sin6_port, NULL, tir.rdomain); > break; > #endif > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.176 tcp_var.h > --- netinet/tcp_var.h 13 Feb 2024 12:22:09 -0000 1.176 > +++ netinet/tcp_var.h 11 Apr 2024 16:39:08 -0000 > @@ -676,7 +676,7 @@ extern const struct pr_usrreqs tcp6_usrr > #endif > > extern struct pool tcpcb_pool; > -extern struct inpcbtable tcbtable; /* head of queue of active tcpcb's */ > +extern struct inpcbtable tcbtable, tcb6table; /* queue of active tcpcb's */ > extern int tcp_do_rfc1323; /* enabled/disabled? */ > extern int tcptv_keep_init; /* [N] time to keep alive initial SYN packet */ > extern int tcp_mssdflt; /* default maximum segment size */ > Index: netinet/udp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v > diff -u -p -r1.318 udp_usrreq.c > --- netinet/udp_usrreq.c 11 Feb 2024 18:14:26 -0000 1.318 > +++ netinet/udp_usrreq.c 11 Apr 2024 16:39:08 -0000 > @@ -1117,10 +1117,10 @@ udp_attach(struct socket *so, int proto, > if ((error = in_pcballoc(so, table, wait))) > return error; > #ifdef INET6 > - if (sotoinpcb(so)->inp_flags & INP_IPV6) > + if (ISSET(sotoinpcb(so)->inp_flags, INP_IPV6)) > sotoinpcb(so)->inp_ipv6.ip6_hlim = ip6_defhlim; > else > -#endif /* INET6 */ > +#endif > sotoinpcb(so)->inp_ip.ip_ttl = ip_defttl; > return 0; > } > @@ -1184,11 +1184,11 @@ udp_connect(struct socket *so, struct mb > soassertlocked(so); > > #ifdef INET6 > - if (inp->inp_flags & INP_IPV6) { > + if (ISSET(inp->inp_flags, INP_IPV6)) { > if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) > return (EISCONN); > } else > -#endif /* INET6 */ > +#endif > { > if (inp->inp_faddr.s_addr != INADDR_ANY) > return (EISCONN); > @@ -1209,11 +1209,11 @@ udp_disconnect(struct socket *so) > soassertlocked(so); > > #ifdef INET6 > - if (inp->inp_flags & INP_IPV6) { > + if (ISSET(inp->inp_flags, INP_IPV6)) { > if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) > return (ENOTCONN); > } else > -#endif /* INET6 */ > +#endif > { > if (inp->inp_faddr.s_addr == INADDR_ANY) > return (ENOTCONN); > @@ -1251,7 +1251,7 @@ udp_send(struct socket *so, struct mbuf > mtod(addr, struct sockaddr *)); > else > #ifdef INET6 > - if (inp->inp_flags & INP_IPV6) > + if (ISSET(inp->inp_flags, INP_IPV6)) > session = > pipex_l2tp_userland_lookup_session_ipv6( > m, inp->inp_faddr6); > Index: netinet6/in6_pcb.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v > diff -u -p -r1.143 in6_pcb.c > --- netinet6/in6_pcb.c 31 Mar 2024 15:53:12 -0000 1.143 > +++ netinet6/in6_pcb.c 11 Apr 2024 16:39:08 -0000 > @@ -479,8 +479,7 @@ in6_pcbnotify(struct inpcbtable *table, > rw_enter_write(&table->inpt_notify); > mtx_enter(&table->inpt_mtx); > TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { > - if (!ISSET(inp->inp_flags, INP_IPV6)) > - continue; > + KASSERT(ISSET(inp->inp_flags, INP_IPV6)); > > /* > * Under the following condition, notify of redirects > @@ -580,8 +579,8 @@ in6_pcbhash_lookup(struct inpcbtable *ta > > head = &table->inpt_hashtbl[hash & table->inpt_mask]; > LIST_FOREACH(inp, head, inp_hash) { > - if (!ISSET(inp->inp_flags, INP_IPV6)) > - continue; > + KASSERT(ISSET(inp->inp_flags, INP_IPV6)); > + > if (inp->inp_fport == fport && inp->inp_lport == lport && > IN6_ARE_ADDR_EQUAL(&inp->inp_faddr6, faddr) && > IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr) && >