From: Claudio Jeker Subject: Re: ip6 forward mcopy To: Alexander Bluhm Cc: tech@openbsd.org Date: Mon, 24 Jun 2024 12:23:56 +0200 On Mon, Jun 24, 2024 at 11:33:44AM +0200, Alexander Bluhm wrote: > Hi, > > Forwarding IPv6 packets is slower than IPv4. Reason is the m_copym() > that is done for every packet. Just in case we may have to send > an ICMP6 packet, ip6_forward() creates a mbuf copy. After that > mbuf cluster is read only, so for the ethernet header another mbuf > is allocated. pf NAT and RDR ignores readonly clusters, so it also > modifies the potential ICMP6 packet. > > IPv4 ip_forward() avoids all these problems by copying the leading > 68 bytes of the original packets onto the stack. More is not need > for ICMP. IPv6 RFC requires up to 1232 bytes in the ICMP6 packet. > This cannot be copied to the stack. > > The reason for the difference is that the ICMP6 packet has to contain > the full header chain. I think if we have a simple UDP or TCP > packet without chain, we could do a shortcut and just copy the > header to ICMP6 packet. > > Do we want to violate the RFC and gain 30% performance for the > common TCP/UDP case? Why only TCP/UDP? IMO this could be done for all L4 IP protocols. Only if the packet has hop-to-hop headers or some other strange intermediate header we need to copy all. Do you know what other systems do here? Bouncing back 1232 bytes of payload is just bonkers. > bluhm > > Index: netinet6/ip6_forward.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v > diff -u -p -r1.119 ip6_forward.c > --- netinet6/ip6_forward.c 20 Jun 2024 19:25:42 -0000 1.119 > +++ netinet6/ip6_forward.c 24 Jun 2024 08:16:04 -0000 > @@ -89,8 +89,15 @@ ip6_forward(struct mbuf *m, struct route > struct rtentry *rt; > struct sockaddr *dst; > struct ifnet *ifp = NULL; > - int error = 0, type = 0, code = 0, destmtu = 0; > + u_int rtableid = m->m_pkthdr.ph_rtableid; > + u_int ifidx = m->m_pkthdr.ph_ifidx; > + u_int icmp_len; > + char icmp_buf[sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + > + MAX_TCPOPTLEN]; > + CTASSERT(sizeof(icmp_buf) <= MHLEN); > + u_short mflags, pfflags; > struct mbuf *mcopy; > + int error = 0, type = 0, code = 0, destmtu = 0; > #ifdef IPSEC > struct tdb *tdb = NULL; > #endif /* IPSEC */ > @@ -117,9 +124,7 @@ ip6_forward(struct mbuf *m, struct route > log(LOG_DEBUG, > "cannot forward " > "from %s to %s nxt %d received on interface %u\n", > - src6, dst6, > - ip6->ip6_nxt, > - m->m_pkthdr.ph_ifidx); > + src6, dst6, ip6->ip6_nxt, ifidx); > } > m_freem(m); > goto done; > @@ -138,11 +143,30 @@ ip6_forward(struct mbuf *m, struct route > * we need to generate an ICMP6 message to the src. > * Thanks to M_EXT, in most cases copy will not occur. > * > + * For final protocol header like TCP or UDP, full header chain in > + * ICMP6 packet is not necessary. In this case only copy small > + * part of original packet and save it on stack instead of mbuf. > + * Although this violates RFC, it avoids additional mbuf allocations. > + * Also pf nat and rdr do not affect the shared mbuf cluster. > + * > * It is important to save it before IPsec processing as IPsec > * processing may modify the mbuf. > */ > - mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN), > - M_NOWAIT); > + switch (ip6->ip6_nxt) { > + case IPPROTO_TCP: > + case IPPROTO_UDP: > + icmp_len = min(m->m_pkthdr.len, sizeof(icmp_buf)); > + mflags = m->m_flags; > + pfflags = m->m_pkthdr.pf.flags; > + m_copydata(m, 0, icmp_len, icmp_buf); > + mcopy = NULL; > + break; > + default: > + icmp_len = min(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN); > + mcopy = m_copym(m, 0, icmp_len, M_NOWAIT); > + icmp_len = 0; > + break; > + } > > #if NPF > 0 > reroute: > @@ -174,12 +198,10 @@ reroute: > m->m_pkthdr.ph_rtableid); > if (rt == NULL) { > ip6stat_inc(ip6s_noroute); > - if (mcopy != NULL) { > - icmp6_error(mcopy, ICMP6_DST_UNREACH, > - ICMP6_DST_UNREACH_NOROUTE, 0); > - } > + type = ICMP6_DST_UNREACH; > + code = ICMP6_DST_UNREACH_NOROUTE; > m_freem(m); > - goto done; > + goto icmperror; > } > dst = &ro->ro_dstsa; > > @@ -190,7 +212,7 @@ reroute: > * unreachable error with Code 2 (beyond scope of source address). > * [draft-ietf-ipngwg-icmp-v3-00.txt, Section 3.1] > */ > - if (in6_addr2scopeid(m->m_pkthdr.ph_ifidx, &ip6->ip6_src) != > + if (in6_addr2scopeid(ifidx, &ip6->ip6_src) != > in6_addr2scopeid(rt->rt_ifidx, &ip6->ip6_src)) { > time_t uptime; > > @@ -205,15 +227,12 @@ reroute: > log(LOG_DEBUG, > "cannot forward " > "src %s, dst %s, nxt %d, rcvif %u, outif %u\n", > - src6, dst6, > - ip6->ip6_nxt, > - m->m_pkthdr.ph_ifidx, rt->rt_ifidx); > + src6, dst6, ip6->ip6_nxt, ifidx, rt->rt_ifidx); > } > - if (mcopy != NULL) > - icmp6_error(mcopy, ICMP6_DST_UNREACH, > - ICMP6_DST_UNREACH_BEYONDSCOPE, 0); > + type = ICMP6_DST_UNREACH; > + code = ICMP6_DST_UNREACH_BEYONDSCOPE; > m_freem(m); > - goto done; > + goto icmperror; > } > > #ifdef IPSEC > @@ -248,7 +267,7 @@ reroute: > m_freem(m); > goto freecopy; > } > - if (rt->rt_ifidx == m->m_pkthdr.ph_ifidx && > + if (rt->rt_ifidx == ifidx && > ip6_sendredirects && !ISSET(flags, IPV6_REDIRECT) && > (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { > if ((ifp->if_flags & IFF_POINTOPOINT) && > @@ -268,11 +287,10 @@ reroute: > * type/code is based on suggestion by Rich Draves. > * not sure if it is the best pick. > */ > - if (mcopy != NULL) > - icmp6_error(mcopy, ICMP6_DST_UNREACH, > - ICMP6_DST_UNREACH_ADDR, 0); > + type = ICMP6_DST_UNREACH; > + code = ICMP6_DST_UNREACH_ADDR; > m_freem(m); > - goto done; > + goto icmperror; > } > type = ND_REDIRECT; > } > @@ -323,27 +341,40 @@ reroute: > if (error || m == NULL) > goto senderr; > > - if (mcopy != NULL) > - icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); > + type = ICMP6_PACKET_TOO_BIG; > + destmtu = ifp->if_mtu; > m_freem(m); > - goto done; > + goto icmperror; > > senderr: > - if (mcopy == NULL) > + if (mcopy == NULL && icmp_len == 0) > goto done; > > switch (error) { > case 0: > if (type == ND_REDIRECT) { > - icmp6_redirect_output(mcopy, rt); > - ip6stat_inc(ip6s_redirectsent); > + if (icmp_len != 0) { > + mcopy = m_gethdr(M_DONTWAIT, MT_DATA); > + if (mcopy == NULL) > + goto done; > + mcopy->m_len = mcopy->m_pkthdr.len = icmp_len; > + mcopy->m_flags |= (mflags & M_COPYFLAGS); > + mcopy->m_pkthdr.ph_rtableid = rtableid; > + mcopy->m_pkthdr.ph_ifidx = ifidx; > + mcopy->m_pkthdr.pf.flags |= > + (pfflags & PF_TAG_GENERATED); > + memcpy(mcopy->m_data, icmp_buf, icmp_len); > + } > + if (mcopy != NULL) { > + icmp6_redirect_output(mcopy, rt); > + ip6stat_inc(ip6s_redirectsent); > + } > goto done; > } > goto freecopy; > > case EMSGSIZE: > type = ICMP6_PACKET_TOO_BIG; > - code = 0; > if (rt != NULL) { > if (rt->rt_mtu) { > destmtu = rt->rt_mtu; > @@ -381,7 +412,20 @@ senderr: > code = ICMP6_DST_UNREACH_ADDR; > break; > } > - icmp6_error(mcopy, type, code, destmtu); > + icmperror: > + if (icmp_len != 0) { > + mcopy = m_gethdr(M_DONTWAIT, MT_DATA); > + if (mcopy == NULL) > + goto done; > + mcopy->m_len = mcopy->m_pkthdr.len = icmp_len; > + mcopy->m_flags |= (mflags & M_COPYFLAGS); > + mcopy->m_pkthdr.ph_rtableid = rtableid; > + mcopy->m_pkthdr.ph_ifidx = ifidx; > + mcopy->m_pkthdr.pf.flags |= (pfflags & PF_TAG_GENERATED); > + memcpy(mcopy->m_data, icmp_buf, icmp_len); > + } > + if (mcopy != NULL) > + icmp6_error(mcopy, type, code, destmtu); > goto done; > > freecopy: > -- :wq Claudio