From: Alexander Bluhm Subject: Re: ip6 forward mcopy To: tech@openbsd.org Date: Mon, 24 Jun 2024 23:40:15 +0200 On Mon, Jun 24, 2024 at 12:23:56PM +0200, Claudio Jeker wrote: > On Mon, Jun 24, 2024 at 11:33:44AM +0200, Alexander Bluhm wrote: > > Hi, > > > > Forwarding IPv6 packets is slower than IPv4. Reason is the m_copym() > > that is done for every packet. Just in case we may have to send > > an ICMP6 packet, ip6_forward() creates a mbuf copy. After that > > mbuf cluster is read only, so for the ethernet header another mbuf > > is allocated. pf NAT and RDR ignores readonly clusters, so it also > > modifies the potential ICMP6 packet. > > > > IPv4 ip_forward() avoids all these problems by copying the leading > > 68 bytes of the original packets onto the stack. More is not need > > for ICMP. IPv6 RFC requires up to 1232 bytes in the ICMP6 packet. > > This cannot be copied to the stack. > > > > The reason for the difference is that the ICMP6 packet has to contain > > the full header chain. I think if we have a simple UDP or TCP > > packet without chain, we could do a shortcut and just copy the > > header to ICMP6 packet. > > > > Do we want to violate the RFC and gain 30% performance for the > > common TCP/UDP case? > > Why only TCP/UDP? IMO this could be done for all L4 IP protocols. > Only if the packet has hop-to-hop headers or some other strange > intermediate header we need to copy all. Depends whether you are on the opt-in or opt-out side. pf pf_walk_header6() steps over IPPROTO_FRAGMENT, IPPROTO_ROUTING, IPPROTO_HOPOPTS, IPPROTO_AH, IPPROTO_DSTOPTS, some IPPROTO_ICMPV6. The others are considered final and not analyzed in the chain. I wanted to be on the safe side and took IPPROTO_TCP, IPPROTO_UDP as they are performance critical. But we can extend the list. > Do you know what other systems do here? Bouncing back 1232 bytes of > payload is just bonkers. Both NetBSD and FreeBSD use m_copym(ICMPV6_PLD_MAXLEN) like KAME. Requirement comes from RFC 4443: Internet Control Message Protocol (ICMPv6) for the Internet Protocol Version 6 (IPv6) Specification 2.4. Message Processing Rules (c) Every ICMPv6 error message (type < 128) MUST include as much of the IPv6 offending (invoking) packet (the packet that caused the error) as possible without making the error message packet exceed the minimum IPv6 MTU [IPv6]. Diff below contains ph_loopcnt as suggested by claudio@. bluhm Index: netinet6/ip6_forward.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v diff -u -p -r1.119 ip6_forward.c --- netinet6/ip6_forward.c 20 Jun 2024 19:25:42 -0000 1.119 +++ netinet6/ip6_forward.c 24 Jun 2024 21:05:11 -0000 @@ -89,8 +89,16 @@ ip6_forward(struct mbuf *m, struct route struct rtentry *rt; struct sockaddr *dst; struct ifnet *ifp = NULL; - int error = 0, type = 0, code = 0, destmtu = 0; + u_int rtableid = m->m_pkthdr.ph_rtableid; + u_int ifidx = m->m_pkthdr.ph_ifidx; + u_int8_t loopcnt = m->m_pkthdr.ph_loopcnt; + u_int icmp_len; + char icmp_buf[sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + + MAX_TCPOPTLEN]; + CTASSERT(sizeof(icmp_buf) <= MHLEN); + u_short mflags, pfflags; struct mbuf *mcopy; + int error = 0, type = 0, code = 0, destmtu = 0; #ifdef IPSEC struct tdb *tdb = NULL; #endif /* IPSEC */ @@ -117,9 +125,7 @@ ip6_forward(struct mbuf *m, struct route log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on interface %u\n", - src6, dst6, - ip6->ip6_nxt, - m->m_pkthdr.ph_ifidx); + src6, dst6, ip6->ip6_nxt, ifidx); } m_freem(m); goto done; @@ -138,11 +144,30 @@ ip6_forward(struct mbuf *m, struct route * we need to generate an ICMP6 message to the src. * Thanks to M_EXT, in most cases copy will not occur. * + * For final protocol header like TCP or UDP, full header chain in + * ICMP6 packet is not necessary. In this case only copy small + * part of original packet and save it on stack instead of mbuf. + * Although this violates RFC, it avoids additional mbuf allocations. + * Also pf nat and rdr do not affect the shared mbuf cluster. + * * It is important to save it before IPsec processing as IPsec * processing may modify the mbuf. */ - mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN), - M_NOWAIT); + switch (ip6->ip6_nxt) { + case IPPROTO_TCP: + case IPPROTO_UDP: + icmp_len = min(m->m_pkthdr.len, sizeof(icmp_buf)); + mflags = m->m_flags; + pfflags = m->m_pkthdr.pf.flags; + m_copydata(m, 0, icmp_len, icmp_buf); + mcopy = NULL; + break; + default: + icmp_len = min(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN); + mcopy = m_copym(m, 0, icmp_len, M_NOWAIT); + icmp_len = 0; + break; + } #if NPF > 0 reroute: @@ -174,12 +199,10 @@ reroute: m->m_pkthdr.ph_rtableid); if (rt == NULL) { ip6stat_inc(ip6s_noroute); - if (mcopy != NULL) { - icmp6_error(mcopy, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_NOROUTE, 0); - } + type = ICMP6_DST_UNREACH; + code = ICMP6_DST_UNREACH_NOROUTE; m_freem(m); - goto done; + goto icmperror; } dst = &ro->ro_dstsa; @@ -190,7 +213,7 @@ reroute: * unreachable error with Code 2 (beyond scope of source address). * [draft-ietf-ipngwg-icmp-v3-00.txt, Section 3.1] */ - if (in6_addr2scopeid(m->m_pkthdr.ph_ifidx, &ip6->ip6_src) != + if (in6_addr2scopeid(ifidx, &ip6->ip6_src) != in6_addr2scopeid(rt->rt_ifidx, &ip6->ip6_src)) { time_t uptime; @@ -205,15 +228,12 @@ reroute: log(LOG_DEBUG, "cannot forward " "src %s, dst %s, nxt %d, rcvif %u, outif %u\n", - src6, dst6, - ip6->ip6_nxt, - m->m_pkthdr.ph_ifidx, rt->rt_ifidx); + src6, dst6, ip6->ip6_nxt, ifidx, rt->rt_ifidx); } - if (mcopy != NULL) - icmp6_error(mcopy, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_BEYONDSCOPE, 0); + type = ICMP6_DST_UNREACH; + code = ICMP6_DST_UNREACH_BEYONDSCOPE; m_freem(m); - goto done; + goto icmperror; } #ifdef IPSEC @@ -248,7 +268,7 @@ reroute: m_freem(m); goto freecopy; } - if (rt->rt_ifidx == m->m_pkthdr.ph_ifidx && + if (rt->rt_ifidx == ifidx && ip6_sendredirects && !ISSET(flags, IPV6_REDIRECT) && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((ifp->if_flags & IFF_POINTOPOINT) && @@ -268,11 +288,10 @@ reroute: * type/code is based on suggestion by Rich Draves. * not sure if it is the best pick. */ - if (mcopy != NULL) - icmp6_error(mcopy, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_ADDR, 0); + type = ICMP6_DST_UNREACH; + code = ICMP6_DST_UNREACH_ADDR; m_freem(m); - goto done; + goto icmperror; } type = ND_REDIRECT; } @@ -323,27 +342,41 @@ reroute: if (error || m == NULL) goto senderr; - if (mcopy != NULL) - icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); + type = ICMP6_PACKET_TOO_BIG; + destmtu = ifp->if_mtu; m_freem(m); - goto done; + goto icmperror; senderr: - if (mcopy == NULL) + if (mcopy == NULL && icmp_len == 0) goto done; switch (error) { case 0: if (type == ND_REDIRECT) { - icmp6_redirect_output(mcopy, rt); - ip6stat_inc(ip6s_redirectsent); + if (icmp_len != 0) { + mcopy = m_gethdr(M_DONTWAIT, MT_DATA); + if (mcopy == NULL) + goto done; + mcopy->m_len = mcopy->m_pkthdr.len = icmp_len; + mcopy->m_flags |= (mflags & M_COPYFLAGS); + mcopy->m_pkthdr.ph_rtableid = rtableid; + mcopy->m_pkthdr.ph_ifidx = ifidx; + mcopy->m_pkthdr.ph_loopcnt = loopcnt; + mcopy->m_pkthdr.pf.flags |= + (pfflags & PF_TAG_GENERATED); + memcpy(mcopy->m_data, icmp_buf, icmp_len); + } + if (mcopy != NULL) { + icmp6_redirect_output(mcopy, rt); + ip6stat_inc(ip6s_redirectsent); + } goto done; } goto freecopy; case EMSGSIZE: type = ICMP6_PACKET_TOO_BIG; - code = 0; if (rt != NULL) { if (rt->rt_mtu) { destmtu = rt->rt_mtu; @@ -381,7 +414,21 @@ senderr: code = ICMP6_DST_UNREACH_ADDR; break; } - icmp6_error(mcopy, type, code, destmtu); + icmperror: + if (icmp_len != 0) { + mcopy = m_gethdr(M_DONTWAIT, MT_DATA); + if (mcopy == NULL) + goto done; + mcopy->m_len = mcopy->m_pkthdr.len = icmp_len; + mcopy->m_flags |= (mflags & M_COPYFLAGS); + mcopy->m_pkthdr.ph_rtableid = rtableid; + mcopy->m_pkthdr.ph_ifidx = ifidx; + mcopy->m_pkthdr.ph_loopcnt = loopcnt; + mcopy->m_pkthdr.pf.flags |= (pfflags & PF_TAG_GENERATED); + memcpy(mcopy->m_data, icmp_buf, icmp_len); + } + if (mcopy != NULL) + icmp6_error(mcopy, type, code, destmtu); goto done; freecopy: