Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: ip6 forward mcopy
To:
tech@openbsd.org
Date:
Mon, 24 Jun 2024 23:40:15 +0200

Download raw body.

Thread
On Mon, Jun 24, 2024 at 12:23:56PM +0200, Claudio Jeker wrote:
> On Mon, Jun 24, 2024 at 11:33:44AM +0200, Alexander Bluhm wrote:
> > Hi,
> > 
> > Forwarding IPv6 packets is slower than IPv4.  Reason is the m_copym()
> > that is done for every packet.  Just in case we may have to send
> > an ICMP6 packet, ip6_forward() creates a mbuf copy.  After that
> > mbuf cluster is read only, so for the ethernet header another mbuf
> > is allocated.  pf NAT and RDR ignores readonly clusters, so it also
> > modifies the potential ICMP6 packet.
> > 
> > IPv4 ip_forward() avoids all these problems by copying the leading
> > 68 bytes of the original packets onto the stack.  More is not need
> > for ICMP.  IPv6 RFC requires up to 1232 bytes in the ICMP6 packet.
> > This cannot be copied to the stack.
> > 
> > The reason for the difference is that the ICMP6 packet has to contain
> > the full header chain.  I think if we have a simple UDP or TCP
> > packet without chain, we could do a shortcut and just copy the
> > header to ICMP6 packet.
> > 
> > Do we want to violate the RFC and gain 30% performance for the
> > common TCP/UDP case?
> 
> Why only TCP/UDP? IMO this could be done for all L4 IP protocols.
> Only if the packet has hop-to-hop headers or some other strange
> intermediate header we need to copy all.

Depends whether you are on the opt-in or opt-out side.  pf
pf_walk_header6() steps over IPPROTO_FRAGMENT, IPPROTO_ROUTING,
IPPROTO_HOPOPTS, IPPROTO_AH, IPPROTO_DSTOPTS, some IPPROTO_ICMPV6.
The others are considered final and not analyzed in the chain.  I
wanted to be on the safe side and took IPPROTO_TCP, IPPROTO_UDP as
they are performance critical.  But we can extend the list.

> Do you know what other systems do here? Bouncing back 1232 bytes of
> payload is just bonkers.

Both NetBSD and FreeBSD use m_copym(ICMPV6_PLD_MAXLEN) like KAME.

Requirement comes from RFC 4443:
               Internet Control Message Protocol (ICMPv6)
        for the Internet Protocol Version 6 (IPv6) Specification
2.4.  Message Processing Rules
   (c) Every ICMPv6 error message (type < 128) MUST include as much of
       the IPv6 offending (invoking) packet (the packet that caused the
       error) as possible without making the error message packet exceed
       the minimum IPv6 MTU [IPv6].

Diff below contains ph_loopcnt as suggested by claudio@.

bluhm

Index: netinet6/ip6_forward.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
diff -u -p -r1.119 ip6_forward.c
--- netinet6/ip6_forward.c	20 Jun 2024 19:25:42 -0000	1.119
+++ netinet6/ip6_forward.c	24 Jun 2024 21:05:11 -0000
@@ -89,8 +89,16 @@ ip6_forward(struct mbuf *m, struct route
 	struct rtentry *rt;
 	struct sockaddr *dst;
 	struct ifnet *ifp = NULL;
-	int error = 0, type = 0, code = 0, destmtu = 0;
+	u_int rtableid = m->m_pkthdr.ph_rtableid;
+	u_int ifidx = m->m_pkthdr.ph_ifidx;
+	u_int8_t loopcnt = m->m_pkthdr.ph_loopcnt;
+	u_int icmp_len;
+	char icmp_buf[sizeof(struct ip6_hdr) + sizeof(struct tcphdr) +
+	    MAX_TCPOPTLEN];
+	CTASSERT(sizeof(icmp_buf) <= MHLEN);
+	u_short mflags, pfflags;
 	struct mbuf *mcopy;
+	int error = 0, type = 0, code = 0, destmtu = 0;
 #ifdef IPSEC
 	struct tdb *tdb = NULL;
 #endif /* IPSEC */
@@ -117,9 +125,7 @@ ip6_forward(struct mbuf *m, struct route
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "from %s to %s nxt %d received on interface %u\n",
-			    src6, dst6,
-			    ip6->ip6_nxt,
-			    m->m_pkthdr.ph_ifidx);
+			    src6, dst6, ip6->ip6_nxt, ifidx);
 		}
 		m_freem(m);
 		goto done;
@@ -138,11 +144,30 @@ ip6_forward(struct mbuf *m, struct route
 	 * we need to generate an ICMP6 message to the src.
 	 * Thanks to M_EXT, in most cases copy will not occur.
 	 *
+	 * For final protocol header like TCP or UDP, full header chain in
+	 * ICMP6 packet is not necessary.  In this case only copy small
+	 * part of original packet and save it on stack instead of mbuf.
+	 * Although this violates RFC, it avoids additional mbuf allocations.
+	 * Also pf nat and rdr do not affect the shared mbuf cluster.
+	 *
 	 * It is important to save it before IPsec processing as IPsec
 	 * processing may modify the mbuf.
 	 */
-	mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN),
-	    M_NOWAIT);
+	switch (ip6->ip6_nxt) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		icmp_len = min(m->m_pkthdr.len, sizeof(icmp_buf));
+		mflags = m->m_flags;
+		pfflags = m->m_pkthdr.pf.flags;
+		m_copydata(m, 0, icmp_len, icmp_buf);
+		mcopy = NULL;
+		break;
+	default:
+		icmp_len = min(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN);
+		mcopy = m_copym(m, 0, icmp_len, M_NOWAIT);
+		icmp_len = 0;
+		break;
+	}
 
 #if NPF > 0
 reroute:
@@ -174,12 +199,10 @@ reroute:
 	    m->m_pkthdr.ph_rtableid);
 	if (rt == NULL) {
 		ip6stat_inc(ip6s_noroute);
-		if (mcopy != NULL) {
-			icmp6_error(mcopy, ICMP6_DST_UNREACH,
-				    ICMP6_DST_UNREACH_NOROUTE, 0);
-		}
+		type = ICMP6_DST_UNREACH;
+		code = ICMP6_DST_UNREACH_NOROUTE;
 		m_freem(m);
-		goto done;
+		goto icmperror;
 	}
 	dst = &ro->ro_dstsa;
 
@@ -190,7 +213,7 @@ reroute:
 	 * unreachable error with Code 2 (beyond scope of source address).
 	 * [draft-ietf-ipngwg-icmp-v3-00.txt, Section 3.1]
 	 */
-	if (in6_addr2scopeid(m->m_pkthdr.ph_ifidx, &ip6->ip6_src) !=
+	if (in6_addr2scopeid(ifidx, &ip6->ip6_src) !=
 	    in6_addr2scopeid(rt->rt_ifidx, &ip6->ip6_src)) {
 		time_t uptime;
 
@@ -205,15 +228,12 @@ reroute:
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "src %s, dst %s, nxt %d, rcvif %u, outif %u\n",
-			    src6, dst6,
-			    ip6->ip6_nxt,
-			    m->m_pkthdr.ph_ifidx, rt->rt_ifidx);
+			    src6, dst6, ip6->ip6_nxt, ifidx, rt->rt_ifidx);
 		}
-		if (mcopy != NULL)
-			icmp6_error(mcopy, ICMP6_DST_UNREACH,
-				    ICMP6_DST_UNREACH_BEYONDSCOPE, 0);
+		type = ICMP6_DST_UNREACH;
+		code = ICMP6_DST_UNREACH_BEYONDSCOPE;
 		m_freem(m);
-		goto done;
+		goto icmperror;
 	}
 
 #ifdef IPSEC
@@ -248,7 +268,7 @@ reroute:
 		m_freem(m);
 		goto freecopy;
 	}
-	if (rt->rt_ifidx == m->m_pkthdr.ph_ifidx &&
+	if (rt->rt_ifidx == ifidx &&
 	    ip6_sendredirects && !ISSET(flags, IPV6_REDIRECT) &&
 	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) &&
@@ -268,11 +288,10 @@ reroute:
 			 * type/code is based on suggestion by Rich Draves.
 			 * not sure if it is the best pick.
 			 */
-			if (mcopy != NULL)
-				icmp6_error(mcopy, ICMP6_DST_UNREACH,
-				    ICMP6_DST_UNREACH_ADDR, 0);
+			type = ICMP6_DST_UNREACH;
+			code = ICMP6_DST_UNREACH_ADDR;
 			m_freem(m);
-			goto done;
+			goto icmperror;
 		}
 		type = ND_REDIRECT;
 	}
@@ -323,27 +342,41 @@ reroute:
 	if (error || m == NULL)
 		goto senderr;
 
-	if (mcopy != NULL)
-		icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+	type = ICMP6_PACKET_TOO_BIG;
+	destmtu = ifp->if_mtu;
 	m_freem(m);
-	goto done;
+	goto icmperror;
 
 senderr:
-	if (mcopy == NULL)
+	if (mcopy == NULL && icmp_len == 0)
 		goto done;
 
 	switch (error) {
 	case 0:
 		if (type == ND_REDIRECT) {
-			icmp6_redirect_output(mcopy, rt);
-			ip6stat_inc(ip6s_redirectsent);
+			if (icmp_len != 0) {
+				mcopy = m_gethdr(M_DONTWAIT, MT_DATA);
+				if (mcopy == NULL)
+					goto done;
+				mcopy->m_len = mcopy->m_pkthdr.len = icmp_len;
+				mcopy->m_flags |= (mflags & M_COPYFLAGS);
+				mcopy->m_pkthdr.ph_rtableid = rtableid;
+				mcopy->m_pkthdr.ph_ifidx = ifidx;
+				mcopy->m_pkthdr.ph_loopcnt = loopcnt;
+				mcopy->m_pkthdr.pf.flags |=
+				    (pfflags & PF_TAG_GENERATED);
+				memcpy(mcopy->m_data, icmp_buf, icmp_len);
+			}
+			if (mcopy != NULL) {
+				icmp6_redirect_output(mcopy, rt);
+				ip6stat_inc(ip6s_redirectsent);
+			}
 			goto done;
 		}
 		goto freecopy;
 
 	case EMSGSIZE:
 		type = ICMP6_PACKET_TOO_BIG;
-		code = 0;
 		if (rt != NULL) {
 			if (rt->rt_mtu) {
 				destmtu = rt->rt_mtu;
@@ -381,7 +414,21 @@ senderr:
 		code = ICMP6_DST_UNREACH_ADDR;
 		break;
 	}
-	icmp6_error(mcopy, type, code, destmtu);
+ icmperror:
+	if (icmp_len != 0) {
+		mcopy = m_gethdr(M_DONTWAIT, MT_DATA);
+		if (mcopy == NULL)
+			goto done;
+		mcopy->m_len = mcopy->m_pkthdr.len = icmp_len;
+		mcopy->m_flags |= (mflags & M_COPYFLAGS);
+		mcopy->m_pkthdr.ph_rtableid = rtableid;
+		mcopy->m_pkthdr.ph_ifidx = ifidx;
+		mcopy->m_pkthdr.ph_loopcnt = loopcnt;
+		mcopy->m_pkthdr.pf.flags |= (pfflags & PF_TAG_GENERATED);
+		memcpy(mcopy->m_data, icmp_buf, icmp_len);
+	}
+	if (mcopy != NULL)
+		icmp6_error(mcopy, type, code, destmtu);
 	goto done;
 
  freecopy: