Download raw body.
ip6 forward mcopy
On Mon, Jun 24, 2024 at 12:23:56PM +0200, Claudio Jeker wrote:
> On Mon, Jun 24, 2024 at 11:33:44AM +0200, Alexander Bluhm wrote:
> > Hi,
> >
> > Forwarding IPv6 packets is slower than IPv4. Reason is the m_copym()
> > that is done for every packet. Just in case we may have to send
> > an ICMP6 packet, ip6_forward() creates a mbuf copy. After that
> > mbuf cluster is read only, so for the ethernet header another mbuf
> > is allocated. pf NAT and RDR ignores readonly clusters, so it also
> > modifies the potential ICMP6 packet.
> >
> > IPv4 ip_forward() avoids all these problems by copying the leading
> > 68 bytes of the original packets onto the stack. More is not need
> > for ICMP. IPv6 RFC requires up to 1232 bytes in the ICMP6 packet.
> > This cannot be copied to the stack.
> >
> > The reason for the difference is that the ICMP6 packet has to contain
> > the full header chain. I think if we have a simple UDP or TCP
> > packet without chain, we could do a shortcut and just copy the
> > header to ICMP6 packet.
> >
> > Do we want to violate the RFC and gain 30% performance for the
> > common TCP/UDP case?
>
> Why only TCP/UDP? IMO this could be done for all L4 IP protocols.
> Only if the packet has hop-to-hop headers or some other strange
> intermediate header we need to copy all.
Depends whether you are on the opt-in or opt-out side. pf
pf_walk_header6() steps over IPPROTO_FRAGMENT, IPPROTO_ROUTING,
IPPROTO_HOPOPTS, IPPROTO_AH, IPPROTO_DSTOPTS, some IPPROTO_ICMPV6.
The others are considered final and not analyzed in the chain. I
wanted to be on the safe side and took IPPROTO_TCP, IPPROTO_UDP as
they are performance critical. But we can extend the list.
> Do you know what other systems do here? Bouncing back 1232 bytes of
> payload is just bonkers.
Both NetBSD and FreeBSD use m_copym(ICMPV6_PLD_MAXLEN) like KAME.
Requirement comes from RFC 4443:
Internet Control Message Protocol (ICMPv6)
for the Internet Protocol Version 6 (IPv6) Specification
2.4. Message Processing Rules
(c) Every ICMPv6 error message (type < 128) MUST include as much of
the IPv6 offending (invoking) packet (the packet that caused the
error) as possible without making the error message packet exceed
the minimum IPv6 MTU [IPv6].
Diff below contains ph_loopcnt as suggested by claudio@.
bluhm
Index: netinet6/ip6_forward.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
diff -u -p -r1.119 ip6_forward.c
--- netinet6/ip6_forward.c 20 Jun 2024 19:25:42 -0000 1.119
+++ netinet6/ip6_forward.c 24 Jun 2024 21:05:11 -0000
@@ -89,8 +89,16 @@ ip6_forward(struct mbuf *m, struct route
struct rtentry *rt;
struct sockaddr *dst;
struct ifnet *ifp = NULL;
- int error = 0, type = 0, code = 0, destmtu = 0;
+ u_int rtableid = m->m_pkthdr.ph_rtableid;
+ u_int ifidx = m->m_pkthdr.ph_ifidx;
+ u_int8_t loopcnt = m->m_pkthdr.ph_loopcnt;
+ u_int icmp_len;
+ char icmp_buf[sizeof(struct ip6_hdr) + sizeof(struct tcphdr) +
+ MAX_TCPOPTLEN];
+ CTASSERT(sizeof(icmp_buf) <= MHLEN);
+ u_short mflags, pfflags;
struct mbuf *mcopy;
+ int error = 0, type = 0, code = 0, destmtu = 0;
#ifdef IPSEC
struct tdb *tdb = NULL;
#endif /* IPSEC */
@@ -117,9 +125,7 @@ ip6_forward(struct mbuf *m, struct route
log(LOG_DEBUG,
"cannot forward "
"from %s to %s nxt %d received on interface %u\n",
- src6, dst6,
- ip6->ip6_nxt,
- m->m_pkthdr.ph_ifidx);
+ src6, dst6, ip6->ip6_nxt, ifidx);
}
m_freem(m);
goto done;
@@ -138,11 +144,30 @@ ip6_forward(struct mbuf *m, struct route
* we need to generate an ICMP6 message to the src.
* Thanks to M_EXT, in most cases copy will not occur.
*
+ * For final protocol header like TCP or UDP, full header chain in
+ * ICMP6 packet is not necessary. In this case only copy small
+ * part of original packet and save it on stack instead of mbuf.
+ * Although this violates RFC, it avoids additional mbuf allocations.
+ * Also pf nat and rdr do not affect the shared mbuf cluster.
+ *
* It is important to save it before IPsec processing as IPsec
* processing may modify the mbuf.
*/
- mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN),
- M_NOWAIT);
+ switch (ip6->ip6_nxt) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ icmp_len = min(m->m_pkthdr.len, sizeof(icmp_buf));
+ mflags = m->m_flags;
+ pfflags = m->m_pkthdr.pf.flags;
+ m_copydata(m, 0, icmp_len, icmp_buf);
+ mcopy = NULL;
+ break;
+ default:
+ icmp_len = min(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN);
+ mcopy = m_copym(m, 0, icmp_len, M_NOWAIT);
+ icmp_len = 0;
+ break;
+ }
#if NPF > 0
reroute:
@@ -174,12 +199,10 @@ reroute:
m->m_pkthdr.ph_rtableid);
if (rt == NULL) {
ip6stat_inc(ip6s_noroute);
- if (mcopy != NULL) {
- icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_NOROUTE, 0);
- }
+ type = ICMP6_DST_UNREACH;
+ code = ICMP6_DST_UNREACH_NOROUTE;
m_freem(m);
- goto done;
+ goto icmperror;
}
dst = &ro->ro_dstsa;
@@ -190,7 +213,7 @@ reroute:
* unreachable error with Code 2 (beyond scope of source address).
* [draft-ietf-ipngwg-icmp-v3-00.txt, Section 3.1]
*/
- if (in6_addr2scopeid(m->m_pkthdr.ph_ifidx, &ip6->ip6_src) !=
+ if (in6_addr2scopeid(ifidx, &ip6->ip6_src) !=
in6_addr2scopeid(rt->rt_ifidx, &ip6->ip6_src)) {
time_t uptime;
@@ -205,15 +228,12 @@ reroute:
log(LOG_DEBUG,
"cannot forward "
"src %s, dst %s, nxt %d, rcvif %u, outif %u\n",
- src6, dst6,
- ip6->ip6_nxt,
- m->m_pkthdr.ph_ifidx, rt->rt_ifidx);
+ src6, dst6, ip6->ip6_nxt, ifidx, rt->rt_ifidx);
}
- if (mcopy != NULL)
- icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_BEYONDSCOPE, 0);
+ type = ICMP6_DST_UNREACH;
+ code = ICMP6_DST_UNREACH_BEYONDSCOPE;
m_freem(m);
- goto done;
+ goto icmperror;
}
#ifdef IPSEC
@@ -248,7 +268,7 @@ reroute:
m_freem(m);
goto freecopy;
}
- if (rt->rt_ifidx == m->m_pkthdr.ph_ifidx &&
+ if (rt->rt_ifidx == ifidx &&
ip6_sendredirects && !ISSET(flags, IPV6_REDIRECT) &&
(rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) {
if ((ifp->if_flags & IFF_POINTOPOINT) &&
@@ -268,11 +288,10 @@ reroute:
* type/code is based on suggestion by Rich Draves.
* not sure if it is the best pick.
*/
- if (mcopy != NULL)
- icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_ADDR, 0);
+ type = ICMP6_DST_UNREACH;
+ code = ICMP6_DST_UNREACH_ADDR;
m_freem(m);
- goto done;
+ goto icmperror;
}
type = ND_REDIRECT;
}
@@ -323,27 +342,41 @@ reroute:
if (error || m == NULL)
goto senderr;
- if (mcopy != NULL)
- icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+ type = ICMP6_PACKET_TOO_BIG;
+ destmtu = ifp->if_mtu;
m_freem(m);
- goto done;
+ goto icmperror;
senderr:
- if (mcopy == NULL)
+ if (mcopy == NULL && icmp_len == 0)
goto done;
switch (error) {
case 0:
if (type == ND_REDIRECT) {
- icmp6_redirect_output(mcopy, rt);
- ip6stat_inc(ip6s_redirectsent);
+ if (icmp_len != 0) {
+ mcopy = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (mcopy == NULL)
+ goto done;
+ mcopy->m_len = mcopy->m_pkthdr.len = icmp_len;
+ mcopy->m_flags |= (mflags & M_COPYFLAGS);
+ mcopy->m_pkthdr.ph_rtableid = rtableid;
+ mcopy->m_pkthdr.ph_ifidx = ifidx;
+ mcopy->m_pkthdr.ph_loopcnt = loopcnt;
+ mcopy->m_pkthdr.pf.flags |=
+ (pfflags & PF_TAG_GENERATED);
+ memcpy(mcopy->m_data, icmp_buf, icmp_len);
+ }
+ if (mcopy != NULL) {
+ icmp6_redirect_output(mcopy, rt);
+ ip6stat_inc(ip6s_redirectsent);
+ }
goto done;
}
goto freecopy;
case EMSGSIZE:
type = ICMP6_PACKET_TOO_BIG;
- code = 0;
if (rt != NULL) {
if (rt->rt_mtu) {
destmtu = rt->rt_mtu;
@@ -381,7 +414,21 @@ senderr:
code = ICMP6_DST_UNREACH_ADDR;
break;
}
- icmp6_error(mcopy, type, code, destmtu);
+ icmperror:
+ if (icmp_len != 0) {
+ mcopy = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (mcopy == NULL)
+ goto done;
+ mcopy->m_len = mcopy->m_pkthdr.len = icmp_len;
+ mcopy->m_flags |= (mflags & M_COPYFLAGS);
+ mcopy->m_pkthdr.ph_rtableid = rtableid;
+ mcopy->m_pkthdr.ph_ifidx = ifidx;
+ mcopy->m_pkthdr.ph_loopcnt = loopcnt;
+ mcopy->m_pkthdr.pf.flags |= (pfflags & PF_TAG_GENERATED);
+ memcpy(mcopy->m_data, icmp_buf, icmp_len);
+ }
+ if (mcopy != NULL)
+ icmp6_error(mcopy, type, code, destmtu);
goto done;
freecopy:
ip6 forward mcopy