Download raw body.
ip6 forward mcopy
On Mon, Jun 24, 2024 at 11:40:15PM +0200, Alexander Bluhm wrote:
> On Mon, Jun 24, 2024 at 12:23:56PM +0200, Claudio Jeker wrote:
> > On Mon, Jun 24, 2024 at 11:33:44AM +0200, Alexander Bluhm wrote:
> > > Hi,
> > >
> > > Forwarding IPv6 packets is slower than IPv4. Reason is the m_copym()
> > > that is done for every packet. Just in case we may have to send
> > > an ICMP6 packet, ip6_forward() creates a mbuf copy. After that
> > > mbuf cluster is read only, so for the ethernet header another mbuf
> > > is allocated. pf NAT and RDR ignores readonly clusters, so it also
> > > modifies the potential ICMP6 packet.
> > >
> > > IPv4 ip_forward() avoids all these problems by copying the leading
> > > 68 bytes of the original packets onto the stack. More is not need
> > > for ICMP. IPv6 RFC requires up to 1232 bytes in the ICMP6 packet.
> > > This cannot be copied to the stack.
> > >
> > > The reason for the difference is that the ICMP6 packet has to contain
> > > the full header chain. I think if we have a simple UDP or TCP
> > > packet without chain, we could do a shortcut and just copy the
> > > header to ICMP6 packet.
> > >
> > > Do we want to violate the RFC and gain 30% performance for the
> > > common TCP/UDP case?
> >
> > Why only TCP/UDP? IMO this could be done for all L4 IP protocols.
> > Only if the packet has hop-to-hop headers or some other strange
> > intermediate header we need to copy all.
>
> Depends whether you are on the opt-in or opt-out side. pf
> pf_walk_header6() steps over IPPROTO_FRAGMENT, IPPROTO_ROUTING,
> IPPROTO_HOPOPTS, IPPROTO_AH, IPPROTO_DSTOPTS, some IPPROTO_ICMPV6.
> The others are considered final and not analyzed in the chain. I
> wanted to be on the safe side and took IPPROTO_TCP, IPPROTO_UDP as
> they are performance critical. But we can extend the list.
I would like to enable this feature only for specific protocols.
TCP, UDP, ESP can be considered final header. There I enable it.
Feel free to add more.
Updated diff. Copy only relevant header to stack. Use stack instead
of m_copym() for all protocols if packet is short.
markus@ has tried alternative solutions. He tried to copy
ICMPV6_PLD_MAXLEN bytes to stack. This is risky due to stack
exhaustion. We could also replace m_copym() with m_dup_pkt(). This
reduces contention of m_extref_mtx mutex. But it keeps mbuf
allocations and does more memory copying.
What do we want to do?
1. keep everything as it is
2. use my diff below
3. copy ICMPV6_PLD_MAXLEN to stack
4. replace m_copym() with m_dup_pkt()
5. someone has a better idea
bluhm
Index: netinet6/ip6_forward.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
diff -u -p -r1.119 ip6_forward.c
--- netinet6/ip6_forward.c 20 Jun 2024 19:25:42 -0000 1.119
+++ netinet6/ip6_forward.c 28 Jun 2024 13:08:09 -0000
@@ -89,8 +89,16 @@ ip6_forward(struct mbuf *m, struct route
struct rtentry *rt;
struct sockaddr *dst;
struct ifnet *ifp = NULL;
- int error = 0, type = 0, code = 0, destmtu = 0;
+ u_int rtableid = m->m_pkthdr.ph_rtableid;
+ u_int ifidx = m->m_pkthdr.ph_ifidx;
+ u_int8_t loopcnt = m->m_pkthdr.ph_loopcnt;
+ u_int icmp_len;
+ char icmp_buf[sizeof(struct ip6_hdr) + sizeof(struct tcphdr) +
+ MAX_TCPOPTLEN];
+ CTASSERT(sizeof(icmp_buf) <= MHLEN);
+ u_short mflags, pfflags;
struct mbuf *mcopy;
+ int error = 0, type = 0, code = 0, destmtu = 0;
#ifdef IPSEC
struct tdb *tdb = NULL;
#endif /* IPSEC */
@@ -117,9 +125,7 @@ ip6_forward(struct mbuf *m, struct route
log(LOG_DEBUG,
"cannot forward "
"from %s to %s nxt %d received on interface %u\n",
- src6, dst6,
- ip6->ip6_nxt,
- m->m_pkthdr.ph_ifidx);
+ src6, dst6, ip6->ip6_nxt, ifidx);
}
m_freem(m);
goto done;
@@ -138,11 +144,42 @@ ip6_forward(struct mbuf *m, struct route
* we need to generate an ICMP6 message to the src.
* Thanks to M_EXT, in most cases copy will not occur.
*
+ * For final protocol header like TCP or UDP, full header chain in
+ * ICMP6 packet is not necessary. In this case only copy small
+ * part of original packet and save it on stack instead of mbuf.
+ * Although this violates RFC 4443 2.4. (c), it avoids additional
+ * mbuf allocations. Also pf nat and rdr do not affect the shared
+ * mbuf cluster.
+ *
* It is important to save it before IPsec processing as IPsec
* processing may modify the mbuf.
*/
- mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN),
- M_NOWAIT);
+ switch (ip6->ip6_nxt) {
+ case IPPROTO_TCP:
+ icmp_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) +
+ MAX_TCPOPTLEN;
+ break;
+ case IPPROTO_UDP:
+ icmp_len = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
+ break;
+ case IPPROTO_ESP:
+ icmp_len = sizeof(struct ip6_hdr) + 2 * sizeof(u_int32_t);
+ break;
+ default:
+ icmp_len = ICMPV6_PLD_MAXLEN;
+ break;
+ }
+ if (icmp_len > m->m_pkthdr.len)
+ icmp_len = m->m_pkthdr.len;
+ if (icmp_len <= sizeof(icmp_buf)) {
+ mflags = m->m_flags;
+ pfflags = m->m_pkthdr.pf.flags;
+ m_copydata(m, 0, icmp_len, icmp_buf);
+ mcopy = NULL;
+ } else {
+ mcopy = m_copym(m, 0, icmp_len, M_NOWAIT);
+ icmp_len = 0;
+ }
#if NPF > 0
reroute:
@@ -174,12 +211,10 @@ reroute:
m->m_pkthdr.ph_rtableid);
if (rt == NULL) {
ip6stat_inc(ip6s_noroute);
- if (mcopy != NULL) {
- icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_NOROUTE, 0);
- }
+ type = ICMP6_DST_UNREACH;
+ code = ICMP6_DST_UNREACH_NOROUTE;
m_freem(m);
- goto done;
+ goto icmperror;
}
dst = &ro->ro_dstsa;
@@ -190,7 +225,7 @@ reroute:
* unreachable error with Code 2 (beyond scope of source address).
* [draft-ietf-ipngwg-icmp-v3-00.txt, Section 3.1]
*/
- if (in6_addr2scopeid(m->m_pkthdr.ph_ifidx, &ip6->ip6_src) !=
+ if (in6_addr2scopeid(ifidx, &ip6->ip6_src) !=
in6_addr2scopeid(rt->rt_ifidx, &ip6->ip6_src)) {
time_t uptime;
@@ -205,15 +240,12 @@ reroute:
log(LOG_DEBUG,
"cannot forward "
"src %s, dst %s, nxt %d, rcvif %u, outif %u\n",
- src6, dst6,
- ip6->ip6_nxt,
- m->m_pkthdr.ph_ifidx, rt->rt_ifidx);
+ src6, dst6, ip6->ip6_nxt, ifidx, rt->rt_ifidx);
}
- if (mcopy != NULL)
- icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_BEYONDSCOPE, 0);
+ type = ICMP6_DST_UNREACH;
+ code = ICMP6_DST_UNREACH_BEYONDSCOPE;
m_freem(m);
- goto done;
+ goto icmperror;
}
#ifdef IPSEC
@@ -248,7 +280,7 @@ reroute:
m_freem(m);
goto freecopy;
}
- if (rt->rt_ifidx == m->m_pkthdr.ph_ifidx &&
+ if (rt->rt_ifidx == ifidx &&
ip6_sendredirects && !ISSET(flags, IPV6_REDIRECT) &&
(rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) {
if ((ifp->if_flags & IFF_POINTOPOINT) &&
@@ -268,11 +300,10 @@ reroute:
* type/code is based on suggestion by Rich Draves.
* not sure if it is the best pick.
*/
- if (mcopy != NULL)
- icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_ADDR, 0);
+ type = ICMP6_DST_UNREACH;
+ code = ICMP6_DST_UNREACH_ADDR;
m_freem(m);
- goto done;
+ goto icmperror;
}
type = ND_REDIRECT;
}
@@ -323,27 +354,41 @@ reroute:
if (error || m == NULL)
goto senderr;
- if (mcopy != NULL)
- icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+ type = ICMP6_PACKET_TOO_BIG;
+ destmtu = ifp->if_mtu;
m_freem(m);
- goto done;
+ goto icmperror;
senderr:
- if (mcopy == NULL)
+ if (mcopy == NULL && icmp_len == 0)
goto done;
switch (error) {
case 0:
if (type == ND_REDIRECT) {
- icmp6_redirect_output(mcopy, rt);
- ip6stat_inc(ip6s_redirectsent);
+ if (icmp_len != 0) {
+ mcopy = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (mcopy == NULL)
+ goto done;
+ mcopy->m_len = mcopy->m_pkthdr.len = icmp_len;
+ mcopy->m_flags |= (mflags & M_COPYFLAGS);
+ mcopy->m_pkthdr.ph_rtableid = rtableid;
+ mcopy->m_pkthdr.ph_ifidx = ifidx;
+ mcopy->m_pkthdr.ph_loopcnt = loopcnt;
+ mcopy->m_pkthdr.pf.flags |=
+ (pfflags & PF_TAG_GENERATED);
+ memcpy(mcopy->m_data, icmp_buf, icmp_len);
+ }
+ if (mcopy != NULL) {
+ icmp6_redirect_output(mcopy, rt);
+ ip6stat_inc(ip6s_redirectsent);
+ }
goto done;
}
goto freecopy;
case EMSGSIZE:
type = ICMP6_PACKET_TOO_BIG;
- code = 0;
if (rt != NULL) {
if (rt->rt_mtu) {
destmtu = rt->rt_mtu;
@@ -381,7 +426,21 @@ senderr:
code = ICMP6_DST_UNREACH_ADDR;
break;
}
- icmp6_error(mcopy, type, code, destmtu);
+ icmperror:
+ if (icmp_len != 0) {
+ mcopy = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (mcopy == NULL)
+ goto done;
+ mcopy->m_len = mcopy->m_pkthdr.len = icmp_len;
+ mcopy->m_flags |= (mflags & M_COPYFLAGS);
+ mcopy->m_pkthdr.ph_rtableid = rtableid;
+ mcopy->m_pkthdr.ph_ifidx = ifidx;
+ mcopy->m_pkthdr.ph_loopcnt = loopcnt;
+ mcopy->m_pkthdr.pf.flags |= (pfflags & PF_TAG_GENERATED);
+ memcpy(mcopy->m_data, icmp_buf, icmp_len);
+ }
+ if (mcopy != NULL)
+ icmp6_error(mcopy, type, code, destmtu);
goto done;
freecopy:
ip6 forward mcopy