Download raw body.
ether extract headers alingment memcpy
> Date: Tue, 13 Feb 2024 14:47:36 +0100
> From: Marcus Glocker <marcus@nazgul.ch>
I would really like to understand what is happening here. What bluhm@
described just didn' make sense. And I think some of the testing
being done is flawed.
> On Tue, Feb 13, 2024 at 01:23:14AM +0100, Alexander Bluhm wrote:
>
> > On Wed, Feb 07, 2024 at 01:07:15AM +0100, Alexander Bluhm wrote:
> > > This is my next aproach to retract IP and TCP header information
> > > from an mbuf in driver level. Idea is to use memcpy() to move data
> > > from the unaligned header bit fields to integers.
> > >
> > > I create a union of bytes and bits on the stack. memcpy() gets the
> > > byte from the mbuf, compiler cares about alingment on the stack.
> > > A lot of code to trick gcc into generating byte access on sparc64.
> > >
> > > The IP and TCP header fields are stored in struct ether_extracted
> > > so the drivers can just use them without memcpy() magic.
> > >
> > > I also added more length checks. Do clear pointers to headers if
> > > they have bgous length information. On top I also track packet and
> > > payload length and also check them for consistency. I think this
> > > is better than putting stupid values into driver registers based
> > > on evil packets.
> > >
> > > To make driver debugging easier, I added a lot of debug prints if
> > > packets look strange.
> > >
> > > Basic testing seems to work, but I will test on more network hardware
> > > and architectures. Diff is not ready for commit.
> > >
> > > Is the idea with memcpy() and union of bit fields a good one?
> > > Are there arguments against more consistency checks at driver level?
> > > Do we want all the debug prints?
> > > Should we keep the m_pkthdr.len checks and track the payload?
> >
> > With this diff, all access to IP and TCP header is done with sparc64
> > ldub instructions.
> >
> > 1133 memcpy(&hdrcpy.hc_data, ext->ip4, 1);
> > 1134 hlen = hdrcpy.hc_ip.hl << 2;
> > 1135 if (m->m_len - hoff < hlen) {
> >
> > /usr/src/sys/net/if_ethersubr.c:1133
> > 490: c2 08 80 03 ldub [ %g2 + %g3 ], %g1
> > 494: c2 2f a7 df stb %g1, [ %fp + 0x7df ]
> > /usr/src/sys/net/if_ethersubr.c:1134
> > 498: 82 08 60 ff and %g1, 0xff, %g1
> > /usr/src/sys/net/if_ethersubr.c:1135
> > 49c: c4 02 20 18 ld [ %o0 + 0x18 ], %g2
> > /usr/src/sys/net/if_ethersubr.c:1134
> > 4a0: 82 08 60 0f and %g1, 0xf, %g1
> > 4a4: 83 28 60 02 sll %g1, 2, %g1
> > 4a8: 87 38 60 00 sra %g1, 0, %g3
> > /usr/src/sys/net/if_ethersubr.c:1135
> >
> > 1187 memcpy(&hdrcpy.hc_data, &ext->tcp->th_flags - 1, 1);
> > 1188 hlen = hdrcpy.hc_th.off << 2;
> > 1189 if (m->m_len - hoff < hlen) {
> >
> > /usr/src/sys/net/if_ethersubr.c:1187
> > 5f4: c4 08 60 0c ldub [ %g1 + 0xc ], %g2
> > 5f8: c4 2f a7 df stb %g2, [ %fp + 0x7df ]
> > /usr/src/sys/net/if_ethersubr.c:1188
> > 5fc: c2 5f a7 df ldx [ %fp + 0x7df ], %g1
> > /usr/src/sys/net/if_ethersubr.c:1189
> > 600: c6 02 20 18 ld [ %o0 + 0x18 ], %g3
> > /usr/src/sys/net/if_ethersubr.c:1188
> > 604: 83 30 70 3c srlx %g1, 0x3c, %g1
> > 608: 83 28 60 02 sll %g1, 2, %g1
> > /usr/src/sys/net/if_ethersubr.c:1189
> > 60c: 86 20 c0 04 sub %g3, %g4, %g3
> > /usr/src/sys/net/if_ethersubr.c:1188
> > 610: 85 38 60 00 sra %g1, 0, %g2
> > /usr/src/sys/net/if_ethersubr.c:1189
> >
> > I have tested it on sparc64 with em, ix, ixl.
> >
> > ok?
> >
> > bluhm
>
> Quickly tested with ix(4) and em(4). I like the simplified length
> variables. If it fixes the alignment issues seen on sparc64,
> even better. I think it's a step forward to the ix(4) and em(4)
> issue which we are currently investigating. Next steps as
> discussed.
>
> ok mglocker@
>
> > Index: dev/pci/if_bnxt.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_bnxt.c,v
> > diff -u -p -r1.45 if_bnxt.c
> > --- dev/pci/if_bnxt.c 19 Jan 2024 03:25:13 -0000 1.45
> > +++ dev/pci/if_bnxt.c 11 Feb 2024 19:52:37 -0000
> > @@ -1433,13 +1433,13 @@ bnxt_start(struct ifqueue *ifq)
> > lflags |= TX_BD_LONG_LFLAGS_LSO;
> > hdrsize = sizeof(*ext.eh);
> > if (ext.ip4)
> > - hdrsize += ext.ip4->ip_hl << 2;
> > + hdrsize += ext.ip4hlen;
> > else if (ext.ip6)
> > hdrsize += sizeof(*ext.ip6);
> > else
> > tcpstat_inc(tcps_outbadtso);
> >
> > - hdrsize += ext.tcp->th_off << 2;
> > + hdrsize += ext.tcphlen;
> > txhi->hdr_size = htole16(hdrsize / 2);
> >
> > outlen = m->m_pkthdr.ph_mss;
> > Index: dev/pci/if_em.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_em.c,v
> > diff -u -p -r1.371 if_em.c
> > --- dev/pci/if_em.c 28 Jan 2024 18:42:58 -0000 1.371
> > +++ dev/pci/if_em.c 11 Feb 2024 19:52:37 -0000
> > @@ -2433,7 +2433,7 @@ em_tx_ctx_setup(struct em_queue *que, st
> > vlan_macip_lens |= (sizeof(*ext.eh) << E1000_ADVTXD_MACLEN_SHIFT);
> >
> > if (ext.ip4) {
> > - iphlen = ext.ip4->ip_hl << 2;
> > + iphlen = ext.ip4hlen;
> >
> > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
> > if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
> > Index: dev/pci/if_igc.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_igc.c,v
> > diff -u -p -r1.15 if_igc.c
> > --- dev/pci/if_igc.c 23 Jan 2024 08:48:12 -0000 1.15
> > +++ dev/pci/if_igc.c 11 Feb 2024 19:52:37 -0000
> > @@ -2028,7 +2028,7 @@ igc_tx_ctx_setup(struct tx_ring *txr, st
> > ether_extract_headers(mp, &ext);
> >
> > if (ext.ip4) {
> > - iphlen = ext.ip4->ip_hl << 2;
> > + iphlen = ext.ip4hlen;
> >
> > type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_IPV4;
> > if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
> > Index: dev/pci/if_ix.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v
> > diff -u -p -r1.206 if_ix.c
> > --- dev/pci/if_ix.c 10 Nov 2023 15:51:20 -0000 1.206
> > +++ dev/pci/if_ix.c 11 Feb 2024 19:52:37 -0000
> > @@ -2502,7 +2502,7 @@ ixgbe_tx_offload(struct mbuf *mp, uint32
> > *vlan_macip_lens |= (ethlen << IXGBE_ADVTXD_MACLEN_SHIFT);
> >
> > if (ext.ip4) {
> > - iphlen = ext.ip4->ip_hl << 2;
> > + iphlen = ext.ip4hlen;
> >
> > if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
> > *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8;
> > @@ -2542,7 +2542,7 @@ ixgbe_tx_offload(struct mbuf *mp, uint32
> > if (ext.tcp) {
> > uint32_t hdrlen, thlen, paylen, outlen;
> >
> > - thlen = ext.tcp->th_off << 2;
> > + thlen = ext.tcphlen;
> >
> > outlen = mp->m_pkthdr.ph_mss;
> > *mss_l4len_idx |= outlen << IXGBE_ADVTXD_MSS_SHIFT;
> > @@ -3277,11 +3277,11 @@ ixgbe_rxeof(struct rx_ring *rxr)
> > hdrlen += ETHER_VLAN_ENCAP_LEN;
> > #endif
> > if (ext.ip4)
> > - hdrlen += ext.ip4->ip_hl << 2;
> > + hdrlen += ext.ip4hlen;
> > if (ext.ip6)
> > hdrlen += sizeof(*ext.ip6);
> > if (ext.tcp) {
> > - hdrlen += ext.tcp->th_off << 2;
> > + hdrlen += ext.tcphlen;
> > tcpstat_inc(tcps_inhwlro);
> > tcpstat_add(tcps_inpktlro, pkts);
> > } else {
> > Index: dev/pci/if_ixl.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
> > diff -u -p -r1.95 if_ixl.c
> > --- dev/pci/if_ixl.c 7 Jan 2024 21:01:45 -0000 1.95
> > +++ dev/pci/if_ixl.c 11 Feb 2024 19:52:37 -0000
> > @@ -2827,7 +2827,7 @@ ixl_tx_setup_offload(struct mbuf *m0, st
> > IXL_TX_DESC_CMD_IIPT_IPV4_CSUM :
> > IXL_TX_DESC_CMD_IIPT_IPV4;
> >
> > - hlen = ext.ip4->ip_hl << 2;
> > + hlen = ext.ip4hlen;
> > #ifdef INET6
> > } else if (ext.ip6) {
> > offload |= IXL_TX_DESC_CMD_IIPT_IPV6;
> > @@ -2844,10 +2844,12 @@ ixl_tx_setup_offload(struct mbuf *m0, st
> >
> > if (ext.tcp && ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
> > offload |= IXL_TX_DESC_CMD_L4T_EOFT_TCP;
> > - offload |= (uint64_t)ext.tcp->th_off << IXL_TX_DESC_L4LEN_SHIFT;
> > + offload |= (uint64_t)(ext.tcphlen >> 2)
> > + << IXL_TX_DESC_L4LEN_SHIFT;
> > } else if (ext.udp && ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
> > offload |= IXL_TX_DESC_CMD_L4T_EOFT_UDP;
> > - offload |= (sizeof(*ext.udp) >> 2) << IXL_TX_DESC_L4LEN_SHIFT;
> > + offload |= (uint64_t)(sizeof(*ext.udp) >> 2)
> > + << IXL_TX_DESC_L4LEN_SHIFT;
> > }
> >
> > if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) {
> > @@ -2855,7 +2857,7 @@ ixl_tx_setup_offload(struct mbuf *m0, st
> > struct ixl_tx_desc *ring, *txd;
> > uint64_t cmd = 0, paylen, outlen;
> >
> > - hlen += ext.tcp->th_off << 2;
> > + hlen += ext.tcphlen;
> >
> > outlen = m0->m_pkthdr.ph_mss;
> > paylen = m0->m_pkthdr.len - ETHER_HDR_LEN - hlen;
> > Index: dev/pv/if_vio.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pv/if_vio.c,v
> > diff -u -p -r1.29 if_vio.c
> > --- dev/pv/if_vio.c 20 Dec 2023 09:51:06 -0000 1.29
> > +++ dev/pv/if_vio.c 11 Feb 2024 19:52:37 -0000
> > @@ -765,7 +765,7 @@ again:
> > hdr->csum_offset = offsetof(struct udphdr, uh_sum);
> >
> > if (ext.ip4)
> > - hdr->csum_start += ext.ip4->ip_hl << 2;
> > + hdr->csum_start += ext.ip4hlen;
> > #ifdef INET6
> > else if (ext.ip6)
> > hdr->csum_start += sizeof(*ext.ip6);
> > Index: net/if_ethersubr.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
> > diff -u -p -r1.291 if_ethersubr.c
> > --- net/if_ethersubr.c 27 Jul 2023 20:21:25 -0000 1.291
> > +++ net/if_ethersubr.c 11 Feb 2024 19:52:37 -0000
> > @@ -140,6 +140,20 @@ didn't get a copy, you may request one f
> > #include <netmpls/mpls.h>
> > #endif /* MPLS */
> >
> > +/* #define ETHERDEBUG 1 */
> > +#ifdef ETHERDEBUG
> > +int etherdebug = ETHERDEBUG;
> > +#define DNPRINTF(level, fmt, args...) \
> > + do { \
> > + if (etherdebug >= level) \
> > + printf("%s: " fmt "\n", __func__, ## args); \
> > + } while (0)
> > +#else
> > +#define DNPRINTF(level, fmt, args...) \
> > + do { } while (0)
> > +#endif
> > +#define DPRINTF(fmt, args...) DNPRINTF(1, fmt, args)
> > +
> > u_int8_t etherbroadcastaddr[ETHER_ADDR_LEN] =
> > { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
> > u_int8_t etheranyaddr[ETHER_ADDR_LEN] =
> > @@ -1034,56 +1048,126 @@ ether_e64_to_addr(struct ether_addr *ea,
> >
> > /* Parse different TCP/IP protocol headers for a quick view inside an mbuf. */
> > void
> > -ether_extract_headers(struct mbuf *mp, struct ether_extracted *ext)
> > +ether_extract_headers(struct mbuf *m0, struct ether_extracted *ext)
> > {
> > struct mbuf *m;
> > - uint64_t hlen;
> > + size_t hlen;
> > int hoff;
> > uint8_t ipproto;
> > uint16_t ether_type;
> > + /* gcc 4.2.1 on sparc64 may create 32 bit loads on unaligned mbuf */
> > + union {
> > + u_char hc_data;
> > +#if _BYTE_ORDER == _LITTLE_ENDIAN
> > + struct {
> > + u_int hl:4, /* header length */
> > + v:4; /* version */
> > + } hc_ip;
> > + struct {
> > + u_int x2:4, /* (unused) */
> > + off:4; /* data offset */
> > + } hc_th;
> > +#endif
> > +#if _BYTE_ORDER == _BIG_ENDIAN
> > + struct {
> > + u_int v:4, /* version */
> > + hl:4; /* header length */
> > + } hc_ip;
> > + struct {
> > + u_int off:4, /* data offset */
> > + x2:4; /* (unused) */
> > + } hc_th;
> > +#endif
> > + } hdrcpy;
> >
> > /* Return NULL if header was not recognized. */
> > memset(ext, 0, sizeof(*ext));
> >
> > - if (mp->m_len < sizeof(*ext->eh))
> > - return;
> > + KASSERT(ISSET(m0->m_flags, M_PKTHDR));
> > + ext->paylen = m0->m_pkthdr.len;
> >
> > - ext->eh = mtod(mp, struct ether_header *);
> > + if (m0->m_len < sizeof(*ext->eh)) {
> > + DPRINTF("m_len %d, eh %zu", m0->m_len, sizeof(*ext->eh));
> > + return;
> > + }
> > + ext->eh = mtod(m0, struct ether_header *);
> > ether_type = ntohs(ext->eh->ether_type);
> > hlen = sizeof(*ext->eh);
> > + if (ext->paylen < hlen) {
> > + DPRINTF("paylen %u, ehlen %zu", ext->paylen, hlen);
> > + ext->eh = NULL;
> > + return;
> > + }
> > + ext->paylen -= hlen;
> >
> > #if NVLAN > 0
> > if (ether_type == ETHERTYPE_VLAN) {
> > - ext->evh = mtod(mp, struct ether_vlan_header *);
> > + if (m0->m_len < sizeof(*ext->evh)) {
> > + DPRINTF("m_len %d, evh %zu",
> > + m0->m_len, sizeof(*ext->evh));
> > + return;
> > + }
> > + ext->evh = mtod(m0, struct ether_vlan_header *);
> > ether_type = ntohs(ext->evh->evl_proto);
> > hlen = sizeof(*ext->evh);
> > + if (sizeof(*ext->eh) + ext->paylen < hlen) {
> > + DPRINTF("paylen %zu, evhlen %zu",
> > + sizeof(*ext->eh) + ext->paylen, hlen);
> > + ext->evh = NULL;
> > + return;
> > + }
> > + ext->paylen = sizeof(*ext->eh) + ext->paylen - hlen;
> > }
> > #endif
> >
> > switch (ether_type) {
> > case ETHERTYPE_IP:
> > - m = m_getptr(mp, hlen, &hoff);
> > - if (m == NULL || m->m_len - hoff < sizeof(*ext->ip4))
> > + m = m_getptr(m0, hlen, &hoff);
> > + if (m == NULL || m->m_len - hoff < sizeof(*ext->ip4)) {
> > + DPRINTF("m_len %d, hoff %d, ip4 %zu",
> > + m ? m->m_len : -1, hoff, sizeof(*ext->ip4));
> > return;
> > + }
> > ext->ip4 = (struct ip *)(mtod(m, caddr_t) + hoff);
> >
> > - if (ISSET(ntohs(ext->ip4->ip_off), IP_MF|IP_OFFMASK))
> > + memcpy(&hdrcpy.hc_data, ext->ip4, 1);
> > + hlen = hdrcpy.hc_ip.hl << 2;
> > + if (m->m_len - hoff < hlen) {
> > + DPRINTF("m_len %d, hoff %d, iphl %zu",
> > + m ? m->m_len : -1, hoff, hlen);
> > + ext->ip4 = NULL;
> > return;
> > -
> > - hlen = ext->ip4->ip_hl << 2;
> > + }
> > + if (ext->paylen < hlen) {
> > + DPRINTF("paylen %u, ip4hlen %zu", ext->paylen, hlen);
> > + ext->ip4 = NULL;
> > + return;
> > + }
> > + ext->ip4hlen = hlen;
> > + ext->paylen -= hlen;
> > ipproto = ext->ip4->ip_p;
> >
> > + if (ISSET(ntohs(ext->ip4->ip_off), IP_MF|IP_OFFMASK))
> > + return;
> > break;
> > #ifdef INET6
> > case ETHERTYPE_IPV6:
> > - m = m_getptr(mp, hlen, &hoff);
> > - if (m == NULL || m->m_len - hoff < sizeof(*ext->ip6))
> > + m = m_getptr(m0, hlen, &hoff);
> > + if (m == NULL || m->m_len - hoff < sizeof(*ext->ip6)) {
> > + DPRINTF("m_len %d, hoff %d, ip6 %zu",
> > + m ? m->m_len : -1, hoff, sizeof(*ext->ip6));
> > return;
> > + }
> > ext->ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
> >
> > hlen = sizeof(*ext->ip6);
> > + if (ext->paylen < hlen) {
> > + DPRINTF("paylen %u, ip6hlen %zu", ext->paylen, hlen);
> > + ext->ip6 = NULL;
> > + return;
> > + }
> > + ext->paylen -= hlen;
> > ipproto = ext->ip6->ip6_nxt;
> > -
> > break;
> > #endif
> > default:
> > @@ -1093,16 +1177,51 @@ ether_extract_headers(struct mbuf *mp, s
> > switch (ipproto) {
> > case IPPROTO_TCP:
> > m = m_getptr(m, hoff + hlen, &hoff);
> > - if (m == NULL || m->m_len - hoff < sizeof(*ext->tcp))
> > + if (m == NULL || m->m_len - hoff < sizeof(*ext->tcp)) {
> > + DPRINTF("m_len %d, hoff %d, tcp %zu",
> > + m ? m->m_len : -1, hoff, sizeof(*ext->tcp));
> > return;
> > + }
> > ext->tcp = (struct tcphdr *)(mtod(m, caddr_t) + hoff);
> > +
> > + memcpy(&hdrcpy.hc_data, &ext->tcp->th_flags - 1, 1);
> > + hlen = hdrcpy.hc_th.off << 2;
> > + if (m->m_len - hoff < hlen) {
> > + DPRINTF("m_len %d, hoff %d, thoff %zu",
> > + m ? m->m_len : -1, hoff, hlen);
> > + ext->tcp = NULL;
> > + return;
> > + }
> > + if (ext->paylen < hlen) {
> > + DPRINTF("paylen %u, tcphlen %zu", ext->paylen, hlen);
> > + ext->tcp = NULL;
> > + return;
> > + }
> > + ext->tcphlen = hlen;
> > + ext->paylen -= hlen;
> > break;
> >
> > case IPPROTO_UDP:
> > m = m_getptr(m, hoff + hlen, &hoff);
> > - if (m == NULL || m->m_len - hoff < sizeof(*ext->udp))
> > + if (m == NULL || m->m_len - hoff < sizeof(*ext->udp)) {
> > + DPRINTF("m_len %d, hoff %d, tcp %zu",
> > + m ? m->m_len : -1, hoff, sizeof(*ext->tcp));
> > return;
> > + }
> > ext->udp = (struct udphdr *)(mtod(m, caddr_t) + hoff);
> > +
> > + hlen = sizeof(*ext->udp);
> > + if (ext->paylen < hlen) {
> > + DPRINTF("paylen %u, udphlen %zu", ext->paylen, hlen);
> > + ext->udp = NULL;
> > + return;
> > + }
> > break;
> > }
> > +
> > + DNPRINTF(2, "%s%s%s%s%s%s ip4h %u, tcph %u, payl %u",
> > + ext->eh ? "eh," : "", ext->evh ? "evh," : "",
> > + ext->ip4 ? "ip4," : "", ext->ip6 ? "ip6," : "",
> > + ext->tcp ? "tcp," : "", ext->udp ? "udp," : "",
> > + ext->ip4hlen, ext->tcphlen, ext->paylen);
> > }
> > Index: netinet/if_ether.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/if_ether.h,v
> > diff -u -p -r1.90 if_ether.h
> > --- netinet/if_ether.h 27 Jul 2023 20:21:25 -0000 1.90
> > +++ netinet/if_ether.h 11 Feb 2024 19:52:37 -0000
> > @@ -307,6 +307,9 @@ struct ether_extracted {
> > struct ip6_hdr *ip6;
> > struct tcphdr *tcp;
> > struct udphdr *udp;
> > + u_int ip4hlen;
> > + u_int tcphlen;
> > + u_int paylen;
> > };
> >
> > void ether_extract_headers(struct mbuf *, struct ether_extracted *);
>
>
ether extract headers alingment memcpy