merge vlan and carp input back into ether_input

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
6 messages Options
Reply | Threaded
Open this post in threaded view
|

merge vlan and carp input back into ether_input

David Gwynne-5
while we were working on making the various pseudo interfaces you
stack on top of ethernet mpsafe, we split their input processing
off so they could be attacked one by one. they're all mpsafe now,
so this separation is not strictly necessary anymore.

this moves carp and vlan input back into ether_input. a lot of care
is taken to correctly order when we give the packets to the sub
interfaces. basically, any ethernet packet with a vlan tag is
unconditionally given to vlan_input. carp input is only attempted
if the packet is not for the parent interface, but before the
multicast handling is done. by checking the interfaces mac address
first, carp interfaces can get their packets immediately, which
means we can  stop messing around with the M_BCAST and M_MCAST flags
on carp mbufs. the relevant chunk of code is:

#if NVLAN > 0
        if (ISSET(m->m_flags, M_VLANTAG) ||
            etype == ETHERTYPE_VLAN ||
            etype == ETHERTYPE_QINQ) {
                vlan_input(ifp, ml, m);
                return;
        }
#endif

        if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
                /* The packet doesn't match the ether addr on this iface */

#if NCARP > 0
                /* It may be addressed to a child carp iface */
                if (ifp->if_type != IFT_CARP &&
                    !SRPL_EMPTY_LOCKED(&ifp->if_carp) &&
                    carp_input(ifp, ml, m)) {
                        /* carp_input has consumed the packet */
                        return;
                }
#endif

                /* It must be multicast if it isn't for us or a child carp */
                if (!ETHER_IS_MULTICAST(eh->ether_dhost))
                        goto dropanyway;

                /* Drop it if it came from us in the first place */
                if (!ISSET(ifp->if_flags, IFF_SIMPLEX) &&
                    memcmp(ac->ac_enaddr, eh->ether_shost,
                     ETHER_ADDR_LEN) == 0)
                        goto dropanyway;

                SET(m->m_flags, (memcmp(etherbroadcastaddr, eh->ether_dhost,
                    ETHER_ADDR_LEN) == 0) ? M_BCAST : M_MCAST);
                ifp->if_imcasts++; /* XXX lock? */
        }

doing vlan and carp input here let's us remove special handling for
VLAN packets in carp code.

this diff also gets rid of the use of the pseudo interfaces input
queues, it processes their packets off an mbuf list for each real
ethernet packet. if we can tie all the work done on behalf of a
physical ring to a single task it makes rx ring moderation for
physical interfaces a lot easier to implement.

note that trunk and bridge/switch are still implemented using
interface input handlers at the moment.

ok?

Index: net/if_ethersubr.c
===================================================================
RCS file: /cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.250
diff -u -p -r1.250 if_ethersubr.c
--- net/if_ethersubr.c 10 Jan 2018 00:14:38 -0000 1.250
+++ net/if_ethersubr.c 11 Jan 2018 01:20:54 -0000
@@ -103,6 +103,16 @@ didn't get a copy, you may request one f
 #include <net/bpf.h>
 #endif
 
+#include "vlan.h"
+#if NVLAN > 0
+#include <net/if_vlan_var.h>
+#endif
+
+#include "carp.h"
+#if NCARP > 0
+#include <netinet/ip_carp.h>
+#endif
+
 #include "pppoe.h"
 #if NPPPOE > 0
 #include <net/if_pppoe.h>
@@ -121,6 +131,8 @@ didn't get a copy, you may request one f
 #include <netmpls/mpls.h>
 #endif /* MPLS */
 
+void ether_input_m(struct ifnet *, struct mbuf_list *, struct mbuf *);
+
 u_int8_t etherbroadcastaddr[ETHER_ADDR_LEN] =
     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 u_int8_t etheranyaddr[ETHER_ADDR_LEN] =
@@ -306,69 +318,108 @@ bad:
  return (error);
 }
 
-/*
- * Process a received Ethernet packet;
- * the packet is in the mbuf chain m without
- * the ether header, which is provided separately.
- */
+void
+ether_enqueue(struct ifnet *ifp, struct mbuf_list *ml, struct mbuf *m)
+{
+#if NBPFILTER > 0
+ caddr_t if_bpf;
+#endif
+
+ m->m_pkthdr.ph_ifidx = ifp->if_index;
+ m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
+
+ /* XXX lock? */
+ ifp->if_ipackets++;
+ ifp->if_ibytes += m->m_pkthdr.len;
+
+#if NBPFILTER > 0
+ if_bpf = ifp->if_bpf;
+ if (if_bpf) {
+ if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_IN)) {
+ m_freem(m);
+ return;
+ }
+ }
+#endif
+
+ ml_enqueue(ml, m);
+}
+
 int
 ether_input(struct ifnet *ifp, struct mbuf *m, void *cookie)
 {
+ struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+
+ /* Drop short frames */
+ if (m->m_len < ETHER_HDR_LEN) {
+ m_freem(m);
+ return (1);
+ }
+
+ /* We have a reference to this ifp already */
+ ether_input_m(ifp, &ml, m);
+
+ /* Run the packet through any child interfaces */
+ while ((m = ml_dequeue(&ml)) != NULL) {
+ ifp = if_get(m->m_pkthdr.ph_ifidx);
+ if (ifp != NULL)
+ ether_input_m(ifp, &ml, m);
+ else
+ m_freem(m);
+ if_put(ifp);
+ }
+
+ return (1);
+}
+
+void
+ether_input_m(struct ifnet *ifp, struct mbuf_list *ml, struct mbuf *m)
+{
  struct ether_header *eh;
  void (*input)(struct ifnet *, struct mbuf *);
  u_int16_t etype;
  struct arpcom *ac;
 
- /* Drop short frames */
- if (m->m_len < ETHER_HDR_LEN)
- goto dropanyway;
-
  ac = (struct arpcom *)ifp;
  eh = mtod(m, struct ether_header *);
+ etype = ntohs(eh->ether_type);
 
- if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
- /*
- * If this is not a simplex interface, drop the packet
- * if it came from us.
- */
- if ((ifp->if_flags & IFF_SIMPLEX) == 0) {
- if (memcmp(ac->ac_enaddr, eh->ether_shost,
-    ETHER_ADDR_LEN) == 0) {
- m_freem(m);
- return (1);
- }
- }
-
- if (memcmp(etherbroadcastaddr, eh->ether_dhost,
-    sizeof(etherbroadcastaddr)) == 0)
- m->m_flags |= M_BCAST;
- else
- m->m_flags |= M_MCAST;
- ifp->if_imcasts++;
+#if NVLAN > 0
+ if (ISSET(m->m_flags, M_VLANTAG) ||
+    etype == ETHERTYPE_VLAN ||
+    etype == ETHERTYPE_QINQ) {
+ vlan_input(ifp, ml, m);
+ return;
  }
+#endif
 
- /*
- * HW vlan tagged packets that were not collected by vlan(4) must
- * be dropped now.
- */
- if (m->m_flags & M_VLANTAG) {
- m_freem(m);
- return (1);
- }
+ if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
+ /* The packet doesn't match the ether addr on this iface */
 
- /*
- * If packet is unicast, make sure it is for us.  Drop otherwise.
- * This check is required in promiscous mode, and for some hypervisors
- * where the MAC filter is 'best effort' only.
- */
- if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
- if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN)) {
- m_freem(m);
- return (1);
+#if NCARP > 0
+ /* It may be addressed to a child carp iface */
+ if (ifp->if_type != IFT_CARP &&
+    !SRPL_EMPTY_LOCKED(&ifp->if_carp) &&
+    carp_input(ifp, ml, m)) {
+ /* carp_input has consumed the packet */
+ return;
  }
- }
+#endif
 
- etype = ntohs(eh->ether_type);
+ /* It must be multicast if it isn't for us or a child carp */
+ if (!ETHER_IS_MULTICAST(eh->ether_dhost))
+ goto dropanyway;
+
+ /* Drop it if it came from us in the first place */
+ if (!ISSET(ifp->if_flags, IFF_SIMPLEX) &&
+    memcmp(ac->ac_enaddr, eh->ether_shost,
+      ETHER_ADDR_LEN) == 0)
+ goto dropanyway;
+
+ SET(m->m_flags, (memcmp(etherbroadcastaddr, eh->ether_dhost,
+    ETHER_ADDR_LEN) == 0) ? M_BCAST : M_MCAST);
+ ifp->if_imcasts++; /* XXX lock? */
+ }
 
  switch (etype) {
  case ETHERTYPE_IP:
@@ -406,7 +457,7 @@ ether_input(struct ifnet *ifp, struct mb
 
  if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
  pipex_pppoe_input(m, session);
- return (1);
+ return;
  }
  }
 #endif
@@ -414,7 +465,7 @@ ether_input(struct ifnet *ifp, struct mb
  niq_enqueue(&pppoediscinq, m);
  else
  niq_enqueue(&pppoeinq, m);
- return (1);
+ return;
 #endif
 #ifdef MPLS
  case ETHERTYPE_MPLS:
@@ -428,10 +479,9 @@ ether_input(struct ifnet *ifp, struct mb
 
  m_adj(m, sizeof(*eh));
  (*input)(ifp, m);
- return (1);
+ return;
 dropanyway:
  m_freem(m);
- return (1);
 }
 
 /*
Index: net/if_vlan.c
===================================================================
RCS file: /cvs/src/sys/net/if_vlan.c,v
retrieving revision 1.175
diff -u -p -r1.175 if_vlan.c
--- net/if_vlan.c 9 Jan 2018 15:24:24 -0000 1.175
+++ net/if_vlan.c 11 Jan 2018 01:20:55 -0000
@@ -84,7 +84,6 @@ void vlanattach(int count);
 int vlan_clone_create(struct if_clone *, int);
 int vlan_clone_destroy(struct ifnet *);
 
-int vlan_input(struct ifnet *, struct mbuf *, void *);
 void vlan_start(struct ifqueue *ifq);
 int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
 
@@ -325,11 +324,8 @@ vlan_inject(struct mbuf *m, uint16_t typ
  return (m);
  }
 
-/*
- * vlan_input() returns 1 if it has consumed the packet, 0 otherwise.
- */
-int
-vlan_input(struct ifnet *ifp0, struct mbuf *m, void *cookie)
+void
+vlan_input(struct ifnet *ifp0, struct mbuf_list *ml, struct mbuf *m)
 {
  struct ifvlan *ifv;
  struct ether_vlan_header *evl;
@@ -337,28 +333,22 @@ vlan_input(struct ifnet *ifp0, struct mb
  SRPL_HEAD(, ifvlan) *tagh, *list;
  struct srp_ref sr;
  u_int tag;
- struct mbuf_list ml = MBUF_LIST_INITIALIZER();
  u_int16_t etype;
 
- eh = mtod(m, struct ether_header *);
- etype = ntohs(eh->ether_type);
-
  if (m->m_flags & M_VLANTAG) {
  etype = ETHERTYPE_VLAN;
  tagh = vlan_tagh;
- } else if ((etype == ETHERTYPE_VLAN) || (etype == ETHERTYPE_QINQ)) {
+ } else {
  if (m->m_len < sizeof(*evl) &&
     (m = m_pullup(m, sizeof(*evl))) == NULL) {
  ifp0->if_ierrors++;
- return (1);
+ return;
  }
 
  evl = mtod(m, struct ether_vlan_header *);
  m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
+ etype = ntohs(evl->evl_encap_proto);
  tagh = etype == ETHERTYPE_QINQ ? svlan_tagh : vlan_tagh;
- } else {
- /* Skip non-VLAN packets. */
- return (0);
  }
 
  /* From now on ether_vtag is fine */
@@ -376,15 +366,11 @@ vlan_input(struct ifnet *ifp0, struct mb
  break;
  }
 
- if (ifv == NULL) {
+ if (ifv == NULL || !ISSET(ifv->ifv_if.if_flags, IFF_RUNNING)) {
  ifp0->if_noproto++;
  goto drop;
  }
 
- if ((ifv->ifv_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
-    (IFF_UP|IFF_RUNNING))
- goto drop;
-
  /*
  * Having found a valid vlan interface corresponding to
  * the given source interface and vlan tag, remove the
@@ -393,20 +379,19 @@ vlan_input(struct ifnet *ifp0, struct mb
  if (m->m_flags & M_VLANTAG) {
  m->m_flags &= ~M_VLANTAG;
  } else {
+ eh = mtod(m, struct ether_header *);
  eh->ether_type = evl->evl_proto;
  memmove((char *)eh + EVL_ENCAPLEN, eh, sizeof(*eh));
  m_adj(m, EVL_ENCAPLEN);
  }
 
- ml_enqueue(&ml, m);
- if_input(&ifv->ifv_if, &ml);
+ ether_enqueue(&ifv->ifv_if, ml, m);
  SRPL_LEAVE(&sr);
- return (1);
+ return;
 
 drop:
  SRPL_LEAVE(&sr);
  m_freem(m);
- return (1);
 }
 
 int
@@ -430,8 +415,6 @@ vlan_parent_up(struct ifvlan *ifv, struc
 
  vlan_multi_apply(ifv, ifp0, SIOCADDMULTI);
 
- if_ih_insert(ifp0, vlan_input, NULL);
-
  return (0);
 }
 
@@ -549,7 +532,6 @@ vlan_down(struct ifvlan *ifv)
 
  ifp0 = if_get(ifv->ifv_ifp0);
  if (ifp0 != NULL) {
- if_ih_remove(ifp0, vlan_input, NULL);
  if (ISSET(ifv->ifv_flags, IFVF_PROMISC))
  ifpromisc(ifp0, 0);
  vlan_multi_apply(ifv, ifp0, SIOCDELMULTI);
Index: net/if_vlan_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_vlan_var.h,v
retrieving revision 1.37
diff -u -p -r1.37 if_vlan_var.h
--- net/if_vlan_var.h 24 Jan 2017 10:08:30 -0000 1.37
+++ net/if_vlan_var.h 11 Jan 2018 01:20:55 -0000
@@ -85,6 +85,7 @@ struct ifvlan {
 #define IFVF_LLADDR 0x02 /* don't inherit the parents mac */
 
 struct mbuf *vlan_inject(struct mbuf *, uint16_t, uint16_t);
+void vlan_input(struct ifnet *, struct mbuf_list *, struct mbuf *);
 #endif /* _KERNEL */
 
 #endif /* _NET_IF_VLAN_VAR_H_ */
Index: netinet/if_ether.h
===================================================================
RCS file: /cvs/src/sys/netinet/if_ether.h,v
retrieving revision 1.73
diff -u -p -r1.73 if_ether.h
--- netinet/if_ether.h 29 Nov 2016 10:09:57 -0000 1.73
+++ netinet/if_ether.h 11 Jan 2018 01:20:56 -0000
@@ -240,6 +240,7 @@ void ether_ifattach(struct ifnet *);
 void ether_ifdetach(struct ifnet *);
 int ether_ioctl(struct ifnet *, struct arpcom *, u_long, caddr_t);
 int ether_input(struct ifnet *, struct mbuf *, void *);
+void ether_enqueue(struct ifnet *, struct mbuf_list *, struct mbuf *);
 int ether_output(struct ifnet *,
     struct mbuf *, struct sockaddr *, struct rtentry *);
 void ether_rtrequest(struct ifnet *, int, struct rtentry *);
Index: netinet/ip_carp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_carp.c,v
retrieving revision 1.324
diff -u -p -r1.324 ip_carp.c
--- netinet/ip_carp.c 11 Jan 2018 00:14:15 -0000 1.324
+++ netinet/ip_carp.c 11 Jan 2018 01:20:56 -0000
@@ -207,7 +207,6 @@ void carp_hmac_generate(struct carp_vhos
     unsigned char *, u_int8_t);
 int carp_hmac_verify(struct carp_vhost_entry *, u_int32_t *,
     unsigned char *);
-int carp_input(struct ifnet *, struct mbuf *, void *);
 void carp_proto_input_c(struct ifnet *, struct mbuf *,
     struct carp_header *, int, sa_family_t);
 int carp_proto_input_if(struct ifnet *, struct mbuf **, int *, int);
@@ -926,9 +925,6 @@ carpdetach(struct carp_softc *sc)
 
  cif = &ifp0->if_carp;
 
- /* Restore previous input handler. */
- if_ih_remove(ifp0, carp_input, NULL);
-
  if (sc->lh_cookie != NULL)
  hook_disestablish(ifp0->if_linkstatehooks, sc->lh_cookie);
 
@@ -1379,58 +1375,32 @@ carp_vhe_match(struct carp_softc *sc, ui
 }
 
 int
-carp_input(struct ifnet *ifp0, struct mbuf *m, void *cookie)
+carp_input(struct ifnet *ifp0, struct mbuf_list *ml, struct mbuf *m)
 {
  struct ether_header *eh;
- struct mbuf_list ml = MBUF_LIST_INITIALIZER();
  struct srpl *cif;
  struct carp_softc *sc;
+ struct ifnet *ifp;
  struct srp_ref sr;
 
-#if NVLAN > 0
- /*
- * If the underlying interface removed the VLAN header itself,
- * it's not for us.
- */
- if (ISSET(m->m_flags, M_VLANTAG))
- return (0);
-#endif
-
  eh = mtod(m, struct ether_header *);
  cif = &ifp0->if_carp;
 
  SRPL_FOREACH(sc, &sr, cif, sc_list) {
- if ((sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
-    (IFF_UP|IFF_RUNNING))
+ ifp = &sc->sc_if;
+
+ if (!ISSET(ifp->if_flags, IFF_RUNNING))
  continue;
 
  if (carp_vhe_match(sc, eh->ether_dhost)) {
- /*
- * These packets look like layer 2 multicast but they
- * are unicast at layer 3. With help of the tag the
- * mbuf's M_MCAST flag can be removed by carp_lsdrop()
- * after we have passed layer 2.
- */
- if (sc->sc_balancing == CARP_BAL_IP) {
- struct m_tag *mtag;
- mtag = m_tag_get(PACKET_TAG_CARP_BAL_IP, 0,
-    M_NOWAIT);
- if (mtag == NULL) {
- m_freem(m);
- goto out;
- }
- m_tag_prepend(m, mtag);
- }
- break;
+ ether_enqueue(ifp, ml, m);
+ SRPL_LEAVE(&sr);
+ return (1);
  }
  }
+ SRPL_LEAVE(&sr);
 
- if (sc == NULL) {
- SRPL_LEAVE(&sr);
-
- if (!ETHER_IS_MULTICAST(eh->ether_dhost))
- return (0);
-
+ if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
  /*
  * XXX Should really check the list of multicast addresses
  * for each CARP interface _before_ copying.
@@ -1438,29 +1408,22 @@ carp_input(struct ifnet *ifp0, struct mb
  SRPL_FOREACH(sc, &sr, cif, sc_list) {
  struct mbuf *m0;
 
- if (!(sc->sc_if.if_flags & IFF_UP))
+ ifp = &sc->sc_if;
+ if (!ISSET(ifp->if_flags, IFF_RUNNING))
  continue;
 
+ /* check carp_vhe_match? */
+
  m0 = m_dup_pkt(m, ETHER_ALIGN, M_DONTWAIT);
  if (m0 == NULL)
  continue;
 
- ml_init(&ml);
- ml_enqueue(&ml, m0);
-
- if_input(&sc->sc_if, &ml);
+ ether_enqueue(ifp, ml, m0);
  }
  SRPL_LEAVE(&sr);
-
- return (0);
  }
 
- ml_enqueue(&ml, m);
- if_input(&sc->sc_if, &ml);
-out:
- SRPL_LEAVE(&sr);
-
- return (1);
+ return (0);
 }
 
 int
@@ -1747,9 +1710,6 @@ carp_set_ifp(struct carp_softc *sc, stru
  sc->lh_cookie = hook_establish(ifp0->if_linkstatehooks, 1,
     carp_carpdev_state, ifp0);
 
- /* Change input handler of the physical interface. */
- if_ih_insert(ifp0, carp_input, NULL);
-
  carp_carpdev_state(ifp0);
 
  return (0);
Index: netinet/ip_carp.h
===================================================================
RCS file: /cvs/src/sys/netinet/ip_carp.h,v
retrieving revision 1.45
diff -u -p -r1.45 ip_carp.h
--- netinet/ip_carp.h 10 Jan 2018 23:50:39 -0000 1.45
+++ netinet/ip_carp.h 11 Jan 2018 01:20:56 -0000
@@ -200,6 +200,7 @@ void carp_group_demote_adj(struct ifne
 int carp6_proto_input(struct mbuf **, int *, int, int);
 int carp_iamatch(struct ifnet *);
 int carp_ourether(struct ifnet *, u_int8_t *);
+int carp_input(struct ifnet *, struct mbuf_list *, struct mbuf *);
 int carp_output(struct ifnet *, struct mbuf *, struct sockaddr *,
      struct rtentry *);
 int carp_sysctl(int *, u_int,  void *, size_t *, void *, size_t);


Reply | Threaded
Open this post in threaded view
|

Re: merge vlan and carp input back into ether_input

Martin Pieuchot
On 11/01/18(Thu) 11:50, David Gwynne wrote:
> while we were working on making the various pseudo interfaces you
> stack on top of ethernet mpsafe, we split their input processing
> off so they could be attacked one by one. they're all mpsafe now,
> so this separation is not strictly necessary anymore.

Well at that time we weren't sure how to split the work between
CPUs.  The current design allow to have multiple tasks doing some
part of the work and, like we are pushing now, process all the
incoming packets in the same context.

> this moves carp and vlan input back into ether_input. [...]

It does much more than that.  It's also doing a conversion from mbuf to
mbuf_list, it contains some IFF_RUNNING "fixes",  it breaks carp(4)
balancing, it breaks vlan/carp on top of trunk(4)...

> this diff also gets rid of the use of the pseudo interfaces input
> queues, it processes their packets off an mbuf list for each real
> ethernet packet. if we can tie all the work done on behalf of a
> physical ring to a single task it makes rx ring moderation for
> physical interfaces a lot easier to implement.

This needs to be done.  But please don't mix that with a rewrite of
the input handlers.  I believe that's the easiest way to do that is
to modify if_input() do process the packets directly if `ifp' is a
pseudo-interface:

        if (ISSET(ifp->if_xflags, IFXF_CLONED) {
                struct ifih *ifih;
                struct srp_ref sr;

                NET_ASSERT_LOCKED();

                SRPL_FOREACH(ifih, &sr, &ifp->if_inputs, ifih_next) {
                        if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
                                break;
                }
                SRPL_LEAVE(&sr)
        } else
                ifiq_input(&ifp->if_rcv, ml, 2048);

If if_input() is called on a pseudo-interface we know we're already in
a softnet process.

We could also think of doing something similar for if_enqueue() and call
if_start() directly for pseudo-interface.

> note that trunk and bridge/switch are still implemented using
> interface input handlers at the moment.

If you want to get rid of the input handlers, I'd suggest doing it in
the beginning of a release cycle and for all pseudo drivers at once.

Reply | Threaded
Open this post in threaded view
|

Re: merge vlan and carp input back into ether_input

David Gwynne-5


> On 11 Jan 2018, at 8:58 pm, Martin Pieuchot <[hidden email]> wrote:
>
> On 11/01/18(Thu) 11:50, David Gwynne wrote:
>> while we were working on making the various pseudo interfaces you
>> stack on top of ethernet mpsafe, we split their input processing
>> off so they could be attacked one by one. they're all mpsafe now,
>> so this separation is not strictly necessary anymore.
>
> Well at that time we weren't sure how to split the work between
> CPUs.  The current design allow to have multiple tasks doing some
> part of the work and, like we are pushing now, process all the
> incoming packets in the same context.

yes.

now though we assign interfaces to softnettqs, no matter what their relationship to each other is. a physical interface could be on tq 0, and it's child vlan on 1, which goes against the goal of processing all incoming packets in the same context.

>
>> this moves carp and vlan input back into ether_input. [...]
>
> It does much more than that.  It's also doing a conversion from mbuf to
> mbuf_list, it contains some IFF_RUNNING "fixes",  it breaks carp(4)
> balancing, it breaks vlan/carp on top of trunk(4)...

it doesnt replace mbufs with mbuf_lists, it gets rid of the cookies for vlan and carp input and gives them the list to requeue their packets on for input.

wrt to IFF_RUNNING, if we agree that we should be looking at it to know if an interface is functional i can make a bunch of those fixes without oks.

when you say i break carp balancing, are you talking about the removal of the PACKET_TAG_CARP_BAL_IP tagging? PACKET_TAG_CARP_BAL_IP is only used in carp_lsdrop to clear the M_MCAST flag on the mbuf. M_MCAST wont be set on packets destined for the carp interface because we check ac_enaddr before checking if the packet is multicast or broadcast.

are you sure? carp and vlan on top of trunk should still function. however, trunk or bridge/switch on vlan is broken though :(

>
>> this diff also gets rid of the use of the pseudo interfaces input
>> queues, it processes their packets off an mbuf list for each real
>> ethernet packet. if we can tie all the work done on behalf of a
>> physical ring to a single task it makes rx ring moderation for
>> physical interfaces a lot easier to implement.
>
> This needs to be done.  But please don't mix that with a rewrite of
> the input handlers.  I believe that's the easiest way to do that is
> to modify if_input() do process the packets directly if `ifp' is a
> pseudo-interface:
>
> if (ISSET(ifp->if_xflags, IFXF_CLONED) {
> struct ifih *ifih;
> struct srp_ref sr;
>
> NET_ASSERT_LOCKED();
>
>                SRPL_FOREACH(ifih, &sr, &ifp->if_inputs, ifih_next) {
>                        if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
>                                break;
>                }
>                SRPL_LEAVE(&sr)
> } else
> ifiq_input(&ifp->if_rcv, ml, 2048);
>
> If if_input() is called on a pseudo-interface we know we're already in
> a softnet process.

or a syscall.

the code above has pseudo interfaces recurse, where they'd loop either on ifih or at the task level. probably not a huge concern though.

>
> We could also think of doing something similar for if_enqueue() and call
> if_start() directly for pseudo-interface.

ill think about that. ive had some other experiments in that area we could look at too.

>
>> note that trunk and bridge/switch are still implemented using
>> interface input handlers at the moment.
>
> If you want to get rid of the input handlers, I'd suggest doing it in
> the beginning of a release cycle and for all pseudo drivers at once.

considering the trunk and bridge/switch issue, it probably is best to do them all at once.

Reply | Threaded
Open this post in threaded view
|

Re: merge vlan and carp input back into ether_input

Martin Pieuchot
On 11/01/18(Thu) 21:59, David Gwynne wrote:
> [...]
> when you say i break carp balancing, are you talking about the removal of the PACKET_TAG_CARP_BAL_IP tagging? PACKET_TAG_CARP_BAL_IP is only used in carp_lsdrop to clear the M_MCAST flag on the mbuf. M_MCAST wont be set on packets destined for the carp interface because we check ac_enaddr before checking if the packet is multicast or broadcast.

I might be mistaken.  I just know that this code is fragile and I'd
prefer to see such change tested isolated because carp(4) itself has
multiple configurations.  CARP balancing has been fixed for 6.2 by
friehm@ after being broken for multiple releases.

> are you sure? carp and vlan on top of trunk should still function. however, trunk or bridge/switch on vlan is broken though :(

I'm not sure, but I believe mixing input handler and hardcoding some in
ether_input() won't fly :)

> > If if_input() is called on a pseudo-interface we know we're already in
> > a softnet process.
>
> or a syscall.
>
> the code above has pseudo interfaces recurse, where they'd loop either on ifih or at the task level. probably not a huge concern though.

We can unroll the loop afterward.  What we need now is get rid of the
queues.

> > We could also think of doing something similar for if_enqueue() and call
> > if_start() directly for pseudo-interface.
>
> ill think about that. ive had some other experiments in that area we could look at too.

I'd be glad to look at your experiments :)

> >> note that trunk and bridge/switch are still implemented using
> >> interface input handlers at the moment.
> >
> > If you want to get rid of the input handlers, I'd suggest doing it in
> > the beginning of a release cycle and for all pseudo drivers at once.
>
> considering the trunk and bridge/switch issue, it probably is best to do them all at once.

Yes.  I'm aware that the SRP/input handler loop might be considered.
We can probably gain some percents if we remove it.  However this is
a micro optimization compared to other improvements that can be done.
Plus it has the advantage of not having fragile #ifdef maze in the
rest of the code.

Reply | Threaded
Open this post in threaded view
|

Re: merge vlan and carp input back into ether_input

David Gwynne-5
On Thu, Jan 11, 2018 at 02:51:40PM +0100, Martin Pieuchot wrote:
> On 11/01/18(Thu) 21:59, David Gwynne wrote:
> > [...]
> > when you say i break carp balancing, are you talking about the removal of the PACKET_TAG_CARP_BAL_IP tagging? PACKET_TAG_CARP_BAL_IP is only used in carp_lsdrop to clear the M_MCAST flag on the mbuf. M_MCAST wont be set on packets destined for the carp interface because we check ac_enaddr before checking if the packet is multicast or broadcast.
>
> I might be mistaken.  I just know that this code is fragile and I'd
> prefer to see such change tested isolated because carp(4) itself has
> multiple configurations.  CARP balancing has been fixed for 6.2 by
> friehm@ after being broken for multiple releases.

ok.

i had a look the commit related to this at
https://github.com/openbsd/src/commit/76dda2b0279f3c37adf1c059c3bab4d74bc96602

that change happened because ethernet_input checks and sets the
multicast bits before doing the ac_enaddr comparison. that's reversed
in the big diff i sent, but i pulled it out and attached it below.
it doesn't take away the mbuf tag carp uses, but it would make it
unnecessary.

> > are you sure? carp and vlan on top of trunk should still function. however, trunk or bridge/switch on vlan is broken though :(
>
> I'm not sure, but I believe mixing input handler and hardcoding some in
> ether_input() won't fly :)

agreed.

>
> > > If if_input() is called on a pseudo-interface we know we're already in
> > > a softnet process.
> >
> > or a syscall.
> >
> > the code above has pseudo interfaces recurse, where they'd loop either on ifih or at the task level. probably not a huge concern though.
>
> We can unroll the loop afterward.  What we need now is get rid of the
> queues.

ok.

>
> > > We could also think of doing something similar for if_enqueue() and call
> > > if_start() directly for pseudo-interface.
> >
> > ill think about that. ive had some other experiments in that area we could look at too.
>
> I'd be glad to look at your experiments :)

but im shy :(

> > >> note that trunk and bridge/switch are still implemented using
> > >> interface input handlers at the moment.
> > >
> > > If you want to get rid of the input handlers, I'd suggest doing it in
> > > the beginning of a release cycle and for all pseudo drivers at once.
> >
> > considering the trunk and bridge/switch issue, it probably is best to do them all at once.
>
> Yes.  I'm aware that the SRP/input handler loop might be considered.
> We can probably gain some percents if we remove it.  However this is
> a micro optimization compared to other improvements that can be done.
> Plus it has the advantage of not having fragile #ifdef maze in the
> rest of the code.

im not expecting a performance difference with this stuff, it's
more about correctness. right now the behaviour of the stack is
arguably incorrect depending on when you configure pseudo-interfaces.
here are two examples:

if you configure a carp interface on myx0 after configuring a vlan
interface, carp_input input handler will be in the SRPL before
vlan_input. because of that we have this chunk in carp_input:

#if NVLAN > 0
        /*
         * If the underlying interface removed the VLAN header itself,
         * it's not for us.
         */
        if (ISSET(m->m_flags, M_VLANTAG))
                return (0);
#endif

however, myx does not do hw vlan tagging and we dont check for
ETHERTYPE_VLAN or _QINQ there. carp may take a packet that vlan
should have taken.

secondly, if you configure switch(4) on an interface after configuring
it as a trunk port, switch will happily take all the packets on it.

it could be argued that these examples are a bit contrived, which
i will accept, and they could be fixed with some stricter code
checks, but they do demonstrate a problem with the stack.

in my mind, the order of processing by pseudo interfaces attached
to an ethernet interface would be:

1. let trunk(4) look at it

if trunk is configured on the port...

while we're talking about this, i would like to implement "independent"
ports on trunks. independent ports mean that if lacp isnt negotiated
on the ethernet interface, it can be used as a normal interface.
cisco do this by default on port channels, but other vendors require
explicit config. eg, on a force10^Wdell i need to add something
like "lacp ungroup member-independent port-channel 12" to allow
members of po12 to function as normal ports when lacp isnt negotiated.
this is useful if you want to pxe boot boxes that are usually
connected using lacp.

if we supported independent trunk ports, then we should allow
otherwise normal ethernet interface configuration.

2. let bridge(4) or switch(4) look at it

not both.

it only makes sense for a bridge/switch to get the packet if trunk
doesnt want it.

3. if it is vlan or qinq tagged, give it to vlan or drop it.

4. check ac_enaddr to see if it is for the current interface.

5. otherwise, give it to carp to try and match on

im arguing the system should enforce this order. right now people
tend to configure it in orders that make sense, but the exceptions
are ugly.

i did have a super big diff that did all this, but i didnt like the
bridge/switch handling in it. i can tart it up again though.

anyway, here's the ether_input shuffle:

Index: if_ethersubr.c
===================================================================
RCS file: /cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.250
diff -u -p -r1.250 if_ethersubr.c
--- if_ethersubr.c 10 Jan 2018 00:14:38 -0000 1.250
+++ if_ethersubr.c 12 Jan 2018 07:07:32 -0000
@@ -326,17 +326,21 @@ ether_input(struct ifnet *ifp, struct mb
  ac = (struct arpcom *)ifp;
  eh = mtod(m, struct ether_header *);
 
- if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
+ /* Is the packet for us? */
+ if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
+
+ /* If not, it must be multicast or broadcast to go further */
+ if (!ETHER_IS_MULTICAST(eh->ether_dhost))
+ goto dropanyway;
+
  /*
  * If this is not a simplex interface, drop the packet
  * if it came from us.
  */
  if ((ifp->if_flags & IFF_SIMPLEX) == 0) {
  if (memcmp(ac->ac_enaddr, eh->ether_shost,
-    ETHER_ADDR_LEN) == 0) {
- m_freem(m);
- return (1);
- }
+    ETHER_ADDR_LEN) == 0)
+ goto dropanyway;
  }
 
  if (memcmp(etherbroadcastaddr, eh->ether_dhost,
@@ -354,18 +358,6 @@ ether_input(struct ifnet *ifp, struct mb
  if (m->m_flags & M_VLANTAG) {
  m_freem(m);
  return (1);
- }
-
- /*
- * If packet is unicast, make sure it is for us.  Drop otherwise.
- * This check is required in promiscous mode, and for some hypervisors
- * where the MAC filter is 'best effort' only.
- */
- if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
- if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN)) {
- m_freem(m);
- return (1);
- }
  }
 
  etype = ntohs(eh->ether_type);

Reply | Threaded
Open this post in threaded view
|

Re: merge vlan and carp input back into ether_input

Florian Riehm-2
In reply to this post by Martin Pieuchot
On 01/11/18 14:51, Martin Pieuchot wrote:
> On 11/01/18(Thu) 21:59, David Gwynne wrote:
>> [...]
>> when you say i break carp balancing, are you talking about the removal of the PACKET_TAG_CARP_BAL_IP tagging? PACKET_TAG_CARP_BAL_IP is only used in carp_lsdrop to clear the M_MCAST flag on the mbuf. M_MCAST wont be set on packets destined for the carp interface because we check ac_enaddr before checking if the packet is multicast or broadcast.
>
> I might be mistaken.  I just know that this code is fragile and I'd
> prefer to see such change tested isolated because carp(4) itself has
> multiple configurations.  CARP balancing has been fixed for 6.2 by
> friehm@ after being broken for multiple releases.
>
Hi David,

let me know if you all you carp changes are commited, then
I will test if carp balancing is still working.
I tested this commit already and it seems to be fine.

Thanks & Regards,

friehm