fix races in if_clone_create()

classic Classic list List threaded Threaded
15 messages Options
Reply | Threaded
Open this post in threaded view
|

fix races in if_clone_create()

Vitaliy Makkoveev
if_clone_create() has the races caused by context switch.

---- cut begin ----
if_clone_create(const char *name, int rdomain)
{
        struct if_clone *ifc;
        struct ifnet *ifp;
        int unit, ret;

        ifc = if_clone_lookup(name, &unit); /* [1] */
        if (ifc == NULL)
                return (EINVAL);

        if (ifunit(name) != NULL) /* [2] */
                return (EEXIST);

        ret = (*ifc->ifc_create)(ifc, unit); /* [3] */

        if (ret != 0 || (ifp = ifunit(name)) == NULL) /* [4] */
                return (ret);

        NET_LOCK();
        if_addgroup(ifp, ifc->ifc_name);
        if (rdomain != 0)
                if_setrdomain(ifp, rdomain);
        NET_UNLOCK();

        return (ret);
}

---- cut end ----


The race is:

Thread 1:
1. We pass the string "ifacename0" with contains interface name and
it's unit to if_clone_lookup(). if_clone_lookup() extracts the unit '0'
and return it to us as integer `unit'. this `unit' is our local
variable, nobody knows what we are going to create interface with this
unit.

2. There is no "ifacename0" yet, so ifunit() will return NULL.

3. We did some checks add call `ifc_create' which will call
pseudo-driver's callback to create instance with passed `unit'. We
initialize ifnet with our softwere context and the `if_xname' set to
"ifacename0".

3.1. We do if_attach() which call NET_LOCK() before we link `ifnet' to
`if_list'. Also if_attach() doesn't check anything so we always attach
passed `ifnet'.

Context was switched to Thread 2:

1. We also passed string "ifacename0" to if_clone_lookup() and received
`unit' with value `0'

2. There is no "ifacename0" yet, so ifunit() will return NULL.

3. All checks done, we call `ifc_create'. We initialize out softwere
context with passed `unit', Who knows do we chech is software context
for `unit' already exists before? In fact we don't :)
3.1 We initialized `ifnet' with the `if_xname' set to "ifacename0" and
we link it `if_list'.

4. This check is passed, `ifc_create' returned `0' and we have
"ifacename0" linked so ifunit() will return `ifp'.

We returned to userland with success and switched to Thread 1:

We continue 3.1. if_attach() continue his work and we have another
`ifnet' with `if_xname' set to "ifacename0" in list.

Now we have inconsistent `if_list'.

4. We do these checks. ifunit() will return pointer to `ifp' with
requested "ifacename0". All ok we return to userland with success.

Diff below fixes this reces.

Each `struct if_clone' has the bitmap where requested `unit' marked as
busy if it was not allocated before. We have new function
if_clone_alloc_unit() for. If `unit' already was allocated
if_clone_alloc_unit() will return EEXIST. Also we have
if_clone_rele_unit() to release `unit'. We do unit allocation before we
do context switch so now we can't allocate the same unit twice or more.

Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.610
diff -u -p -r1.610 if.c
--- net/if.c 22 Jun 2020 09:45:13 -0000 1.610
+++ net/if.c 26 Jun 2020 13:49:08 -0000
@@ -157,6 +157,8 @@ void if_linkstate_task(void *);
 
 int if_clone_list(struct if_clonereq *);
 struct if_clone *if_clone_lookup(const char *, int *);
+int if_clone_alloc_unit(struct if_clone *, int);
+void if_clone_rele_unit(struct if_clone *, int);
 
 int if_group_egress_build(void);
 
@@ -1244,19 +1246,23 @@ if_clone_create(const char *name, int rd
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int unit, ret;
+ int unit, error;
 
  ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
+ error = if_clone_alloc_unit(ifc, unit);
+ if (error != 0)
+ return (error);
 
- if (ifunit(name) != NULL)
- return (EEXIST);
-
- ret = (*ifc->ifc_create)(ifc, unit);
+ if (ifunit(name) != NULL) {
+ error = (EEXIST);
+ goto rele;
+ }
 
- if (ret != 0 || (ifp = ifunit(name)) == NULL)
- return (ret);
+ error = (*ifc->ifc_create)(ifc, unit);
+ if (error != 0 || (ifp = ifunit(name)) == NULL)
+ return (0);
 
  NET_LOCK();
  if_addgroup(ifp, ifc->ifc_name);
@@ -1264,7 +1270,10 @@ if_clone_create(const char *name, int rd
  if_setrdomain(ifp, rdomain);
  NET_UNLOCK();
 
- return (ret);
+ return (0);
+rele:
+ if_clone_rele_unit(ifc, unit);
+ return (error);
 }
 
 /*
@@ -1275,9 +1284,9 @@ if_clone_destroy(const char *name)
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int ret;
+ int ret, unit;
 
- ifc = if_clone_lookup(name, NULL);
+ ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
 
@@ -1297,6 +1306,7 @@ if_clone_destroy(const char *name)
  }
  NET_UNLOCK();
  ret = (*ifc->ifc_destroy)(ifp);
+ if_clone_rele_unit(ifc, unit);
 
  return (ret);
 }
@@ -1342,12 +1352,96 @@ if_clone_lookup(const char *name, int *u
  unit = (unit * 10) + (*cp++ - '0');
  }
 
- if (unitp != NULL)
- *unitp = unit;
+ *unitp = unit;
  return (ifc);
 }
 
 /*
+ * Allocate unit for cloned network interface.
+ */
+int if_clone_alloc_unit(struct if_clone *ifc, int unit)
+{
+ int word, bit, ret;
+
+ word = unit / (sizeof(*ifc->ifc_map) * 8);
+ bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+ rw_enter_write(&ifc->ifc_lock);
+
+ if(word >= ifc->ifc_map_size) {
+ u_long *map;
+ int size;
+
+ size = word + 1;
+ map = mallocarray(size, sizeof(*map), M_TEMP, M_WAITOK |
+    M_ZERO);
+
+ if (ifc->ifc_map != NULL) {
+ memcpy(map, ifc->ifc_map, ifc->ifc_map_size);
+ free(ifc->ifc_map, M_TEMP,
+    ifc->ifc_map_size * sizeof(*map));
+ }
+
+ ifc->ifc_map = map;
+ ifc->ifc_map_size = size;
+ }
+
+ if (ifc->ifc_map[word] & (1UL << bit))
+ ret = EEXIST;
+ else {
+ ifc->ifc_map[word] |= (1UL << bit);
+ ret = 0;
+ }
+
+ rw_exit_write(&ifc->ifc_lock);
+
+ return ret;
+}
+
+/*
+ * Release allocated unit for cloned network interface.
+ */
+void if_clone_rele_unit(struct if_clone *ifc, int unit)
+{
+ int word, bit;
+
+ word = unit / (sizeof(*ifc->ifc_map) * 8);
+ bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+ rw_enter_write(&ifc->ifc_lock);
+ KASSERT(word < ifc->ifc_map_size);
+
+ ifc->ifc_map[word] &= ~(1UL << bit);
+
+ if (ifc->ifc_map[word] == 0) {
+ u_long *map;
+ int size;
+
+ size = ifc->ifc_map_size - 2;
+ while (size>=0) {
+ if (ifc->ifc_map[size] != 0)
+ break;
+ --size;
+ }
+ if (size<0) {
+ size = 0;
+ map = NULL;
+ } else {
+ size += 1;
+ map = mallocarray(size, sizeof(*map), M_TEMP,
+    M_WAITOK);
+ memcpy(map, ifc->ifc_map, size);
+ free(ifc->ifc_map, M_TEMP,
+    ifc->ifc_map_size * sizeof(*map));
+ }
+
+ ifc->ifc_map = map;
+ ifc->ifc_map_size = size;
+ }
+ rw_exit_write(&ifc->ifc_lock);
+}
+
+/*
  * Register a network interface cloner.
  */
 void
@@ -1360,6 +1454,7 @@ if_clone_attach(struct if_clone *ifc)
  * initialization, the if_cloners becomes immutable.
  */
  KASSERT(pdevinit_done == 0);
+ rw_init(&ifc->ifc_lock, "icflck");
  LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
  if_cloners_count++;
 }
Index: net/if_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.105
diff -u -p -r1.105 if_var.h
--- net/if_var.h 12 May 2020 08:49:54 -0000 1.105
+++ net/if_var.h 26 Jun 2020 13:49:08 -0000
@@ -45,6 +45,7 @@
 #include <sys/task.h>
 #include <sys/time.h>
 #include <sys/timeout.h>
+#include <sys/rwlock.h>
 
 #include <net/ifq.h>
 
@@ -86,6 +87,10 @@ struct if_clone {
  const char *ifc_name; /* name of device, e.g. `gif' */
  size_t ifc_namelen; /* length of name */
 
+ struct rwlock ifc_lock; /* lock for map */
+ u_long *ifc_map; /* units map */
+ int ifc_map_size; /* units map size */
+
  int (*ifc_create)(struct if_clone *, int);
  int (*ifc_destroy)(struct ifnet *);
 };
@@ -95,6 +100,8 @@ struct if_clone {
   .ifc_list = { NULL, NULL }, \
   .ifc_name = name, \
   .ifc_namelen = sizeof(name) - 1, \
+  .ifc_map = NULL, \
+  .ifc_map_size = 0, \
   .ifc_create = create, \
   .ifc_destroy = destroy, \
 }

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
Sorry, I found a memory leak. Updated diff below.

Index: sys/net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.610
diff -u -p -r1.610 if.c
--- sys/net/if.c 22 Jun 2020 09:45:13 -0000 1.610
+++ sys/net/if.c 26 Jun 2020 17:19:41 -0000
@@ -157,6 +157,8 @@ void if_linkstate_task(void *);
 
 int if_clone_list(struct if_clonereq *);
 struct if_clone *if_clone_lookup(const char *, int *);
+int if_clone_alloc_unit(struct if_clone *, int);
+void if_clone_rele_unit(struct if_clone *, int);
 
 int if_group_egress_build(void);
 
@@ -1244,19 +1246,23 @@ if_clone_create(const char *name, int rd
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int unit, ret;
+ int unit, error;
 
  ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
+ error = if_clone_alloc_unit(ifc, unit);
+ if (error != 0)
+ return (error);
 
- if (ifunit(name) != NULL)
- return (EEXIST);
-
- ret = (*ifc->ifc_create)(ifc, unit);
+ if (ifunit(name) != NULL) {
+ error = (EEXIST);
+ goto rele;
+ }
 
- if (ret != 0 || (ifp = ifunit(name)) == NULL)
- return (ret);
+ error = (*ifc->ifc_create)(ifc, unit);
+ if (error != 0 || (ifp = ifunit(name)) == NULL)
+ return (0);
 
  NET_LOCK();
  if_addgroup(ifp, ifc->ifc_name);
@@ -1264,7 +1270,10 @@ if_clone_create(const char *name, int rd
  if_setrdomain(ifp, rdomain);
  NET_UNLOCK();
 
- return (ret);
+ return (0);
+rele:
+ if_clone_rele_unit(ifc, unit);
+ return (error);
 }
 
 /*
@@ -1275,9 +1284,9 @@ if_clone_destroy(const char *name)
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int ret;
+ int ret, unit;
 
- ifc = if_clone_lookup(name, NULL);
+ ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
 
@@ -1297,6 +1306,7 @@ if_clone_destroy(const char *name)
  }
  NET_UNLOCK();
  ret = (*ifc->ifc_destroy)(ifp);
+ if_clone_rele_unit(ifc, unit);
 
  return (ret);
 }
@@ -1342,12 +1352,95 @@ if_clone_lookup(const char *name, int *u
  unit = (unit * 10) + (*cp++ - '0');
  }
 
- if (unitp != NULL)
- *unitp = unit;
+ *unitp = unit;
  return (ifc);
 }
 
 /*
+ * Allocate unit for cloned network interface.
+ */
+int if_clone_alloc_unit(struct if_clone *ifc, int unit)
+{
+ int word, bit, ret;
+
+ word = unit / (sizeof(*ifc->ifc_map) * 8);
+ bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+ rw_enter_write(&ifc->ifc_lock);
+
+ if(word >= ifc->ifc_map_size) {
+ u_long *map;
+ int size;
+
+ size = word + 1;
+ map = mallocarray(size, sizeof(*map), M_TEMP, M_WAITOK |
+    M_ZERO);
+
+ if (ifc->ifc_map != NULL) {
+ memcpy(map, ifc->ifc_map, ifc->ifc_map_size);
+ free(ifc->ifc_map, M_TEMP,
+    ifc->ifc_map_size * sizeof(*map));
+ }
+
+ ifc->ifc_map = map;
+ ifc->ifc_map_size = size;
+ }
+
+ if (ifc->ifc_map[word] & (1UL << bit))
+ ret = EEXIST;
+ else {
+ ifc->ifc_map[word] |= (1UL << bit);
+ ret = 0;
+ }
+
+ rw_exit_write(&ifc->ifc_lock);
+
+ return ret;
+}
+
+/*
+ * Release allocated unit for cloned network interface.
+ */
+void if_clone_rele_unit(struct if_clone *ifc, int unit)
+{
+ int word, bit;
+
+ word = unit / (sizeof(*ifc->ifc_map) * 8);
+ bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+ rw_enter_write(&ifc->ifc_lock);
+ KASSERT(word < ifc->ifc_map_size);
+
+ ifc->ifc_map[word] &= ~(1UL << bit);
+
+ if (ifc->ifc_map[word] == 0) {
+ u_long *map;
+ int size;
+
+ size = ifc->ifc_map_size - 2;
+ while (size>=0) {
+ if (ifc->ifc_map[size] != 0)
+ break;
+ --size;
+ }
+ if (size<0) {
+ size = 0;
+ map = NULL;
+ } else {
+ size += 1;
+ map = mallocarray(size, sizeof(*map), M_TEMP,
+    M_WAITOK);
+ memcpy(map, ifc->ifc_map, size);
+ }
+
+ free(ifc->ifc_map, M_TEMP, ifc->ifc_map_size * sizeof(*map));
+ ifc->ifc_map = map;
+ ifc->ifc_map_size = size;
+ }
+ rw_exit_write(&ifc->ifc_lock);
+}
+
+/*
  * Register a network interface cloner.
  */
 void
@@ -1360,6 +1453,7 @@ if_clone_attach(struct if_clone *ifc)
  * initialization, the if_cloners becomes immutable.
  */
  KASSERT(pdevinit_done == 0);
+ rw_init(&ifc->ifc_lock, "icflck");
  LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
  if_cloners_count++;
 }
Index: sys/net/if_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.105
diff -u -p -r1.105 if_var.h
--- sys/net/if_var.h 12 May 2020 08:49:54 -0000 1.105
+++ sys/net/if_var.h 26 Jun 2020 17:19:41 -0000
@@ -45,6 +45,7 @@
 #include <sys/task.h>
 #include <sys/time.h>
 #include <sys/timeout.h>
+#include <sys/rwlock.h>
 
 #include <net/ifq.h>
 
@@ -86,6 +87,10 @@ struct if_clone {
  const char *ifc_name; /* name of device, e.g. `gif' */
  size_t ifc_namelen; /* length of name */
 
+ struct rwlock ifc_lock; /* lock for map */
+ u_long *ifc_map; /* units map */
+ int ifc_map_size; /* units map size */
+
  int (*ifc_create)(struct if_clone *, int);
  int (*ifc_destroy)(struct ifnet *);
 };
@@ -95,6 +100,8 @@ struct if_clone {
   .ifc_list = { NULL, NULL }, \
   .ifc_name = name, \
   .ifc_namelen = sizeof(name) - 1, \
+  .ifc_map = NULL, \
+  .ifc_map_size = 0, \
   .ifc_create = create, \
   .ifc_destroy = destroy, \
 }

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Martin Pieuchot
In reply to this post by Vitaliy Makkoveev
On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> if_clone_create() has the races caused by context switch.

Can you share a backtrace of such race?  Where does the kernel panic?

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > if_clone_create() has the races caused by context switch.
>
> Can you share a backtrace of such race?  Where does the kernel panic?
>

This diff was inspired by thread [1]. As I explained [2] here is 3
issues that cause panics produced by command below:

---- cut begin ----
for i in 1 2 3; do while true; do ifconfig bridge0 create& \
        ifconfig bridge0 destroy& done& done
---- cut end ----

My system was stable with the last diff I did for thread [1]. But since
this final diff [3] which include fixes for tun(4) is quick and dirty
and not for commit I decided to make the diff to fix the races caused by
if_clone_create() at first.

I included screenshot with panic.
Also the code to reproduce below:

---- cut begin ----
#!/bin/sh
while true; do ifconfig bridge0 create& done&
while true; do ifconfig bridge0 destroy done
---- cut end ----

1. https://marc.info/?t=159289590100001&r=1&w=2
2. https://marc.info/?l=openbsd-tech&m=159307900124245&w=2
3. https://marc.info/?l=openbsd-tech&m=159308633126243&w=2

panic.png (18K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Martin Pieuchot
On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:

> On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > if_clone_create() has the races caused by context switch.
> >
> > Can you share a backtrace of such race?  Where does the kernel panic?
> >
>
> This diff was inspired by thread [1]. As I explained [2] here is 3
> issues that cause panics produced by command below:
>
> ---- cut begin ----
> for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> ifconfig bridge0 destroy& done& done
> ---- cut end ----

Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
managed to reproduce it with other pseudo-devices or just with bridge0?

> My system was stable with the last diff I did for thread [1]. But since
> this final diff [3] which include fixes for tun(4) is quick and dirty
> and not for commit I decided to make the diff to fix the races caused by
> if_clone_create() at first.
>
> I included screenshot with panic.

Thanks, interesting that the corruption happens on a list that should be
initialized.  Does that mean the context switch on Thread 1 is happening
before if_attach_common() is called?

You said your previous email that there's a context switch.  Do you know
when it happens?  You could see that in ddb by looking at the backtrace
of the other CPU.

Is the context switch leading to the race common to all pseudo-drivers
or is it in the bridge(4) driver?

Regarding your solution, do I understand correctly that the goal is to
serialize all if_clone_create()?  Is it really needed to remember which
unit is being currently created or can't we just serialize all of them?

The fact that a lock is not held over the cloning operation is imho
positive.

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
On Sat, Jun 27, 2020 at 12:10:24PM +0200, Martin Pieuchot wrote:

> On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> > On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > > if_clone_create() has the races caused by context switch.
> > >
> > > Can you share a backtrace of such race?  Where does the kernel panic?
> > >
> >
> > This diff was inspired by thread [1]. As I explained [2] here is 3
> > issues that cause panics produced by command below:
> >
> > ---- cut begin ----
> > for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> > ifconfig bridge0 destroy& done& done
> > ---- cut end ----
>
> Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
> managed to reproduce it with other pseudo-devices or just with bridge0?
>

In thread [1] you talked about bridge(4), tun(4) and vether(4). A first
I fixed races in if_clone_destroy() and I caught the races with
if_clone_create() while I run your initial comman but with vether(4)

---- cut begin ----
for i in 0 1 2 3 4 5 6 7; do while true; \
        do cat /dev/vether0& ifconfig vether0 destroy& done& done
---- cut end ----

It's hard to reproduce this issue. The best chances for me is bare metal
8 cores, fully unloaded system, no X, no active processes, test started
at console and all output redirected to /dev/null. And it can take
*hours* to catch. I can't reproduce this on 2 cores. I can't reproduce
this at 4 cores under kvm but it's reproducible under virtual box under
osx. The hosts has 8 cores. I can reproduce this on bare metal with 4
cores, but also it takes time.

Routine called by `ifc_create' within if_clone_attach() is very specific
to each pseudo interface. if_attach() is the only common point to sleep
for them, but you also can sleep in any point of sleep before
`ifc_create' will call if_attach(), For exmaple you will alloc software
context with `M_WAITOK'.

bridge(4) is just the best way to reproduce to me.

You have all `ifnet's linked to `if_list'. ifunit() does linear search
in this list by compare `ifp->if_xname' and given `name'. So if you
inserted many `ifnet's to this list ifunit() will return you first. but
if_get(9) doesn't work with this list. So if you have the case I talk
above if_get(9) and ifname() are inconsistent. Some times in the stack
you use if_get(9) sometimes you use ifunit() so you work every time with
diffetrent `ifnet's with the same `if_xname'. You can't predict where
`ifnet' will be corrupted.

> > My system was stable with the last diff I did for thread [1]. But since
> > this final diff [3] which include fixes for tun(4) is quick and dirty
> > and not for commit I decided to make the diff to fix the races caused by
> > if_clone_create() at first.
> >
> > I included screenshot with panic.
>
> Thanks, interesting that the corruption happens on a list that should be
> initialized.  Does that mean the context switch on Thread 1 is happening
> before if_attach_common() is called?
>

I don't know where it was. if_attach() doesn't checks if `ifnet' with
the name in `if_xname' already linked. You will insert passed `ifnet' in
any cases. If you have more then one `ifnet' with identical `if_xname'
you have broken ifunit() and if_get() logic.

Look at if_attach():

---- cut begin ----
if_attach(struct ifnet *ifp)
{
        if_attach_common(ifp);
        NET_LOCK();
        TAILQ_INSERT_TAIL(&ifnet, ifp, if_list); /* (1) */
        if_attachsetup(ifp);
        NET_UNLOCK();
}

You link `ifp' at (1). And it's still your `ifp' before and after context
switch ot without context switch. You will brake it later. The reason is
pseudo driver received the same `unit' more than once. And it created
two or more software context with identical `unit'. And internal pseudo
driver's logic is broken. Also ifunit() and if_get(9) are inconsistent
now. You can break memory everythere.

---- cut end ----
> You said your previous email that there's a context switch.  Do you know
> when it happens?  You could see that in ddb by looking at the backtrace
> of the other CPU.
>
> Is the context switch leading to the race common to all pseudo-drivers
> or is it in the bridge(4) driver?

ddb(4) is useless. The panic occured while we are trying to if_detach()
already broken `ifnet'. There is no reces here. But the rases was
*before* and we inserted two or more `ifnet's with the same name to
`if_list'. This insertion is no panic condition.

The first time I caught this races while I connected to you [1] thread.
I inserted ifunit() call to if_attach() as below and received panic so
I'am shure about the reason:

---- cut begin ----
if_attach(struct ifnet *ifp)
{
        if_attach_common(ifp);
        NET_LOCK();
        KASSERT(ifunit(ifp->if_xname));
        TAILQ_INSERT_TAIL(&ifnet, ifp, if_list);
        if_attachsetup(ifp);
        NET_UNLOCK();
}
---- cut end ----

But in thread [1] you said these races with pseudo interfaces are very
old well know issue, so I didn't take photos of panics caused by races
of if_clone_{create,destroy}().

>
> Regarding your solution, do I understand correctly that the goal is to
> serialize all if_clone_create()?  Is it really needed to remember which
> unit is being currently created or can't we just serialize all of them?
>

No. There is no serialization required. And you can do simultaneous
if_clone_create() but with different units:
if_clone_create("ifname0");
if_clone_create("ifname1");
if_clone_create("ifname2");
....
if_clone_create("ifnameN");

The goal is to be shure that obtained `unit' will not be used until we
release it. This denies to initialize software context with the same
`units' and you can't insert duplicate `if_xname' to `if_list'. This
makes if_get(9) and ifunit() happy.

And yes it's really needed to store already used units. Or pseudo
driver should store it's units and check it. Since you obtained the
`unit' you should *deny* to use it to anyone else until you release it.
Pesudo driver trusts you that `unit' passed to `ifc_create' is unique.
And it should be unique elsewhere you should duplicate the same check
logic in *every* pseudo-driver.

> The fact that a lock is not held over the cloning operation is imho
> positive.
>

Also I guess I fixed the races you discussed in thread [1]. So I decided
to rework my diff [2] to many diffs to be ready to commit.

Diff I posted to this thread is for races in if_clone_create() only. But
also you have use-after-free issues caused by races in
if_clone_destroy(). And panics caused by these races are easy to catch.

Also pppx(4) and pipex(4) had fully identical issues and I fixed them
already :)

1. https://marc.info/?t=159289590100001&r=1&w=2
2. https://marc.info/?l=openbsd-tech&m=159308633126243&w=2

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
In reply to this post by Martin Pieuchot
On Sat, Jun 27, 2020 at 12:10:24PM +0200, Martin Pieuchot wrote:

> On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> > On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > > if_clone_create() has the races caused by context switch.
> > >
> > > Can you share a backtrace of such race?  Where does the kernel panic?
> > >
> >
> > This diff was inspired by thread [1]. As I explained [2] here is 3
> > issues that cause panics produced by command below:
> >
> > ---- cut begin ----
> > for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> > ifconfig bridge0 destroy& done& done
> > ---- cut end ----
>
> Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
> managed to reproduce it with other pseudo-devices or just with bridge0?
>
> > My system was stable with the last diff I did for thread [1]. But since
> > this final diff [3] which include fixes for tun(4) is quick and dirty
> > and not for commit I decided to make the diff to fix the races caused by
> > if_clone_create() at first.
> >
> > I included screenshot with panic.
>
> Thanks, interesting that the corruption happens on a list that should be
> initialized.  Does that mean the context switch on Thread 1 is happening
> before if_attach_common() is called?
>
> You said your previous email that there's a context switch.  Do you know
> when it happens?  You could see that in ddb by looking at the backtrace
> of the other CPU.
>
> Is the context switch leading to the race common to all pseudo-drivers
> or is it in the bridge(4) driver?
>
> Regarding your solution, do I understand correctly that the goal is to
> serialize all if_clone_create()?  Is it really needed to remember which
> unit is being currently created or can't we just serialize all of them?
>
> The fact that a lock is not held over the cloning operation is imho
> positive.
>
I reworked tool for reproduce. Now I avoided fork()/exec() route and it
takes couple of minutes to take panic on 4 cores. Also some screenshots
attached.

I hope anyone else will try it.

---- cut begin ----

#include <sys/socket.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <pthread.h>
#include <string.h>
#include <unistd.h>

static struct ifreq ifr;

static void *clone_create(void *arg)
{
        int s;

        if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
                err(1, "socket()");
        while(1){
                if(ioctl(s, SIOCIFCREATE, &ifr)<0)
                        if(errno==EINVAL)
                                exit(1);
        }

        return NULL;
}

static void *clone_destroy(void *arg)
{
        int s;

        if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
                err(1, "socket()");
        while(1){
                if(ioctl(s, SIOCIFDESTROY, &ifr)<0)
                        if(errno==EINVAL)
                                exit(1);
        }

        return NULL;
}

int main(int argc, char *argv[])
{
        pthread_t thr;
        int i;

        if(argc!=2){
                fprintf(stderr, "usage: %s ifname\n", getprogname());
                return 1;
        }

        if(getuid()!=0){
                fprintf(stderr, "should be root\n");
                return 1;
        }

        memset(&ifr, 0, sizeof(ifr));
        strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));

        for(i=0; i<8*4; ++i){
                if(pthread_create(&thr, NULL, clone_create, NULL)!=0)
                        errx(1, "pthread_create(clone_create)");
        }

        clone_destroy(NULL);

        return 0;
}

---- cut end ----

etherip.png (18K) Download Attachment
pflog.png (18K) Download Attachment
vether.png (17K) Download Attachment
switch.png (18K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
In reply to this post by Martin Pieuchot
On Sat, Jun 27, 2020 at 12:10:24PM +0200, Martin Pieuchot wrote:

> On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> > On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > > if_clone_create() has the races caused by context switch.
> > >
> > > Can you share a backtrace of such race?  Where does the kernel panic?
> > >
> >
> > This diff was inspired by thread [1]. As I explained [2] here is 3
> > issues that cause panics produced by command below:
> >
> > ---- cut begin ----
> > for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> > ifconfig bridge0 destroy& done& done
> > ---- cut end ----
>
> Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
> managed to reproduce it with other pseudo-devices or just with bridge0?
>
> > My system was stable with the last diff I did for thread [1]. But since
> > this final diff [3] which include fixes for tun(4) is quick and dirty
> > and not for commit I decided to make the diff to fix the races caused by
> > if_clone_create() at first.
> >
> > I included screenshot with panic.
>
> Thanks, interesting that the corruption happens on a list that should be
> initialized.  Does that mean the context switch on Thread 1 is happening
> before if_attach_common() is called?
>
> You said your previous email that there's a context switch.  Do you know
> when it happens?  You could see that in ddb by looking at the backtrace
> of the other CPU.
>
> Is the context switch leading to the race common to all pseudo-drivers
> or is it in the bridge(4) driver?
>
> Regarding your solution, do I understand correctly that the goal is to
> serialize all if_clone_create()?  Is it really needed to remember which
> unit is being currently created or can't we just serialize all of them?
>
> The fact that a lock is not held over the cloning operation is imho
> positive.
>

I reworked tool for reproduce. Now I avoided fork()/exec() route and it
takes couple of minutes to take panic on 4 cores. I attached some
screenshots with panics caused by various pseudo-interfaces but my
previous mail was banned. I will try to attach them with separate mails.

I hope anyone else will try it. Now switch(4) is the bast way to
reproduce.

---- cut begin ----

#include <sys/socket.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <pthread.h>
#include <string.h>
#include <unistd.h>

static struct ifreq ifr;

static void *clone_create(void *arg)
{
        int s;

        if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
                err(1, "socket()");
        while(1){
                if(ioctl(s, SIOCIFCREATE, &ifr)<0)
                        if(errno==EINVAL)
                                exit(1);
        }

        return NULL;
}

static void *clone_destroy(void *arg)
{
        int s;

        if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
                err(1, "socket()");
        while(1){
                if(ioctl(s, SIOCIFDESTROY, &ifr)<0)
                        if(errno==EINVAL)
                                exit(1);
        }

        return NULL;
}

int main(int argc, char *argv[])
{
        pthread_t thr;
        int i;

        if(argc!=2){
                fprintf(stderr, "usage: %s ifname\n", getprogname());
                return 1;
        }

        if(getuid()!=0){
                fprintf(stderr, "should be root\n");
                return 1;
        }

        memset(&ifr, 0, sizeof(ifr));
        strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));

        for(i=0; i<8*4; ++i){
                if(pthread_create(&thr, NULL, clone_create, NULL)!=0)
                        errx(1, "pthread_create(clone_create)");
        }

        clone_destroy(NULL);

        return 0;
}

---- cut end ----

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
In reply to this post by Martin Pieuchot
screenshot

etherip.png (18K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Hrvoje Popovski
In reply to this post by Vitaliy Makkoveev
On 29.6.2020. 10:59, Vitaliy Makkoveev wrote:
> I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> takes couple of minutes to take panic on 4 cores. Also some screenshots
> attached.
>
> I hope anyone else will try it.

Hi,

i'm getting panic quite fast :)
i will leave box in ddb if more information is needed

r620-1# ./a.out bridge0
panic: kernel diagnostic assertion "TAILQ_EMPTY(&ifp->if_addrhooks)"
failed: file "/sys/net/if.c", line 1168
Stopped at      db_enter+0x10:  popq    %rbp
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
 475311   7753   1000         0x3          0    0  ifconfig
*128110   3280      0         0x3          0    1K a.out
  86419   3280      0         0x3  0x4000000    4  a.out
 352360   3280      0         0x3  0x4000000    3  a.out
 309715   3280      0         0x3  0x4000000    5  a.out
 268210   3280      0         0x3  0x4000000    2  a.out
db_enter() at db_enter+0x10
panic(ffffffff81df42d3) at panic+0x128
__assert(ffffffff81e5d55e,ffffffff81e5b1fa,490,ffffffff81e408d9) at
__assert+0x2b
if_detach(ffff800001169000) at if_detach+0x45f
bridge_clone_destroy(ffff800001169000) at bridge_clone_destroy+0x17b
ifioctl(fffffd839209c828,80206979,ffff8000248fa980,ffff800024902618) at
ifioctl+0x1c2
soo_ioctl(fffffd83b04b34c8,80206979,ffff8000248fa980,ffff800024902618)
at soo_ioctl+0x171
sys_ioctl(ffff800024902618,ffff8000248faa90,ffff8000248faaf0) at
sys_ioctl+0x2df
syscall(ffff8000248fab60) at syscall+0x389
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7f7ffffd3600, count: 5
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{1}>

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
On Mon, Jun 29, 2020 at 04:27:50PM +0200, Hrvoje Popovski wrote:

> On 29.6.2020. 10:59, Vitaliy Makkoveev wrote:
> > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > attached.
> >
> > I hope anyone else will try it.
>
> Hi,
>
> i'm getting panic quite fast :)
> i will leave box in ddb if more information is needed
>

Thanks. Right now it takes seconds to catch panic at least with switch(4),
bridge(4), pflog(4), vether(4) and etherip(4). So you can leave ddb(4).

I like to someone will try my solution for this issue. And reviews are
welcomed :)

The latest diff below. If the `unit' was obtained it's guaranteed that
there is no pseudo interface with `name' is system. ifunit() now useless
here and can be dropped.


Index: sys/net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.610
diff -u -p -r1.610 if.c
--- sys/net/if.c 22 Jun 2020 09:45:13 -0000 1.610
+++ sys/net/if.c 29 Jun 2020 13:54:29 -0000
@@ -157,6 +157,8 @@ void if_linkstate_task(void *);
 
 int if_clone_list(struct if_clonereq *);
 struct if_clone *if_clone_lookup(const char *, int *);
+int if_clone_alloc_unit(struct if_clone *, int);
+void if_clone_rele_unit(struct if_clone *, int);
 
 int if_group_egress_build(void);
 
@@ -1244,19 +1246,18 @@ if_clone_create(const char *name, int rd
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int unit, ret;
+ int unit, error;
 
  ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
+ error = if_clone_alloc_unit(ifc, unit);
+ if (error != 0)
+ return (error);
 
- if (ifunit(name) != NULL)
- return (EEXIST);
-
- ret = (*ifc->ifc_create)(ifc, unit);
-
- if (ret != 0 || (ifp = ifunit(name)) == NULL)
- return (ret);
+ error = (*ifc->ifc_create)(ifc, unit);
+ if (error != 0 || (ifp = ifunit(name)) == NULL)
+ return (error);
 
  NET_LOCK();
  if_addgroup(ifp, ifc->ifc_name);
@@ -1264,7 +1265,7 @@ if_clone_create(const char *name, int rd
  if_setrdomain(ifp, rdomain);
  NET_UNLOCK();
 
- return (ret);
+ return (0);
 }
 
 /*
@@ -1275,9 +1276,9 @@ if_clone_destroy(const char *name)
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int ret;
+ int ret, unit;
 
- ifc = if_clone_lookup(name, NULL);
+ ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
 
@@ -1297,6 +1298,7 @@ if_clone_destroy(const char *name)
  }
  NET_UNLOCK();
  ret = (*ifc->ifc_destroy)(ifp);
+ if_clone_rele_unit(ifc, unit);
 
  return (ret);
 }
@@ -1342,12 +1344,95 @@ if_clone_lookup(const char *name, int *u
  unit = (unit * 10) + (*cp++ - '0');
  }
 
- if (unitp != NULL)
- *unitp = unit;
+ *unitp = unit;
  return (ifc);
 }
 
 /*
+ * Allocate unit for cloned network interface.
+ */
+int if_clone_alloc_unit(struct if_clone *ifc, int unit)
+{
+ int word, bit, ret;
+
+ word = unit / (sizeof(*ifc->ifc_map) * 8);
+ bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+ rw_enter_write(&ifc->ifc_lock);
+
+ if(word >= ifc->ifc_map_size) {
+ u_long *map;
+ int size;
+
+ size = word + 1;
+ map = mallocarray(size, sizeof(*map), M_TEMP, M_WAITOK |
+    M_ZERO);
+
+ if (ifc->ifc_map != NULL) {
+ memcpy(map, ifc->ifc_map, ifc->ifc_map_size);
+ free(ifc->ifc_map, M_TEMP,
+    ifc->ifc_map_size * sizeof(*map));
+ }
+
+ ifc->ifc_map = map;
+ ifc->ifc_map_size = size;
+ }
+
+ if (ifc->ifc_map[word] & (1UL << bit))
+ ret = EEXIST;
+ else {
+ ifc->ifc_map[word] |= (1UL << bit);
+ ret = 0;
+ }
+
+ rw_exit_write(&ifc->ifc_lock);
+
+ return ret;
+}
+
+/*
+ * Release allocated unit for cloned network interface.
+ */
+void if_clone_rele_unit(struct if_clone *ifc, int unit)
+{
+ int word, bit;
+
+ word = unit / (sizeof(*ifc->ifc_map) * 8);
+ bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+ rw_enter_write(&ifc->ifc_lock);
+ KASSERT(word < ifc->ifc_map_size);
+
+ ifc->ifc_map[word] &= ~(1UL << bit);
+
+ if (ifc->ifc_map[word] == 0) {
+ u_long *map;
+ int size;
+
+ size = ifc->ifc_map_size - 2;
+ while (size>=0) {
+ if (ifc->ifc_map[size] != 0)
+ break;
+ --size;
+ }
+ if (size<0) {
+ size = 0;
+ map = NULL;
+ } else {
+ size += 1;
+ map = mallocarray(size, sizeof(*map), M_TEMP,
+    M_WAITOK);
+ memcpy(map, ifc->ifc_map, size);
+ }
+
+ free(ifc->ifc_map, M_TEMP, ifc->ifc_map_size * sizeof(*map));
+ ifc->ifc_map = map;
+ ifc->ifc_map_size = size;
+ }
+ rw_exit_write(&ifc->ifc_lock);
+}
+
+/*
  * Register a network interface cloner.
  */
 void
@@ -1360,6 +1445,7 @@ if_clone_attach(struct if_clone *ifc)
  * initialization, the if_cloners becomes immutable.
  */
  KASSERT(pdevinit_done == 0);
+ rw_init(&ifc->ifc_lock, "icflck");
  LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
  if_cloners_count++;
 }
Index: sys/net/if_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.105
diff -u -p -r1.105 if_var.h
--- sys/net/if_var.h 12 May 2020 08:49:54 -0000 1.105
+++ sys/net/if_var.h 29 Jun 2020 13:54:29 -0000
@@ -45,6 +45,7 @@
 #include <sys/task.h>
 #include <sys/time.h>
 #include <sys/timeout.h>
+#include <sys/rwlock.h>
 
 #include <net/ifq.h>
 
@@ -86,6 +87,10 @@ struct if_clone {
  const char *ifc_name; /* name of device, e.g. `gif' */
  size_t ifc_namelen; /* length of name */
 
+ struct rwlock ifc_lock; /* lock for map */
+ u_long *ifc_map; /* units map */
+ int ifc_map_size; /* units map size */
+
  int (*ifc_create)(struct if_clone *, int);
  int (*ifc_destroy)(struct ifnet *);
 };
@@ -95,6 +100,8 @@ struct if_clone {
   .ifc_list = { NULL, NULL }, \
   .ifc_name = name, \
   .ifc_namelen = sizeof(name) - 1, \
+  .ifc_map = NULL, \
+  .ifc_map_size = 0, \
   .ifc_create = create, \
   .ifc_destroy = destroy, \
 }

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Martin Pieuchot
In reply to this post by Vitaliy Makkoveev
On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> [...]
> I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> takes couple of minutes to take panic on 4 cores. Also some screenshots
> attached.

Setting kern.pool_debug=2 makes the race reproducible in seconds.

Could you turn this test into something committable in regress/?  We can
link it to the build once a fix is committed.

> #include <sys/socket.h>
> #include <sys/ioctl.h>
> #include <net/if.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <err.h>
> #include <errno.h>
> #include <pthread.h>
> #include <string.h>
> #include <unistd.h>
>
> static struct ifreq ifr;
>
> static void *clone_create(void *arg)
> {
> int s;
>
> if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> err(1, "socket()");
> while(1){
> if(ioctl(s, SIOCIFCREATE, &ifr)<0)
> if(errno==EINVAL)
> exit(1);
> }
>
> return NULL;
> }
>
> static void *clone_destroy(void *arg)
> {
> int s;
>
> if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> err(1, "socket()");
> while(1){
> if(ioctl(s, SIOCIFDESTROY, &ifr)<0)
> if(errno==EINVAL)
> exit(1);
> }
>
> return NULL;
> }
>
> int main(int argc, char *argv[])
> {
> pthread_t thr;
> int i;
>
> if(argc!=2){
> fprintf(stderr, "usage: %s ifname\n", getprogname());
> return 1;
> }
>
> if(getuid()!=0){
> fprintf(stderr, "should be root\n");
> return 1;
> }
>
> memset(&ifr, 0, sizeof(ifr));
> strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));
>
> for(i=0; i<8*4; ++i){
> if(pthread_create(&thr, NULL, clone_create, NULL)!=0)
> errx(1, "pthread_create(clone_create)");
> }
>
> clone_destroy(NULL);
>
> return 0;
> }
>
> ---- cut end ----





Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:

> On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> > [...]
> > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > attached.
>
> Setting kern.pool_debug=2 makes the race reproducible in seconds.
>
> Could you turn this test into something committable in regress/?  We can
> link it to the build once a fix is committed.
>

We have 3 races with cloned interfaces:
1. if_clone_create() vs if_clone_create()
2. if_clone_destroy() vs if_clone_destroy()
3. if_clone_destroy() vs the rest of stack

It makes sences to commit unified test to regress/, so I suggest to wait
a little.

> > #include <sys/socket.h>
> > #include <sys/ioctl.h>
> > #include <net/if.h>
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <err.h>
> > #include <errno.h>
> > #include <pthread.h>
> > #include <string.h>
> > #include <unistd.h>
> >
> > static struct ifreq ifr;
> >
> > static void *clone_create(void *arg)
> > {
> > int s;
> >
> > if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> > err(1, "socket()");
> > while(1){
> > if(ioctl(s, SIOCIFCREATE, &ifr)<0)
> > if(errno==EINVAL)
> > exit(1);
> > }
> >
> > return NULL;
> > }
> >
> > static void *clone_destroy(void *arg)
> > {
> > int s;
> >
> > if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> > err(1, "socket()");
> > while(1){
> > if(ioctl(s, SIOCIFDESTROY, &ifr)<0)
> > if(errno==EINVAL)
> > exit(1);
> > }
> >
> > return NULL;
> > }
> >
> > int main(int argc, char *argv[])
> > {
> > pthread_t thr;
> > int i;
> >
> > if(argc!=2){
> > fprintf(stderr, "usage: %s ifname\n", getprogname());
> > return 1;
> > }
> >
> > if(getuid()!=0){
> > fprintf(stderr, "should be root\n");
> > return 1;
> > }
> >
> > memset(&ifr, 0, sizeof(ifr));
> > strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));
> >
> > for(i=0; i<8*4; ++i){
> > if(pthread_create(&thr, NULL, clone_create, NULL)!=0)
> > errx(1, "pthread_create(clone_create)");
> > }
> >
> > clone_destroy(NULL);
> >
> > return 0;
> > }
> >
> > ---- cut end ----
>
>
>
>
>

Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
On Tue, Jun 30, 2020 at 03:48:22PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
> > On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> > > [...]
> > > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > > attached.
> >
> > Setting kern.pool_debug=2 makes the race reproducible in seconds.

Unfortunately you will catch splassert() caused by kern/sched_bsd.c:304.
malloc() will call yield() while we are holding NET_LOCK(). I attached
screenshot with splassertion to this mail.

> >
> > Could you turn this test into something committable in regress/?  We can
> > link it to the build once a fix is committed.
> >
>
> We have 3 races with cloned interfaces:
> 1. if_clone_create() vs if_clone_create()
> 2. if_clone_destroy() vs if_clone_destroy()
> 3. if_clone_destroy() vs the rest of stack
>
> It makes sences to commit unified test to regress/, so I suggest to wait
> a little.
The another solution.

Diff below introduces per-`ifc' serialization for if_clone_create() and
if_clone_destroy(). There is no index bitmap anymore.

Diff fixes the following races:
1. if_clone_create() vs if_clone_create()
2. if_clone_destroy() vs if_clone_destroy()

`ifc_create' will go the same lock path for each cloner, and
`ifc_destroy' will go this path but in reverse order. It seems
reasonable to allow simultaneous create/destroy for different cloners
but since different instances of one cloner will block each other it's
no reason have parallelism here.

Updated test tool
---- cut begin ----
#include <sys/socket.h>
#include <sys/select.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <pthread.h>
#include <string.h>
#include <unistd.h>

static struct ifreq ifr;

static void *clone_create(void *arg)
{
        int s;

        if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
                err(1, "socket()");
        while(1){
                if(ioctl(s, SIOCIFCREATE, &ifr)<0)
                        if(errno==EINVAL)
                                exit(1);
        }

        return NULL;
}

static void *clone_destroy(void *arg)
{
        int s;

        if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
                err(1, "socket()");
        while(1){
                if(ioctl(s, SIOCIFDESTROY, &ifr)<0)
                        if(errno==EINVAL)
                                exit(1);
        }

        return NULL;
}

int main(int argc, char *argv[])
{
        pthread_t thr;
        int i;

        if(argc!=2){
                fprintf(stderr, "usage: %s ifname\n", getprogname());
                return 1;
        }

        if(getuid()!=0){
                fprintf(stderr, "should be root\n");
                return 1;
        }

        memset(&ifr, 0, sizeof(ifr));
        strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));

        for(i=0; i<8*4; ++i){
                if(pthread_create(&thr, NULL, clone_create, NULL)!=0)
                        errx(1, "pthread_create(clone_create)");
                if(pthread_create(&thr, NULL, clone_destroy, NULL)!=0)
                        errx(1, "pthread_create(clone_destroy)");
        }

        select(0, NULL, NULL, NULL, NULL);

        return 0;
}
---- cut end ----



Index: sys/net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.611
diff -u -p -r1.611 if.c
--- sys/net/if.c 30 Jun 2020 09:31:38 -0000 1.611
+++ sys/net/if.c 30 Jun 2020 20:41:50 -0000
@@ -155,6 +155,8 @@ int if_getgrouplist(caddr_t);
 void if_linkstate(struct ifnet *);
 void if_linkstate_task(void *);
 
+int if_clone_lock(struct if_clone *);
+void if_clone_unlock(struct if_clone *);
 int if_clone_list(struct if_clonereq *);
 struct if_clone *if_clone_lookup(const char *, int *);
 
@@ -1244,27 +1246,35 @@ if_clone_create(const char *name, int rd
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int unit, ret;
+ int unit, error;
 
  ifc = if_clone_lookup(name, &unit);
  if (ifc == NULL)
  return (EINVAL);
 
- if (ifunit(name) != NULL)
- return (EEXIST);
+ error = if_clone_lock(ifc);
+ if (error != 0)
+ return (error);
+
+ if (ifunit(name) != NULL) {
+ error = (EEXIST);
+ goto unlock;
+ }
 
- ret = (*ifc->ifc_create)(ifc, unit);
+ error = (*ifc->ifc_create)(ifc, unit);
 
- if (ret != 0 || (ifp = ifunit(name)) == NULL)
- return (ret);
+ if (error != 0 || (ifp = ifunit(name)) == NULL)
+ goto unlock;
 
  NET_LOCK();
  if_addgroup(ifp, ifc->ifc_name);
  if (rdomain != 0)
  if_setrdomain(ifp, rdomain);
  NET_UNLOCK();
+unlock:
+ if_clone_unlock(ifc);
 
- return (ret);
+ return (error);
 }
 
 /*
@@ -1275,18 +1285,26 @@ if_clone_destroy(const char *name)
 {
  struct if_clone *ifc;
  struct ifnet *ifp;
- int ret;
+ int error;
 
  ifc = if_clone_lookup(name, NULL);
  if (ifc == NULL)
  return (EINVAL);
 
+ error = if_clone_lock(ifc);
+ if (error != 0)
+ return (error);
+
  ifp = ifunit(name);
- if (ifp == NULL)
- return (ENXIO);
+ if (ifp == NULL) {
+ error = (ENXIO);
+ goto unlock;
+ }
 
- if (ifc->ifc_destroy == NULL)
- return (EOPNOTSUPP);
+ if (ifc->ifc_destroy == NULL) {
+ error = (EOPNOTSUPP);
+ goto unlock;
+ }
 
  NET_LOCK();
  if (ifp->if_flags & IFF_UP) {
@@ -1296,9 +1314,56 @@ if_clone_destroy(const char *name)
  splx(s);
  }
  NET_UNLOCK();
- ret = (*ifc->ifc_destroy)(ifp);
+ error = (*ifc->ifc_destroy)(ifp);
+unlock:
+ if_clone_unlock(ifc);
+
+ return (error);
+}
+
+/*
+ * Lock a clone network interface.
+ */
+int
+if_clone_lock(struct if_clone *ifc)
+{
+ int error;
+
+ rw_enter_write(&ifc->ifc_lock);
+
+ while (ifc->ifc_flags & IFC_CREATE_LOCKED) {
+ ifc->ifc_flags |= IFC_CREATE_LOCKWAIT;
+ error = rwsleep_nsec(&ifc->ifc_flags, &ifc->ifc_lock,
+    PWAIT|PCATCH, "ifclk", INFSLP);
+ if(error != 0) {
+ ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
+ rw_exit_write(&ifc->ifc_lock);
+ return error;
+ }
+ }
+ ifc->ifc_flags |= IFC_CREATE_LOCKED;
+ ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
+
+ rw_exit_write(&ifc->ifc_lock);
+
+ return 0;
+}
+
+/*
+ * Unlock a clone network interface.
+ */
+void
+if_clone_unlock(struct if_clone *ifc)
+{
+ rw_enter_write(&ifc->ifc_lock);
+
+ ifc->ifc_flags &= ~IFC_CREATE_LOCKED;
+ if (ifc->ifc_flags & IFC_CREATE_LOCKWAIT) {
+ ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
+ wakeup(&ifc->ifc_flags);
+ }
 
- return (ret);
+ rw_exit_write(&ifc->ifc_lock);
 }
 
 /*
Index: sys/net/if_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.105
diff -u -p -r1.105 if_var.h
--- sys/net/if_var.h 12 May 2020 08:49:54 -0000 1.105
+++ sys/net/if_var.h 30 Jun 2020 20:41:50 -0000
@@ -45,6 +45,7 @@
 #include <sys/task.h>
 #include <sys/time.h>
 #include <sys/timeout.h>
+#include <sys/rwlock.h>
 
 #include <net/ifq.h>
 
@@ -85,16 +86,23 @@ struct if_clone {
  LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */
  const char *ifc_name; /* name of device, e.g. `gif' */
  size_t ifc_namelen; /* length of name */
+ struct rwlock ifc_lock; /* lock for ifc_flags */
+ u_int ifc_flags; /* flags */
 
  int (*ifc_create)(struct if_clone *, int);
  int (*ifc_destroy)(struct ifnet *);
 };
 
+#define IFC_CREATE_LOCKED 0x1
+#define IFC_CREATE_LOCKWAIT 0x2
+
 #define IF_CLONE_INITIALIZER(name, create, destroy) \
 { \
   .ifc_list = { NULL, NULL }, \
   .ifc_name = name, \
   .ifc_namelen = sizeof(name) - 1, \
+  .ifc_lock = RWLOCK_INITIALIZER("ifclk"), \
+  .ifc_flags = 0, \
   .ifc_create = create, \
   .ifc_destroy = destroy, \
 }

panic.png (18K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: fix races in if_clone_create()

Vitaliy Makkoveev
ping?

> On 1 Jul 2020, at 00:02, Vitaliy Makkoveev <[hidden email]> wrote:
>
> On Tue, Jun 30, 2020 at 03:48:22PM +0300, Vitaliy Makkoveev wrote:
>> On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
>>> On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
>>>> [...]
>>>> I reworked tool for reproduce. Now I avoided fork()/exec() route and it
>>>> takes couple of minutes to take panic on 4 cores. Also some screenshots
>>>> attached.
>>>
>>> Setting kern.pool_debug=2 makes the race reproducible in seconds.
>
> Unfortunately you will catch splassert() caused by kern/sched_bsd.c:304.
> malloc() will call yield() while we are holding NET_LOCK(). I attached
> screenshot with splassertion to this mail.
>
>>>
>>> Could you turn this test into something committable in regress/?  We can
>>> link it to the build once a fix is committed.
>>>
>>
>> We have 3 races with cloned interfaces:
>> 1. if_clone_create() vs if_clone_create()
>> 2. if_clone_destroy() vs if_clone_destroy()
>> 3. if_clone_destroy() vs the rest of stack
>>
>> It makes sences to commit unified test to regress/, so I suggest to wait
>> a little.
>
> The another solution.
>
> Diff below introduces per-`ifc' serialization for if_clone_create() and
> if_clone_destroy(). There is no index bitmap anymore.
>
> Diff fixes the following races:
> 1. if_clone_create() vs if_clone_create()
> 2. if_clone_destroy() vs if_clone_destroy()
>
> `ifc_create' will go the same lock path for each cloner, and
> `ifc_destroy' will go this path but in reverse order. It seems
> reasonable to allow simultaneous create/destroy for different cloners
> but since different instances of one cloner will block each other it's
> no reason have parallelism here.
>
> Updated test tool
> ---- cut begin ----
> #include <sys/socket.h>
> #include <sys/select.h>
> #include <sys/ioctl.h>
> #include <net/if.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <err.h>
> #include <errno.h>
> #include <pthread.h>
> #include <string.h>
> #include <unistd.h>
>
> static struct ifreq ifr;
>
> static void *clone_create(void *arg)
> {
> int s;
>
> if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> err(1, "socket()");
> while(1){
> if(ioctl(s, SIOCIFCREATE, &ifr)<0)
> if(errno==EINVAL)
> exit(1);
> }
>
> return NULL;
> }
>
> static void *clone_destroy(void *arg)
> {
> int s;
>
> if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> err(1, "socket()");
> while(1){
> if(ioctl(s, SIOCIFDESTROY, &ifr)<0)
> if(errno==EINVAL)
> exit(1);
> }
>
> return NULL;
> }
>
> int main(int argc, char *argv[])
> {
> pthread_t thr;
> int i;
>
> if(argc!=2){
> fprintf(stderr, "usage: %s ifname\n", getprogname());
> return 1;
> }
>
> if(getuid()!=0){
> fprintf(stderr, "should be root\n");
> return 1;
> }
>
> memset(&ifr, 0, sizeof(ifr));
> strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));
>
> for(i=0; i<8*4; ++i){
> if(pthread_create(&thr, NULL, clone_create, NULL)!=0)
> errx(1, "pthread_create(clone_create)");
> if(pthread_create(&thr, NULL, clone_destroy, NULL)!=0)
> errx(1, "pthread_create(clone_destroy)");
> }
>
> select(0, NULL, NULL, NULL, NULL);
>
> return 0;
> }
> ---- cut end ----
>
>
>
> Index: sys/net/if.c
> ===================================================================
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.611
> diff -u -p -r1.611 if.c
> --- sys/net/if.c 30 Jun 2020 09:31:38 -0000 1.611
> +++ sys/net/if.c 30 Jun 2020 20:41:50 -0000
> @@ -155,6 +155,8 @@ int if_getgrouplist(caddr_t);
> void if_linkstate(struct ifnet *);
> void if_linkstate_task(void *);
>
> +int if_clone_lock(struct if_clone *);
> +void if_clone_unlock(struct if_clone *);
> int if_clone_list(struct if_clonereq *);
> struct if_clone *if_clone_lookup(const char *, int *);
>
> @@ -1244,27 +1246,35 @@ if_clone_create(const char *name, int rd
> {
> struct if_clone *ifc;
> struct ifnet *ifp;
> - int unit, ret;
> + int unit, error;
>
> ifc = if_clone_lookup(name, &unit);
> if (ifc == NULL)
> return (EINVAL);
>
> - if (ifunit(name) != NULL)
> - return (EEXIST);
> + error = if_clone_lock(ifc);
> + if (error != 0)
> + return (error);
> +
> + if (ifunit(name) != NULL) {
> + error = (EEXIST);
> + goto unlock;
> + }
>
> - ret = (*ifc->ifc_create)(ifc, unit);
> + error = (*ifc->ifc_create)(ifc, unit);
>
> - if (ret != 0 || (ifp = ifunit(name)) == NULL)
> - return (ret);
> + if (error != 0 || (ifp = ifunit(name)) == NULL)
> + goto unlock;
>
> NET_LOCK();
> if_addgroup(ifp, ifc->ifc_name);
> if (rdomain != 0)
> if_setrdomain(ifp, rdomain);
> NET_UNLOCK();
> +unlock:
> + if_clone_unlock(ifc);
>
> - return (ret);
> + return (error);
> }
>
> /*
> @@ -1275,18 +1285,26 @@ if_clone_destroy(const char *name)
> {
> struct if_clone *ifc;
> struct ifnet *ifp;
> - int ret;
> + int error;
>
> ifc = if_clone_lookup(name, NULL);
> if (ifc == NULL)
> return (EINVAL);
>
> + error = if_clone_lock(ifc);
> + if (error != 0)
> + return (error);
> +
> ifp = ifunit(name);
> - if (ifp == NULL)
> - return (ENXIO);
> + if (ifp == NULL) {
> + error = (ENXIO);
> + goto unlock;
> + }
>
> - if (ifc->ifc_destroy == NULL)
> - return (EOPNOTSUPP);
> + if (ifc->ifc_destroy == NULL) {
> + error = (EOPNOTSUPP);
> + goto unlock;
> + }
>
> NET_LOCK();
> if (ifp->if_flags & IFF_UP) {
> @@ -1296,9 +1314,56 @@ if_clone_destroy(const char *name)
> splx(s);
> }
> NET_UNLOCK();
> - ret = (*ifc->ifc_destroy)(ifp);
> + error = (*ifc->ifc_destroy)(ifp);
> +unlock:
> + if_clone_unlock(ifc);
> +
> + return (error);
> +}
> +
> +/*
> + * Lock a clone network interface.
> + */
> +int
> +if_clone_lock(struct if_clone *ifc)
> +{
> + int error;
> +
> + rw_enter_write(&ifc->ifc_lock);
> +
> + while (ifc->ifc_flags & IFC_CREATE_LOCKED) {
> + ifc->ifc_flags |= IFC_CREATE_LOCKWAIT;
> + error = rwsleep_nsec(&ifc->ifc_flags, &ifc->ifc_lock,
> +    PWAIT|PCATCH, "ifclk", INFSLP);
> + if(error != 0) {
> + ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
> + rw_exit_write(&ifc->ifc_lock);
> + return error;
> + }
> + }
> + ifc->ifc_flags |= IFC_CREATE_LOCKED;
> + ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
> +
> + rw_exit_write(&ifc->ifc_lock);
> +
> + return 0;
> +}
> +
> +/*
> + * Unlock a clone network interface.
> + */
> +void
> +if_clone_unlock(struct if_clone *ifc)
> +{
> + rw_enter_write(&ifc->ifc_lock);
> +
> + ifc->ifc_flags &= ~IFC_CREATE_LOCKED;
> + if (ifc->ifc_flags & IFC_CREATE_LOCKWAIT) {
> + ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
> + wakeup(&ifc->ifc_flags);
> + }
>
> - return (ret);
> + rw_exit_write(&ifc->ifc_lock);
> }
>
> /*
> Index: sys/net/if_var.h
> ===================================================================
> RCS file: /cvs/src/sys/net/if_var.h,v
> retrieving revision 1.105
> diff -u -p -r1.105 if_var.h
> --- sys/net/if_var.h 12 May 2020 08:49:54 -0000 1.105
> +++ sys/net/if_var.h 30 Jun 2020 20:41:50 -0000
> @@ -45,6 +45,7 @@
> #include <sys/task.h>
> #include <sys/time.h>
> #include <sys/timeout.h>
> +#include <sys/rwlock.h>
>
> #include <net/ifq.h>
>
> @@ -85,16 +86,23 @@ struct if_clone {
> LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */
> const char *ifc_name; /* name of device, e.g. `gif' */
> size_t ifc_namelen; /* length of name */
> + struct rwlock ifc_lock; /* lock for ifc_flags */
> + u_int ifc_flags; /* flags */
>
> int (*ifc_create)(struct if_clone *, int);
> int (*ifc_destroy)(struct ifnet *);
> };
>
> +#define IFC_CREATE_LOCKED 0x1
> +#define IFC_CREATE_LOCKWAIT 0x2
> +
> #define IF_CLONE_INITIALIZER(name, create, destroy) \
> { \
>   .ifc_list = { NULL, NULL }, \
>   .ifc_name = name, \
>   .ifc_namelen = sizeof(name) - 1, \
> +  .ifc_lock = RWLOCK_INITIALIZER("ifclk"), \
> +  .ifc_flags = 0, \
>   .ifc_create = create, \
>   .ifc_destroy = destroy, \
> }
> <panic.png>