make kevent(2) (a bit) mpsafe

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

make kevent(2) (a bit) mpsafe

David Gwynne-5
i originally came at this from the other side, where i wanted to run
kqueue_enqueue and _dequeue without the KERNEL_LOCK, but that implied
making kqueue_scan use the mutex too, which allowed the syscall to
become less locked.

it assumes that the existing locking in kqueue_scan is in the right
place, it just turns it into a mutex instead of KERNEL_LOCK with
splhigh. it leaves the kqueue_register code under KERNEL_LOCK, but if
you're not making changes with kevent then this should be a win.

there's an extra rwlock around the kqueue_scan call. this protects the
kq_head list from having multiple marker structs attached to it. that is
an extremely rare situation, ie, you'd have to have two threads execute
kevent on the same kq fd concurrently, but that never happens. right?

it seems to work ok, but i havent tested it extensively.

thoughts?

Index: sys/eventvar.h
===================================================================
RCS file: /cvs/src/sys/sys/eventvar.h,v
retrieving revision 1.5
diff -u -p -r1.5 eventvar.h
--- sys/eventvar.h 17 Jun 2018 08:22:02 -0000 1.5
+++ sys/eventvar.h 1 May 2019 06:29:43 -0000
@@ -35,6 +35,8 @@
 #define KQEXTENT 256 /* linear growth by this amount */
 
 struct kqueue {
+ struct rwlock kq_kevent; /* serialise kevent syscall */
+ struct mutex kq_mtx;
  TAILQ_HEAD(kqlist, knote) kq_head; /* list of pending event */
  int kq_count; /* number of pending events */
  int kq_refs; /* number of references */
Index: kern/kern_event.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.102
diff -u -p -r1.102 kern_event.c
--- kern/kern_event.c 1 May 2019 06:22:39 -0000 1.102
+++ kern/kern_event.c 1 May 2019 06:29:43 -0000
@@ -455,6 +455,8 @@ sys_kqueue(struct proc *p, void *v, regi
  fp->f_type = DTYPE_KQUEUE;
  fp->f_ops = &kqueueops;
  kq = pool_get(&kqueue_pool, PR_WAITOK|PR_ZERO);
+ rw_init(&kq->kq_kevent, "kevent");
+ mtx_init(&kq->kq_mtx, IPL_HIGH);
  TAILQ_INIT(&kq->kq_head);
  fp->f_data = kq;
  KQREF(kq);
@@ -509,37 +511,42 @@ sys_kevent(struct proc *p, void *v, regi
  kq = fp->f_data;
  nerrors = 0;
 
- while (SCARG(uap, nchanges) > 0) {
- n = SCARG(uap, nchanges) > KQ_NEVENTS ?
-    KQ_NEVENTS : SCARG(uap, nchanges);
- error = copyin(SCARG(uap, changelist), kev,
-    n * sizeof(struct kevent));
- if (error)
- goto done;
+ if (SCARG(uap, nchanges) > 0) {
+ KERNEL_LOCK();
+ do {
+ n = SCARG(uap, nchanges) > KQ_NEVENTS ?
+    KQ_NEVENTS : SCARG(uap, nchanges);
+ error = copyin(SCARG(uap, changelist), kev,
+    n * sizeof(struct kevent));
+ if (error)
+ goto done;
 #ifdef KTRACE
- if (KTRPOINT(p, KTR_STRUCT))
- ktrevent(p, kev, n);
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, kev, n);
 #endif
- for (i = 0; i < n; i++) {
- kevp = &kev[i];
- kevp->flags &= ~EV_SYSFLAGS;
- error = kqueue_register(kq, kevp, p);
- if (error || (kevp->flags & EV_RECEIPT)) {
- if (SCARG(uap, nevents) != 0) {
- kevp->flags = EV_ERROR;
- kevp->data = error;
- copyout(kevp, SCARG(uap, eventlist),
-    sizeof(*kevp));
- SCARG(uap, eventlist)++;
- SCARG(uap, nevents)--;
- nerrors++;
- } else {
- goto done;
+ for (i = 0; i < n; i++) {
+ kevp = &kev[i];
+ kevp->flags &= ~EV_SYSFLAGS;
+ error = kqueue_register(kq, kevp, p);
+ if (error || (kevp->flags & EV_RECEIPT)) {
+ if (SCARG(uap, nevents) != 0) {
+ kevp->flags = EV_ERROR;
+ kevp->data = error;
+ copyout(kevp,
+    SCARG(uap, eventlist),
+    sizeof(*kevp));
+ SCARG(uap, eventlist)++;
+ SCARG(uap, nevents)--;
+ nerrors++;
+ } else {
+ goto done;
+ }
  }
  }
- }
- SCARG(uap, nchanges) -= n;
- SCARG(uap, changelist) += n;
+ SCARG(uap, nchanges) -= n;
+ SCARG(uap, changelist) += n;
+ } while (SCARG(uap, nchanges) > 0);
+ KERNEL_UNLOCK();
  }
  if (nerrors) {
  *retval = nerrors;
@@ -547,12 +554,18 @@ sys_kevent(struct proc *p, void *v, regi
  goto done;
  }
 
+ error = rw_enter(&kq->kq_kevent, RW_WRITE|RW_NOSLEEP);
+ if (error != 0)
+ goto done;
+
  KQREF(kq);
  FRELE(fp, p);
  error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
     SCARG(uap, timeout), p, &n);
  KQRELE(kq);
+ rw_exit(&kq->kq_kevent);
  *retval = n;
+
  return (error);
 
  done:
@@ -701,7 +714,7 @@ kqueue_scan(struct kqueue *kq, int maxev
  struct kevent *kevp;
  struct timespec ats, rts, tts;
  struct knote *kn, marker;
- int s, count, timeout, nkev = 0, error = 0;
+ int count, timeout, nkev = 0, error = 0;
  struct kevent kev[KQ_NEVENTS];
 
  count = maxevents;
@@ -749,15 +762,16 @@ start:
  }
 
  kevp = &kev[0];
- s = splhigh();
+ mtx_enter(&kq->kq_mtx);
  if (kq->kq_count == 0) {
  if (timeout < 0) {
  error = EWOULDBLOCK;
  } else {
  kq->kq_state |= KQ_SLEEP;
- error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
+ error = msleep(kq, &kq->kq_mtx, PSOCK | PCATCH,
+    "kqread", timeout);
  }
- splx(s);
+ mtx_leave(&kq->kq_mtx);
  if (error == 0)
  goto retry;
  /* don't restart after signals... */
@@ -773,7 +787,7 @@ start:
  kn = TAILQ_FIRST(&kq->kq_head);
  if (kn == &marker) {
  TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
- splx(s);
+ mtx_leave(&kq->kq_mtx);
  if (count == maxevents)
  goto retry;
  goto done;
@@ -796,10 +810,12 @@ start:
  nkev++;
  if (kn->kn_flags & EV_ONESHOT) {
  kn->kn_status &= ~KN_QUEUED;
- splx(s);
+ mtx_leave(&kq->kq_mtx);
+ KERNEL_LOCK();
  kn->kn_fop->f_detach(kn);
+ KERNEL_UNLOCK();
  knote_drop(kn, p);
- s = splhigh();
+ mtx_enter(&kq->kq_mtx);
  } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
  if (kn->kn_flags & EV_CLEAR) {
  kn->kn_data = 0;
@@ -814,7 +830,7 @@ start:
  }
  count--;
  if (nkev == KQ_NEVENTS) {
- splx(s);
+ mtx_leave(&kq->kq_mtx);
 #ifdef KTRACE
  if (KTRPOINT(p, KTR_STRUCT))
  ktrevent(p, kev, nkev);
@@ -824,13 +840,13 @@ start:
  ulistp += nkev;
  nkev = 0;
  kevp = &kev[0];
- s = splhigh();
+ mtx_enter(&kq->kq_mtx);
  if (error)
  break;
  }
  }
  TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
- splx(s);
+ mtx_leave(&kq->kq_mtx);
 done:
  if (nkev != 0) {
 #ifdef KTRACE
@@ -1070,15 +1086,14 @@ void
 knote_enqueue(struct knote *kn)
 {
  struct kqueue *kq = kn->kn_kq;
- int s = splhigh();
 
+ mtx_enter(&kq->kq_mtx);
  KASSERT((kn->kn_status & KN_QUEUED) == 0);
- KERNEL_ASSERT_LOCKED();
 
  TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
  kn->kn_status |= KN_QUEUED;
  kq->kq_count++;
- splx(s);
+ mtx_leave(&kq->kq_mtx);
  kqueue_wakeup(kq);
 }
 
@@ -1086,15 +1101,14 @@ void
 knote_dequeue(struct knote *kn)
 {
  struct kqueue *kq = kn->kn_kq;
- int s = splhigh();
 
+ mtx_enter(&kq->kq_mtx);
  KASSERT(kn->kn_status & KN_QUEUED);
- KERNEL_ASSERT_LOCKED();
 
  TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
  kn->kn_status &= ~KN_QUEUED;
  kq->kq_count--;
- splx(s);
+ mtx_leave(&kq->kq_mtx);
 }
 
 void
Index: kern/syscalls.master
===================================================================
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.189
diff -u -p -r1.189 syscalls.master
--- kern/syscalls.master 11 Jan 2019 18:46:30 -0000 1.189
+++ kern/syscalls.master 1 May 2019 06:29:44 -0000
@@ -166,7 +166,7 @@
     struct itimerval *itv); }
 71 STD { int sys_select(int nd, fd_set *in, fd_set *ou, \
     fd_set *ex, struct timeval *tv); }
-72 STD { int sys_kevent(int fd, \
+72 STD NOLOCK { int sys_kevent(int fd, \
     const struct kevent *changelist, int nchanges, \
     struct kevent *eventlist, int nevents, \
     const struct timespec *timeout); }

Reply | Threaded
Open this post in threaded view
|

Re: make kevent(2) (a bit) mpsafe

Martin Pieuchot
On 01/05/19(Wed) 16:35, David Gwynne wrote:

> i originally came at this from the other side, where i wanted to run
> kqueue_enqueue and _dequeue without the KERNEL_LOCK, but that implied
> making kqueue_scan use the mutex too, which allowed the syscall to
> become less locked.
>
> it assumes that the existing locking in kqueue_scan is in the right
> place, it just turns it into a mutex instead of KERNEL_LOCK with
> splhigh. it leaves the kqueue_register code under KERNEL_LOCK, but if
> you're not making changes with kevent then this should be a win.
>
> there's an extra rwlock around the kqueue_scan call. this protects the
> kq_head list from having multiple marker structs attached to it. that is
> an extremely rare situation, ie, you'd have to have two threads execute
> kevent on the same kq fd concurrently, but that never happens. right?

If you look at the archives on bugs@ that can happen.  The diff that has
been backed out exposed an issue possibly related to multiple threads
playing with the queue.

> it seems to work ok, but i havent tested it extensively.
>
> thoughts?

I believe removing the KERNEL_LOCK() from the syscall should be done in a
second step, see below.

> @@ -796,10 +810,12 @@ start:
>   nkev++;
>   if (kn->kn_flags & EV_ONESHOT) {
>   kn->kn_status &= ~KN_QUEUED;
> - splx(s);
> + mtx_leave(&kq->kq_mtx);
> + KERNEL_LOCK();
>   kn->kn_fop->f_detach(kn);
> + KERNEL_UNLOCK();
>   knote_drop(kn, p);
> - s = splhigh();
> + mtx_enter(&kq->kq_mtx);

By releasing the mutex here aren't we allowing multiple callers to play
with the queue?  Is it safe?  Does that mean that another thread might
ends up removing the existing marker?

Please look at my previous change "Make it possible for multiple
threads to enter kqueue_scan() in parallel" which is based on the
approach taken by Dfly.  I believe there's many more dragons in there
:o)

Reply | Threaded
Open this post in threaded view
|

Re: make kevent(2) (a bit) mpsafe

William Ahern-2
In reply to this post by David Gwynne-5
On Wed, May 01, 2019 at 04:35:02PM +1000, David Gwynne wrote:

> i originally came at this from the other side, where i wanted to run
> kqueue_enqueue and _dequeue without the KERNEL_LOCK, but that implied
> making kqueue_scan use the mutex too, which allowed the syscall to
> become less locked.
>
> it assumes that the existing locking in kqueue_scan is in the right
> place, it just turns it into a mutex instead of KERNEL_LOCK with
> splhigh. it leaves the kqueue_register code under KERNEL_LOCK, but if
> you're not making changes with kevent then this should be a win.
>
> there's an extra rwlock around the kqueue_scan call. this protects the
> kq_head list from having multiple marker structs attached to it. that is
> an extremely rare situation, ie, you'd have to have two threads execute
> kevent on the same kq fd concurrently, but that never happens. right?

FWIW, in Linux-land a shared event descriptor with edge-triggered events is
a not uncommon pattern for multithreaded dispatch loops as it removes the
need for userspace locking. kqueue supports the same pattern and some
portable event loops likely mix threads and shared event descriptors this
way. epoll and kqueue (and Solaris Ports, for that matter) are similar
enough that a thin wrapper doesn't even need to explicitly support this
pattern for it to be available to an application.

I don't see any reason to optimize for it at the moment though.[1] That lock
doesn't appear to change semantics, just serializes threads waiting on the
queue, right? Even if the order that threads are awoken changes it shouldn't
impact correctness.

[1] Or ever. Edge-triggered events and shared mutable data are individually
brittle, difficult to maintain design choices; combining them is just asking
for trouble. I've seen projects go down this path and then switch to oneshot
events instead of edge-triggered events to "solve" thread race bugs, which
can result in worse performance than a classic select loop. For example, in
low-latency and/or high load scenarios where the total number of syscalls
constantly rearming a descriptor is greater. Linux still doesn't have
batched updates, AFAIK, and they're prohibitively difficult to implement for
a multithreaded, lock-free dispatch loop, anyhow, so it's not a common
optimization.