Kill NFS-only kqueue poller thread

classic Classic list List threaded Threaded
7 messages Options
Reply | Threaded
Open this post in threaded view
|

Kill NFS-only kqueue poller thread

Martin Pieuchot
When it comes to kqueue filters NFS is special.  A custom thread is
created when the first event is registered.  Its purpose is to poll
for changes every 2.5sec.  This logic has been inherited from NetBSD
and is not present in FreeBSD.

Since filt_nfsread() only check `n_size' of a given nfsnode having a
different context to emulate stat(2) behavior in kernel is questionable.
So the diff below gets rid of this logic.  The rationals are KISS,
coherency with nfs_poll() and match what other FS kqueue filters do.

While here add a missing write filter.

Matching the nfs_poll() logic and the write filter are necessary to keep
the current behavior of poll(2) & select(2) once they start querying
kqfilter handlers.

ok?

Index: nfs/nfs_kq.c
===================================================================
RCS file: /cvs/src/sys/nfs/nfs_kq.c,v
retrieving revision 1.30
diff -u -p -r1.30 nfs_kq.c
--- nfs/nfs_kq.c 7 Apr 2020 13:27:52 -0000 1.30
+++ nfs/nfs_kq.c 28 May 2020 09:52:52 -0000
@@ -32,183 +32,26 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
 #include <sys/mount.h>
-#include <sys/malloc.h>
 #include <sys/vnode.h>
-#include <sys/unistd.h>
 #include <sys/file.h>
-#include <sys/kthread.h>
-#include <sys/rwlock.h>
-#include <sys/queue.h>
 
-#include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfs.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs_var.h>
 
-void nfs_kqpoll(void *);
-
 void filt_nfsdetach(struct knote *);
 int filt_nfsread(struct knote *, long);
+int filt_nfswrite(struct knote *, long);
 int filt_nfsvnode(struct knote *, long);
 
-struct kevq {
- SLIST_ENTRY(kevq) kev_link;
- struct vnode *vp;
- u_int usecount;
- u_int flags;
-#define KEVQ_BUSY 0x01 /* currently being processed */
-#define KEVQ_WANT 0x02 /* want to change this entry */
- struct timespec omtime; /* old modification time */
- struct timespec octime; /* old change time */
- nlink_t onlink; /* old number of references to file */
-};
-SLIST_HEAD(kevqlist, kevq);
-
-struct rwlock nfskevq_lock = RWLOCK_INITIALIZER("nfskqlk");
-struct proc *pnfskq;
-struct kevqlist kevlist = SLIST_HEAD_INITIALIZER(kevlist);
-
-/*
- * This quite simplistic routine periodically checks for server changes
- * of any of the watched files every NFS_MINATTRTIMO/2 seconds.
- * Only changes in size, modification time, change time and nlinks
- * are being checked, everything else is ignored.
- * The routine only calls VOP_GETATTR() when it's likely it would get
- * some new data, i.e. when the vnode expires from attrcache. This
- * should give same result as periodically running stat(2) from userland,
- * while keeping CPU/network usage low, and still provide proper kevent
- * semantics.
- * The poller thread is created when first vnode is added to watch list,
- * and exits when the watch list is empty. The overhead of thread creation
- * isn't really important, neither speed of attach and detach of knote.
- */
-/* ARGSUSED */
-void
-nfs_kqpoll(void *arg)
-{
- struct kevq *ke;
- struct vattr attr;
- struct proc *p = pnfskq;
- u_quad_t osize;
- int error;
-
- for(;;) {
- rw_enter_write(&nfskevq_lock);
- SLIST_FOREACH(ke, &kevlist, kev_link) {
- struct nfsnode *np = VTONFS(ke->vp);
-
-#ifdef DEBUG
- printf("nfs_kqpoll on: ");
- VOP_PRINT(ke->vp);
-#endif
- /* skip if still in attrcache */
- if (nfs_getattrcache(ke->vp, &attr) != ENOENT)
- continue;
-
- /*
- * Mark entry busy, release lock and check
- * for changes.
- */
- ke->flags |= KEVQ_BUSY;
- rw_exit_write(&nfskevq_lock);
-
- /* save v_size, nfs_getattr() updates it */
- osize = np->n_size;
-
- error = VOP_GETATTR(ke->vp, &attr, p->p_ucred, p);
- if (error == ESTALE) {
- NFS_INVALIDATE_ATTRCACHE(np);
- VN_KNOTE(ke->vp, NOTE_DELETE);
- goto next;
- }
-
- /* following is a bit fragile, but about best
- * we can get */
- if (attr.va_size != osize) {
- int flags = NOTE_WRITE;
-
- if (attr.va_size > osize)
- flags |= NOTE_EXTEND;
- else
- flags |= NOTE_TRUNCATE;
-
- VN_KNOTE(ke->vp, flags);
- ke->omtime = attr.va_mtime;
- } else if (attr.va_mtime.tv_sec != ke->omtime.tv_sec
-    || attr.va_mtime.tv_nsec != ke->omtime.tv_nsec) {
- VN_KNOTE(ke->vp, NOTE_WRITE);
- ke->omtime = attr.va_mtime;
- }
-
- if (attr.va_ctime.tv_sec != ke->octime.tv_sec
-    || attr.va_ctime.tv_nsec != ke->octime.tv_nsec) {
- VN_KNOTE(ke->vp, NOTE_ATTRIB);
- ke->octime = attr.va_ctime;
- }
-
- if (attr.va_nlink != ke->onlink) {
- VN_KNOTE(ke->vp, NOTE_LINK);
- ke->onlink = attr.va_nlink;
- }
-
-next:
- rw_enter_write(&nfskevq_lock);
- ke->flags &= ~KEVQ_BUSY;
- if (ke->flags & KEVQ_WANT) {
- ke->flags &= ~KEVQ_WANT;
- wakeup(ke);
- }
- }
-
- if (SLIST_EMPTY(&kevlist)) {
- /* Nothing more to watch, exit */
- pnfskq = NULL;
- rw_exit_write(&nfskevq_lock);
- kthread_exit(0);
- }
- rw_exit_write(&nfskevq_lock);
-
- /* wait a while before checking for changes again */
- tsleep_nsec(pnfskq, PSOCK, "nfskqpw",
-    SEC_TO_NSEC(NFS_MINATTRTIMO) / 2);
- }
-}
-
 void
 filt_nfsdetach(struct knote *kn)
 {
  struct vnode *vp = (struct vnode *)kn->kn_hook;
- struct kevq *ke;
 
  klist_remove(&vp->v_selectinfo.si_note, kn);
-
- /* Remove the vnode from watch list */
- rw_enter_write(&nfskevq_lock);
- SLIST_FOREACH(ke, &kevlist, kev_link) {
- if (ke->vp == vp) {
- while (ke->flags & KEVQ_BUSY) {
- ke->flags |= KEVQ_WANT;
- rw_exit_write(&nfskevq_lock);
- tsleep_nsec(ke, PSOCK, "nfskqdet", INFSLP);
- rw_enter_write(&nfskevq_lock);
- }
-
- if (ke->usecount > 1) {
- /* keep, other kevents need this */
- ke->usecount--;
- } else {
- /* last user, g/c */
- SLIST_REMOVE(&kevlist, ke, kevq, kev_link);
- free(ke, M_KEVENT, sizeof(*ke));
- }
- break;
- }
- }
- rw_exit_write(&nfskevq_lock);
 }
 
 int
@@ -227,9 +70,6 @@ filt_nfsread(struct knote *kn, long hint
  }
 
  kn->kn_data = np->n_size - foffset(kn->kn_fp);
-#ifdef DEBUG
- printf("nfsread event. %lld\n", kn->kn_data);
-#endif
  if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
  kn->kn_fflags |= NOTE_EOF;
  return (1);
@@ -238,6 +78,22 @@ filt_nfsread(struct knote *kn, long hint
 }
 
 int
+filt_nfswrite(struct knote *kn, long hint)
+{
+ /*
+ * filesystem is gone, so set the EOF flag and schedule
+ * the knote for deletion.
+ */
+ if (hint == NOTE_REVOKE) {
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ return (1);
+ }
+
+ kn->kn_data = 0;
+ return (1);
+}
+
+int
 filt_nfsvnode(struct knote *kn, long hint)
 {
  if (kn->kn_sfflags & hint)
@@ -256,6 +112,13 @@ static const struct filterops nfsread_fi
  .f_event = filt_nfsread,
 };
 
+const struct filterops nfswrite_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_nfsdetach,
+ .f_event = filt_nfswrite,
+};
+
 static const struct filterops nfsvnode_filtops = {
  .f_flags = FILTEROP_ISFD,
  .f_attach = NULL,
@@ -267,25 +130,16 @@ int
 nfs_kqfilter(void *v)
 {
  struct vop_kqfilter_args *ap = v;
- struct vnode *vp;
- struct knote *kn;
- struct kevq *ke;
- int error = 0;
- struct vattr attr;
- struct proc *p = curproc; /* XXX */
-
- vp = ap->a_vp;
- kn = ap->a_kn;
-
-#ifdef DEBUG
- printf("nfs_kqfilter(%d) on: ", kn->kn_filter);
- VOP_PRINT(vp);
-#endif
+ struct vnode *vp = ap->a_vp;
+ struct knote *kn = ap->a_kn;
 
  switch (kn->kn_filter) {
  case EVFILT_READ:
  kn->kn_fop = &nfsread_filtops;
  break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &nfswrite_filtops;
+ break;
  case EVFILT_VNODE:
  kn->kn_fop = &nfsvnode_filtops;
  break;
@@ -295,53 +149,7 @@ nfs_kqfilter(void *v)
 
  kn->kn_hook = vp;
 
- /*
- * Put the vnode to watched list.
- */
-
- /*
- * Fetch current attributes. It's only needed when the vnode
- * is not watched yet, but we need to do this without lock
- * held. This is likely cheap due to attrcache, so do it now.
- */
- memset(&attr, 0, sizeof(attr));
- (void) VOP_GETATTR(vp, &attr, p->p_ucred, p);
-
- rw_enter_write(&nfskevq_lock);
-
- /* ensure the poller is running */
- if (!pnfskq) {
- error = kthread_create(nfs_kqpoll, NULL, &pnfskq,
- "nfskqpoll");
- if (error)
- goto out;
- }
-
- SLIST_FOREACH(ke, &kevlist, kev_link)
- if (ke->vp == vp)
- break;
-
- if (ke) {
- /* already watched, so just bump usecount */
- ke->usecount++;
- } else {
- /* need a new one */
- ke = malloc(sizeof(*ke), M_KEVENT, M_WAITOK);
- ke->vp = vp;
- ke->usecount = 1;
- ke->flags = 0;
- ke->omtime = attr.va_mtime;
- ke->octime = attr.va_ctime;
- ke->onlink = attr.va_nlink;
- SLIST_INSERT_HEAD(&kevlist, ke, kev_link);
- }
-
- /* kick the poller */
- wakeup(pnfskq);
-
  klist_insert(&vp->v_selectinfo.si_note, kn);
 
-out:
- rw_exit_write(&nfskevq_lock);
- return (error);
+ return (0);
 }

Reply | Threaded
Open this post in threaded view
|

Re: Kill NFS-only kqueue poller thread

Visa Hankala-2
On Thu, May 28, 2020 at 12:11:20PM +0200, Martin Pieuchot wrote:

> When it comes to kqueue filters NFS is special.  A custom thread is
> created when the first event is registered.  Its purpose is to poll
> for changes every 2.5sec.  This logic has been inherited from NetBSD
> and is not present in FreeBSD.
>
> Since filt_nfsread() only check `n_size' of a given nfsnode having a
> different context to emulate stat(2) behavior in kernel is questionable.
> So the diff below gets rid of this logic.  The rationals are KISS,
> coherency with nfs_poll() and match what other FS kqueue filters do.
>
> While here add a missing write filter.
>
> Matching the nfs_poll() logic and the write filter are necessary to keep
> the current behavior of poll(2) & select(2) once they start querying
> kqfilter handlers.

I think it is not good to remove nfs_kqpoll(). The function does more
than just supports the read filter. It generates various vnode events.
If the poller is removed, it becomes impossible for example to monitor
an NFS directory with kevent(2).

It is true that the code is not perfect, but having even basic and
best-effort functionality with kevent(2) and NFS is better than nothing
in my opinion. The kqueue subsystem should not dumb down for emulating
poll(2) and select(2).

Reply | Threaded
Open this post in threaded view
|

Re: Kill NFS-only kqueue poller thread

Martin Pieuchot
On 30/05/20(Sat) 09:22, Visa Hankala wrote:

> On Thu, May 28, 2020 at 12:11:20PM +0200, Martin Pieuchot wrote:
> > When it comes to kqueue filters NFS is special.  A custom thread is
> > created when the first event is registered.  Its purpose is to poll
> > for changes every 2.5sec.  This logic has been inherited from NetBSD
> > and is not present in FreeBSD.
> >
> > Since filt_nfsread() only check `n_size' of a given nfsnode having a
> > different context to emulate stat(2) behavior in kernel is questionable.
> > So the diff below gets rid of this logic.  The rationals are KISS,
> > coherency with nfs_poll() and match what other FS kqueue filters do.
> >
> > While here add a missing write filter.
> >
> > Matching the nfs_poll() logic and the write filter are necessary to keep
> > the current behavior of poll(2) & select(2) once they start querying
> > kqfilter handlers.
>
> I think it is not good to remove nfs_kqpoll(). The function does more
> than just supports the read filter. It generates various vnode events.
> If the poller is removed, it becomes impossible for example to monitor
> an NFS directory with kevent(2).

What do you mean with "impossible to monitor a NFS directory"?  Are you
saying that NFS being is the only FS to have a specific thread to implement
a poller is wanted feature?  If it is wanted, why is it NFS only?  Which
use case involve monitoring a NFS directory with kevent(2)?  When are we
aware of the existence of a "nfskqpoll" kernel thread?

> It is true that the code is not perfect, but having even basic and
> best-effort functionality with kevent(2) and NFS is better than nothing
> in my opinion. The kqueue subsystem should not dumb down for emulating
> poll(2) and select(2).

In that case we've to accept or work-around with the fact that a thread
is allocated, an operation that can fail, in the kqfilter().  Do you have
an opinion on that?  What should we do for other FSs is uniformity wanted?

Reply | Threaded
Open this post in threaded view
|

Re: Kill NFS-only kqueue poller thread

Visa Hankala-2
On Sat, May 30, 2020 at 03:34:06PM +0200, Martin Pieuchot wrote:

> On 30/05/20(Sat) 09:22, Visa Hankala wrote:
> > On Thu, May 28, 2020 at 12:11:20PM +0200, Martin Pieuchot wrote:
> > > When it comes to kqueue filters NFS is special.  A custom thread is
> > > created when the first event is registered.  Its purpose is to poll
> > > for changes every 2.5sec.  This logic has been inherited from NetBSD
> > > and is not present in FreeBSD.
> > >
> > > Since filt_nfsread() only check `n_size' of a given nfsnode having a
> > > different context to emulate stat(2) behavior in kernel is questionable.
> > > So the diff below gets rid of this logic.  The rationals are KISS,
> > > coherency with nfs_poll() and match what other FS kqueue filters do.
> > >
> > > While here add a missing write filter.
> > >
> > > Matching the nfs_poll() logic and the write filter are necessary to keep
> > > the current behavior of poll(2) & select(2) once they start querying
> > > kqfilter handlers.
> >
> > I think it is not good to remove nfs_kqpoll(). The function does more
> > than just supports the read filter. It generates various vnode events.
> > If the poller is removed, it becomes impossible for example to monitor
> > an NFS directory with kevent(2).
>
> What do you mean with "impossible to monitor a NFS directory"?  Are you
> saying that NFS being is the only FS to have a specific thread to implement
> a poller is wanted feature?  If it is wanted, why is it NFS only?  Which
> use case involve monitoring a NFS directory with kevent(2)?  When are we
> aware of the existence of a "nfskqpoll" kernel thread?

Local filesystems can observe changes at the source, which makes polling
unnecessary. NFS clients do not have that benefit. The NFSv3 protocol
lacks a mechanism to notify clients of changes.

The NFS polling mechanism is in use for example when running tail -f
on a remote file. Directory monitoring can be utilized for example
by a directory browser application or a distributed batch processing
system.

> > It is true that the code is not perfect, but having even basic and
> > best-effort functionality with kevent(2) and NFS is better than nothing
> > in my opinion. The kqueue subsystem should not dumb down for emulating
> > poll(2) and select(2).
>
> In that case we've to accept or work-around with the fact that a thread
> is allocated, an operation that can fail, in the kqfilter().  Do you have
> an opinion on that?  What should we do for other FSs is uniformity wanted?

Can't the emulation of poll(2) and select(2) operate without the poller?
These system calls are supposed to return true for regular files for
reading and writing. No polling of remote state is needed for that.

The read behaviour of poll(2) and select(2) differs from kevent(2)'s
EVFILT_READ. Consequently, there has to be an indicator that makes
f_event work in poll(2)/select(2) mode. I guess the same indicator can
control nfs_kqfilter().

Reply | Threaded
Open this post in threaded view
|

Re: Kill NFS-only kqueue poller thread

Martin Pieuchot
In reply to this post by Martin Pieuchot
On 30/05/20(Sat) 09:25, Theo de Raadt wrote:
> [...]
> What does this have to do with threads?  That is an implimentation detail.

This implementation detail is specific to NFS, no other FS do anything
like that.  So I'm questioning whether calling kthread_create(9) inside a
kqueue(2) handler, which is called from kevent(2) and could be called
from poll(2) is the wanted behavior.

> This is about behaviour.  If you open a FFS directory, do you eventually
> get updates?

FFS and all the other FS have the same behavior when it comes to poll(2)
and kqueue(2).  They don't check for I/O and return true.  NFS tries to
be clever for kqueue(2)-only, I'm questioning the added value of that
cleverness.

> Should NFS not try to act the same?

That's exactly what my diff is doing: bring it in sync with other FS.

Reply | Threaded
Open this post in threaded view
|

Re: Kill NFS-only kqueue poller thread

Martin Pieuchot
In reply to this post by Visa Hankala-2
On 30/05/20(Sat) 15:49, Visa Hankala wrote:

> [...]
> Local filesystems can observe changes at the source, which makes polling
> unnecessary. NFS clients do not have that benefit. The NFSv3 protocol
> lacks a mechanism to notify clients of changes.
>
> The NFS polling mechanism is in use for example when running tail -f
> on a remote file. Directory monitoring can be utilized for example
> by a directory browser application or a distributed batch processing
> system.
>
> > > It is true that the code is not perfect, but having even basic and
> > > best-effort functionality with kevent(2) and NFS is better than nothing
> > > in my opinion. The kqueue subsystem should not dumb down for emulating
> > > poll(2) and select(2).
> >
> > In that case we've to accept or work-around with the fact that a thread
> > is allocated, an operation that can fail, in the kqfilter().  Do you have
> > an opinion on that?  What should we do for other FSs is uniformity wanted?
>
> Can't the emulation of poll(2) and select(2) operate without the poller?
> These system calls are supposed to return true for regular files for
> reading and writing. No polling of remote state is needed for that.
>
> The read behaviour of poll(2) and select(2) differs from kevent(2)'s
> EVFILT_READ. Consequently, there has to be an indicator that makes
> f_event work in poll(2)/select(2) mode. I guess the same indicator can
> control nfs_kqfilter().

So are you saying that poll(2) is broken on NFS and suggest that I extend
the kqfilter to match this behavior?

Reply | Threaded
Open this post in threaded view
|

Re: Kill NFS-only kqueue poller thread

Visa Hankala-2
On Sun, May 31, 2020 at 09:40:47AM +0200, Martin Pieuchot wrote:

> On 30/05/20(Sat) 15:49, Visa Hankala wrote:
> > [...]
> > Local filesystems can observe changes at the source, which makes polling
> > unnecessary. NFS clients do not have that benefit. The NFSv3 protocol
> > lacks a mechanism to notify clients of changes.
> >
> > The NFS polling mechanism is in use for example when running tail -f
> > on a remote file. Directory monitoring can be utilized for example
> > by a directory browser application or a distributed batch processing
> > system.
> >
> > > > It is true that the code is not perfect, but having even basic and
> > > > best-effort functionality with kevent(2) and NFS is better than nothing
> > > > in my opinion. The kqueue subsystem should not dumb down for emulating
> > > > poll(2) and select(2).
> > >
> > > In that case we've to accept or work-around with the fact that a thread
> > > is allocated, an operation that can fail, in the kqfilter().  Do you have
> > > an opinion on that?  What should we do for other FSs is uniformity wanted?
> >
> > Can't the emulation of poll(2) and select(2) operate without the poller?
> > These system calls are supposed to return true for regular files for
> > reading and writing. No polling of remote state is needed for that.
> >
> > The read behaviour of poll(2) and select(2) differs from kevent(2)'s
> > EVFILT_READ. Consequently, there has to be an indicator that makes
> > f_event work in poll(2)/select(2) mode. I guess the same indicator can
> > control nfs_kqfilter().
>
> So are you saying that poll(2) is broken on NFS and suggest that I extend
> the kqfilter to match this behavior?

I am not saying that. I am saying that poll(2) and kevent(2) have
different semantics with regular files. Except that the write filter
is missing at the moment, the current NFS poll(2) and kevent(2)
behaviour looks about similar to the other filesystems.

poll(2) returns true immediately, whereas kevent(2) EVFILT_READ returns
true once the file pointer is not at the end of file (unless NOTE_EOF
is set).

For the semantics of poll(2), the NFS poller is pointless. However,
the poller is necessary for approximating kevent(2) semantics on NFS.