Different fix for vnode deadlock

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

Different fix for vnode deadlock

Martin Pieuchot
Page faults on vnode-backed objects commonly end up calling VOP_READ(9)
or VOP_WRITE(9) to go through the buffer cache.  This implies grabbing
an inode lock after any UVM locking.

On the other hand changing the size of a vnode results in entering UVM,
generally via calling uvm_vnp_setsize() with a locked inode.

Syzkaller exposed a deadlock that anton@ fixed in r1.108 of uvm/uvm_vnode.c
by making the page fault path grab the inode lock earlier.  Sadly such
change isn't compatible with a finer locking required to unlock the lower
part of the UVM fault handler.

UVM's code make use of the PG_BUSY flag to ask other threads to not touch
a given page.  This is done to keep the ownership of a page after having
released its associated lock.  This is currently hard to follow because
the locking code has been removed ;)

With the current fix, the PG_BUSY flag is set after grabbing the inode
lock which creates a lock ordering problem with `uobj->vmlock' being
released after setting the flag.

So the diff below takes a different approach, if the thread that faulted
finds that the `inode' is contended it stops there and restart the fault.
This has the side effect of un-PG_BUSY the pages and allows the other
thread to make progress.

This is enough to move forward with `uobj->vmlock' without changing the
interaction between the existing buffer cache and UVM locking (thanks!).

I couldn't trigger the deadlock with regress/sys/uvm/vnode with this
diff.

Is the explanation clear enough?  Comments?  Oks?

Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.108
diff -u -p -r1.108 uvm_vnode.c
--- uvm/uvm_vnode.c 26 Oct 2020 19:48:19 -0000 1.108
+++ uvm/uvm_vnode.c 23 Feb 2021 10:46:50 -0000
@@ -90,9 +90,6 @@ int uvn_io(struct uvm_vnode *, vm_page
 int uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
 void uvn_reference(struct uvm_object *);
 
-int uvm_vnode_lock(struct uvm_vnode *);
-void uvm_vnode_unlock(struct uvm_vnode *);
-
 /*
  * master pager structure
  */
@@ -878,16 +875,11 @@ uvn_cluster(struct uvm_object *uobj, vof
 int
 uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags)
 {
- struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
  int retval;
 
  KERNEL_ASSERT_LOCKED();
 
- retval = uvm_vnode_lock(uvn);
- if (retval)
- return(retval);
- retval = uvn_io(uvn, pps, npages, flags, UIO_WRITE);
- uvm_vnode_unlock(uvn);
+ retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
 
  return(retval);
 }
@@ -905,10 +897,9 @@ int
 uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
     int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
 {
- struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
  voff_t current_offset;
  struct vm_page *ptmp;
- int lcv, result, gotpages, retval;
+ int lcv, result, gotpages;
  boolean_t done;
 
  KERNEL_ASSERT_LOCKED();
@@ -983,18 +974,6 @@ uvn_get(struct uvm_object *uobj, voff_t
  }
 
  /*
- * Before getting non-resident pages which must be populate with data
- * using I/O on the backing vnode, lock the same vnode. Such pages are
- * about to be allocated and busied (i.e. PG_BUSY) by the current
- * thread. Allocating and busying the page(s) before acquiring the
- * vnode lock could cause a deadlock with uvn_flush() which acquires the
- * vnode lock before waiting on pages to become unbusy and then flushed.
- */
- retval = uvm_vnode_lock(uvn);
- if (retval)
- return(retval);
-
- /*
  * step 2: get non-resident or busy pages.
  * data structures are unlocked.
  *
@@ -1080,15 +1059,14 @@ uvn_get(struct uvm_object *uobj, voff_t
  * we have a "fake/busy/clean" page that we just allocated.  do
  * I/O to fill it with valid data.
  */
- result = uvn_io(uvn, &ptmp, 1, PGO_SYNCIO, UIO_READ);
+ result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1,
+    PGO_SYNCIO|PGO_NOWAIT, UIO_READ);
 
  /*
  * I/O done.  because we used syncio the result can not be
  * PEND or AGAIN.
  */
  if (result != VM_PAGER_OK) {
- uvm_vnode_unlock(uvn);
-
  if (ptmp->pg_flags & PG_WANTED)
  wakeup(ptmp);
 
@@ -1119,15 +1097,12 @@ uvn_get(struct uvm_object *uobj, voff_t
 
  }
 
- uvm_vnode_unlock(uvn);
-
  return (VM_PAGER_OK);
 }
 
 /*
  * uvn_io: do I/O to a vnode
  *
- * => uvn: the backing vnode must be locked
  * => prefer map unlocked (not required)
  * => flags: PGO_SYNCIO -- use sync. I/O
  * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
@@ -1145,6 +1120,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t
  int waitf, result, mapinflags;
  size_t got, wanted;
  int netunlocked = 0;
+ int lkflags = (flags & PGO_NOWAIT) ? LK_NOWAIT : 0;
 
  /* init values */
  waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
@@ -1213,17 +1189,42 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t
  }
 
  /* do the I/O!  (XXX: curproc?) */
- if (rw == UIO_READ)
- result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
- else
- result = VOP_WRITE(vn, &uio,
-    (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
-    curproc->p_ucred);
+ /*
+ * This process may already have this vnode locked, if we faulted in
+ * copyin() or copyout() on a region backed by this vnode
+ * while doing I/O to the vnode.  If this is the case, don't
+ * panic.. instead, return the error to the user.
+ *
+ * XXX this is a stopgap to prevent a panic.
+ * Ideally, this kind of operation *should* work.
+ */
+ result = 0;
+ if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
+ result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL | lkflags);
+ if (result == 0) {
+ /* NOTE: vnode now locked! */
+ if (rw == UIO_READ)
+ result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
+ else
+ result = VOP_WRITE(vn, &uio,
+    (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
+    curproc->p_ucred);
+
+ if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
+ VOP_UNLOCK(vn);
+
+ }
 
  if (netunlocked)
  NET_LOCK();
 
- /* zero out rest of buffer (if needed) */
+
+ /* NOTE: vnode now unlocked (unless vnislocked) */
+ /*
+ * result == unix style errno (0 == OK!)
+ *
+ * zero out rest of buffer (if needed)
+ */
  if (result == 0) {
  got = wanted - uio.uio_resid;
 
@@ -1244,14 +1245,16 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t
  wakeup(&uvn->u_nio);
  }
 
- if (result == 0)
+ if (result == 0) {
  return(VM_PAGER_OK);
-
- if (result == EIO) {
- /* Signal back to uvm_vnode_unlock(). */
- uvn->u_flags |= UVM_VNODE_IOERROR;
+ } else if (result == EBUSY) {
+ KASSERT(flags & PGO_NOWAIT);
+ return(VM_PAGER_AGAIN);
+ } else {
+ while (rebooting)
+ tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP);
+ return(VM_PAGER_ERROR);
  }
- return(VM_PAGER_ERROR);
 }
 
 /*
@@ -1467,53 +1470,4 @@ uvm_vnp_sync(struct mount *mp)
  }
 
  rw_exit_write(&uvn_sync_lock);
-}
-
-int
-uvm_vnode_lock(struct uvm_vnode *uvn)
-{
- int error;
- int netunlocked = 0;
-
- if (uvn->u_flags & UVM_VNODE_VNISLOCKED)
- return(VM_PAGER_OK);
-
- /*
- * This thread may already have the net lock, if we faulted in copyin()
- * or copyout() in the network stack.
- */
- if (rw_status(&netlock) == RW_WRITE) {
- NET_UNLOCK();
- netunlocked = 1;
- }
-
- /*
- * This thread may already have this vnode locked, if we faulted in
- * copyin() or copyout() on a region backed by this vnode
- * while doing I/O to the vnode. If this is the case, don't panic but
- * instead return an error; as dictated by the LK_RECURSEFAIL flag.
- *
- * XXX this is a stopgap to prevent a panic.
- * Ideally, this kind of operation *should* work.
- */
- error = vn_lock(uvn->u_vnode, LK_EXCLUSIVE | LK_RECURSEFAIL);
- if (netunlocked)
- NET_LOCK();
- return(error ? VM_PAGER_ERROR : VM_PAGER_OK);
-}
-
-void
-uvm_vnode_unlock(struct uvm_vnode *uvn)
-{
- int error;
-
- if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
- VOP_UNLOCK(uvn->u_vnode);
-
- error = uvn->u_flags & UVM_VNODE_IOERROR;
- uvn->u_flags &= ~UVM_VNODE_IOERROR;
- if (error) {
- while (rebooting)
- tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP);
- }
 }
Index: uvm/uvm_vnode.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.h,v
retrieving revision 1.15
diff -u -p -r1.15 uvm_vnode.h
--- uvm/uvm_vnode.h 26 Oct 2020 19:48:19 -0000 1.15
+++ uvm/uvm_vnode.h 23 Feb 2021 10:43:25 -0000
@@ -84,7 +84,6 @@ struct uvm_vnode {
    i/o sync to clear so it can do
    i/o */
 #define UVM_VNODE_WRITEABLE 0x200 /* uvn has pages that are writeable */
-#define UVM_VNODE_IOERROR 0x400 /* i/o error occurred in uvn_io() */
 
 /*
  * UVM_VNODE_BLOCKED: any condition that should new processes from
Index: uvm/uvm_pager.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.29
diff -u -p -r1.29 uvm_pager.h
--- uvm/uvm_pager.h 11 Jul 2014 16:35:40 -0000 1.29
+++ uvm/uvm_pager.h 23 Feb 2021 10:46:26 -0000
@@ -111,6 +111,7 @@ struct uvm_pagerops {
 #define PGO_LOCKED 0x040 /* fault data structures are locked [get] */
 #define PGO_PDFREECLUST 0x080 /* daemon's free cluster flag [uvm_pager_put] */
 #define PGO_REALLOCSWAP 0x100 /* reallocate swap area [pager_dropcluster] */
+#define PGO_NOWAIT 0x200 /* do not wait for inode lock */
 
 /* page we are not interested in getting */
 #define PGO_DONTCARE ((struct vm_page *) -1L) /* [get only] */