unlock PF_UNIX sockets

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

unlock PF_UNIX sockets

Vitaliy Makkoveev
socket(2) layer is already protected by solock(). It grabs NET_LOCK()
for inet{,6}(4) sockets, but all other sockets are still under
KERNEL_LOCK().

I guess solock is already placed everythere it's required. Also `struct
file' is already mp-safe.

Diff below introduces rwlock `unp_lock'. It's like NET_LOCK()'s
`netlock' but for PF_UNIX sockets. This lock protects the whole PF_UNIX
layer. Vnode layer accessed in "connect" "bind" and "desconnect" cases
is still under KERNEL_LOCK().

I'am experimenting with his diff since 19.04.2020 and my test systems,
include Gnome workstaion are stable, without any issue.


Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_socket2.c
--- sys/kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104
+++ sys/kern/uipc_socket2.c 1 May 2020 13:47:11 -0000
@@ -53,6 +53,8 @@ u_long sb_max = SB_MAX; /* patchable */
 extern struct pool mclpools[];
 extern struct pool mbpool;
 
+extern struct rwlock unp_lock;
+
 /*
  * Procedures to manipulate state flags of socket
  * and do appropriate wakeups.  Normal sequence from the
@@ -282,6 +284,8 @@ solock(struct socket *so)
  NET_LOCK();
  break;
  case PF_UNIX:
+ rw_enter_write(&unp_lock);
+ break;
  case PF_ROUTE:
  case PF_KEY:
  default:
@@ -306,6 +310,8 @@ sounlock(struct socket *so, int s)
  NET_UNLOCK();
  break;
  case PF_UNIX:
+ rw_exit_write(&unp_lock);
+ break;
  case PF_ROUTE:
  case PF_KEY:
  default:
@@ -323,6 +329,8 @@ soassertlocked(struct socket *so)
  NET_ASSERT_LOCKED();
  break;
  case PF_UNIX:
+ rw_assert_wrlock(&unp_lock);
+ break;
  case PF_ROUTE:
  case PF_KEY:
  default:
@@ -335,12 +343,24 @@ int
 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
     uint64_t nsecs)
 {
- if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
-    (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
-    (so->so_proto->pr_domain->dom_family != PF_KEY)) {
- return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
- } else
- return tsleep_nsec(ident, prio, wmesg, nsecs);
+ int ret;
+
+ switch (so->so_proto->pr_domain->dom_family) {
+ case PF_INET:
+ case PF_INET6:
+ ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+ break;
+ case PF_UNIX:
+ ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs);
+ break;
+ case PF_ROUTE:
+ case PF_KEY:
+ default:
+ ret = tsleep_nsec(ident, prio, wmesg, nsecs);
+ break;
+ }
+
+ return ret;
 }
 
 /*
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.142
diff -u -p -r1.142 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142
+++ sys/kern/uipc_usrreq.c 1 May 2020 13:47:11 -0000
@@ -51,10 +51,17 @@
 #include <sys/task.h>
 #include <sys/pledge.h>
 #include <sys/pool.h>
+#include <sys/rwlock.h>                                                                            
+/*
+ * Locks used to protect global data and struct members:
+ *      I       immutable after creation
+ *      U       unp_lock
+ */
+struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
 
 void uipc_setaddr(const struct unpcb *, struct mbuf *);
 
-/* list of all UNIX domain sockets, for unp_gc() */
+/* [U] list of all UNIX domain sockets, for unp_gc() */
 LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
 
 /*
@@ -62,10 +69,10 @@ LIST_HEAD(unp_head, unpcb) unp_head = LI
  * not received and need to be closed.
  */
 struct unp_deferral {
- SLIST_ENTRY(unp_deferral) ud_link;
- int ud_n;
+ SLIST_ENTRY(unp_deferral) ud_link; /* [U] */
+ int ud_n; /* [I] */
  /* followed by ud_n struct fdpass */
- struct fdpass ud_fp[];
+ struct fdpass ud_fp[]; /* [I] */
 };
 
 void unp_discard(struct fdpass *, int);
@@ -74,7 +81,7 @@ void unp_scan(struct mbuf *, void (*)(st
 int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
 
 struct pool unpcb_pool;
-/* list of sets of files that were sent over sockets that are now closed */
+/* [U] list of sets of files that were sent over sockets that are now closed */
 SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
 
 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
@@ -89,7 +96,7 @@ struct task unp_gc_task = TASK_INITIALIZ
  * need a proper out-of-band
  */
 struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-ino_t unp_ino; /* prototype for fake inode numbers */
+ino_t unp_ino; /* [U] prototype for fake inode numbers */
 
 void
 unp_init(void)
@@ -351,7 +358,7 @@ u_long unpst_recvspace = PIPSIZ;
 u_long unpdg_sendspace = 2*1024; /* really max datagram size */
 u_long unpdg_recvspace = 4*1024;
 
-int unp_rights; /* file descriptors in flight */
+int unp_rights; /* [U] file descriptors in flight */
 
 int
 uipc_attach(struct socket *so, int proto)
@@ -359,6 +366,8 @@ uipc_attach(struct socket *so, int proto
  struct unpcb *unp;
  int error;
 
+ rw_assert_wrlock(&unp_lock);
+
  if (so->so_pcb)
  return EISCONN;
  if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -409,12 +418,17 @@ unp_detach(struct unpcb *unp)
 {
  struct vnode *vp;
 
+ rw_assert_wrlock(&unp_lock);
+
  LIST_REMOVE(unp, unp_link);
  if (unp->unp_vnode) {
+ /* this is an unlocked cleaning of `v_socket', but it's safe */
  unp->unp_vnode->v_socket = NULL;
  vp = unp->unp_vnode;
  unp->unp_vnode = NULL;
+ KERNEL_LOCK();
  vrele(vp);
+ KERNEL_UNLOCK();
  }
  if (unp->unp_conn)
  unp_disconnect(unp);
@@ -458,10 +472,11 @@ unp_bind(struct unpcb *unp, struct mbuf
  NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
     soun->sun_path, p);
  nd.ni_pledge = PLEDGE_UNIX;
+ KERNEL_LOCK();
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
  if ((error = namei(&nd)) != 0) {
  m_freem(nam2);
- return (error);
+ goto unlock;
  }
  vp = nd.ni_vp;
  if (vp != NULL) {
@@ -472,7 +487,8 @@ unp_bind(struct unpcb *unp, struct mbuf
  vput(nd.ni_dvp);
  vrele(vp);
  m_freem(nam2);
- return (EADDRINUSE);
+ error = EADDRINUSE;
+ goto unlock;
  }
  VATTR_NULL(&vattr);
  vattr.va_type = VSOCK;
@@ -481,7 +497,7 @@ unp_bind(struct unpcb *unp, struct mbuf
  vput(nd.ni_dvp);
  if (error) {
  m_freem(nam2);
- return (error);
+ goto unlock;
  }
  unp->unp_addr = nam2;
  vp = nd.ni_vp;
@@ -492,7 +508,10 @@ unp_bind(struct unpcb *unp, struct mbuf
  unp->unp_connid.pid = p->p_p->ps_pid;
  unp->unp_flags |= UNP_FEIDSBIND;
  VOP_UNLOCK(vp);
- return (0);
+unlock:
+ KERNEL_UNLOCK();
+
+ return error;
 }
 
 int
@@ -510,8 +529,9 @@ unp_connect(struct socket *so, struct mb
 
  NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
  nd.ni_pledge = PLEDGE_UNIX;
+ KERNEL_LOCK();
  if ((error = namei(&nd)) != 0)
- return (error);
+ goto unlock;
  vp = nd.ni_vp;
  if (vp->v_type != VSOCK) {
  error = ENOTSOCK;
@@ -553,6 +573,9 @@ unp_connect(struct socket *so, struct mb
  error = unp_connect2(so, so2);
 bad:
  vput(vp);
+unlock:
+ KERNEL_UNLOCK();
+
  return (error);
 }
 
@@ -562,6 +585,8 @@ unp_connect2(struct socket *so, struct s
  struct unpcb *unp = sotounpcb(so);
  struct unpcb *unp2;
 
+ rw_assert_wrlock(&unp_lock);
+
  if (so2->so_type != so->so_type)
  return (EPROTOTYPE);
  unp2 = sotounpcb(so2);
@@ -635,15 +660,16 @@ unp_drop(struct unpcb *unp, int errno)
 {
  struct socket *so = unp->unp_socket;
 
- KERNEL_ASSERT_LOCKED();
+ rw_assert_wrlock(&unp_lock);
 
  so->so_error = errno;
  unp_disconnect(unp);
  if (so->so_head) {
  so->so_pcb = NULL;
  /*
- * As long as the KERNEL_LOCK() is the default lock for Unix
- * sockets, do not release it.
+ * As long as `unp_lock' is taken before entering
+ * uipc_usrreq() releasing it here would lead to a
+ * double unlock.
  */
  sofree(so, SL_NOUNLOCK);
  m_freem(unp->unp_addr);
@@ -685,6 +711,8 @@ unp_externalize(struct mbuf *rights, soc
  struct file *fp;
  int nfds, error = 0;
 
+ rw_assert_wrlock(&unp_lock);
+
  /*
  * This code only works because SCM_RIGHTS is the only supported
  * control message type on unix sockets. Enforce this here.
@@ -705,6 +733,10 @@ unp_externalize(struct mbuf *rights, soc
 
  /* Make sure the recipient should be able to see the descriptors.. */
  rp = (struct fdpass *)CMSG_DATA(cm);
+
+ /* fdp->fd_rdir requires KERNEL_LOCK() */
+ KERNEL_LOCK();
+
  for (i = 0; i < nfds; i++) {
  fp = rp->fp;
  rp++;
@@ -728,6 +760,8 @@ unp_externalize(struct mbuf *rights, soc
  }
  }
 
+ KERNEL_UNLOCK();
+
  fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
 
 restart:
@@ -825,6 +859,8 @@ unp_internalize(struct mbuf *control, st
  int i, error;
  int nfds, *ip, fd, neededspace;
 
+ rw_assert_wrlock(&unp_lock);
+
  /*
  * Check for two potential msg_controllen values because
  * IETF stuck their nose in a place it does not belong.
@@ -923,7 +959,8 @@ fail:
  return (error);
 }
 
-int unp_defer, unp_gcing;
+int unp_defer; /* [U] number of deferred fp to close by the GC task */
+int unp_gcing; /* [U] GC task currently running */
 
 void
 unp_gc(void *arg __unused)
@@ -934,8 +971,10 @@ unp_gc(void *arg __unused)
  struct unpcb *unp;
  int nunref, i;
 
+ rw_enter_write(&unp_lock);
+
  if (unp_gcing)
- return;
+ goto unlock;
  unp_gcing = 1;
 
  /* close any fds on the deferred list */
@@ -950,7 +989,9 @@ unp_gc(void *arg __unused)
  if ((unp = fptounp(fp)) != NULL)
  unp->unp_msgcount--;
  unp_rights--;
+ rw_exit_write(&unp_lock);
  (void) closef(fp, NULL);
+ rw_enter_write(&unp_lock);
  }
  free(defer, M_TEMP, sizeof(*defer) +
     sizeof(struct fdpass) * defer->ud_n);
@@ -1021,6 +1062,8 @@ unp_gc(void *arg __unused)
  }
  }
  unp_gcing = 0;
+unlock:
+ rw_exit_write(&unp_lock);
 }
 
 void
@@ -1066,6 +1109,8 @@ unp_mark(struct fdpass *rp, int nfds)
  struct unpcb *unp;
  int i;
 
+ rw_assert_wrlock(&unp_lock);
+
  for (i = 0; i < nfds; i++) {
  if (rp[i].fp == NULL)
  continue;
@@ -1087,6 +1132,8 @@ void
 unp_discard(struct fdpass *rp, int nfds)
 {
  struct unp_deferral *defer;
+
+ rw_assert_wrlock(&unp_lock);
 
  /* copy the file pointers to a deferral structure */
  defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.17
diff -u -p -r1.17 unpcb.h
--- sys/sys/unpcb.h 15 Jul 2019 12:28:06 -0000 1.17
+++ sys/sys/unpcb.h 1 May 2020 13:47:12 -0000
@@ -56,22 +56,27 @@
  * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt
  * so that changes in the sockbuf may be computed to modify
  * back pressure on the sender accordingly.
+ *
+ * Locks used to protect struct members:
+ *      I       immutable after creation
+ *      U       unp_lock
  */
 
+
 struct unpcb {
- struct socket *unp_socket; /* pointer back to socket */
- struct vnode *unp_vnode; /* if associated with file */
- struct file *unp_file; /* backpointer for unp_gc() */
- struct unpcb *unp_conn; /* control block of connected socket */
- ino_t unp_ino; /* fake inode number */
- SLIST_HEAD(,unpcb) unp_refs; /* referencing socket linked list */
- SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */
- struct mbuf *unp_addr; /* bound address of socket */
- long unp_msgcount; /* references from socket rcv buf */
- int unp_flags; /* this unpcb contains peer eids */
- struct sockpeercred unp_connid;/* id of peer process */
- struct timespec unp_ctime; /* holds creation time */
- LIST_ENTRY(unpcb) unp_link; /* link in per-AF list of sockets */
+ struct socket *unp_socket; /* [I] pointer back to socket */
+ struct vnode *unp_vnode; /* [U] if associated with file */
+ struct file *unp_file; /* [U] backpointer for unp_gc() */
+ struct unpcb *unp_conn; /* [U] control block of connected socket */
+ ino_t unp_ino; /* [U] fake inode number */
+ SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */
+ SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
+ struct mbuf *unp_addr; /* [U] bound address of socket */
+ long unp_msgcount; /* [U] references from socket rcv buf */
+ int unp_flags; /* [U] this unpcb contains peer eids */
+ struct sockpeercred unp_connid;/* [U] id of peer process */
+ struct timespec unp_ctime; /* [I] holds creation time */
+ LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */
 };
 
 /*

Reply | Threaded
Open this post in threaded view
|

Re: unlock PF_UNIX sockets

Martin Pieuchot
On 28/05/20(Thu) 14:59, Vitaliy Makkoveev wrote:

> socket(2) layer is already protected by solock(). It grabs NET_LOCK()
> for inet{,6}(4) sockets, but all other sockets are still under
> KERNEL_LOCK().
>
> I guess solock is already placed everythere it's required. Also `struct
> file' is already mp-safe.
>
> Diff below introduces rwlock `unp_lock'. It's like NET_LOCK()'s
> `netlock' but for PF_UNIX sockets. This lock protects the whole PF_UNIX
> layer. Vnode layer accessed in "connect" "bind" and "desconnect" cases
> is still under KERNEL_LOCK().
>
> I'am experimenting with his diff since 19.04.2020 and my test systems,
> include Gnome workstaion are stable, without any issue.

Looks great, more tests are required :)

Your diff has many trailing spaces, could you remove them?  Commonly
used editors or diff programs have a way to highlight them :)

One comment:

> Index: sys/kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.142
> diff -u -p -r1.142 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142
> +++ sys/kern/uipc_usrreq.c 1 May 2020 13:47:11 -0000
> @@ -409,12 +418,17 @@ unp_detach(struct unpcb *unp)
>  {
>   struct vnode *vp;
>  
> + rw_assert_wrlock(&unp_lock);
> +
>   LIST_REMOVE(unp, unp_link);
>   if (unp->unp_vnode) {
> + /* this is an unlocked cleaning of `v_socket', but it's safe */
>   unp->unp_vnode->v_socket = NULL;
>   vp = unp->unp_vnode;
>   unp->unp_vnode = NULL;
> + KERNEL_LOCK();
>   vrele(vp);
> + KERNEL_UNLOCK();

Why is it safe?  That's what the comment should explain :)  If the code
doesn't take the lock that implies it is not required.  Why it is not
required is not obvious.

Reply | Threaded
Open this post in threaded view
|

Re: unlock PF_UNIX sockets

Vitaliy Makkoveev
On Fri, May 29, 2020 at 08:48:14AM +0200, Martin Pieuchot wrote:

> On 28/05/20(Thu) 14:59, Vitaliy Makkoveev wrote:
> > socket(2) layer is already protected by solock(). It grabs NET_LOCK()
> > for inet{,6}(4) sockets, but all other sockets are still under
> > KERNEL_LOCK().
> >
> > I guess solock is already placed everythere it's required. Also `struct
> > file' is already mp-safe.
> >
> > Diff below introduces rwlock `unp_lock'. It's like NET_LOCK()'s
> > `netlock' but for PF_UNIX sockets. This lock protects the whole PF_UNIX
> > layer. Vnode layer accessed in "connect" "bind" and "desconnect" cases
> > is still under KERNEL_LOCK().
> >
> > I'am experimenting with his diff since 19.04.2020 and my test systems,
> > include Gnome workstaion are stable, without any issue.
>
> Looks great, more tests are required :)
>
> Your diff has many trailing spaces, could you remove them?  Commonly
> used editors or diff programs have a way to highlight them :)

Ok I will fix it.

>
> One comment:
>
> > Index: sys/kern/uipc_usrreq.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.142
> > diff -u -p -r1.142 uipc_usrreq.c
> > --- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142
> > +++ sys/kern/uipc_usrreq.c 1 May 2020 13:47:11 -0000
> > @@ -409,12 +418,17 @@ unp_detach(struct unpcb *unp)
> >  {
> >   struct vnode *vp;
> >  
> > + rw_assert_wrlock(&unp_lock);
> > +
> >   LIST_REMOVE(unp, unp_link);
> >   if (unp->unp_vnode) {
> > + /* this is an unlocked cleaning of `v_socket', but it's safe */
> >   unp->unp_vnode->v_socket = NULL;
> >   vp = unp->unp_vnode;
> >   unp->unp_vnode = NULL;
> > + KERNEL_LOCK();
> >   vrele(vp);
> > + KERNEL_UNLOCK();
>
> Why is it safe?  That's what the comment should explain :)  If the code
> doesn't take the lock that implies it is not required.  Why it is not
> required is not obvious.
>

The only place where `v_socket' is accessed outside PF_UNIX layer is
kern/kern_sysctl.c:1143. This is just pointer to integer cast. Pointer
assignment is atomic.

Also we are accessing this `unp' in unp_detach(), unp_bind() and
unp_connect() but they are serialized by `unp_lock'.

This is the last reference to this `unp' and also there is the only
reference in socket layer to `unp_vnode'. Since `unpcb' has no reference
counter we can't add assertion to be sure this is the last reference to
`unp'. We can check reference count of vnode. But since sys_open() and
unp_detach() are not serialized by KERNEL_LOCK(), vn_open() called by
doopenat() will raise `v_usecount' before `v_type' check
(kern/vfs_vnops.c:112). And `vn->v_usecount' in this unp_detach() call
can be "2".

In fact, only vrele(9) should be called under KERNEL_LOCK() to protect
vnode. If we want to put "KASSERT(vn->v_usecount==1)" we should move it
under KERNEL_LOCK() before `v_socket' cleaning to be sure there is no
concurency with sys_open().

Something like this:

        if (unp->unp_vnode) {
                KERNEL_LOCK();
                KASSERT(unp->unp_vnode->v_usecount == 1);
                unp->unp_vnode->v_socket = NULL;
                vp = unp->unp_vnode;
                unp->unp_vnode = NULL;
                vrele(vp);
                KERNEL_UNLOCK();
        }

I will update diff if this is required.