mbuf cluster limit pool wakeup

classic Classic list List threaded Threaded
6 messages Options
Reply | Threaded
Open this post in threaded view
|

mbuf cluster limit pool wakeup

Alexander Bluhm
Hi,

When the kernel reaches the sysclt kern.maxclusters limit, operations
get stuck while holding the net lock.  Increasing the limit does
not help as there is no wakeup of the pools.  So run through the
mbuf pool request list when the limit changes.

There seem to more problems when recovering from mbuf shortage, but
this is the most obvious one.

ok?

bluhm

Index: kern/subr_pool.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_pool.c,v
retrieving revision 1.227
diff -u -p -r1.227 subr_pool.c
--- kern/subr_pool.c 23 Apr 2019 13:35:12 -0000 1.227
+++ kern/subr_pool.c 16 Jul 2019 18:02:08 -0000
@@ -815,6 +815,12 @@ pool_put(struct pool *pp, void *v)
  if (freeph != NULL)
  pool_p_free(pp, freeph);

+ pool_wakeup(pp);
+}
+
+void
+pool_wakeup(struct pool *pp)
+{
  if (!TAILQ_EMPTY(&pp->pr_requests)) {
  pl_enter(pp, &pp->pr_requests_lock);
  pool_runqueue(pp, PR_NOWAIT);
Index: kern/uipc_mbuf.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.270
diff -u -p -r1.270 uipc_mbuf.c
--- kern/uipc_mbuf.c 16 Jul 2019 17:39:02 -0000 1.270
+++ kern/uipc_mbuf.c 16 Jul 2019 18:04:33 -0000
@@ -167,8 +167,6 @@ mbinit(void)

  m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;

- error = nmbclust_update(nmbclust);
- KASSERT(error == 0);
  mbuf_mem_alloc = 0;

 #if DIAGNOSTIC
@@ -196,6 +194,9 @@ mbinit(void)
  m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
  }

+ error = nmbclust_update(nmbclust);
+ KASSERT(error == 0);
+
  (void)mextfree_register(m_extfree_pool);
  KASSERT(num_extfree_fns == 1);
 }
@@ -217,11 +218,18 @@ mbcpuinit()
 int
 nmbclust_update(long newval)
 {
+ int i;
+
  if (newval < 0 || newval > LONG_MAX / MCLBYTES)
  return ERANGE;
  /* update the global mbuf memory limit */
  nmbclust = newval;
  mbuf_mem_limit = nmbclust * MCLBYTES;
+
+ pool_wakeup(&mbpool);
+ for (i = 0; i < nitems(mclsizes); i++)
+ pool_wakeup(&mclpools[i]);
+
  return 0;
 }

Index: sys/pool.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/pool.h,v
retrieving revision 1.76
diff -u -p -r1.76 pool.h
--- sys/pool.h 10 Feb 2019 22:45:58 -0000 1.76
+++ sys/pool.h 16 Jul 2019 18:02:08 -0000
@@ -271,6 +271,7 @@ void pool_request_init(struct pool_requ
     void (*)(struct pool *, void *, void *), void *);
 void pool_request(struct pool *, struct pool_request *);
 void pool_put(struct pool *, void *);
+void pool_wakeup(struct pool *);
 int pool_reclaim(struct pool *);
 void pool_reclaim_all(void);
 int pool_prime(struct pool *, int);

Reply | Threaded
Open this post in threaded view
|

Re: mbuf cluster limit pool wakeup

Martin Pieuchot
On 16/07/19(Tue) 21:35, Alexander Bluhm wrote:
> Hi,
>
> When the kernel reaches the sysclt kern.maxclusters limit, operations
> get stuck while holding the net lock.  Increasing the limit does
> not help as there is no wakeup of the pools.  So run through the
> mbuf pool request list when the limit changes.

Should you call pool_wakeup() only if the new limit is greater than the
current one?

> There seem to more problems when recovering from mbuf shortage, but
> this is the most obvious one.

By more problems to you mean "How to give mbuf back to the pool"?

> ok?
>
> bluhm
>
> Index: kern/subr_pool.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_pool.c,v
> retrieving revision 1.227
> diff -u -p -r1.227 subr_pool.c
> --- kern/subr_pool.c 23 Apr 2019 13:35:12 -0000 1.227
> +++ kern/subr_pool.c 16 Jul 2019 18:02:08 -0000
> @@ -815,6 +815,12 @@ pool_put(struct pool *pp, void *v)
>   if (freeph != NULL)
>   pool_p_free(pp, freeph);
>
> + pool_wakeup(pp);
> +}
> +
> +void
> +pool_wakeup(struct pool *pp)
> +{
>   if (!TAILQ_EMPTY(&pp->pr_requests)) {
>   pl_enter(pp, &pp->pr_requests_lock);
>   pool_runqueue(pp, PR_NOWAIT);
> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
> retrieving revision 1.270
> diff -u -p -r1.270 uipc_mbuf.c
> --- kern/uipc_mbuf.c 16 Jul 2019 17:39:02 -0000 1.270
> +++ kern/uipc_mbuf.c 16 Jul 2019 18:04:33 -0000
> @@ -167,8 +167,6 @@ mbinit(void)
>
>   m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
>
> - error = nmbclust_update(nmbclust);
> - KASSERT(error == 0);
>   mbuf_mem_alloc = 0;
>
>  #if DIAGNOSTIC
> @@ -196,6 +194,9 @@ mbinit(void)
>   m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
>   }
>
> + error = nmbclust_update(nmbclust);
> + KASSERT(error == 0);
> +
>   (void)mextfree_register(m_extfree_pool);
>   KASSERT(num_extfree_fns == 1);
>  }
> @@ -217,11 +218,18 @@ mbcpuinit()
>  int
>  nmbclust_update(long newval)
>  {
> + int i;
> +
>   if (newval < 0 || newval > LONG_MAX / MCLBYTES)
>   return ERANGE;
>   /* update the global mbuf memory limit */
>   nmbclust = newval;
>   mbuf_mem_limit = nmbclust * MCLBYTES;
> +
> + pool_wakeup(&mbpool);
> + for (i = 0; i < nitems(mclsizes); i++)
> + pool_wakeup(&mclpools[i]);
> +
>   return 0;
>  }
>
> Index: sys/pool.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/pool.h,v
> retrieving revision 1.76
> diff -u -p -r1.76 pool.h
> --- sys/pool.h 10 Feb 2019 22:45:58 -0000 1.76
> +++ sys/pool.h 16 Jul 2019 18:02:08 -0000
> @@ -271,6 +271,7 @@ void pool_request_init(struct pool_requ
>      void (*)(struct pool *, void *, void *), void *);
>  void pool_request(struct pool *, struct pool_request *);
>  void pool_put(struct pool *, void *);
> +void pool_wakeup(struct pool *);
>  int pool_reclaim(struct pool *);
>  void pool_reclaim_all(void);
>  int pool_prime(struct pool *, int);
>

Reply | Threaded
Open this post in threaded view
|

Re: mbuf cluster limit pool wakeup

Alexander Bluhm
On Tue, Jul 16, 2019 at 08:58:43PM -0300, Martin Pieuchot wrote:

> On 16/07/19(Tue) 21:35, Alexander Bluhm wrote:
> > Hi,
> >
> > When the kernel reaches the sysclt kern.maxclusters limit, operations
> > get stuck while holding the net lock.  Increasing the limit does
> > not help as there is no wakeup of the pools.  So run through the
> > mbuf pool request list when the limit changes.
>
> Should you call pool_wakeup() only if the new limit is greater than the
> current one?

Additional wakups do not hurt, especially from the cold sysctl path.
Note that sysctl KERN_MAXCLUSTERS has a val != nmbclust check
already.  So the value has to change.  I think it is not worth to
add more code to nmbclust_update.

> > There seem to more problems when recovering from mbuf shortage, but
> > this is the most obvious one.
>
> By more problems to you mean "How to give mbuf back to the pool"?

Sorry for being so vague.

dlg@ has some ideas about what could cause starvation in pools.
But I see problems in socket, protocol and interface layer.

The listen queue is full, inetd blocks reading from another socket.
I guess poll/select information was not correctly reported to
userland so it got stuck.

I see half closed TCP connections hanging on loopback forever:

tcp          0      0  127.0.0.1.9            127.0.0.1.18490        ESTABLISHED
tcp          0      0  127.0.0.1.18490        127.0.0.1.9            FIN_WAIT_1

And I have to do ifconfig vio0 down and ifconfig vio0 up to receive
packets on the interface again.

These things have to be investigated and fixed after this diff has
been commited.  In general we behave poorly when hitting resource
limits.  I think nobody is testing this regulary.

bluhm

> > Index: kern/subr_pool.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_pool.c,v
> > retrieving revision 1.227
> > diff -u -p -r1.227 subr_pool.c
> > --- kern/subr_pool.c 23 Apr 2019 13:35:12 -0000 1.227
> > +++ kern/subr_pool.c 16 Jul 2019 18:02:08 -0000
> > @@ -815,6 +815,12 @@ pool_put(struct pool *pp, void *v)
> >   if (freeph != NULL)
> >   pool_p_free(pp, freeph);
> >
> > + pool_wakeup(pp);
> > +}
> > +
> > +void
> > +pool_wakeup(struct pool *pp)
> > +{
> >   if (!TAILQ_EMPTY(&pp->pr_requests)) {
> >   pl_enter(pp, &pp->pr_requests_lock);
> >   pool_runqueue(pp, PR_NOWAIT);
> > Index: kern/uipc_mbuf.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
> > retrieving revision 1.270
> > diff -u -p -r1.270 uipc_mbuf.c
> > --- kern/uipc_mbuf.c 16 Jul 2019 17:39:02 -0000 1.270
> > +++ kern/uipc_mbuf.c 16 Jul 2019 18:04:33 -0000
> > @@ -167,8 +167,6 @@ mbinit(void)
> >
> >   m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
> >
> > - error = nmbclust_update(nmbclust);
> > - KASSERT(error == 0);
> >   mbuf_mem_alloc = 0;
> >
> >  #if DIAGNOSTIC
> > @@ -196,6 +194,9 @@ mbinit(void)
> >   m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
> >   }
> >
> > + error = nmbclust_update(nmbclust);
> > + KASSERT(error == 0);
> > +
> >   (void)mextfree_register(m_extfree_pool);
> >   KASSERT(num_extfree_fns == 1);
> >  }
> > @@ -217,11 +218,18 @@ mbcpuinit()
> >  int
> >  nmbclust_update(long newval)
> >  {
> > + int i;
> > +
> >   if (newval < 0 || newval > LONG_MAX / MCLBYTES)
> >   return ERANGE;
> >   /* update the global mbuf memory limit */
> >   nmbclust = newval;
> >   mbuf_mem_limit = nmbclust * MCLBYTES;
> > +
> > + pool_wakeup(&mbpool);
> > + for (i = 0; i < nitems(mclsizes); i++)
> > + pool_wakeup(&mclpools[i]);
> > +
> >   return 0;
> >  }
> >
> > Index: sys/pool.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/pool.h,v
> > retrieving revision 1.76
> > diff -u -p -r1.76 pool.h
> > --- sys/pool.h 10 Feb 2019 22:45:58 -0000 1.76
> > +++ sys/pool.h 16 Jul 2019 18:02:08 -0000
> > @@ -271,6 +271,7 @@ void pool_request_init(struct pool_requ
> >      void (*)(struct pool *, void *, void *), void *);
> >  void pool_request(struct pool *, struct pool_request *);
> >  void pool_put(struct pool *, void *);
> > +void pool_wakeup(struct pool *);
> >  int pool_reclaim(struct pool *);
> >  void pool_reclaim_all(void);
> >  int pool_prime(struct pool *, int);
> >

Reply | Threaded
Open this post in threaded view
|

Re: mbuf cluster limit pool wakeup

David Gwynne-5


> On 18 Jul 2019, at 7:44 am, Alexander Bluhm <[hidden email]> wrote:
>
> On Tue, Jul 16, 2019 at 08:58:43PM -0300, Martin Pieuchot wrote:
>> On 16/07/19(Tue) 21:35, Alexander Bluhm wrote:
>>> Hi,
>>>
>>> When the kernel reaches the sysclt kern.maxclusters limit, operations
>>> get stuck while holding the net lock.  Increasing the limit does
>>> not help as there is no wakeup of the pools.  So run through the
>>> mbuf pool request list when the limit changes.
>>
>> Should you call pool_wakeup() only if the new limit is greater than the
>> current one?
>
> Additional wakups do not hurt, especially from the cold sysctl path.
> Note that sysctl KERN_MAXCLUSTERS has a val != nmbclust check
> already.  So the value has to change.  I think it is not worth to
> add more code to nmbclust_update.
>
>>> There seem to more problems when recovering from mbuf shortage, but
>>> this is the most obvious one.
>>
>> By more problems to you mean "How to give mbuf back to the pool"?
>
> Sorry for being so vague.
>
> dlg@ has some ideas about what could cause starvation in pools.
> But I see problems in socket, protocol and interface layer.
>
> The listen queue is full, inetd blocks reading from another socket.
> I guess poll/select information was not correctly reported to
> userland so it got stuck.
>
> I see half closed TCP connections hanging on loopback forever:
>
> tcp          0      0  127.0.0.1.9            127.0.0.1.18490        ESTABLISHED
> tcp          0      0  127.0.0.1.18490        127.0.0.1.9            FIN_WAIT_1
>
> And I have to do ifconfig vio0 down and ifconfig vio0 up to receive
> packets on the interface again.

Can you look at systat mb when this happens to check what value vio0 has in the ALIVE column? If ALIVE is 0, it means the ring is empty. It looks like the driver does try to cope with this situation, but maybe it doesn't cope well enough. Let's start with checking if it is an empty ring first.

dlg

>
> These things have to be investigated and fixed after this diff has
> been commited.  In general we behave poorly when hitting resource
> limits.  I think nobody is testing this regulary.
>
> bluhm
>
>>> Index: kern/subr_pool.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_pool.c,v
>>> retrieving revision 1.227
>>> diff -u -p -r1.227 subr_pool.c
>>> --- kern/subr_pool.c 23 Apr 2019 13:35:12 -0000 1.227
>>> +++ kern/subr_pool.c 16 Jul 2019 18:02:08 -0000
>>> @@ -815,6 +815,12 @@ pool_put(struct pool *pp, void *v)
>>> if (freeph != NULL)
>>> pool_p_free(pp, freeph);
>>>
>>> + pool_wakeup(pp);
>>> +}
>>> +
>>> +void
>>> +pool_wakeup(struct pool *pp)
>>> +{
>>> if (!TAILQ_EMPTY(&pp->pr_requests)) {
>>> pl_enter(pp, &pp->pr_requests_lock);
>>> pool_runqueue(pp, PR_NOWAIT);
>>> Index: kern/uipc_mbuf.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
>>> retrieving revision 1.270
>>> diff -u -p -r1.270 uipc_mbuf.c
>>> --- kern/uipc_mbuf.c 16 Jul 2019 17:39:02 -0000 1.270
>>> +++ kern/uipc_mbuf.c 16 Jul 2019 18:04:33 -0000
>>> @@ -167,8 +167,6 @@ mbinit(void)
>>>
>>> m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
>>>
>>> - error = nmbclust_update(nmbclust);
>>> - KASSERT(error == 0);
>>> mbuf_mem_alloc = 0;
>>>
>>> #if DIAGNOSTIC
>>> @@ -196,6 +194,9 @@ mbinit(void)
>>> m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
>>> }
>>>
>>> + error = nmbclust_update(nmbclust);
>>> + KASSERT(error == 0);
>>> +
>>> (void)mextfree_register(m_extfree_pool);
>>> KASSERT(num_extfree_fns == 1);
>>> }
>>> @@ -217,11 +218,18 @@ mbcpuinit()
>>> int
>>> nmbclust_update(long newval)
>>> {
>>> + int i;
>>> +
>>> if (newval < 0 || newval > LONG_MAX / MCLBYTES)
>>> return ERANGE;
>>> /* update the global mbuf memory limit */
>>> nmbclust = newval;
>>> mbuf_mem_limit = nmbclust * MCLBYTES;
>>> +
>>> + pool_wakeup(&mbpool);
>>> + for (i = 0; i < nitems(mclsizes); i++)
>>> + pool_wakeup(&mclpools[i]);
>>> +
>>> return 0;
>>> }
>>>
>>> Index: sys/pool.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/pool.h,v
>>> retrieving revision 1.76
>>> diff -u -p -r1.76 pool.h
>>> --- sys/pool.h 10 Feb 2019 22:45:58 -0000 1.76
>>> +++ sys/pool.h 16 Jul 2019 18:02:08 -0000
>>> @@ -271,6 +271,7 @@ void pool_request_init(struct pool_requ
>>>    void (*)(struct pool *, void *, void *), void *);
>>> void pool_request(struct pool *, struct pool_request *);
>>> void pool_put(struct pool *, void *);
>>> +void pool_wakeup(struct pool *);
>>> int pool_reclaim(struct pool *);
>>> void pool_reclaim_all(void);
>>> int pool_prime(struct pool *, int);
>>>
>

Reply | Threaded
Open this post in threaded view
|

vio0 recover from mbuf limit

Alexander Bluhm
On Thu, Jul 18, 2019 at 11:31:26PM +1000, David Gwynne wrote:
> > And I have to do ifconfig vio0 down and ifconfig vio0 up to receive
> > packets on the interface again.
>
> Can you look at systat mb when this happens to check what value
> vio0 has in the ALIVE column? If ALIVE is 0, it means the ring is
> empty. It looks like the driver does try to cope with this situation,
> but maybe it doesn't cope well enough. Let's start with checking
> if it is an empty ring first.

After a bit of testing it looks that the codition for adding the
timeout is never true.  It is a consequence of converting to
if_rxr_get().

The easiest fix is to add the rx tick timeout unconditionaly.
Runnning this code every second is not too much waste.  Other drivers
also do that.

ok?

bluhm

Index: dev/pv/if_vio.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pv/if_vio.c,v
retrieving revision 1.12
diff -u -p -r1.12 if_vio.c
--- dev/pv/if_vio.c 26 May 2019 15:22:31 -0000 1.12
+++ dev/pv/if_vio.c 5 Aug 2019 06:52:04 -0000
@@ -982,10 +982,7 @@ vio_populate_rx_mbufs(struct vio_softc *

  if (done)
  virtio_notify(vsc, vq);
- if (vq->vq_used_idx != vq->vq_avail_idx)
- timeout_del(&sc->sc_rxtick);
- else
- timeout_add_sec(&sc->sc_rxtick, 1);
+ timeout_add_sec(&sc->sc_rxtick, 1);
 }

 /* dequeue received packets */

Reply | Threaded
Open this post in threaded view
|

Re: vio0 recover from mbuf limit

David Gwynne-5


> On 5 Aug 2019, at 22:38, Alexander Bluhm <[hidden email]> wrote:
>
> On Thu, Jul 18, 2019 at 11:31:26PM +1000, David Gwynne wrote:
>>> And I have to do ifconfig vio0 down and ifconfig vio0 up to receive
>>> packets on the interface again.
>>
>> Can you look at systat mb when this happens to check what value
>> vio0 has in the ALIVE column? If ALIVE is 0, it means the ring is
>> empty. It looks like the driver does try to cope with this situation,
>> but maybe it doesn't cope well enough. Let's start with checking
>> if it is an empty ring first.
>
> After a bit of testing it looks that the codition for adding the
> timeout is never true.  It is a consequence of converting to
> if_rxr_get().
>
> The easiest fix is to add the rx tick timeout unconditionaly.
> Runnning this code every second is not too much waste.  Other drivers
> also do that.
>
> ok?

I don't like how big this hammer is, but it is better than the current situation.

It doesn't look like vio uses MPSAFE interrupts, so the timeout can't run concurrently with the rxeof stuff, so this should be fine.

ok.

>
> bluhm
>
> Index: dev/pv/if_vio.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pv/if_vio.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 if_vio.c
> --- dev/pv/if_vio.c 26 May 2019 15:22:31 -0000 1.12
> +++ dev/pv/if_vio.c 5 Aug 2019 06:52:04 -0000
> @@ -982,10 +982,7 @@ vio_populate_rx_mbufs(struct vio_softc *
>
> if (done)
> virtio_notify(vsc, vq);
> - if (vq->vq_used_idx != vq->vq_avail_idx)
> - timeout_del(&sc->sc_rxtick);
> - else
> - timeout_add_sec(&sc->sc_rxtick, 1);
> + timeout_add_sec(&sc->sc_rxtick, 1);
> }
>
> /* dequeue received packets */