6.3 just died (not for the first time)

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
11 messages Options
Reply | Threaded
Open this post in threaded view
|

6.3 just died (not for the first time)

Harald Dunkel-3
Hi folks,

hopefully its allowed to repost this message here:

One gateway running 6.3 ran into the debugger last night. Last words:

login: kernel: protection fault trap, code=0
Stopped at      export_sa+0x5c: movl    0(%rcx),%ecx
ddb{0}> show panic
the kernel did not panic
ddb{0}> trace
export_sa(10,ffff800033445e70) at export_sa+0x5c
pfkeyv2_expire(ffff8000013d4c00,ffff8000013d4c00) at pfkeyv2_expire+0x14e
tdb_timeout(ffff800033446020) at tdb_timeout+0x39
softclock_thread(0) at softclock_thread+0xc6
end trace frame: 0x0, count: -4
ddb{0}> show registers
rdi               0xffff800033445e98
rsi               0xffff8000013d4c00
rbp               0xffff800033445e70
rbx               0xffff800033445e98
rdx               0xffffffff81abdff0    cpu_info_full_primary+0x1ff0
rcx               0xdeadbeefdeadbeef
rax               0xffff800001387510
r8                             0x120
r9                0xffffffff81aa58d8    netlock
r10               0xffffffffffffffff
r11               0xffff800033445ea0
r12               0xffff800001387500
r13                              0x3
r14               0xffff8000013d4c00
r15                             0x90
rip               0xffffffff8121fefc    export_sa+0x5c
cs                               0x8
rflags                       0x10282    __ALIGN_SIZE+0xf282
rsp               0xffff800033445e70
ss                              0x10
export_sa+0x5c: movl    0(%rcx),%ecx
ddb{0}> ps
    PID     TID   PPID    UID  S       FLAGS  WAIT          COMMAND
  74371   82200      1      0  3        0x82  ttyopn        getty
  64133  371566      1      0  3    0x100083  ttyin         getty
  73177  400616      1      0  3    0x100083  ttyin         getty
   2198  160363      1      0  3    0x100083  ttyin         getty
  66943   62449      1      0  3    0x100083  ttyin         getty
  77195  409193      1      0  3    0x100083  ttyin         getty
  30152   89639      1      0  3    0x100083  ttyin         getty
  54326   20290      1      0  3    0x100098  poll          cron
  81308    6330      1      0  3        0x80  kqread        apmd
  21604  251912  61088     74  3    0x100092  bpf           pflogd
  61088  386173      1      0  3        0x80  netio         pflogd
  38994  395332  22137    623  3        0x90  nanosleep     zabbix_agentd
  92334  256603  22137    623  3        0x90  select        zabbix_agentd
   5776  303931  22137    623  3        0x90  netcon        zabbix_agentd
  71818  109922  22137    623  3        0x90  select        zabbix_agentd
  28432  430198  22137    623  3        0x90  nanosleep     zabbix_agentd
  55014  131036  54187     74  3    0x100092  bpf           pflogd
  54187  404660      1      0  3        0x80  netio         pflogd
  32954  132161  74424     74  3    0x100092  bpf           pflogd
  74424   72323      1      0  3        0x80  netio         pflogd
  22137  193504      1    623  3        0x90  wait          zabbix_agentd
  23016    6037      1      0  3        0x80  poll          openvpn
  27849  148250      1      0  3        0x80  poll          openvpn
  78572  192037      1      0  3        0x80  poll          openvpn
  83974  209100      1      0  3        0x80  poll          openvpn
   1297  379204      1     99  3    0x100090  poll          sndiod
  72635   52767      1    110  3    0x100090  poll          sndiod
  59204  423537      1     62  3    0x100090  bpf           spamlogd
  51694  290283  46867     62  3    0x100090  piperd        spamd
  76899  369277  46867     62  3    0x100090  poll          spamd
  46867   52758      1     62  3    0x100090  nanosleep     spamd
  51631   64028      1    109  3        0x90  kqread        ftp-proxy
  74489  238300  13002     95  3    0x100092  kqread        smtpd
  69227  383337  13002    103  3    0x100092  kqread        smtpd
  95912  255952  13002     95  3    0x100092  kqread        smtpd
  52092  398675  13002     95  3    0x100092  kqread        smtpd
  15268  264170  13002     95  3    0x100092  kqread        smtpd
  23823   51587  13002     95  3    0x100092  kqread        smtpd
  13002  289905      1      0  3    0x100080  kqread        smtpd
  39875  399764      1      0  3        0x80  select        sshd
  84492   73143  16575     68  7        0x90                sasyncd
  16575  267935      1      0  3        0x80  select        sasyncd
   5600  244082  24905     68  7        0x10                isakmpd
  24905  484997      1      0  3        0x80  netio         isakmpd
  15412  155977      1      0  3    0x100080  poll          ntpd
  71665   62722  35888     83  3    0x100092  poll          ntpd
  35888  382324      1     83  3    0x100092  poll          ntpd
  79699  454922  42559     74  3    0x100092  bpf           pflogd
  42559  472293      1      0  3        0x80  netio         pflogd
  90864  469513  67456     73  3    0x100090  kqread        syslogd
  67456  146341      1      0  3    0x100082  netio         syslogd
  54377  194590  79772    115  3    0x100092  kqread        slaacd
  81742  432607  79772    115  3    0x100092  kqread        slaacd
  79772  398085      1      0  3        0x80  kqread        slaacd
  84892   75254      0      0  3     0x14200  pgzero        zerothread
  63373  100614      0      0  3     0x14200  aiodoned      aiodoned
  72872  341972      0      0  3     0x14200  syncer        update
  38598  248818      0      0  3     0x14200  cleaner       cleaner
  10377  444516      0      0  3     0x14200  reaper        reaper
  45526  458979      0      0  3     0x14200  pgdaemon      pagedaemon
  49336  347074      0      0  3     0x14200  bored         crynlk
  66086   58296      0      0  3     0x14200  bored         crypto
  24070  258491      0      0  3     0x14200  usbtsk        usbtask
  55027  468872      0      0  3     0x14200  usbatsk       usbatsk
  92887  460485      0      0  3     0x14200  bored         i915-hangcheck
  41493  155682      0      0  3     0x14200  bored         i915-dp
  66272   77311      0      0  3     0x14200  bored         i915
  71823  256416      0      0  3  0x40014200  acpi0         acpi0
  20125  439415      0      0  7  0x40014200                idle3
  42176  326787      0      0  3  0x40014200                idle2
  21681  480396      0      0  3  0x40014200                idle1
  39767  314589      0      0  3     0x14200  bored         sensors
  58848  324478      0      0  3     0x14200  netlock       softnet
  87175   58746      0      0  3     0x14200  bored         systqmp
  67735  194118      0      0  3     0x14200  bored         systq
*79929  171386      0      0  7  0x40014200                softclock
  81339  345588      0      0  3  0x40014200                idle0
      1  339913      0      0  3        0x82  wait          init
      0       0     -1      0  3     0x10200  scheduler     swapper
ddb{0}>

I had installed the recent ipsec and libcrypto syspatches from a few days
ago.

gate5a# syspatch -l
001_perl
002_libtls
003_arp
004_gif
005_httpd
006_ipseclen
007_libcrypto

Is there anything more I could try? Are there suggestions about
how to get more information from the debugger session, if this happens
again?

It just happened again, while writing this EMail! Again its export_sa+0x5c

Every helpful comment is highly appreciated.

Harri

dmesg.log (18K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Martin Pieuchot
On 16/05/18(Wed) 08:06, Harald Dunkel wrote:
> Hi folks,

Thanks for the report.
 

> hopefully its allowed to repost this message here:
>
> One gateway running 6.3 ran into the debugger last night. Last words:
>
> login: kernel: protection fault trap, code=0
> Stopped at      export_sa+0x5c: movl    0(%rcx),%ecx
> ddb{0}> show panic
> the kernel did not panic
> ddb{0}> trace
> export_sa(10,ffff800033445e70) at export_sa+0x5c
> pfkeyv2_expire(ffff8000013d4c00,ffff8000013d4c00) at pfkeyv2_expire+0x14e
> tdb_timeout(ffff800033446020) at tdb_timeout+0x39
> softclock_thread(0) at softclock_thread+0xc6
> end trace frame: 0x0, count: -4
> ddb{0}> show registers
> rdi               0xffff800033445e98
> rsi               0xffff8000013d4c00
> rbp               0xffff800033445e70
> rbx               0xffff800033445e98
> rdx               0xffffffff81abdff0    cpu_info_full_primary+0x1ff0
> rcx               0xdeadbeefdeadbeef
                    ^^^^^^^^^^^^^^^^^^
That means that the TDB has already been freed.  This is possible
because the timeout sleeps on the NET_LOCK().  Diff below should prevent
that by introducing a tdb_reaper() function like we do in other parts of
the stack.

Index: netinet/ip_ipsp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.229
diff -u -p -r1.229 ip_ipsp.c
--- netinet/ip_ipsp.c 6 Nov 2017 15:12:43 -0000 1.229
+++ netinet/ip_ipsp.c 16 May 2018 08:17:59 -0000
@@ -79,10 +79,11 @@ void tdb_hashstats(void);
 #endif
 
 void tdb_rehash(void);
-void tdb_timeout(void *v);
-void tdb_firstuse(void *v);
-void tdb_soft_timeout(void *v);
-void tdb_soft_firstuse(void *v);
+void tdb_reaper(void *);
+void tdb_timeout(void *);
+void tdb_firstuse(void *);
+void tdb_soft_timeout(void *);
+void tdb_soft_firstuse(void *);
 int tdb_hash(u_int, u_int32_t, union sockaddr_union *, u_int8_t);
 
 int ipsec_in_use = 0;
@@ -541,14 +542,13 @@ tdb_timeout(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_TIMER))
- return;
-
  NET_LOCK();
- /* If it's an "invalid" TDB do a silent expiration. */
- if (!(tdb->tdb_flags & TDBF_INVALID))
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
- tdb_delete(tdb);
+ if (tdb->tdb_flags & TDBF_TIMER) {
+ /* If it's an "invalid" TDB do a silent expiration. */
+ if (!(tdb->tdb_flags & TDBF_INVALID))
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
+ tdb_delete(tdb);
+ }
  NET_UNLOCK();
 }
 
@@ -557,14 +557,13 @@ tdb_firstuse(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_SOFT_FIRSTUSE))
- return;
-
  NET_LOCK();
- /* If the TDB hasn't been used, don't renew it. */
- if (tdb->tdb_first_use != 0)
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
- tdb_delete(tdb);
+ if (tdb->tdb_flags & TDBF_SOFT_FIRSTUSE) {
+ /* If the TDB hasn't been used, don't renew it. */
+ if (tdb->tdb_first_use != 0)
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
+ tdb_delete(tdb);
+ }
  NET_UNLOCK();
 }
 
@@ -573,13 +572,12 @@ tdb_soft_timeout(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_SOFT_TIMER))
- return;
-
  NET_LOCK();
- /* Soft expirations. */
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
- tdb->tdb_flags &= ~TDBF_SOFT_TIMER;
+ if (tdb->tdb_flags & TDBF_SOFT_TIMER) {
+ /* Soft expirations. */
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
+ tdb->tdb_flags &= ~TDBF_SOFT_TIMER;
+ }
  NET_UNLOCK();
 }
 
@@ -588,14 +586,13 @@ tdb_soft_firstuse(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_SOFT_FIRSTUSE))
- return;
-
  NET_LOCK();
- /* If the TDB hasn't been used, don't renew it. */
- if (tdb->tdb_first_use != 0)
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
- tdb->tdb_flags &= ~TDBF_SOFT_FIRSTUSE;
+ if (tdb->tdb_flags & TDBF_SOFT_FIRSTUSE) {
+ /* If the TDB hasn't been used, don't renew it. */
+ if (tdb->tdb_first_use != 0)
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
+ tdb->tdb_flags &= ~TDBF_SOFT_FIRSTUSE;
+ }
  NET_UNLOCK();
 }
 
@@ -841,14 +838,6 @@ tdb_free(struct tdb *tdbp)
  ipo->ipo_last_searched = 0; /* Force a re-search. */
  }
 
- /* Remove expiration timeouts. */
- tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER |
-    TDBF_SOFT_TIMER);
- timeout_del(&tdbp->tdb_timer_tmo);
- timeout_del(&tdbp->tdb_first_tmo);
- timeout_del(&tdbp->tdb_stimer_tmo);
- timeout_del(&tdbp->tdb_sfirst_tmo);
-
  if (tdbp->tdb_ids) {
  ipsp_ids_free(tdbp->tdb_ids);
  tdbp->tdb_ids = NULL;
@@ -866,6 +855,23 @@ tdb_free(struct tdb *tdbp)
 
  if ((tdbp->tdb_inext) && (tdbp->tdb_inext->tdb_onext == tdbp))
  tdbp->tdb_inext->tdb_onext = NULL;
+
+ /* Remove expiration timeouts. */
+ tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER |
+    TDBF_SOFT_TIMER);
+ timeout_del(&tdbp->tdb_timer_tmo);
+ timeout_del(&tdbp->tdb_first_tmo);
+ timeout_del(&tdbp->tdb_stimer_tmo);
+ timeout_del(&tdbp->tdb_sfirst_tmo);
+
+ timeout_set_proc(&tdbp->tdb_timer_tmo, tdb_reaper, tdbp);
+ timeout_add(&tdbp->tdb_timer_tmo, 0);
+}
+
+void
+tdb_reaper(void *xtdbp)
+{
+ struct tdb *tdbp = xtdbp;
 
  free(tdbp, M_TDB, 0);
 }

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
Thanx for the patch, but I wonder how I can create a syspatch from
it? If I patch, build and install stable from source, then my hosts
are cut off from the syspatch scheme. That would be highly painful.

Every helpful comment is highly appreciated.

Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Kevin Chadwick-4
On Wed, 16 May 2018 12:34:57 +0200


> Thanx for the patch, but I wonder how I can create a syspatch from
> it? If I patch, build and install stable from source, then my hosts
> are cut off from the syspatch scheme. That would be highly painful.
>
> Every helpful comment is highly appreciated.

You are obviously spoiled, ;).

I have no idea if a syspatch will be provided for this to make it super
easy, however it wouldn't be too painful. All you need is to rebuild
the kernel, the other 7 syspatches are provided on
"http://www.openbsd.org/errata63.html" to apply easily. The point of
stable is that it isn't volatile.

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Stuart Henderson
In reply to this post by Harald Dunkel-3
On 2018/05/16 12:34, Harald Dunkel wrote:
> Thanx for the patch, but I wonder how I can create a syspatch from
> it? If I patch, build and install stable from source, then my hosts
> are cut off from the syspatch scheme. That would be highly painful.
>
> Every helpful comment is highly appreciated.
>
> Harri
>

You can't create a syspatch from it.

If you can get it tested as a patch on a self-built -stable kernel
and we can get it committed then there's a *chance* it might be
errata worthy in which case you could go back to syspatches instead.

If you're hitting this as often as it seems, even if it doesn't make
it into errata, "cvs up/make config/make" is probably going to be
more acceptable than the crashes..


Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Theo Buehler-4
In reply to this post by Harald Dunkel-3
On Wed, May 16, 2018 at 12:34:57PM +0200, Harald Dunkel wrote:
> Thanx for the patch, but I wonder how I can create a syspatch from
> it?

You can't. The syspatch utility only installs officially signed
syspatches and you don't have the signing key.

> If I patch, build and install stable from source, then my hosts
> are cut off from the syspatch scheme. That would be highly painful.

You don't need to build all of stable. Apply the patch to a stable tree
(it applies from /usr/src/sys) and build a GENERIC.MP kernel as
described in release(8). This will provide you in particular with fixes
for all the errata that we had so far.

However, don't install this kernel with 'make install', but do a simple

        cp obj/bsd /bsd.tdb

from the kernel build directory instead. The -stable userland you
already have installed will work just fine with your custom kernel.

Reboot into bsd.tdb and run it for a while to see if this fixes your
crash.

Yes, while you're running this kernel, syspatch will not be available to
you and /bsd.tdb will not be randomized at boot time. But having a
running kernel without syspatch surely is preferable over a crashing
kernel with syspatch :)

Once we know that this patch fixes your problem we will consider whether
it should become a syspatch.

Should such a patch be published, you can then reboot into /bsd which
will hopefully be stable long enough for you to install the desired
syspatches and head back to normal. Until then, you will need to
maintain your own -stable kernel, i.e., recompile whenever an erratum is
published.

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Stuart Henderson
On 5/16/18 1:07 PM, Stuart Henderson wrote:
>
> You can't create a syspatch from it.

I am talking about a *private* syspatch. Something that I can
revert later to gain access to the official syspatches again.
That should be possible.

>
> If you can get it tested as a patch on a self-built -stable kernel
> and we can get it committed then there's a *chance* it might be
> errata worthy in which case you could go back to syspatches instead.
>

Thats the interesting part. How can I "go back to syspatches"? AFAIR,
when I built stable on 6.1, then it was installed as part as the build
procedure. My system appeared to be "contaminated" to syspatch and it
refused to touch it on further updates. There was no way back.

It would be nice if this had been fixed.


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Stuart Henderson
On 2018/05/16 13:37, Harald Dunkel wrote:
> On 5/16/18 1:07 PM, Stuart Henderson wrote:
> >
> > You can't create a syspatch from it.
>
> I am talking about a *private* syspatch. Something that I can
> revert later to gain access to the official syspatches again.
> That should be possible.

Oh. In that case, see /usr/src/distrib/syspatch/README, but you're not
going to like it.

> >
> > If you can get it tested as a patch on a self-built -stable kernel
> > and we can get it committed then there's a *chance* it might be
> > errata worthy in which case you could go back to syspatches instead.
> >
>
> Thats the interesting part. How can I "go back to syspatches"? AFAIR,
> when I built stable on 6.1, then it was installed as part as the build
> procedure. My system appeared to be "contaminated" to syspatch and it
> refused to touch it on further updates. There was no way back.
>
> It would be nice if this had been fixed.

There might be quicker ways, but this ought to work,

syspatch -R
do an "upgrade" install of the release
syspatch

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Martin Pieuchot
On 5/16/18 10:20 AM, Martin Pieuchot wrote:

> That means that the TDB has already been freed.  This is possible
> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
> that by introducing a tdb_reaper() function like we do in other parts of
> the stack.
>

I have booted the patched kernel on the 2 hosts.


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Alexander Bluhm
In reply to this post by Martin Pieuchot
On Wed, May 16, 2018 at 10:20:49AM +0200, Martin Pieuchot wrote:
> That means that the TDB has already been freed.  This is possible
> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
> that by introducing a tdb_reaper() function like we do in other parts of
> the stack.


> @@ -841,14 +838,6 @@ tdb_free(struct tdb *tdbp)
>   ipo->ipo_last_searched = 0; /* Force a re-search. */
>   }
>  
> - /* Remove expiration timeouts. */
> - tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER |
> -    TDBF_SOFT_TIMER);
> - timeout_del(&tdbp->tdb_timer_tmo);
> - timeout_del(&tdbp->tdb_first_tmo);
> - timeout_del(&tdbp->tdb_stimer_tmo);
> - timeout_del(&tdbp->tdb_sfirst_tmo);
> -
>   if (tdbp->tdb_ids) {
>   ipsp_ids_free(tdbp->tdb_ids);
>   tdbp->tdb_ids = NULL;

Why do you move deleting the timeouts down?  Order does not matter
as everything is protected by netlock.  But the natural way would
be to cancel the timeouts before we start destroying the object.
And at the end, we activate the reaper.

anyway OK bluhm@

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Harald Dunkel-3
Hi folks,

On 5/16/18 3:32 PM, Harald Dunkel wrote:

> On 5/16/18 10:20 AM, Martin Pieuchot wrote:
>
>> That means that the TDB has already been freed.  This is possible
>> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
>> that by introducing a tdb_reaper() function like we do in other parts of
>> the stack.
>>
>
> I have booted the patched kernel on the 2 hosts.
>

Since I booted these 2 hosts with the patched kernel 5 days ago
there were no incidents. Before I had up to 3 failures within 24h.

Load hasn't changed, AFAICT.

Do you think this is worth a syspatch?


Regards
Harri