6.3 just died (not for the first time)

classic Classic list List threaded Threaded
19 messages Options
Reply | Threaded
Open this post in threaded view
|

6.3 just died (not for the first time)

Harald Dunkel-3
Hi folks,

hopefully its allowed to repost this message here:

One gateway running 6.3 ran into the debugger last night. Last words:

login: kernel: protection fault trap, code=0
Stopped at      export_sa+0x5c: movl    0(%rcx),%ecx
ddb{0}> show panic
the kernel did not panic
ddb{0}> trace
export_sa(10,ffff800033445e70) at export_sa+0x5c
pfkeyv2_expire(ffff8000013d4c00,ffff8000013d4c00) at pfkeyv2_expire+0x14e
tdb_timeout(ffff800033446020) at tdb_timeout+0x39
softclock_thread(0) at softclock_thread+0xc6
end trace frame: 0x0, count: -4
ddb{0}> show registers
rdi               0xffff800033445e98
rsi               0xffff8000013d4c00
rbp               0xffff800033445e70
rbx               0xffff800033445e98
rdx               0xffffffff81abdff0    cpu_info_full_primary+0x1ff0
rcx               0xdeadbeefdeadbeef
rax               0xffff800001387510
r8                             0x120
r9                0xffffffff81aa58d8    netlock
r10               0xffffffffffffffff
r11               0xffff800033445ea0
r12               0xffff800001387500
r13                              0x3
r14               0xffff8000013d4c00
r15                             0x90
rip               0xffffffff8121fefc    export_sa+0x5c
cs                               0x8
rflags                       0x10282    __ALIGN_SIZE+0xf282
rsp               0xffff800033445e70
ss                              0x10
export_sa+0x5c: movl    0(%rcx),%ecx
ddb{0}> ps
    PID     TID   PPID    UID  S       FLAGS  WAIT          COMMAND
  74371   82200      1      0  3        0x82  ttyopn        getty
  64133  371566      1      0  3    0x100083  ttyin         getty
  73177  400616      1      0  3    0x100083  ttyin         getty
   2198  160363      1      0  3    0x100083  ttyin         getty
  66943   62449      1      0  3    0x100083  ttyin         getty
  77195  409193      1      0  3    0x100083  ttyin         getty
  30152   89639      1      0  3    0x100083  ttyin         getty
  54326   20290      1      0  3    0x100098  poll          cron
  81308    6330      1      0  3        0x80  kqread        apmd
  21604  251912  61088     74  3    0x100092  bpf           pflogd
  61088  386173      1      0  3        0x80  netio         pflogd
  38994  395332  22137    623  3        0x90  nanosleep     zabbix_agentd
  92334  256603  22137    623  3        0x90  select        zabbix_agentd
   5776  303931  22137    623  3        0x90  netcon        zabbix_agentd
  71818  109922  22137    623  3        0x90  select        zabbix_agentd
  28432  430198  22137    623  3        0x90  nanosleep     zabbix_agentd
  55014  131036  54187     74  3    0x100092  bpf           pflogd
  54187  404660      1      0  3        0x80  netio         pflogd
  32954  132161  74424     74  3    0x100092  bpf           pflogd
  74424   72323      1      0  3        0x80  netio         pflogd
  22137  193504      1    623  3        0x90  wait          zabbix_agentd
  23016    6037      1      0  3        0x80  poll          openvpn
  27849  148250      1      0  3        0x80  poll          openvpn
  78572  192037      1      0  3        0x80  poll          openvpn
  83974  209100      1      0  3        0x80  poll          openvpn
   1297  379204      1     99  3    0x100090  poll          sndiod
  72635   52767      1    110  3    0x100090  poll          sndiod
  59204  423537      1     62  3    0x100090  bpf           spamlogd
  51694  290283  46867     62  3    0x100090  piperd        spamd
  76899  369277  46867     62  3    0x100090  poll          spamd
  46867   52758      1     62  3    0x100090  nanosleep     spamd
  51631   64028      1    109  3        0x90  kqread        ftp-proxy
  74489  238300  13002     95  3    0x100092  kqread        smtpd
  69227  383337  13002    103  3    0x100092  kqread        smtpd
  95912  255952  13002     95  3    0x100092  kqread        smtpd
  52092  398675  13002     95  3    0x100092  kqread        smtpd
  15268  264170  13002     95  3    0x100092  kqread        smtpd
  23823   51587  13002     95  3    0x100092  kqread        smtpd
  13002  289905      1      0  3    0x100080  kqread        smtpd
  39875  399764      1      0  3        0x80  select        sshd
  84492   73143  16575     68  7        0x90                sasyncd
  16575  267935      1      0  3        0x80  select        sasyncd
   5600  244082  24905     68  7        0x10                isakmpd
  24905  484997      1      0  3        0x80  netio         isakmpd
  15412  155977      1      0  3    0x100080  poll          ntpd
  71665   62722  35888     83  3    0x100092  poll          ntpd
  35888  382324      1     83  3    0x100092  poll          ntpd
  79699  454922  42559     74  3    0x100092  bpf           pflogd
  42559  472293      1      0  3        0x80  netio         pflogd
  90864  469513  67456     73  3    0x100090  kqread        syslogd
  67456  146341      1      0  3    0x100082  netio         syslogd
  54377  194590  79772    115  3    0x100092  kqread        slaacd
  81742  432607  79772    115  3    0x100092  kqread        slaacd
  79772  398085      1      0  3        0x80  kqread        slaacd
  84892   75254      0      0  3     0x14200  pgzero        zerothread
  63373  100614      0      0  3     0x14200  aiodoned      aiodoned
  72872  341972      0      0  3     0x14200  syncer        update
  38598  248818      0      0  3     0x14200  cleaner       cleaner
  10377  444516      0      0  3     0x14200  reaper        reaper
  45526  458979      0      0  3     0x14200  pgdaemon      pagedaemon
  49336  347074      0      0  3     0x14200  bored         crynlk
  66086   58296      0      0  3     0x14200  bored         crypto
  24070  258491      0      0  3     0x14200  usbtsk        usbtask
  55027  468872      0      0  3     0x14200  usbatsk       usbatsk
  92887  460485      0      0  3     0x14200  bored         i915-hangcheck
  41493  155682      0      0  3     0x14200  bored         i915-dp
  66272   77311      0      0  3     0x14200  bored         i915
  71823  256416      0      0  3  0x40014200  acpi0         acpi0
  20125  439415      0      0  7  0x40014200                idle3
  42176  326787      0      0  3  0x40014200                idle2
  21681  480396      0      0  3  0x40014200                idle1
  39767  314589      0      0  3     0x14200  bored         sensors
  58848  324478      0      0  3     0x14200  netlock       softnet
  87175   58746      0      0  3     0x14200  bored         systqmp
  67735  194118      0      0  3     0x14200  bored         systq
*79929  171386      0      0  7  0x40014200                softclock
  81339  345588      0      0  3  0x40014200                idle0
      1  339913      0      0  3        0x82  wait          init
      0       0     -1      0  3     0x10200  scheduler     swapper
ddb{0}>

I had installed the recent ipsec and libcrypto syspatches from a few days
ago.

gate5a# syspatch -l
001_perl
002_libtls
003_arp
004_gif
005_httpd
006_ipseclen
007_libcrypto

Is there anything more I could try? Are there suggestions about
how to get more information from the debugger session, if this happens
again?

It just happened again, while writing this EMail! Again its export_sa+0x5c

Every helpful comment is highly appreciated.

Harri

dmesg.log (18K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Martin Pieuchot
On 16/05/18(Wed) 08:06, Harald Dunkel wrote:
> Hi folks,

Thanks for the report.
 

> hopefully its allowed to repost this message here:
>
> One gateway running 6.3 ran into the debugger last night. Last words:
>
> login: kernel: protection fault trap, code=0
> Stopped at      export_sa+0x5c: movl    0(%rcx),%ecx
> ddb{0}> show panic
> the kernel did not panic
> ddb{0}> trace
> export_sa(10,ffff800033445e70) at export_sa+0x5c
> pfkeyv2_expire(ffff8000013d4c00,ffff8000013d4c00) at pfkeyv2_expire+0x14e
> tdb_timeout(ffff800033446020) at tdb_timeout+0x39
> softclock_thread(0) at softclock_thread+0xc6
> end trace frame: 0x0, count: -4
> ddb{0}> show registers
> rdi               0xffff800033445e98
> rsi               0xffff8000013d4c00
> rbp               0xffff800033445e70
> rbx               0xffff800033445e98
> rdx               0xffffffff81abdff0    cpu_info_full_primary+0x1ff0
> rcx               0xdeadbeefdeadbeef
                    ^^^^^^^^^^^^^^^^^^
That means that the TDB has already been freed.  This is possible
because the timeout sleeps on the NET_LOCK().  Diff below should prevent
that by introducing a tdb_reaper() function like we do in other parts of
the stack.

Index: netinet/ip_ipsp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.229
diff -u -p -r1.229 ip_ipsp.c
--- netinet/ip_ipsp.c 6 Nov 2017 15:12:43 -0000 1.229
+++ netinet/ip_ipsp.c 16 May 2018 08:17:59 -0000
@@ -79,10 +79,11 @@ void tdb_hashstats(void);
 #endif
 
 void tdb_rehash(void);
-void tdb_timeout(void *v);
-void tdb_firstuse(void *v);
-void tdb_soft_timeout(void *v);
-void tdb_soft_firstuse(void *v);
+void tdb_reaper(void *);
+void tdb_timeout(void *);
+void tdb_firstuse(void *);
+void tdb_soft_timeout(void *);
+void tdb_soft_firstuse(void *);
 int tdb_hash(u_int, u_int32_t, union sockaddr_union *, u_int8_t);
 
 int ipsec_in_use = 0;
@@ -541,14 +542,13 @@ tdb_timeout(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_TIMER))
- return;
-
  NET_LOCK();
- /* If it's an "invalid" TDB do a silent expiration. */
- if (!(tdb->tdb_flags & TDBF_INVALID))
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
- tdb_delete(tdb);
+ if (tdb->tdb_flags & TDBF_TIMER) {
+ /* If it's an "invalid" TDB do a silent expiration. */
+ if (!(tdb->tdb_flags & TDBF_INVALID))
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
+ tdb_delete(tdb);
+ }
  NET_UNLOCK();
 }
 
@@ -557,14 +557,13 @@ tdb_firstuse(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_SOFT_FIRSTUSE))
- return;
-
  NET_LOCK();
- /* If the TDB hasn't been used, don't renew it. */
- if (tdb->tdb_first_use != 0)
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
- tdb_delete(tdb);
+ if (tdb->tdb_flags & TDBF_SOFT_FIRSTUSE) {
+ /* If the TDB hasn't been used, don't renew it. */
+ if (tdb->tdb_first_use != 0)
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD);
+ tdb_delete(tdb);
+ }
  NET_UNLOCK();
 }
 
@@ -573,13 +572,12 @@ tdb_soft_timeout(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_SOFT_TIMER))
- return;
-
  NET_LOCK();
- /* Soft expirations. */
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
- tdb->tdb_flags &= ~TDBF_SOFT_TIMER;
+ if (tdb->tdb_flags & TDBF_SOFT_TIMER) {
+ /* Soft expirations. */
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
+ tdb->tdb_flags &= ~TDBF_SOFT_TIMER;
+ }
  NET_UNLOCK();
 }
 
@@ -588,14 +586,13 @@ tdb_soft_firstuse(void *v)
 {
  struct tdb *tdb = v;
 
- if (!(tdb->tdb_flags & TDBF_SOFT_FIRSTUSE))
- return;
-
  NET_LOCK();
- /* If the TDB hasn't been used, don't renew it. */
- if (tdb->tdb_first_use != 0)
- pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
- tdb->tdb_flags &= ~TDBF_SOFT_FIRSTUSE;
+ if (tdb->tdb_flags & TDBF_SOFT_FIRSTUSE) {
+ /* If the TDB hasn't been used, don't renew it. */
+ if (tdb->tdb_first_use != 0)
+ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT);
+ tdb->tdb_flags &= ~TDBF_SOFT_FIRSTUSE;
+ }
  NET_UNLOCK();
 }
 
@@ -841,14 +838,6 @@ tdb_free(struct tdb *tdbp)
  ipo->ipo_last_searched = 0; /* Force a re-search. */
  }
 
- /* Remove expiration timeouts. */
- tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER |
-    TDBF_SOFT_TIMER);
- timeout_del(&tdbp->tdb_timer_tmo);
- timeout_del(&tdbp->tdb_first_tmo);
- timeout_del(&tdbp->tdb_stimer_tmo);
- timeout_del(&tdbp->tdb_sfirst_tmo);
-
  if (tdbp->tdb_ids) {
  ipsp_ids_free(tdbp->tdb_ids);
  tdbp->tdb_ids = NULL;
@@ -866,6 +855,23 @@ tdb_free(struct tdb *tdbp)
 
  if ((tdbp->tdb_inext) && (tdbp->tdb_inext->tdb_onext == tdbp))
  tdbp->tdb_inext->tdb_onext = NULL;
+
+ /* Remove expiration timeouts. */
+ tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER |
+    TDBF_SOFT_TIMER);
+ timeout_del(&tdbp->tdb_timer_tmo);
+ timeout_del(&tdbp->tdb_first_tmo);
+ timeout_del(&tdbp->tdb_stimer_tmo);
+ timeout_del(&tdbp->tdb_sfirst_tmo);
+
+ timeout_set_proc(&tdbp->tdb_timer_tmo, tdb_reaper, tdbp);
+ timeout_add(&tdbp->tdb_timer_tmo, 0);
+}
+
+void
+tdb_reaper(void *xtdbp)
+{
+ struct tdb *tdbp = xtdbp;
 
  free(tdbp, M_TDB, 0);
 }

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
Thanx for the patch, but I wonder how I can create a syspatch from
it? If I patch, build and install stable from source, then my hosts
are cut off from the syspatch scheme. That would be highly painful.

Every helpful comment is highly appreciated.

Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Kevin Chadwick-4
On Wed, 16 May 2018 12:34:57 +0200


> Thanx for the patch, but I wonder how I can create a syspatch from
> it? If I patch, build and install stable from source, then my hosts
> are cut off from the syspatch scheme. That would be highly painful.
>
> Every helpful comment is highly appreciated.

You are obviously spoiled, ;).

I have no idea if a syspatch will be provided for this to make it super
easy, however it wouldn't be too painful. All you need is to rebuild
the kernel, the other 7 syspatches are provided on
"http://www.openbsd.org/errata63.html" to apply easily. The point of
stable is that it isn't volatile.

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Stuart Henderson
In reply to this post by Harald Dunkel-3
On 2018/05/16 12:34, Harald Dunkel wrote:
> Thanx for the patch, but I wonder how I can create a syspatch from
> it? If I patch, build and install stable from source, then my hosts
> are cut off from the syspatch scheme. That would be highly painful.
>
> Every helpful comment is highly appreciated.
>
> Harri
>

You can't create a syspatch from it.

If you can get it tested as a patch on a self-built -stable kernel
and we can get it committed then there's a *chance* it might be
errata worthy in which case you could go back to syspatches instead.

If you're hitting this as often as it seems, even if it doesn't make
it into errata, "cvs up/make config/make" is probably going to be
more acceptable than the crashes..


Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Theo Buehler-4
In reply to this post by Harald Dunkel-3
On Wed, May 16, 2018 at 12:34:57PM +0200, Harald Dunkel wrote:
> Thanx for the patch, but I wonder how I can create a syspatch from
> it?

You can't. The syspatch utility only installs officially signed
syspatches and you don't have the signing key.

> If I patch, build and install stable from source, then my hosts
> are cut off from the syspatch scheme. That would be highly painful.

You don't need to build all of stable. Apply the patch to a stable tree
(it applies from /usr/src/sys) and build a GENERIC.MP kernel as
described in release(8). This will provide you in particular with fixes
for all the errata that we had so far.

However, don't install this kernel with 'make install', but do a simple

        cp obj/bsd /bsd.tdb

from the kernel build directory instead. The -stable userland you
already have installed will work just fine with your custom kernel.

Reboot into bsd.tdb and run it for a while to see if this fixes your
crash.

Yes, while you're running this kernel, syspatch will not be available to
you and /bsd.tdb will not be randomized at boot time. But having a
running kernel without syspatch surely is preferable over a crashing
kernel with syspatch :)

Once we know that this patch fixes your problem we will consider whether
it should become a syspatch.

Should such a patch be published, you can then reboot into /bsd which
will hopefully be stable long enough for you to install the desired
syspatches and head back to normal. Until then, you will need to
maintain your own -stable kernel, i.e., recompile whenever an erratum is
published.

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Stuart Henderson
On 5/16/18 1:07 PM, Stuart Henderson wrote:
>
> You can't create a syspatch from it.

I am talking about a *private* syspatch. Something that I can
revert later to gain access to the official syspatches again.
That should be possible.

>
> If you can get it tested as a patch on a self-built -stable kernel
> and we can get it committed then there's a *chance* it might be
> errata worthy in which case you could go back to syspatches instead.
>

Thats the interesting part. How can I "go back to syspatches"? AFAIR,
when I built stable on 6.1, then it was installed as part as the build
procedure. My system appeared to be "contaminated" to syspatch and it
refused to touch it on further updates. There was no way back.

It would be nice if this had been fixed.


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Stuart Henderson
On 2018/05/16 13:37, Harald Dunkel wrote:
> On 5/16/18 1:07 PM, Stuart Henderson wrote:
> >
> > You can't create a syspatch from it.
>
> I am talking about a *private* syspatch. Something that I can
> revert later to gain access to the official syspatches again.
> That should be possible.

Oh. In that case, see /usr/src/distrib/syspatch/README, but you're not
going to like it.

> >
> > If you can get it tested as a patch on a self-built -stable kernel
> > and we can get it committed then there's a *chance* it might be
> > errata worthy in which case you could go back to syspatches instead.
> >
>
> Thats the interesting part. How can I "go back to syspatches"? AFAIR,
> when I built stable on 6.1, then it was installed as part as the build
> procedure. My system appeared to be "contaminated" to syspatch and it
> refused to touch it on further updates. There was no way back.
>
> It would be nice if this had been fixed.

There might be quicker ways, but this ought to work,

syspatch -R
do an "upgrade" install of the release
syspatch

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Martin Pieuchot
On 5/16/18 10:20 AM, Martin Pieuchot wrote:

> That means that the TDB has already been freed.  This is possible
> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
> that by introducing a tdb_reaper() function like we do in other parts of
> the stack.
>

I have booted the patched kernel on the 2 hosts.


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Alexander Bluhm
In reply to this post by Martin Pieuchot
On Wed, May 16, 2018 at 10:20:49AM +0200, Martin Pieuchot wrote:
> That means that the TDB has already been freed.  This is possible
> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
> that by introducing a tdb_reaper() function like we do in other parts of
> the stack.


> @@ -841,14 +838,6 @@ tdb_free(struct tdb *tdbp)
>   ipo->ipo_last_searched = 0; /* Force a re-search. */
>   }
>  
> - /* Remove expiration timeouts. */
> - tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER |
> -    TDBF_SOFT_TIMER);
> - timeout_del(&tdbp->tdb_timer_tmo);
> - timeout_del(&tdbp->tdb_first_tmo);
> - timeout_del(&tdbp->tdb_stimer_tmo);
> - timeout_del(&tdbp->tdb_sfirst_tmo);
> -
>   if (tdbp->tdb_ids) {
>   ipsp_ids_free(tdbp->tdb_ids);
>   tdbp->tdb_ids = NULL;

Why do you move deleting the timeouts down?  Order does not matter
as everything is protected by netlock.  But the natural way would
be to cancel the timeouts before we start destroying the object.
And at the end, we activate the reaper.

anyway OK bluhm@

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Harald Dunkel-3
Hi folks,

On 5/16/18 3:32 PM, Harald Dunkel wrote:

> On 5/16/18 10:20 AM, Martin Pieuchot wrote:
>
>> That means that the TDB has already been freed.  This is possible
>> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
>> that by introducing a tdb_reaper() function like we do in other parts of
>> the stack.
>>
>
> I have booted the patched kernel on the 2 hosts.
>

Since I booted these 2 hosts with the patched kernel 5 days ago
there were no incidents. Before I had up to 3 failures within 24h.

Load hasn't changed, AFAICT.

Do you think this is worth a syspatch?


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Stuart Henderson
Hi folks,

On 5/16/18 1:07 PM, Stuart Henderson wrote:
>
> If you can get it tested as a patch on a self-built -stable kernel
> and we can get it committed then there's a *chance* it might be
> errata worthy in which case you could go back to syspatches instead.
>

The patched kernel is running for a week now, without any incidents.
Before I had up to 4 hiccups per day.

> If you're hitting this as often as it seems, even if it doesn't make
> it into errata, "cvs up/make config/make" is probably going to be
> more acceptable than the crashes..
>

AFAIK there is another problem in the loop to fix the panic in
IPsec introduced with syspatch #008. I understand that you are
very cautious about introducing new syspatches, esp. if the
problem is difficult to reproduce, but do you think this fix is
worth yet another syspatch?


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Alexander Bluhm
In reply to this post by Harald Dunkel-3
On Tue, May 22, 2018 at 07:32:29AM +0200, Harald Dunkel wrote:
> Do you think this is worth a syspatch?

You are asking this question for three times.  The answer is no.

Errata and syspatch are for security issues that can be triggered
remotely or by a local non-root user.  If we would fix every panic
in stable, we could not concentrate on developing OpenBSD current.
Doing errata involves at least five people who could spend their
time on creating new features instead.  We provide security fixes,
may handle stability issues, but will not bug fix old releases.

Bugs like this one show lack of release testing.  So instead of
asking to repair old stuff, please help us to make new things better.
By testing snapshots and diffs on production or production-like
systems, the community can improve OpenBSD releases.

bluhm

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Stefan Sperling-5
On Fri, May 25, 2018 at 03:20:50PM +0200, Alexander Bluhm wrote:

> On Tue, May 22, 2018 at 07:32:29AM +0200, Harald Dunkel wrote:
> > Do you think this is worth a syspatch?
>
> You are asking this question for three times.  The answer is no.
>
> Errata and syspatch are for security issues that can be triggered
> remotely or by a local non-root user.  If we would fix every panic
> in stable, we could not concentrate on developing OpenBSD current.
> Doing errata involves at least five people who could spend their
> time on creating new features instead.  We provide security fixes,
> may handle stability issues, but will not bug fix old releases.
>
> Bugs like this one show lack of release testing.  So instead of
> asking to repair old stuff, please help us to make new things better.
> By testing snapshots and diffs on production or production-like
> systems, the community can improve OpenBSD releases.
>
> bluhm
>

Adding to this:

It is impossible for the project to provide syspatches on any sort
of guaranteed timescale because this is a project run by volunteers.
Nagging volunteers repeatedly about a syspatch is counterproductive.
If you rely on syspatches you'll have to respect the schedule and
priorities of the volunteers who create them.

If you have requirements for putting specific fixes into production
as soon as possible, one option is to follow the release(8) man page
and compile your own snapshots from the -stable tree with any patches
from -current you need. Then you can run those snapshots on your
production machines and you are in full control of which bug already
patched in -current gets patched on your -stable machines and when.
I do this for some of the systems I maintain.

Of course, the trade-off is that if something breaks on your
production system with custom changes, you get to keep the pieces.
There are good reasons why we, as volunteers, are asking for bug
reports against -current snapshots, instead of against builds with
custom patches.

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-5
In reply to this post by Alexander Bluhm
On 5/25/18 3:20 PM, Alexander Bluhm wrote:
> On Tue, May 22, 2018 at 07:32:29AM +0200, Harald Dunkel wrote:
>> Do you think this is worth a syspatch?
>
> You are asking this question for three times.  The answer is no.
>

Twice, IIRC. My apologies. Thanx for your answer. I am glad that
I could help to verify the fix.

> Errata and syspatch are for security issues that can be triggered
> remotely or by a local non-root user.  

Surely I have not your insight into the internals, so I am glad
to learn that this problem cannot be triggered remotely.

> If we would fix every panic
> in stable, we could not concentrate on developing OpenBSD current.
> Doing errata involves at least five people who could spend their
> time on creating new features instead. We provide security fixes,
> may handle stability issues, but will not bug fix old releases.
>
> Bugs like this one show lack of release testing.  So instead of
> asking to repair old stuff, please help us to make new things better.
> By testing snapshots and diffs on production or production-like
> systems, the community can improve OpenBSD releases.
>

I am running OpenBSD stable on my gateway at home. Surly I can move
it to the current version. However, I am not allowed to put the
production environment of a mid size company at risk. In this context
OpenBSD 6.3 is not "the old stuff", but the stable version released
just a few weeks ago. Its *the* version I should run.

The syspatches help me to keep downtimes as short as possible. IMU
the syspatch support is a major improvement of OpenBSD.

Of course I highly appreciate the work of all OpenBSD developers and
maintainers.


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

lists-2
Mon, 28 May 2018 20:30:33 +0200 Harald Dunkel <[hidden email]>

> On 5/25/18 3:20 PM, Alexander Bluhm wrote:
> > On Tue, May 22, 2018 at 07:32:29AM +0200, Harald Dunkel wrote:  
> >> Do you think this is worth a syspatch?  
> >
> > You are asking this question for three times.  The answer is no.
> >  
>
> Twice, IIRC. My apologies. Thanx for your answer. I am glad that
> I could help to verify the fix.
>
> > Errata and syspatch are for security issues that can be triggered
> > remotely or by a local non-root user.    
>
> Surely I have not your insight into the internals, so I am glad
> to learn that this problem cannot be triggered remotely.
>
> > If we would fix every panic
> > in stable, we could not concentrate on developing OpenBSD current.
> > Doing errata involves at least five people who could spend their
> > time on creating new features instead. We provide security fixes,
> > may handle stability issues, but will not bug fix old releases.
> >
> > Bugs like this one show lack of release testing.  So instead of
> > asking to repair old stuff, please help us to make new things better.
> > By testing snapshots and diffs on production or production-like
> > systems, the community can improve OpenBSD releases.
> >  
>
> I am running OpenBSD stable on my gateway at home. Surly I can move

Surly, Harri - Harri, Surly.

Now that you know each other, read again the previous posts.  And please
keep testing around release time if you want to run the release, thanks.

We do not need to know how you run your company infrastructure, and this
is by no means an excuse to disregard the fine advice provided for free.

Also, if possible ensure yourself peace by having a test box & rollback.
It's not a surprise: ~99% snapshots are production quality and run fine.

Kind regards,
Anton Lazarov

> it to the current version. However, I am not allowed to put the
> production environment of a mid size company at risk. In this context
> OpenBSD 6.3 is not "the old stuff", but the stable version released
> just a few weeks ago. Its *the* version I should run.
>
> The syspatches help me to keep downtimes as short as possible. IMU
> the syspatch support is a major improvement of OpenBSD.
>
> Of course I highly appreciate the work of all OpenBSD developers and
> maintainers.
>
>
> Regards
> Harri
>

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Martin Pieuchot
Hi folks,

On 5/16/18 10:20 AM, Martin Pieuchot wrote:

> On 16/05/18(Wed) 08:06, Harald Dunkel wrote:
>> Hi folks,
>
> Thanks for the report.
>  
>> hopefully its allowed to repost this message here:
>>
>> One gateway running 6.3 ran into the debugger last night. Last words:
>>
>> login: kernel: protection fault trap, code=0
>> Stopped at      export_sa+0x5c: movl    0(%rcx),%ecx
>> ddb{0}> show panic
>> the kernel did not panic
>> ddb{0}> trace
>> export_sa(10,ffff800033445e70) at export_sa+0x5c
>> pfkeyv2_expire(ffff8000013d4c00,ffff8000013d4c00) at pfkeyv2_expire+0x14e
>> tdb_timeout(ffff800033446020) at tdb_timeout+0x39
>> softclock_thread(0) at softclock_thread+0xc6
>> end trace frame: 0x0, count: -4
>> ddb{0}> show registers
>> rdi               0xffff800033445e98
>> rsi               0xffff8000013d4c00
>> rbp               0xffff800033445e70
>> rbx               0xffff800033445e98
>> rdx               0xffffffff81abdff0    cpu_info_full_primary+0x1ff0
>> rcx               0xdeadbeefdeadbeef
>    ^^^^^^^^^^^^^^^^^^
> That means that the TDB has already been freed.  This is possible
> because the timeout sleeps on the NET_LOCK().  Diff below should prevent
> that by introducing a tdb_reaper() function like we do in other parts of
> the stack.
>

Is it possible that this problem was introduced by syspatch 008_ipsecout ?

Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
On 6/26/18 10:56 AM, Harald Dunkel wrote:
> Hi folks,
>
>
> Is it possible that this problem was introduced by syspatch 008_ipsecout ?
>

AFAICS my hosts without 008_ipsecout did not die, so I take the
silence as a "yes".


Regards
Harri

Reply | Threaded
Open this post in threaded view
|

Re: 6.3 just died (not for the first time)

Harald Dunkel-3
In reply to this post by Martin Pieuchot
Hi Martin and all others involved,

On 5/16/18 10:20 AM, Martin Pieuchot wrote:
> On 16/05/18(Wed) 08:06, Harald Dunkel wrote:
>> Hi folks,
>
> Thanks for the report.
>  

I highly appreciate the syspatch. Thanx very much


Regards
Harri