apmd hangs

classic Classic list List threaded Threaded
11 messages Options
Reply | Threaded
Open this post in threaded view
|

apmd hangs

Mark Kettenis
The more code & documentation I read, the more I'm convinced that
coordinating state changes between logical processors isn't necessary
and actually is responsible for the hangs people have been seeing.

So here is a diff that does away with it all.  I've tested it on a few
laptops here, but it could use testing on a somewhat wider range of
machines.  I'm especially interested in seeing this tested on a dual
socket machine with apmd -A.


Index: i386/i386/mp_setperf.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/mp_setperf.c,v
retrieving revision 1.5
diff -u -p -r1.5 mp_setperf.c
--- i386/i386/mp_setperf.c 29 Jun 2014 01:01:20 -0000 1.5
+++ i386/i386/mp_setperf.c 8 Sep 2014 20:43:50 -0000
@@ -17,13 +17,10 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 
 #include <machine/cpu.h>
-#include <machine/cpufunc.h>
-
 #include <machine/intr.h>
 
 struct mutex setperf_mp_mutex = MUTEX_INITIALIZER(IPL_HIGH);
@@ -31,14 +28,7 @@ struct mutex setperf_mp_mutex = MUTEX_IN
 /* underlying setperf mechanism e.g. k8_powernow_setperf() */
 void (*ul_setperf)(int);
 
-#define MP_SETPERF_STEADY 0 /* steady state - normal operation */
-#define MP_SETPERF_INTRANSIT 1 /* in transition */
-#define MP_SETPERF_PROCEED 2 /* proceed with transition */
-#define MP_SETPERF_FINISH 3 /* return from IPI */
-
-
 /* protected by setperf_mp_mutex */
-volatile int mp_setperf_state = MP_SETPERF_STEADY;
 volatile int mp_perflevel;
 
 void mp_setperf(int);
@@ -46,102 +36,27 @@ void mp_setperf(int);
 void
 mp_setperf(int level)
 {
- CPU_INFO_ITERATOR cii;
- struct cpu_info *ci;
- int notready, s;
-
- if (mp_setperf_state == MP_SETPERF_STEADY) {
- mtx_enter(&setperf_mp_mutex);
- disable_intr();
-
- mp_perflevel = level;
-
- curcpu()->ci_setperf_state = CI_SETPERF_INTRANSIT;
- /* ask all other processors to drop what they are doing */
- CPU_INFO_FOREACH(cii, ci) {
- if (ci->ci_setperf_state != CI_SETPERF_INTRANSIT) {
- ci->ci_setperf_state =
-    CI_SETPERF_SHOULDSTOP;
- i386_send_ipi(ci, I386_IPI_SETPERF);
- }
- }
-
-
- /* Loop until all processors report ready */
- do {
- CPU_INFO_FOREACH(cii, ci) {
- if ((notready = (ci->ci_setperf_state
-    != CI_SETPERF_INTRANSIT)))
- break;
- }
- } while (notready);
-
- mp_setperf_state = MP_SETPERF_PROCEED; /* release the hounds */
-
- s = splipi();
-
- ul_setperf(mp_perflevel);
-
- splx(s);
-
- curcpu()->ci_setperf_state = CI_SETPERF_DONE;
- /* Loop until all processors report done */
- do {
- CPU_INFO_FOREACH(cii, ci) {
- if ((notready = (ci->ci_setperf_state
-    != CI_SETPERF_DONE)))
- break;
- }
- } while (notready);
-
- mp_setperf_state = MP_SETPERF_FINISH;
- /* delay a little for potential straglers */
- DELAY(2);
- curcpu()->ci_setperf_state = CI_SETPERF_READY;
- mp_setperf_state = MP_SETPERF_STEADY; /* restore normallity */
- enable_intr();
- mtx_leave(&setperf_mp_mutex);
- }
+ mtx_enter(&setperf_mp_mutex);
+ mp_perflevel = level;
 
+ ul_setperf(mp_perflevel);
+ i386_broadcast_ipi(I386_IPI_SETPERF);
+ mtx_leave(&setperf_mp_mutex);
 }
 
 void
 i386_setperf_ipi(struct cpu_info *ci)
 {
-
- disable_intr();
-
- if (ci->ci_setperf_state == CI_SETPERF_SHOULDSTOP)
- ci->ci_setperf_state = CI_SETPERF_INTRANSIT;
-
- while (mp_setperf_state != MP_SETPERF_PROCEED)
- ;
-
  ul_setperf(mp_perflevel);
-
- ci->ci_setperf_state = CI_SETPERF_DONE;
-
- while (mp_setperf_state != MP_SETPERF_FINISH)
- ;
- ci->ci_setperf_state = CI_SETPERF_READY;
-
- enable_intr();
 }
 
 void
-mp_setperf_init()
+mp_setperf_init(void)
 {
- CPU_INFO_ITERATOR cii;
- struct cpu_info *ci;
-
  if (!cpu_setperf)
  return;
- ul_setperf = cpu_setperf;
 
+ ul_setperf = cpu_setperf;
  cpu_setperf = mp_setperf;
-
- CPU_INFO_FOREACH(cii, ci) {
- ci->ci_setperf_state = CI_SETPERF_READY;
- }
  mtx_init(&setperf_mp_mutex, IPL_HIGH);
 }
Index: i386/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/i386/include/cpu.h,v
retrieving revision 1.134
diff -u -p -r1.134 cpu.h
--- i386/include/cpu.h 11 Jul 2014 10:53:07 -0000 1.134
+++ i386/include/cpu.h 8 Sep 2014 20:43:50 -0000
@@ -147,12 +147,6 @@ struct cpu_info {
 #define CI_DDB_ENTERDDB 3
 #define CI_DDB_INDDB 4
 
- volatile int ci_setperf_state;
-#define CI_SETPERF_READY 0
-#define CI_SETPERF_SHOULDSTOP 1
-#define CI_SETPERF_INTRANSIT 2
-#define CI_SETPERF_DONE 3
-
  struct ksensordev ci_sensordev;
  struct ksensor ci_sensor;
 #ifdef GPROF
Index: amd64/amd64/mp_setperf.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/mp_setperf.c,v
retrieving revision 1.4
diff -u -p -r1.4 mp_setperf.c
--- amd64/amd64/mp_setperf.c 29 Jun 2014 01:01:20 -0000 1.4
+++ amd64/amd64/mp_setperf.c 8 Sep 2014 20:43:50 -0000
@@ -17,13 +17,10 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 
 #include <machine/cpu.h>
-#include <machine/cpufunc.h>
-
 #include <machine/intr.h>
 
 struct mutex setperf_mp_mutex = MUTEX_INITIALIZER(IPL_HIGH);
@@ -31,14 +28,7 @@ struct mutex setperf_mp_mutex = MUTEX_IN
 /* underlying setperf mechanism e.g. k8_powernow_setperf() */
 void (*ul_setperf)(int);
 
-#define MP_SETPERF_STEADY 0 /* steady state - normal operation */
-#define MP_SETPERF_INTRANSIT 1 /* in transition */
-#define MP_SETPERF_PROCEED 2 /* proceed with transition */
-#define MP_SETPERF_FINISH 3 /* return from IPI */
-
-
 /* protected by setperf_mp_mutex */
-volatile int mp_setperf_state = MP_SETPERF_STEADY;
 volatile int mp_perflevel;
 
 void mp_setperf(int);
@@ -46,101 +36,28 @@ void mp_setperf(int);
 void
 mp_setperf(int level)
 {
- CPU_INFO_ITERATOR cii;
- struct cpu_info *ci;
- int notready, s;
-
- if (mp_setperf_state == MP_SETPERF_STEADY) {
- mtx_enter(&setperf_mp_mutex);
- disable_intr();
- mp_perflevel = level;
-
- curcpu()->ci_setperf_state = CI_SETPERF_INTRANSIT;
- /* ask all other processors to drop what they are doing */
- CPU_INFO_FOREACH(cii, ci) {
- if (ci->ci_setperf_state != CI_SETPERF_INTRANSIT) {
- ci->ci_setperf_state =
-    CI_SETPERF_SHOULDSTOP;
- x86_send_ipi(ci, X86_IPI_SETPERF);
- }
- }
-
-
- /* Loop until all processors report ready */
- do {
- CPU_INFO_FOREACH(cii, ci) {
- if ((notready = (ci->ci_setperf_state
-    != CI_SETPERF_INTRANSIT)))
- break;
- }
- } while (notready);
-
- mp_setperf_state = MP_SETPERF_PROCEED; /* release the hounds */
-
- s = splipi();
-
- ul_setperf(mp_perflevel);
-
- splx(s);
-
- curcpu()->ci_setperf_state = CI_SETPERF_DONE;
- /* Loop until all processors report done */
- do {
- CPU_INFO_FOREACH(cii, ci) {
- if ((notready = (ci->ci_setperf_state
-    != CI_SETPERF_DONE)))
- break;
- }
- } while (notready);
-
- mp_setperf_state = MP_SETPERF_FINISH;
- /* delay a little for potential straglers */
- DELAY(2);
- curcpu()->ci_setperf_state = CI_SETPERF_READY;
- mp_setperf_state = MP_SETPERF_STEADY; /* restore normallity */
- enable_intr();
- mtx_leave(&setperf_mp_mutex);
- }
+ mtx_enter(&setperf_mp_mutex);
+ mp_perflevel = level;
+
+ ul_setperf(mp_perflevel);
+ x86_broadcast_ipi(X86_IPI_SETPERF);
 
+ mtx_leave(&setperf_mp_mutex);
 }
 
 void
 x86_setperf_ipi(struct cpu_info *ci)
 {
-
- disable_intr();
-
- if (ci->ci_setperf_state == CI_SETPERF_SHOULDSTOP)
- ci->ci_setperf_state = CI_SETPERF_INTRANSIT;
-
- while (mp_setperf_state != MP_SETPERF_PROCEED)
- ;
-
  ul_setperf(mp_perflevel);
-
- ci->ci_setperf_state = CI_SETPERF_DONE;
-
- while (mp_setperf_state != MP_SETPERF_FINISH)
- ;
- ci->ci_setperf_state = CI_SETPERF_READY;
-
- enable_intr();
 }
 
 void
-mp_setperf_init()
+mp_setperf_init(void)
 {
- CPU_INFO_ITERATOR cii;
- struct cpu_info *ci;
-
  if (!cpu_setperf)
  return;
- ul_setperf = cpu_setperf;
 
+ ul_setperf = cpu_setperf;
  cpu_setperf = mp_setperf;
-
- CPU_INFO_FOREACH(cii, ci) {
- ci->ci_setperf_state = CI_SETPERF_READY;
- }
  mtx_init(&setperf_mp_mutex, IPL_HIGH);
 }
Index: amd64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.85
diff -u -p -r1.85 cpu.h
--- amd64/include/cpu.h 11 Jul 2014 10:53:07 -0000 1.85
+++ amd64/include/cpu.h 8 Sep 2014 20:43:50 -0000
@@ -136,12 +136,6 @@ struct cpu_info {
 #define CI_DDB_ENTERDDB 3
 #define CI_DDB_INDDB 4
 
- volatile int ci_setperf_state;
-#define CI_SETPERF_READY 0
-#define CI_SETPERF_SHOULDSTOP 1
-#define CI_SETPERF_INTRANSIT 2
-#define CI_SETPERF_DONE 3
-
  struct ksensordev ci_sensordev;
  struct ksensor ci_sensor;
 #ifdef GPROF

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Stuart Henderson-6
On 2014/09/08 23:35, Mark Kettenis wrote:
> The more code & documentation I read, the more I'm convinced that
> coordinating state changes between logical processors isn't necessary
> and actually is responsible for the hangs people have been seeing.
>
> So here is a diff that does away with it all.  I've tested it on a few
> laptops here, but it could use testing on a somewhat wider range of
> machines.  I'm especially interested in seeing this tested on a dual
> socket machine with apmd -A.

I'm running with this on my amd64 X220 with apm -C, this configuration
used to hang every couple of days. It's a bit soon to say if it fixes
things yet (IIRC some others hit the hangs more easily than me), but
I haven't noticed any regressions.

I've also run a cycle of lots of apm -L / apm -H in a loop while
otherwise stressing the cpu, no problems seen there.

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Ingo Schwarze
In reply to this post by Mark Kettenis
Hi Mark,

Mark Kettenis wrote on Mon, Sep 08, 2014 at 11:35:36PM +0200:

> The more code & documentation I read, the more I'm convinced that
> coordinating state changes between logical processors isn't necessary
> and actually is responsible for the hangs people have been seeing.
>
> So here is a diff that does away with it all.  I've tested it on a few
> laptops here, but it could use testing on a somewhat wider range of
> machines.  I'm especially interested in seeing this tested on a dual
> socket machine with apmd -A.

i'm sorry to say it makes no difference for me (i'm not opposed to the
diff, though).

On my laptop, building ports works fine, running firefox works fine,
but whenever i surf the web with firefox while building ports,
the machine locks up hard.  Sometimes, the lockup already happens
when merely starting firefox while building ports.  Often, it
happens not when requesting a new URI, but when merely scrolling
within the page in firefox.

After the lockup, CapsLk and NmLk still toggle the respective LEDs,
Fn-PgUp still switches on and off the torch, but nothing else has
any effect, not even Ctrl-Alt-Esc, Ctrl-Alt-Delete, Ctrl-Alt-Backspace
or Ctrl-Alt-F1.

Unfortunately, i cannot break into ddb because i don't have a
docking station, hence no serial console, and when going to the
PC virtual console (Ctrl-Alt-F1), setting export DISPLAY=:0,
and starting firefox from the console, i was unable to get any
lockup.  Apparently, it only happens when X (or whatever) is
actually painting something onto the screen.

Whether i run with the defaults or with apm -A doesn't appear to
make a difference.

Yours,
  Ingo


OpenBSD 5.6-current (GENERIC.MP) #4: Tue Sep  9 18:06:19 CEST 2014
    [hidden email]:/usr/src/sys/arch/i386/compile/GENERIC.MP
cpu0: Genuine Intel(R) CPU T2300 @ 1.66GHz ("GenuineIntel" 686-class) 1.67 GHz
cpu0: FPU,V86,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,NXE,SSE3,MWAIT,EST,TM2,xTPR,PDCM,PERF
real mem  = 3211096064 (3062MB)
avail mem = 3146219520 (3000MB)
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: AT/286+ BIOS, date 08/26/09, BIOS32 rev. 0 @ 0xfd6b0, SMBIOS rev. 2.4 @ 0xe0010 (68 entries)
bios0: vendor LENOVO version "7FETA9WW (2.27 )" date 08/26/2009
bios0: LENOVO 94504BG
acpi0 at bios0: rev 2
acpi0: sleep states S0 S3 S4 S5
acpi0: tables DSDT FACP SSDT ECDT TCPA APIC MCFG HPET BOOT SSDT SSDT SSDT SSDT
acpi0: wakeup devices LID_(S3) SLPB(S3) UART(S3) EXP0(S4) EXP1(S4) EXP2(S4) EXP3(S4) PCI1(S4) USB0(S3) USB1(S3) USB3(S3) USB7(S3) HDEF(S4)
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpiec0 at acpi0
acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges
cpu0: apic clock running at 166MHz
cpu0: mwait min=64, max=64, C-substates=0.2.2.2.2, IBE
cpu1 at mainbus0: apid 1 (application processor)
cpu1: Genuine Intel(R) CPU T2300 @ 1.66GHz ("GenuineIntel" 686-class) 1.67 GHz
cpu1: FPU,V86,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,NXE,SSE3,MWAIT,EST,TM2,xTPR,PDCM,PERF
ioapic0 at mainbus0: apid 1 pa 0xfec00000, version 20, 24 pins
ioapic0: misconfigured as apic 2, remapped to apid 1
acpimcfg0 at acpi0 addr 0xf0000000, bus 0-63
acpihpet0 at acpi0: 14318179 Hz
acpiprt0 at acpi0: bus 0 (PCI0)
acpiprt1 at acpi0: bus -1 (AGP_)
acpiprt2 at acpi0: bus 2 (EXP0)
acpiprt3 at acpi0: bus 3 (EXP1)
acpiprt4 at acpi0: bus 4 (EXP2)
acpiprt5 at acpi0: bus 12 (EXP3)
acpiprt6 at acpi0: bus 21 (PCI1)
acpicpu0 at acpi0: C3, C2, C1, PSS
acpicpu1 at acpi0: C3, C2, C1, PSS
acpipwrres0 at acpi0: PUBS, resource for USB0, USB1, USB7
acpitz0 at acpi0: critical temperature is 127 degC
acpitz1 at acpi0: critical temperature is 99 degC
acpibtn0 at acpi0: LID_
acpibtn1 at acpi0: SLPB
acpibat0 at acpi0: BAT0 model "92P1129" serial  1896 type LION oem "Panasonic"
acpibat1 at acpi0: BAT1 not present
acpiac0 at acpi0: AC unit online
acpithinkpad0 at acpi0
acpidock0 at acpi0: GDCK not docked (0)
bios0: ROM list: 0xc0000/0xea00! 0xcf000/0x1600 0xd0800/0x1000 0xdc000/0x4000! 0xe0000/0x10000!
cpu0: Enhanced SpeedStep 1663 MHz: speeds: 1667, 1333, 1000 MHz
pci0 at mainbus0 bus 0: configuration mode 1 (bios)
pchb0 at pci0 dev 0 function 0 "Intel 82945GM Host" rev 0x03
vga1 at pci0 dev 2 function 0 "Intel 82945GM Video" rev 0x03
intagp0 at vga1
agp0 at intagp0: aperture at 0xd0000000, size 0x10000000
inteldrm0 at vga1
drm0 at inteldrm0
inteldrm0: 1280x800
wsdisplay0 at vga1 mux 1: console (std, vt100 emulation)
wsdisplay0: screen 1-5 added (std, vt100 emulation)
"Intel 82945GM Video" rev 0x03 at pci0 dev 2 function 1 not configured
azalia0 at pci0 dev 27 function 0 "Intel 82801GB HD Audio" rev 0x02: msi
azalia0: codecs: Analog Devices AD1981HD, Conexant/0x2bfa, using Analog Devices AD1981HD
audio0 at azalia0
ppb0 at pci0 dev 28 function 0 "Intel 82801GB PCIE" rev 0x02: apic 1 int 20
pci1 at ppb0 bus 2
bge0 at pci1 dev 0 function 0 "Broadcom BCM5752M" rev 0x02, BCM5752 A2 (0x6002): msi, address 00:16:36:93:42:23
brgphy0 at bge0 phy 1: BCM5752 10/100/1000baseT PHY, rev. 0
ppb1 at pci0 dev 28 function 1 "Intel 82801GB PCIE" rev 0x02: apic 1 int 21
pci2 at ppb1 bus 3
wpi0 at pci2 dev 0 function 0 "Intel PRO/Wireless 3945ABG" rev 0x02: msi, MoW2, address 00:18:de:89:c8:1b
ppb2 at pci0 dev 28 function 2 "Intel 82801GB PCIE" rev 0x02: apic 1 int 22
pci3 at ppb2 bus 4
ppb3 at pci0 dev 28 function 3 "Intel 82801GB PCIE" rev 0x02: apic 1 int 23
pci4 at ppb3 bus 12
uhci0 at pci0 dev 29 function 0 "Intel 82801GB USB" rev 0x02: apic 1 int 16
uhci1 at pci0 dev 29 function 1 "Intel 82801GB USB" rev 0x02: apic 1 int 17
uhci2 at pci0 dev 29 function 2 "Intel 82801GB USB" rev 0x02: apic 1 int 18
uhci3 at pci0 dev 29 function 3 "Intel 82801GB USB" rev 0x02: apic 1 int 19
ehci0 at pci0 dev 29 function 7 "Intel 82801GB USB" rev 0x02: apic 1 int 19
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 "Intel EHCI root hub" rev 2.00/1.00 addr 1
ppb4 at pci0 dev 30 function 0 "Intel 82801BAM Hub-to-PCI" rev 0xe2
pci5 at ppb4 bus 21
cbb0 at pci5 dev 0 function 0 "TI PCIXX12 CardBus" rev 0x00: apic 1 int 16
"TI PCIXX12 FireWire" rev 0x00 at pci5 dev 0 function 1 not configured
"TI PCIXX12 Multimedia Card Reader" rev 0x00 at pci5 dev 0 function 2 not configured
sdhc0 at pci5 dev 0 function 3 "TI PCIXX12 SD" rev 0x00: apic 1 int 16
sdmmc0 at sdhc0
cardslot0 at cbb0 slot 0 flags 0
cardbus0 at cardslot0: bus 22 device 0 cacheline 0x8, lattimer 0xb0
pcmcia0 at cardslot0
ichpcib0 at pci0 dev 31 function 0 "Intel 82801GBM LPC" rev 0x02: PM disabled
pciide0 at pci0 dev 31 function 1 "Intel 82801GB IDE" rev 0x02: DMA, channel 0 configured to compatibility, channel 1 configured to compatibility
atapiscsi0 at pciide0 channel 0 drive 0
scsibus1 at atapiscsi0: 2 targets
cd0 at scsibus1 targ 0 lun 0: <HL-DT-ST, DVDRAM GMA-4082N, CX08> ATAPI 5/cdrom removable
cd0(pciide0:0:0): using PIO mode 4, Ultra-DMA mode 2
pciide0: channel 1 ignored (disabled)
ahci0 at pci0 dev 31 function 2 "Intel 82801GBM AHCI" rev 0x02: msi, AHCI 1.1
scsibus2 at ahci0: 32 targets
sd0 at scsibus2 targ 0 lun 0: <ATA, C300-CTFDDAC064M, 0006> SCSI3 0/direct fixed naa.500a075103004554
sd0: 61057MB, 512 bytes/sector, 125045424 sectors, thin
ichiic0 at pci0 dev 31 function 3 "Intel 82801GB SMBus" rev 0x02: apic 1 int 23
iic0 at ichiic0
usb1 at uhci0: USB revision 1.0
uhub1 at usb1 "Intel UHCI root hub" rev 1.00/1.00 addr 1
usb2 at uhci1: USB revision 1.0
uhub2 at usb2 "Intel UHCI root hub" rev 1.00/1.00 addr 1
usb3 at uhci2: USB revision 1.0
uhub3 at usb3 "Intel UHCI root hub" rev 1.00/1.00 addr 1
usb4 at uhci3: USB revision 1.0
uhub4 at usb4 "Intel UHCI root hub" rev 1.00/1.00 addr 1
isa0 at ichpcib0
isadma0 at isa0
com1 at isa0 port 0x2f8/8 irq 3: ns16550a, 16 byte fifo
pckbc0 at isa0 port 0x60/5
pckbd0 at pckbc0 (kbd slot)
pckbc0: using irq 1 for kbd slot
wskbd0 at pckbd0: console keyboard, using wsdisplay0
pms0 at pckbc0 (aux slot)
pckbc0: using irq 12 for aux slot
wsmouse0 at pms0 mux 0
wsmouse1 at pms0 mux 0
pms0: Synaptics touchpad, firmware 6.2
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
aps0 at isa0 port 0x1600/31
npx0 at isa0 port 0xf0/16: reported by CPUID; using exception 16
ugen0 at uhub3 port 2 "STMicroelectronics Biometric Coprocessor" rev 1.00/0.01 addr 2
vscsi0 at root
scsibus3 at vscsi0: 256 targets
softraid0 at root
scsibus4 at softraid0: 256 targets
root on sd0a (705bb599d695716b.a) swap on sd0b dump on sd0b
WARNING: / was not properly unmounted

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

David Coppa
On Tue, Sep 9, 2014 at 7:27 PM, Ingo Schwarze <[hidden email]> wrote:

> i'm sorry to say it makes no difference for me (i'm not opposed to the
> diff, though).
>
> On my laptop, building ports works fine, running firefox works fine,
> but whenever i surf the web with firefox while building ports,
> the machine locks up hard.  Sometimes, the lockup already happens
> when merely starting firefox while building ports.  Often, it
> happens not when requesting a new URI, but when merely scrolling
> within the page in firefox.
>
> After the lockup, CapsLk and NmLk still toggle the respective LEDs,
> Fn-PgUp still switches on and off the torch, but nothing else has
> any effect, not even Ctrl-Alt-Esc, Ctrl-Alt-Delete, Ctrl-Alt-Backspace
> or Ctrl-Alt-F1.
>
> Unfortunately, i cannot break into ddb because i don't have a
> docking station, hence no serial console, and when going to the
> PC virtual console (Ctrl-Alt-F1), setting export DISPLAY=:0,
> and starting firefox from the console, i was unable to get any
> lockup.  Apparently, it only happens when X (or whatever) is
> actually painting something onto the screen.
>
> Whether i run with the defaults or with apm -A doesn't appear to
> make a difference.

I'm a bit confused... Is this hang happening without apmd running?

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Mark Kettenis
In reply to this post by Ingo Schwarze
> Date: Tue, 9 Sep 2014 19:27:42 +0200
> From: Ingo Schwarze <[hidden email]>
>
> Hi Mark,
>
> Mark Kettenis wrote on Mon, Sep 08, 2014 at 11:35:36PM +0200:
>
> > The more code & documentation I read, the more I'm convinced that
> > coordinating state changes between logical processors isn't necessary
> > and actually is responsible for the hangs people have been seeing.
> >
> > So here is a diff that does away with it all.  I've tested it on a few
> > laptops here, but it could use testing on a somewhat wider range of
> > machines.  I'm especially interested in seeing this tested on a dual
> > socket machine with apmd -A.
>
> i'm sorry to say it makes no difference for me (i'm not opposed to the
> diff, though).
>
> On my laptop, building ports works fine, running firefox works fine,
> but whenever i surf the web with firefox while building ports,
> the machine locks up hard.  Sometimes, the lockup already happens
> when merely starting firefox while building ports.  Often, it
> happens not when requesting a new URI, but when merely scrolling
> within the page in firefox.
>
> After the lockup, CapsLk and NmLk still toggle the respective LEDs,
> Fn-PgUp still switches on and off the torch, but nothing else has
> any effect, not even Ctrl-Alt-Esc, Ctrl-Alt-Delete, Ctrl-Alt-Backspace
> or Ctrl-Alt-F1.
>
> Unfortunately, i cannot break into ddb because i don't have a
> docking station, hence no serial console, and when going to the
> PC virtual console (Ctrl-Alt-F1), setting export DISPLAY=:0,
> and starting firefox from the console, i was unable to get any
> lockup.  Apparently, it only happens when X (or whatever) is
> actually painting something onto the screen.
>
> Whether i run with the defaults or with apm -A doesn't appear to
> make a difference.

Not sure what you mean with "defaults", but if the crashes happen even
in "manual performance adjustment mode", this diff certainly won't
magically fix things.

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Ingo Schwarze
In reply to this post by David Coppa
Hi David,

David Coppa wrote on Tue, Sep 09, 2014 at 07:44:47PM +0200:
> On Tue, Sep 9, 2014 at 7:27 PM, Ingo Schwarze <[hidden email]> wrote:

>> i'm sorry to say it makes no difference for me (i'm not opposed to the
>> diff, though).
>>
>> On my laptop, building ports works fine, running firefox works fine,
>> but whenever i surf the web with firefox while building ports,
>> the machine locks up hard.  Sometimes, the lockup already happens
>> when merely starting firefox while building ports.  Often, it
>> happens not when requesting a new URI, but when merely scrolling
>> within the page in firefox.
>>
>> After the lockup, CapsLk and NmLk still toggle the respective LEDs,
>> Fn-PgUp still switches on and off the torch, but nothing else has
>> any effect, not even Ctrl-Alt-Esc, Ctrl-Alt-Delete, Ctrl-Alt-Backspace
>> or Ctrl-Alt-F1.
>>
>> Unfortunately, i cannot break into ddb because i don't have a
>> docking station, hence no serial console, and when going to the
>> PC virtual console (Ctrl-Alt-F1), setting export DISPLAY=:0,
>> and starting firefox from the console, i was unable to get any
>> lockup.  Apparently, it only happens when X (or whatever) is
>> actually painting something onto the screen.
>>
>> Whether i run with the defaults or with apm -A doesn't appear to
>> make a difference.

> I'm a bit confused... Is this hang happening without apmd running?

Yes.  That doesn't make a difference, either.

Usually, i run with apmd in default mode:

  ischwarze@isnote $ grep apm /etc/rc.conf.local
  apmd_flags=""

But with apmd_flags="-A" or apmd_flags=NO the hangs happen in
exactly the same way.

Yours,
  Ingo

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

David Coppa
On Tue, Sep 9, 2014 at 7:58 PM, Ingo Schwarze <[hidden email]> wrote:

> Hi David,
>
> David Coppa wrote on Tue, Sep 09, 2014 at 07:44:47PM +0200:
>> On Tue, Sep 9, 2014 at 7:27 PM, Ingo Schwarze <[hidden email]> wrote:
>
>>> i'm sorry to say it makes no difference for me (i'm not opposed to the
>>> diff, though).
>>>
>>> On my laptop, building ports works fine, running firefox works fine,
>>> but whenever i surf the web with firefox while building ports,
>>> the machine locks up hard.  Sometimes, the lockup already happens
>>> when merely starting firefox while building ports.  Often, it
>>> happens not when requesting a new URI, but when merely scrolling
>>> within the page in firefox.
>>>
>>> After the lockup, CapsLk and NmLk still toggle the respective LEDs,
>>> Fn-PgUp still switches on and off the torch, but nothing else has
>>> any effect, not even Ctrl-Alt-Esc, Ctrl-Alt-Delete, Ctrl-Alt-Backspace
>>> or Ctrl-Alt-F1.
>>>
>>> Unfortunately, i cannot break into ddb because i don't have a
>>> docking station, hence no serial console, and when going to the
>>> PC virtual console (Ctrl-Alt-F1), setting export DISPLAY=:0,
>>> and starting firefox from the console, i was unable to get any
>>> lockup.  Apparently, it only happens when X (or whatever) is
>>> actually painting something onto the screen.
>>>
>>> Whether i run with the defaults or with apm -A doesn't appear to
>>> make a difference.
>
>> I'm a bit confused... Is this hang happening without apmd running?
>
> Yes.  That doesn't make a difference, either.
>
> Usually, i run with apmd in default mode:
>
>   ischwarze@isnote $ grep apm /etc/rc.conf.local
>   apmd_flags=""
>
> But with apmd_flags="-A" or apmd_flags=NO the hangs happen in
> exactly the same way.

So I'm with Mark here, I also think your hang is unrelated to this diff.

ciao!
David

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Amit Kulkarni-5
On Tue, Sep 9, 2014 at 2:13 PM, David Coppa <[hidden email]> wrote:

> On Tue, Sep 9, 2014 at 7:58 PM, Ingo Schwarze <[hidden email]> wrote:
> > Hi David,
> >
> > David Coppa wrote on Tue, Sep 09, 2014 at 07:44:47PM +0200:
> >> On Tue, Sep 9, 2014 at 7:27 PM, Ingo Schwarze <[hidden email]> wrote:
> >
> >>> i'm sorry to say it makes no difference for me (i'm not opposed to the
> >>> diff, though).
> >>>
> >>> On my laptop, building ports works fine, running firefox works fine,
> >>> but whenever i surf the web with firefox while building ports,
> >>> the machine locks up hard.  Sometimes, the lockup already happens
> >>> when merely starting firefox while building ports.  Often, it
> >>> happens not when requesting a new URI, but when merely scrolling
> >>> within the page in firefox.
> >>>
> >>> After the lockup, CapsLk and NmLk still toggle the respective LEDs,
> >>> Fn-PgUp still switches on and off the torch, but nothing else has
> >>> any effect, not even Ctrl-Alt-Esc, Ctrl-Alt-Delete, Ctrl-Alt-Backspace
> >>> or Ctrl-Alt-F1.
> >>>
> >>> Unfortunately, i cannot break into ddb because i don't have a
> >>> docking station, hence no serial console, and when going to the
> >>> PC virtual console (Ctrl-Alt-F1), setting export DISPLAY=:0,
> >>> and starting firefox from the console, i was unable to get any
> >>> lockup.  Apparently, it only happens when X (or whatever) is
> >>> actually painting something onto the screen.
> >>>
> >>> Whether i run with the defaults or with apm -A doesn't appear to
> >>> make a difference.
> >
> >> I'm a bit confused... Is this hang happening without apmd running?
> >
> > Yes.  That doesn't make a difference, either.
> >
> > Usually, i run with apmd in default mode:
> >
> >   ischwarze@isnote $ grep apm /etc/rc.conf.local
> >   apmd_flags=""
> >
> > But with apmd_flags="-A" or apmd_flags=NO the hangs happen in
> > exactly the same way.
>
> So I'm with Mark here, I also think your hang is unrelated to this diff.
>
>

+1

Ingo,
A basic rule of thumb when building ports: raise your /etc/login.conf
limits...especially datasize-cur needs to be 2G and datasize-max needs to
be 3G. The reason being there are some ports where the linker blows up to
2G or slightly over. The worst offenders are usually the www/webkit or
chrome or firefox. Though py-py also takes a lot of memory.

There is also another well-known bug in the I/O path which espie@ referred
to a few months ago. But it is as yet undetected? It rears its ugly head
when your machine does a lot of I/O. Try running cvsync, building ports,
run a find/grep over ports tree, and try to browse with firefox all at the
same time. The system feels as if it goes into a hang. But give it a few
seconds and it comes back normally. Is this what is happening with you?
Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Ingo Schwarze
Hi Amit,

Amit Kulkarni wrote on Tue, Sep 09, 2014 at 08:47:22PM -0500:

> A basic rule of thumb when building ports: raise your /etc/login.conf
> limits...especially datasize-cur needs to be 2G and datasize-max needs to
> be 3G. The reason being there are some ports where the linker blows up to
> 2G or slightly over. The worst offenders are usually the www/webkit or
> chrome or firefox. Though py-py also takes a lot of memory.

Sure.  When i see ld(1) dying from signals, i'll bump limits.  But
that's not what what i'm talking about.  Memory allocation failure
in ld(1) is hopefully not going to cause a hard kernel lockup.
Besides, almost all the kernel lockups i saw today happened while
trying to build either archivers/gtar or misc/fileutils.  Those are
not going to hit any limits.

> There is also another well-known bug in the I/O path which espie@
> referred to a few months ago.  But it is as yet undetected?  It rears
> its ugly head when your machine does a lot of I/O.  Try running cvsync,
> building ports, run a find/grep over ports tree, and try to browse
> with firefox all at the same time.  The system feels as if it goes
> into a hang.  But give it a few seconds and it comes back normally.
> Is this what is happening with you?

I did see temporary userland lockups caused by firefox in the past,
though not lately, and i'm not sure it was the same you are referring
to.  Besides, those could always be interrupted with Ctrl-Alt-Backspace.
What i'm seeing here locks up the kernel, not merely X, and for good.

Anyway, i'm going to shut up now.  As i can't even provide a
backtrace, tech@ is clearly the wrong list.  At least i should
bisect when asking for help, but i can't afford the time right now.

I'll continue to test floating patches that seem possibly related
(to my naive understanding), and i'm sure kettenis@ will continue
to fondly remind me that a patch for FOO is not going to magically
fix BAR whenever my understanding turns out to be nothing but a
mis- (thanks for that).

Yours,
  Ingo

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Stuart Henderson-6
On 2014/09/10 04:44, Ingo Schwarze wrote:
> Sure.  When i see ld(1) dying from signals, i'll bump limits.  But
> that's not what what i'm talking about.  Memory allocation failure
> in ld(1) is hopefully not going to cause a hard kernel lockup.
>
> Besides, almost all the kernel lockups i saw today happened while
> trying to build either archivers/gtar or misc/fileutils.  Those are
> not going to hit any limits.

There are certainly some hard kernel lockups that are hit frequently
on systems where memory is exhausted (limits or not), though a gtar build
(with a 58MB maximum RSS on my laptop) is unlikely to trigger this unless
the machine is otherwise under very high memory pressure..

> Anyway, i'm going to shut up now.  As i can't even provide a
> backtrace, tech@ is clearly the wrong list.

Not necessarily, as some things are resistant to traditional debugging
and people who don't have deep experience of the involved subsystems
are unlikely to know where to start poking to get additional data points
that might help track things down..

Reply | Threaded
Open this post in threaded view
|

Re: apmd hangs

Giovanni Bechis-7
In reply to this post by Mark Kettenis
On 09/08/14 23:35, Mark Kettenis wrote:
> The more code & documentation I read, the more I'm convinced that
> coordinating state changes between logical processors isn't necessary
> and actually is responsible for the hangs people have been seeing.
>
> So here is a diff that does away with it all.  I've tested it on a few
> laptops here, but it could use testing on a somewhat wider range of
> machines.  I'm especially interested in seeing this tested on a dual
> socket machine with apmd -A.
>
finally no more hangs on this machine running with "apmd -A"
 Cheers
  Giovanni

dmesg.txt (7K) Download Attachment