Saving stack frames in dt(4)

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

Saving stack frames in dt(4)

Martin Pieuchot
People are starting to capture kernel stack traces and produce
Flamegraphs.  Those traces currently include the frames used by
dt(4) itself:

    dt_pcb_ring_get+0x11d
    dt_prov_profile_enter+0x6e
    hardclock+0x1a9
    lapic_clockintr+0x3f
    Xresume_lapic_ltimer+0x26
    acpicpu_idle+0x261 <--- CPU was Idle
    sched_idle+0x225
    proc_trampoline+0x1c
    1

Saving the last 5 frames in the example above consumes CPU time and
require some post-processing to cleanup gathered data.  So the diff
below extends the existing stacktrace_save() into and *_at() version
that takes an arbitrary frame as argument.

The fun start when we ask "How many frames do we skip?".  There are
currently two different contexts in which dt(4) can gather data:
 - in hardlock(9), which mean a specialized interrupt context
 - in any thread or interrupt context

Those two contexts require a different number of frames to reach the
recording function: dt_pcb_ring_get().  Now what's fun is that every
architecture has its own way to reach hardclock(9) (possibly multiple
ones), has its own stack unwinder (with specific bugs) and might use
different compilers with different optimizations (yeah!).

So the diff below introduce two per-arch defines that specify at which
frame the unwinding should start.  This has been tested on sparc64 and
amd64 and here's the result:

    # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
    sleep_setup+0x16c
    msleep+0x160
    taskq_next_work+0x5c
    taskq_thread+0x38
    0x1013074
    1

Note that the number of frames for hardclock(9) on amd64 assumes lapic
is used.  That's why on vmm(4) you'll still see an extra frame as below:

  #  btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'          
  Xresume_legacy0+0x1d3
  cpu_idle_cycle+0x1b
  proc_trampoline+0x1c
  1

Whereas on real hardware:

  # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
  acpicpu_idle+0x1d2
  sched_idle+0x225
  proc_trampoline+0x1c
  1

The diff below will break compiling dt(4) on the other archs because
they don't yet provide stacktrace_save_at(), but it shouldn't be hard
to fix ;)

Ok?

Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4
+++ dev/dt/dt_dev.c 19 Mar 2020 14:08:53 -0000
@@ -26,29 +26,40 @@
 #include <dev/dt/dtvar.h>
 
 /*
- * How many frames are used by the profiling code?  For example
- * on amd64:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
  *
- * From syscall provider:
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ *function.  For example the syscall provider:
  *
  * dt_prov_syscall_entry+0x141
  * syscall+0x205 <--- start here
  * Xsyscall+0x128
  *
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture.  For example the profile provider executed
+ * from hardclock(9) on amd64:
  *
  * dt_prov_profile_enter+0x6e
- * hardclock+0x12c
- * clockintr+0x59
- * intr_handler+0x6e
- * Xresume_legacy0+0x1d3
- * cpu_idle_cycle+0x1b <---- start here.
+ * hardclock+0x1a9
+ * lapic_clockintr+0x3f
+ * Xresume_lapic_ltimer+0x26
+ * acpicpu_idle+0x1d2 <---- start here.
+ * sched_idle+0x225
  * proc_trampoline+0x1c
  */
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(1)
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(0)
 #else
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0)
+#define DT_FA_STATIC __builtin_frame_address(0)
+#define DT_FA_PROFILE DT_FA_STATIC
 #endif
 
 #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
@@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp)
  * Get a reference to the next free event state from the ring.
  */
 struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
 {
  struct proc *p = curproc;
  struct dt_evt *dtev;
@@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
  memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
 
  if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
- stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
- stacktrace_save(&dtev->dtev_kstack);
-#endif
+ if (profiling)
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+ else
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
  }
 
  return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
  if (++dp->dp_nticks < dp->dp_maxtick)
  return;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 1);
  if (dtev == NULL)
  return;
  dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *);
 void dt_pcb_purge(struct dt_pcb_list *);
 int dt_pcb_filter(struct dt_pcb *);
 
-struct dt_evt *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int);
 void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
 
 /*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1
+++ sys/stacktrace.h 19 Mar 2020 14:06:01 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
 
 #ifdef _KERNEL
 void stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void stacktrace_save_at(struct stacktrace *, void *);
 void stacktrace_save(struct stacktrace *);
 #endif
 
Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49
+++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
 {
- struct callframe *frame, *lastframe;
+ struct callframe *lastframe, *frame = xframe;
+
+ if (frame == NULL)
+ frame = __builtin_frame_address(0);
 
- frame = __builtin_frame_address(0);
  st->st_count = 0;
  while (st->st_count < STACKTRACE_MAX) {
  st->st_pc[st->st_count++] = frame->f_retaddr;
@@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st)
  if (!INKERNEL(frame->f_retaddr))
  break;
  }
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
 }
 
 vaddr_t
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20
+++ arch/sparc64/sparc64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
 {
  struct frame64 *f64;
  vaddr_t pc;
@@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st)
 
  write_all_windows();
 
- frame = (vaddr_t)__builtin_frame_address(0) - BIAS;
+ if (xframe == NULL)
+ xframe = __builtin_frame_address(0);
+
+ frame = (vaddr_t)xframe - BIAS;
  if ((frame & 1) == 0)
  return;
 
@@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st)
  }
 }
 
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
+}
 void
 db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
 {

Reply | Threaded
Open this post in threaded view
|

Re: Saving stack frames in dt(4)

Martin Pieuchot
On 19/03/20(Thu) 15:37, Martin Pieuchot wrote:

> People are starting to capture kernel stack traces and produce
> Flamegraphs.  Those traces currently include the frames used by
> dt(4) itself:
>
>     dt_pcb_ring_get+0x11d
>     dt_prov_profile_enter+0x6e
>     hardclock+0x1a9
>     lapic_clockintr+0x3f
>     Xresume_lapic_ltimer+0x26
>     acpicpu_idle+0x261 <--- CPU was Idle
>     sched_idle+0x225
>     proc_trampoline+0x1c
>     1
>
> Saving the last 5 frames in the example above consumes CPU time and
> require some post-processing to cleanup gathered data.  So the diff
> below extends the existing stacktrace_save() into and *_at() version
> that takes an arbitrary frame as argument.
>
> The fun start when we ask "How many frames do we skip?".  There are
> currently two different contexts in which dt(4) can gather data:
>  - in hardlock(9), which mean a specialized interrupt context
>  - in any thread or interrupt context
>
> Those two contexts require a different number of frames to reach the
> recording function: dt_pcb_ring_get().  Now what's fun is that every
> architecture has its own way to reach hardclock(9) (possibly multiple
> ones), has its own stack unwinder (with specific bugs) and might use
> different compilers with different optimizations (yeah!).
>
> So the diff below introduce two per-arch defines that specify at which
> frame the unwinding should start.  This has been tested on sparc64 and
> amd64 and here's the result:
>
>     # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
>     sleep_setup+0x16c
>     msleep+0x160
>     taskq_next_work+0x5c
>     taskq_thread+0x38
>     0x1013074
>     1
>
> Note that the number of frames for hardclock(9) on amd64 assumes lapic
> is used.  That's why on vmm(4) you'll still see an extra frame as below:
>
>   #  btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'          
>   Xresume_legacy0+0x1d3
>   cpu_idle_cycle+0x1b
>   proc_trampoline+0x1c
>   1
>
> Whereas on real hardware:
>
>   # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
>   acpicpu_idle+0x1d2
>   sched_idle+0x225
>   proc_trampoline+0x1c
>   1
>
> The diff below will break compiling dt(4) on the other archs because
> they don't yet provide stacktrace_save_at(), but it shouldn't be hard
> to fix ;)

Updated diff including stacktrace_save_at() for i386, any comment on the
approach or Ok?

Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49
+++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
 {
- struct callframe *frame, *lastframe;
+ struct callframe *lastframe, *frame = xframe;
+
+ if (frame == NULL)
+ frame = __builtin_frame_address(0);
 
- frame = __builtin_frame_address(0);
  st->st_count = 0;
  while (st->st_count < STACKTRACE_MAX) {
  st->st_pc[st->st_count++] = frame->f_retaddr;
@@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st)
  if (!INKERNEL(frame->f_retaddr))
  break;
  }
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
 }
 
 vaddr_t
Index: arch/i386/i386/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.38
diff -u -p -r1.38 db_trace.c
--- arch/i386/i386/db_trace.c 20 Jan 2020 15:58:23 -0000 1.38
+++ arch/i386/i386/db_trace.c 20 Mar 2020 10:50:28 -0000
@@ -263,9 +263,17 @@ db_stack_trace_print(db_expr_t addr, int
 void
 stacktrace_save(struct stacktrace *st)
 {
- struct callframe *frame, *lastframe;
+ return stacktrace_save_at(st, NULL);
+}
+
+void
+stacktrace_save_at(struct stacktrace *st, void *xframe)
+{
+ struct callframe *lastframe, *frame = xframe;
+
+ if (frame == NULL)
+ frame = __builtin_frame_address(0);
 
- frame = __builtin_frame_address(0);
  st->st_count = 0;
  while (st->st_count < STACKTRACE_MAX) {
  st->st_pc[st->st_count++] = frame->f_retaddr;
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20
+++ arch/sparc64/sparc64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
 {
  struct frame64 *f64;
  vaddr_t pc;
@@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st)
 
  write_all_windows();
 
- frame = (vaddr_t)__builtin_frame_address(0) - BIAS;
+ if (xframe == NULL)
+ xframe = __builtin_frame_address(0);
+
+ frame = (vaddr_t)xframe - BIAS;
  if ((frame & 1) == 0)
  return;
 
@@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st)
  }
 }
 
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
+}
 void
 db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
 {
Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4
+++ dev/dt/dt_dev.c 19 Mar 2020 14:08:53 -0000
@@ -26,29 +26,40 @@
 #include <dev/dt/dtvar.h>
 
 /*
- * How many frames are used by the profiling code?  For example
- * on amd64:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
  *
- * From syscall provider:
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ *function.  For example the syscall provider:
  *
  * dt_prov_syscall_entry+0x141
  * syscall+0x205 <--- start here
  * Xsyscall+0x128
  *
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture.  For example the profile provider executed
+ * from hardclock(9) on amd64:
  *
  * dt_prov_profile_enter+0x6e
- * hardclock+0x12c
- * clockintr+0x59
- * intr_handler+0x6e
- * Xresume_legacy0+0x1d3
- * cpu_idle_cycle+0x1b <---- start here.
+ * hardclock+0x1a9
+ * lapic_clockintr+0x3f
+ * Xresume_lapic_ltimer+0x26
+ * acpicpu_idle+0x1d2 <---- start here.
+ * sched_idle+0x225
  * proc_trampoline+0x1c
  */
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(1)
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(0)
 #else
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0)
+#define DT_FA_STATIC __builtin_frame_address(0)
+#define DT_FA_PROFILE DT_FA_STATIC
 #endif
 
 #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
@@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp)
  * Get a reference to the next free event state from the ring.
  */
 struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
 {
  struct proc *p = curproc;
  struct dt_evt *dtev;
@@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
  memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
 
  if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
- stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
- stacktrace_save(&dtev->dtev_kstack);
-#endif
+ if (profiling)
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+ else
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
  }
 
  return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
  if (++dp->dp_nticks < dp->dp_maxtick)
  return;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 1);
  if (dtev == NULL)
  return;
  dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *);
 void dt_pcb_purge(struct dt_pcb_list *);
 int dt_pcb_filter(struct dt_pcb *);
 
-struct dt_evt *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int);
 void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
 
 /*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1
+++ sys/stacktrace.h 19 Mar 2020 14:06:01 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
 
 #ifdef _KERNEL
 void stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void stacktrace_save_at(struct stacktrace *, void *);
 void stacktrace_save(struct stacktrace *);
 #endif
 

Reply | Threaded
Open this post in threaded view
|

Re: Saving stack frames in dt(4)

Martin Pieuchot
On 23/03/20(Mon) 15:18, Martin Pieuchot wrote:

> On 19/03/20(Thu) 15:37, Martin Pieuchot wrote:
> > People are starting to capture kernel stack traces and produce
> > Flamegraphs.  Those traces currently include the frames used by
> > dt(4) itself:
> >
> >     dt_pcb_ring_get+0x11d
> >     dt_prov_profile_enter+0x6e
> >     hardclock+0x1a9
> >     lapic_clockintr+0x3f
> >     Xresume_lapic_ltimer+0x26
> >     acpicpu_idle+0x261 <--- CPU was Idle
> >     sched_idle+0x225
> >     proc_trampoline+0x1c
> >     1
> >
> > Saving the last 5 frames in the example above consumes CPU time and
> > require some post-processing to cleanup gathered data.  So the diff
> > below extends the existing stacktrace_save() into and *_at() version
> > that takes an arbitrary frame as argument.
> >
> > The fun start when we ask "How many frames do we skip?".  There are
> > currently two different contexts in which dt(4) can gather data:
> >  - in hardlock(9), which mean a specialized interrupt context
> >  - in any thread or interrupt context
> >
> > Those two contexts require a different number of frames to reach the
> > recording function: dt_pcb_ring_get().  Now what's fun is that every
> > architecture has its own way to reach hardclock(9) (possibly multiple
> > ones), has its own stack unwinder (with specific bugs) and might use
> > different compilers with different optimizations (yeah!).
> >
> > So the diff below introduce two per-arch defines that specify at which
> > frame the unwinding should start.  This has been tested on sparc64 and
> > amd64 and here's the result:
> >
> >     # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
> >     sleep_setup+0x16c
> >     msleep+0x160
> >     taskq_next_work+0x5c
> >     taskq_thread+0x38
> >     0x1013074
> >     1
> >
> > Note that the number of frames for hardclock(9) on amd64 assumes lapic
> > is used.  That's why on vmm(4) you'll still see an extra frame as below:
> >
> >   #  btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'          
> >   Xresume_legacy0+0x1d3
> >   cpu_idle_cycle+0x1b
> >   proc_trampoline+0x1c
> >   1
> >
> > Whereas on real hardware:
> >
> >   # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
> >   acpicpu_idle+0x1d2
> >   sched_idle+0x225
> >   proc_trampoline+0x1c
> >   1
> >
> > The diff below will break compiling dt(4) on the other archs because
> > they don't yet provide stacktrace_save_at(), but it shouldn't be hard
> > to fix ;)
>
> Updated diff including stacktrace_save_at() for i386, any comment on the
> approach or Ok?

New diff, new approach suggested by visa@.  Passes a number of frames to
skip to the unwinding function.  This is compatible with the mips64
unwinder and makes sure that the __builtin_frame_address() isn't called
with an index other than 0.

Indexes have been adjusted because __builtin_frame_address(0) is now in
a deeper frame.

Ok?

Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49
+++ arch/amd64/amd64/db_trace.c 24 Mar 2020 13:03:35 -0000
@@ -256,14 +256,17 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
 {
  struct callframe *frame, *lastframe;
 
  frame = __builtin_frame_address(0);
  st->st_count = 0;
  while (st->st_count < STACKTRACE_MAX) {
- st->st_pc[st->st_count++] = frame->f_retaddr;
+ if (skip == 0)
+ st->st_pc[st->st_count++] = frame->f_retaddr;
+ else
+ skip--;
 
  lastframe = frame;
  frame = frame->f_frame;
@@ -275,6 +278,12 @@ stacktrace_save(struct stacktrace *st)
  if (!INKERNEL(frame->f_retaddr))
  break;
  }
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, 0);
 }
 
 vaddr_t
Index: arch/i386/i386/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.38
diff -u -p -r1.38 db_trace.c
--- arch/i386/i386/db_trace.c 20 Jan 2020 15:58:23 -0000 1.38
+++ arch/i386/i386/db_trace.c 24 Mar 2020 12:59:34 -0000
@@ -261,14 +261,17 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
 {
  struct callframe *frame, *lastframe;
 
  frame = __builtin_frame_address(0);
  st->st_count = 0;
  while (st->st_count < STACKTRACE_MAX) {
- st->st_pc[st->st_count++] = frame->f_retaddr;
+ if (skip == 0)
+ st->st_pc[st->st_count++] = frame->f_retaddr;
+ else
+ skip--;
 
  lastframe = frame;
  frame = frame->f_frame;
@@ -280,6 +283,12 @@ stacktrace_save(struct stacktrace *st)
  if (!INKERNEL(frame->f_retaddr))
  break;
  }
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, 0);
 }
 
 vaddr_t
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20
+++ arch/sparc64/sparc64/db_trace.c 24 Mar 2020 13:08:55 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
 {
  struct frame64 *f64;
  vaddr_t pc;
@@ -181,8 +181,17 @@ stacktrace_save(struct stacktrace *st)
  if ((frame & 1) == 0)
  break;
 
- st->st_pc[st->st_count++] = pc;
+ if (skip == 0)
+ st->st_pc[st->st_count++] = pc;
+ else
+ skip--;
  }
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, 0);
 }
 
 void
Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4
+++ dev/dt/dt_dev.c 24 Mar 2020 13:22:04 -0000
@@ -26,29 +26,42 @@
 #include <dev/dt/dtvar.h>
 
 /*
- * How many frames are used by the profiling code?  For example
- * on amd64:
+ * Number of frames to skip in stack traces.
  *
- * From syscall provider:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
+ *
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ * function.  For example the syscall provider:
  *
  * dt_prov_syscall_entry+0x141
  * syscall+0x205 <--- start here
  * Xsyscall+0x128
  *
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture.  For example the profile provider executed
+ * from hardclock(9) on amd64:
  *
  * dt_prov_profile_enter+0x6e
- * hardclock+0x12c
- * clockintr+0x59
- * intr_handler+0x6e
- * Xresume_legacy0+0x1d3
- * cpu_idle_cycle+0x1b <---- start here.
+ * hardclock+0x1a9
+ * lapic_clockintr+0x3f
+ * Xresume_lapic_ltimer+0x26
+ * acpicpu_idle+0x1d2 <---- start here.
+ * sched_idle+0x225
  * proc_trampoline+0x1c
  */
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE 5
+#define DT_FA_STATIC 2
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE 5
+#define DT_FA_STATIC 1
 #else
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0)
+#define DT_FA_STATIC 0
+#define DT_FA_PROFILE 0
 #endif
 
 #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
@@ -583,7 +596,7 @@ dt_pcb_filter(struct dt_pcb *dp)
  * Get a reference to the next free event state from the ring.
  */
 struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
 {
  struct proc *p = curproc;
  struct dt_evt *dtev;
@@ -617,11 +630,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
  memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
 
  if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
- stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
- stacktrace_save(&dtev->dtev_kstack);
-#endif
+ if (profiling)
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+ else
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
  }
 
  return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
  if (++dp->dp_nticks < dp->dp_maxtick)
  return;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 1);
  if (dtev == NULL)
  return;
  dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
  SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
  struct dt_evt *dtev;
 
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
  if (dtev == NULL)
  continue;
 
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *);
 void dt_pcb_purge(struct dt_pcb_list *);
 int dt_pcb_filter(struct dt_pcb *);
 
-struct dt_evt *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int);
 void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
 
 /*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1
+++ sys/stacktrace.h 24 Mar 2020 13:00:54 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
 
 #ifdef _KERNEL
 void stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void stacktrace_save_at(struct stacktrace *, unsigned int);
 void stacktrace_save(struct stacktrace *);
 #endif