Scheduler improvements, take 1001

classic Classic list List threaded Threaded
19 messages Options
Reply | Threaded
Open this post in threaded view
|

Scheduler improvements, take 1001

Gregor Best
(By popular request as a new thread).

Hi people,

I've tried splitting my scheduler patch into smaller fragments, and
here's the result.

I changed a few things people mentioned over the last few days, such as
the following:

1) sys/proc.h now includes sys/tree.h, which should make libc builds
        work again.
2) deadline generation now takes process priorities into account, as
        suggested by ratchov@. The way it's done now, processes can use their
        sleep priority as a way to lower their nice value for short periods of
        time. I didn't notice any real changes, but I'd love to hear from people
        with more demanding applications.
3) schedstate_percpu is private to the kernel now, as I couldn't find a
        single occurrence of `struct schedstate_percpu` outside of /usr/src/sys
        and it seemed cleaner not to expose kernel data to userland in such a
        broad way.

The patches will follow as single emails.

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 1/5

Gregor Best
diff --git a/kern/kern_clock.c b/kern/kern_clock.c
index 843965b..f598afc 100644
--- a/kern/kern_clock.c
+++ b/kern/kern_clock.c
@@ -233,7 +233,7 @@ hardclock(struct clockframe *frame)
  if (stathz == 0)
  statclock(frame);
 
- if (--ci->ci_schedstate.spc_rrticks <= 0)
+ if (p && (--(p->p_rrticks) <= 0))
  roundrobin(ci);
 
  /*
diff --git a/kern/kern_proc.c b/kern/kern_proc.c
index ad861c8..e0d5536 100644
--- a/kern/kern_proc.c
+++ b/kern/kern_proc.c
@@ -398,8 +398,6 @@ proc_printit(struct proc *p, const char *modif, int (*pr)(const char *, ...))
     p->p_comm, p->p_pid, pst, p->p_flag, P_BITS);
  (*pr)("    pri=%u, usrpri=%u, nice=%d\n",
     p->p_priority, p->p_usrpri, p->p_p->ps_nice);
- (*pr)("    forw=%p, list=%p,%p\n",
-    TAILQ_NEXT(p, p_runq), p->p_list.le_next, p->p_list.le_prev);
  (*pr)("    process=%p user=%p, vmspace=%p\n",
     p->p_p, p->p_addr, p->p_vmspace);
  (*pr)("    estcpu=%u, cpticks=%d, pctcpu=%u.%u, swtime=%u\n",
diff --git a/kern/kern_sched.c b/kern/kern_sched.c
index 253226a..79eb28c 100644
--- a/kern/kern_sched.c
+++ b/kern/kern_sched.c
@@ -24,11 +24,22 @@
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/mutex.h>
+#include <sys/tree.h>
 
 #include <uvm/uvm_extern.h>
 
 #include <sys/malloc.h>
 
+static int
+sched_cmp_proc(struct proc *a, struct proc *b) {
+ if (a == b)
+ return 0;
+ if (timercmp(&(a->p_deadline), &(b->p_deadline), <))
+ return -1;
+ return 1;
+}
+
+RB_GENERATE_STATIC(prochead, proc, p_runq, sched_cmp_proc);
 
 void sched_kthreads_create(void *);
 
@@ -79,10 +90,8 @@ void
 sched_init_cpu(struct cpu_info *ci)
 {
  struct schedstate_percpu *spc = &ci->ci_schedstate;
- int i;
 
- for (i = 0; i < SCHED_NQS; i++)
- TAILQ_INIT(&spc->spc_qs[i]);
+ RB_INIT(&spc->spc_runq);
 
  spc->spc_idleproc = NULL;
 
@@ -158,18 +167,19 @@ sched_idle(void *v)
 
  cpuset_add(&sched_idle_cpus, ci);
  cpu_idle_enter();
- while (spc->spc_whichqs == 0) {
+
+ while (curcpu_is_idle()) {
  if (spc->spc_schedflags & SPCF_SHOULDHALT &&
-    (spc->spc_schedflags & SPCF_HALTED) == 0) {
+ (spc->spc_schedflags & SPCF_HALTED) == 0) {
  cpuset_del(&sched_idle_cpus, ci);
  SCHED_LOCK(s);
- atomic_setbits_int(&spc->spc_schedflags,
-    spc->spc_whichqs ? 0 : SPCF_HALTED);
+ atomic_setbits_int(&spc->spc_schedflags, SPCF_HALTED);
  SCHED_UNLOCK(s);
  wakeup(spc);
  }
  cpu_idle_cycle();
  }
+
  cpu_idle_leave();
  cpuset_del(&sched_idle_cpus, ci);
  }
@@ -222,14 +232,13 @@ void
 setrunqueue(struct proc *p)
 {
  struct schedstate_percpu *spc;
- int queue = p->p_priority >> 2;
 
  SCHED_ASSERT_LOCKED();
  spc = &p->p_cpu->ci_schedstate;
  spc->spc_nrun++;
 
- TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
- spc->spc_whichqs |= (1 << queue);
+ KASSERT(!RB_FIND(prochead, &spc->spc_runq, p));
+ RB_INSERT(prochead, &spc->spc_runq, p);
  cpuset_add(&sched_queued_cpus, p->p_cpu);
 
  if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
@@ -240,38 +249,29 @@ void
 remrunqueue(struct proc *p)
 {
  struct schedstate_percpu *spc;
- int queue = p->p_priority >> 2;
 
  SCHED_ASSERT_LOCKED();
  spc = &p->p_cpu->ci_schedstate;
  spc->spc_nrun--;
 
- TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
- if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
- spc->spc_whichqs &= ~(1 << queue);
- if (spc->spc_whichqs == 0)
- cpuset_del(&sched_queued_cpus, p->p_cpu);
- }
+ KASSERT(RB_REMOVE(prochead, &spc->spc_runq, p));
+ if (RB_EMPTY(&spc->spc_runq))
+ cpuset_del(&sched_queued_cpus, p->p_cpu);
 }
 
 struct proc *
 sched_chooseproc(void)
 {
  struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
- struct proc *p;
- int queue;
+ struct proc *p, *p_tmp = NULL;
 
  SCHED_ASSERT_LOCKED();
 
  if (spc->spc_schedflags & SPCF_SHOULDHALT) {
- if (spc->spc_whichqs) {
- for (queue = 0; queue < SCHED_NQS; queue++) {
- TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
- remrunqueue(p);
- p->p_cpu = sched_choosecpu(p);
- setrunqueue(p);
- }
- }
+ RB_FOREACH_SAFE(p, prochead, &spc->spc_runq, p_tmp) {
+ remrunqueue(p);
+ p->p_cpu = sched_choosecpu(p);
+ setrunqueue(p);
  }
  p = spc->spc_idleproc;
  KASSERT(p);
@@ -280,17 +280,14 @@ sched_chooseproc(void)
  return (p);
  }
 
-again:
- if (spc->spc_whichqs) {
- queue = ffs(spc->spc_whichqs) - 1;
- p = TAILQ_FIRST(&spc->spc_qs[queue]);
+ if (!RB_EMPTY(&spc->spc_runq)) {
+ p = RB_MIN(prochead, &spc->spc_runq);
  remrunqueue(p);
  sched_noidle++;
  KASSERT(p->p_stat == SRUN);
  } else if ((p = sched_steal_proc(curcpu())) == NULL) {
- p = spc->spc_idleproc;
- if (p == NULL) {
-                        int s;
+ while ((p = spc->spc_idleproc) == NULL) {
+ int s;
  /*
  * We get here if someone decides to switch during
  * boot before forking kthreads, bleh.
@@ -302,8 +299,7 @@ again:
  spl0();
  delay(10);
  SCHED_LOCK(s);
- goto again;
-                }
+ }
  KASSERT(p);
  p->p_stat = SRUN;
  }
@@ -441,15 +437,13 @@ sched_steal_proc(struct cpu_info *self)
 
  while ((ci = cpuset_first(&set)) != NULL) {
  struct proc *p;
- int queue;
  int cost;
 
  cpuset_del(&set, ci);
 
  spc = &ci->ci_schedstate;
 
- queue = ffs(spc->spc_whichqs) - 1;
- TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
+ RB_FOREACH(p, prochead, &spc->spc_runq) {
  if (p->p_flag & P_CPUPEG)
  continue;
 
diff --git a/kern/kern_synch.c b/kern/kern_synch.c
index f1db615..43716b3 100644
--- a/kern/kern_synch.c
+++ b/kern/kern_synch.c
@@ -205,7 +205,7 @@ sleep_setup(struct sleep_state *sls, const volatile void *ident, int prio,
  p->p_wmesg = wmesg;
  p->p_slptime = 0;
  p->p_priority = prio & PRIMASK;
- TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_runq);
+ TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 }
 
 void
@@ -342,7 +342,7 @@ void
 unsleep(struct proc *p)
 {
  if (p->p_wchan) {
- TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq);
+ TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
  p->p_wchan = NULL;
  }
 }
@@ -361,7 +361,7 @@ wakeup_n(const volatile void *ident, int n)
  SCHED_LOCK(s);
  qp = &slpque[LOOKUP(ident)];
  for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
- pnext = TAILQ_NEXT(p, p_runq);
+ pnext = TAILQ_NEXT(p, p_slpq);
 #ifdef DIAGNOSTIC
  if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
  panic("wakeup: p_stat is %d", (int)p->p_stat);
@@ -369,7 +369,7 @@ wakeup_n(const volatile void *ident, int n)
  if (p->p_wchan == ident) {
  --n;
  p->p_wchan = 0;
- TAILQ_REMOVE(qp, p, p_runq);
+ TAILQ_REMOVE(qp, p, p_slpq);
  if (p->p_stat == SSLEEP)
  setrunnable(p);
  }
diff --git a/kern/sched_bsd.c b/kern/sched_bsd.c
index 6c5eb63..172bb8f 100644
--- a/kern/sched_bsd.c
+++ b/kern/sched_bsd.c
@@ -89,8 +89,6 @@ roundrobin(struct cpu_info *ci)
 {
  struct schedstate_percpu *spc = &ci->ci_schedstate;
 
- spc->spc_rrticks = rrticks_init;
-
  if (ci->ci_curproc != NULL) {
  if (spc->spc_schedflags & SPCF_SEENRR) {
  /*
@@ -252,8 +250,7 @@ schedcpu(void *arg)
  resetpriority(p);
  if (p->p_priority >= PUSER) {
  if (p->p_stat == SRUN &&
-    (p->p_priority / SCHED_PPQ) !=
-    (p->p_usrpri / SCHED_PPQ)) {
+    p->p_priority == p->p_usrpri) {
  remrunqueue(p);
  p->p_priority = p->p_usrpri;
  setrunqueue(p);
@@ -304,6 +301,7 @@ yield(void)
  SCHED_LOCK(s);
  p->p_priority = p->p_usrpri;
  p->p_stat = SRUN;
+ generate_deadline(p, 1);
  setrunqueue(p);
  p->p_ru.ru_nvcsw++;
  mi_switch();
@@ -332,6 +330,7 @@ preempt(struct proc *newp)
  p->p_priority = p->p_usrpri;
  p->p_stat = SRUN;
  p->p_cpu = sched_choosecpu(p);
+ generate_deadline(p, 0);
  setrunqueue(p);
  p->p_ru.ru_nivcsw++;
  mi_switch();
@@ -531,8 +530,7 @@ resetpriority(struct proc *p)
 
  SCHED_ASSERT_LOCKED();
 
- newpriority = PUSER + p->p_estcpu +
-    NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
+ newpriority = PUSER + p->p_estcpu + (p->p_p->ps_nice - NZERO);
  newpriority = min(newpriority, MAXPRI);
  p->p_usrpri = newpriority;
  resched_proc(p, p->p_usrpri);
@@ -565,3 +563,34 @@ schedclock(struct proc *p)
  p->p_priority = p->p_usrpri;
  SCHED_UNLOCK(s);
 }
+
+void
+generate_deadline(struct proc *p, char voluntary) {
+ /*
+ * For nice values between 0 and 39 inclusively, the offset lies between
+ * 32 and 1280 milliseconds for a machine with hz=100. That means that
+ * processes with nice value=0 (i.e. -20 in userland) will be executed
+ * 32 milliseconds in the future at the latest. Processes with very
+ * little priority will be executed 1.28 seconds in the future at the very
+ * latest. The shift is done to ensure that the lowest possible offset is
+ * larger than the timeslice, in order to make sure that the scheduler does
+ * not degenerate to round robin behaviour when more than just a few processes
+ * with high priority are started.
+ *
+ * If the process voluntarily yielded its CPU, we reward it by halving its
+ * deadline offset.
+ */
+
+ unsigned int offset_msec =
+ ((p->p_p->ps_nice + (p->p_priority >> 1) + 1) * rrticks_init) << (voluntary? 2: 3);
+ struct timeval offset = {
+ .tv_sec  = offset_msec / 1000,
+ .tv_usec = offset_msec % 1000
+ };
+ struct timeval now;
+ microuptime(&now);
+
+ timeradd(&now, &offset, &(p->p_deadline));
+ if (!voluntary)
+ p->p_rrticks = rrticks_init;
+}
diff --git a/sys/proc.h b/sys/proc.h
index da04d28..69fc0e2 100644
--- a/sys/proc.h
+++ b/sys/proc.h
@@ -48,6 +48,7 @@
 #include <sys/mutex.h> /* For struct mutex */
 #include <sys/resource.h> /* For struct rusage */
 #include <machine/atomic.h>
+#include <sys/tree.h>
 
 #ifdef _KERNEL
 #define __need_process
@@ -247,8 +248,9 @@ struct process {
 #define PS_EXITING _P_EXITING
 
 struct proc {
- TAILQ_ENTRY(proc) p_runq;
+ TAILQ_ENTRY(proc) p_slpq;
  LIST_ENTRY(proc) p_list; /* List of all processes. */
+ RB_ENTRY(proc) p_runq;
 
  struct process *p_p; /* The process of this thread. */
  TAILQ_ENTRY(proc) p_thr_link;/* Threads in a process linkage. */
@@ -280,6 +282,8 @@ struct proc {
  int p_sigwait; /* signal handled by sigwait() */
 
  /* scheduling */
+ int p_rrticks;
+ struct timeval p_deadline;
  u_int p_estcpu; /* Time averaged value of p_cpticks. */
  int p_cpticks; /* Ticks of cpu time. */
  fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */
diff --git a/sys/sched.h b/sys/sched.h
index 1651205..fb01f21 100644
--- a/sys/sched.h
+++ b/sys/sched.h
@@ -70,6 +70,7 @@
 #define _SYS_SCHED_H_
 
 #include <sys/queue.h>
+#include <sys/tree.h>
 
 /*
  * Posix defines a <sched.h> which may want to include <sys/sched.h>
@@ -99,7 +100,6 @@ struct schedstate_percpu {
  u_int spc_schedticks; /* ticks for schedclock() */
  u_int64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
  u_char spc_curpriority; /* usrpri of curproc */
- int spc_rrticks; /* ticks until roundrobin() */
  int spc_pscnt; /* prof/stat counter */
  int spc_psdiv; /* prof/stat divisor */
  struct proc *spc_idleproc; /* idle proc for this cpu */
@@ -107,8 +107,7 @@ struct schedstate_percpu {
  u_int spc_nrun; /* procs on the run queues */
  fixpt_t spc_ldavg; /* shortest load avg. for this cpu */
 
- TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
- volatile uint32_t spc_whichqs;
+ RB_HEAD(prochead, proc) spc_runq;
 
 #ifdef notyet
  struct proc *spc_reaper; /* dead proc reaper */
@@ -125,9 +124,7 @@ struct schedstate_percpu {
 #define SPCF_SHOULDHALT 0x0004 /* CPU should be vacated */
 #define SPCF_HALTED 0x0008 /* CPU has been halted */
 
-#define SCHED_PPQ (128 / SCHED_NQS) /* priorities per queue */
-#define NICE_WEIGHT 2 /* priorities per nice level */
-#define ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
+#define ESTCPULIM(e) min((e), PRIO_MAX)
 
 extern int schedhz; /* ideally: 16 */
 extern int rrticks_init; /* ticks per roundrobin() */
@@ -152,13 +149,14 @@ void cpu_idle_enter(void);
 void cpu_idle_cycle(void);
 void cpu_idle_leave(void);
 void sched_peg_curproc(struct cpu_info *ci);
+void generate_deadline(struct proc *, char);
 
 #ifdef MULTIPROCESSOR
 void sched_start_secondary_cpus(void);
 void sched_stop_secondary_cpus(void);
 #endif
 
-#define curcpu_is_idle() (curcpu()->ci_schedstate.spc_whichqs == 0)
+#define curcpu_is_idle() (RB_EMPTY(&curcpu()->ci_schedstate.spc_runq))
 
 void sched_init_runqueues(void);
 void setrunqueue(struct proc *);
--
1.7.6

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 1/5

Gregor Best
diff --git a/kern/sched_bsd.c b/kern/sched_bsd.c
index 172bb8f..c7121dc 100644
--- a/kern/sched_bsd.c
+++ b/kern/sched_bsd.c
@@ -77,12 +77,12 @@ scheduler_start(void)
 
  timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
 
- rrticks_init = hz / 10;
+ rrticks_init = hz / 20;
  schedcpu(&schedcpu_to);
 }
 
 /*
- * Force switch among equal priority processes every 100ms.
+ * Force switch among equal priority processes every 50ms.
  */
 void
 roundrobin(struct cpu_info *ci)
--
1.7.6

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 3/5

Gregor Best
diff --git a/arch/amd64/amd64/identcpu.c b/arch/amd64/amd64/identcpu.c
index c597bb0..982c2bb 100644
--- a/arch/amd64/amd64/identcpu.c
+++ b/arch/amd64/amd64/identcpu.c
@@ -210,6 +210,8 @@ void (*setperf_setup)(struct cpu_info *);
 
 void via_nano_setup(struct cpu_info *ci);
 
+void cpu_topology(struct cpu_info *ci);
+
 void
 via_nano_setup(struct cpu_info *ci)
 {
@@ -479,4 +481,123 @@ identifycpu(struct cpu_info *ci)
  sensordev_install(&ci->ci_sensordev);
 #endif
  }
+
+ cpu_topology(ci);
+}
+
+/*
+ * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
+ */
+static int
+log2(unsigned int i)
+{
+ int ret = 0;
+
+ while (i >>= 1)
+ ret++;
+
+ return (ret);
+}
+
+static int
+mask_width(u_int x)
+{
+ int bit;
+ int mask;
+ int powerof2;
+
+ powerof2 = ((x - 1) & x) == 0;
+ mask = (x << (1 - powerof2)) - 1;
+
+ /* fls */
+ if (mask == 0)
+ return (0);
+ for (bit = 1; mask != 1; bit++)
+ mask = (unsigned int)mask >> 1;
+
+ return (bit);
+}
+
+/*
+ * Build up cpu topology for given cpu, must run on the core itself.
+ */
+void
+cpu_topology(struct cpu_info *ci)
+{
+ u_int32_t eax, ebx, ecx, edx;
+ u_int32_t apicid, max_apicid, max_coreid;
+ u_int32_t smt_bits, core_bits, pkg_bits;
+ u_int32_t smt_mask, core_mask, pkg_mask;
+
+ /* We need at least apicid at CPUID 1 */
+ CPUID(0, eax, ebx, ecx, edx);
+ if (eax < 1)
+ goto no_topology;
+
+ /* Initial apicid */
+ CPUID(1, eax, ebx, ecx, edx);
+ apicid = (ebx >> 24) & 0xff;
+
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+ /* We need at least apicid at CPUID 0x80000008 */
+ CPUID(0x80000000, eax, ebx, ecx, edx);
+ if (eax < 0x80000008)
+ goto no_topology;
+
+ CPUID(0x80000008, eax, ebx, ecx, edx);
+ core_bits = (ecx >> 12) & 0xf;
+ if (core_bits == 0)
+ goto no_topology;
+ /* So coreidsize 2 gives 3, 3 gives 7... */
+ core_mask = (1 << core_bits) - 1;
+ /* Core id is the least significant considering mask */
+ ci->ci_core_id = apicid & core_mask;
+ /* Pkg id is the upper remaining bits */
+ ci->ci_pkg_id = apicid & ~core_mask;
+ ci->ci_pkg_id >>= core_bits;
+ } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+ /* We only support leaf 1/4 detection */
+ CPUID(0, eax, ebx, ecx, edx);
+ if (eax < 4)
+ goto no_topology;
+ /* Get max_apicid */
+ CPUID(1, eax, ebx, ecx, edx);
+ max_apicid = (ebx >> 16) & 0xff;
+ /* Get max_coreid */
+ CPUID2(4, 0, eax, ebx, ecx, edx);
+ max_coreid = ((eax >> 26) & 0x3f) + 1;
+ /* SMT */
+ smt_bits = mask_width(max_apicid / max_coreid);
+ smt_mask = (1 << smt_bits) - 1;
+ /* Core */
+ core_bits = log2(max_coreid);
+ core_mask = (1 << (core_bits + smt_bits)) - 1;
+ core_mask ^= smt_mask;
+ /* Pkg */
+ pkg_bits = core_bits + smt_bits;
+ pkg_mask = -1 << core_bits;
+
+ ci->ci_smt_id = apicid & smt_mask;
+ ci->ci_core_id = (apicid & core_mask) >> smt_bits;
+ ci->ci_pkg_id = (apicid & pkg_mask) >> pkg_bits;
+ } else
+ goto no_topology;
+#ifdef DEBUG
+ printf("cpu%d: smt %u, core %u, pkg %u "
+ "(apicid 0x%x, max_apicid 0x%x, max_coreid 0x%x, smt_bits 0x%x, smt_mask 0x%x, "
+ "core_bits 0x%x, core_mask 0x%x, pkg_bits 0x%x, pkg_mask 0x%x)\n",
+ ci->ci_cpuid, ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id,
+ apicid, max_apicid, max_coreid, smt_bits, smt_mask, core_bits,
+ core_mask, pkg_bits, pkg_mask);
+#else
+ printf("cpu%d: smt %u, core %u, package %u\n", ci->ci_cpuid,
+ ci->ci_smt_id, ci->ci_core_id, ci->ci_pkg_id);
+
+#endif
+ return;
+ /* We can't map, so consider ci_core_id as ci_cpuid */
+no_topology:
+ ci->ci_smt_id  = 0;
+ ci->ci_core_id = ci->ci_cpuid;
+ ci->ci_pkg_id  = 0;
 }
diff --git a/arch/amd64/include/cpu.h b/arch/amd64/include/cpu.h
index 9ce437a..12e48d6 100644
--- a/arch/amd64/include/cpu.h
+++ b/arch/amd64/include/cpu.h
@@ -102,6 +102,9 @@ struct cpu_info {
  u_int32_t ci_cflushsz;
  u_int64_t ci_tsc_freq;
 
+ u_int32_t ci_smt_id;
+ u_int32_t ci_core_id;
+ u_int32_t ci_pkg_id;
  struct cpu_functions *ci_func;
  void (*cpu_setup)(struct cpu_info *);
  void (*ci_info)(struct cpu_info *);
diff --git a/arch/amd64/include/specialreg.h b/arch/amd64/include/specialreg.h
index 142fbbc..cab0985 100644
--- a/arch/amd64/include/specialreg.h
+++ b/arch/amd64/include/specialreg.h
@@ -218,10 +218,14 @@
 #define CPUID2MODEL(cpuid) (((cpuid) >> 4) & 15)
 #define CPUID2STEPPING(cpuid) ((cpuid) & 15)
 
-#define CPUID(code, eax, ebx, ecx, edx)                         \
+#define CPUID2(eax_code, ecx_code, eax, ebx, ecx, edx)         \
  __asm("cpuid"                                           \
-    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)    \
-    : "a" (code));
+ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)    \
+ : "a" (eax_code), "c" (ecx_code));
+
+#define CPUID(code, eax, ebx, ecx, edx)                                \
+ CPUID2(code, 0, eax, ebx, ecx, edx)
+
 #define CPUID_LEAF(code, leaf, eax, ebx, ecx, edx) \
  __asm("cpuid"                                           \
     : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)    \
--
1.7.6

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 4/5

Gregor Best
diff --git a/arch/amd64/include/cpu.h b/arch/amd64/include/cpu.h
index 12e48d6..99501a1 100644
--- a/arch/amd64/include/cpu.h
+++ b/arch/amd64/include/cpu.h
@@ -102,9 +102,11 @@ struct cpu_info {
  u_int32_t ci_cflushsz;
  u_int64_t ci_tsc_freq;
 
+#define ARCH_HAVE_CPU_TOPOLOGY
  u_int32_t ci_smt_id;
  u_int32_t ci_core_id;
  u_int32_t ci_pkg_id;
+
  struct cpu_functions *ci_func;
  void (*cpu_setup)(struct cpu_info *);
  void (*ci_info)(struct cpu_info *);
diff --git a/kern/kern_sched.c b/kern/kern_sched.c
index 79eb28c..072ef38 100644
--- a/kern/kern_sched.c
+++ b/kern/kern_sched.c
@@ -496,6 +496,10 @@ int sched_cost_load = 1;
 int sched_cost_priority = 1;
 int sched_cost_runnable = 3;
 int sched_cost_resident = 1;
+#ifdef ARCH_HAVE_CPU_TOPOLOGY
+int sched_cost_diffcore = 2; /* cost for moving to a different core */
+int sched_cost_diffpkg = 3; /* cost for moving to a different package */
+#endif
 
 int
 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
@@ -536,6 +540,13 @@ sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
  cost -= l2resident * sched_cost_resident;
  }
 
+#ifdef ARCH_HAVE_CPU_TOPOLOGY
+ if (p->p_cpu->ci_pkg_id != ci->ci_pkg_id)
+ cost *= sched_cost_diffpkg;
+ else if (p->p_cpu->ci_core_id != ci->ci_core_id)
+ cost *= sched_cost_diffcore;
+#endif
+
  return (cost);
 }
 
--
1.7.6

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 5/5

Gregor Best
diff --git a/sys/sched.h b/sys/sched.h
index fb01f21..1784ee2 100644
--- a/sys/sched.h
+++ b/sys/sched.h
@@ -69,8 +69,10 @@
 #ifndef _SYS_SCHED_H_
 #define _SYS_SCHED_H_
 
+#ifdef _KERNEL
 #include <sys/queue.h>
 #include <sys/tree.h>
+#endif
 
 /*
  * Posix defines a <sched.h> which may want to include <sys/sched.h>
@@ -88,11 +90,9 @@
 #define CP_IDLE 4
 #define CPUSTATES 5
 
-#define SCHED_NQS 32 /* 32 run queues. */
-
+#ifdef _KERNEL
 /*
  * Per-CPU scheduler state.
- * XXX - expose to userland for now.
  */
 struct schedstate_percpu {
  struct timeval spc_runtime; /* time curproc started running */
@@ -107,15 +107,13 @@ struct schedstate_percpu {
  u_int spc_nrun; /* procs on the run queues */
  fixpt_t spc_ldavg; /* shortest load avg. for this cpu */
 
- RB_HEAD(prochead, proc) spc_runq;
-
 #ifdef notyet
  struct proc *spc_reaper; /* dead proc reaper */
 #endif
  LIST_HEAD(,proc) spc_deadproc;
-};
 
-#ifdef _KERNEL
+ RB_HEAD(prochead, proc) spc_runq;
+};
 
 /* spc_flags */
 #define SPCF_SEENRR             0x0001  /* process has seen roundrobin() */
--
1.7.6

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 1/5

Gregor Best
In reply to this post by Gregor Best
As requested, I'll write down a few comments on each patch. So, here
goes:

This is the initial commit, it replaces the multiple FIFO queues that
were used before with one RB-tree per CPU as a runqueue. The RB-tree is
used because it offers operations such as min(), insert() and remove()
in O(log n).

I remove the NICE_WEIGHT define because it's meaningless with this
patch, as there's only one runqueue.

sys/proc.h now includes sys/tree.h, but that does not seem to be a
problem when building the userland. Libc builds fine now for example.

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

Gregor Best
In reply to this post by Gregor Best
This patch simply halves the timeslice processes get until they are
preempted. This patch is standalone and the rest of the patches does not
depend on it, but I figured I'd throw it in anyway.

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 3/5

Gregor Best
In reply to this post by Gregor Best
This patch simply imports Christiano's code for detecting CPU topology,
as posted on tech@ a while (more than two months) ago. I took it
verbatim and didn't change anything yet.

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 4/5

Gregor Best
In reply to this post by Gregor Best
This patch uses the previous one to take CPU topology into account when
calculating the cost of moving a process between CPUs. This is only done
on amd64 at the moment, and the cost factors are guesses right now, but
it's a start.

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 5/5

Gregor Best
In reply to this post by Gregor Best
This patch moves struct schedstate_percpu to kernel land, which I think
is cleaner than exposing structures for scheduler state to userland,
especially since grepping for 'schedstate' in /usr/src yielded no
results outside of /usr/src/sys.

I have not seen negative impact from this, but I haven't yet run a full
userland build (it's running at the moment but the machine I'm building
on is a bit slower than my laptop).

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 5/5

Philip Guenther-2
On Tue, Oct 9, 2012 at 9:27 AM, Gregor Best <[hidden email]> wrote:
> This patch moves struct schedstate_percpu to kernel land, which I think
> is cleaner than exposing structures for scheduler state to userland,
> especially since grepping for 'schedstate' in /usr/src yielded no
> results outside of /usr/src/sys.

I think this is a misunderstanding about the intent of _KERNEL.  There
are, IMO, two reasons to condition parts of header files on _KERNEL:

1) the header's contents have been specified by a de facto or de jure
standard, but for its own reasons the kernel wants to place additional
info there.

2) the header defines kernel data structures that tools may need (for
crash dump debugging, for example) as well as external symbols like
functions and variables, declarations that can't be used by userspace.

<sys/sched.h> isn't a standardized header and isn't pulled in by a
standardized header, so (1) isn't an issue.  So, only a non-standard
program is going to be pulling in <sys/sched.h> anyway.  Unless
there's a reason otherwise, all types and #defines be exposed to it.


(Yeah, the use of CP_* and CPUSTATES by the KERN_CPTIME* sysctl()s is
less than ideal, but this isn't the right way to fix that, IMO.)


Philip Guenther

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

David Coppa
In reply to this post by Gregor Best
On Tue, Oct 9, 2012 at 6:21 PM, Gregor Best <[hidden email]> wrote:
> This patch simply halves the timeslice processes get until they are
> preempted. This patch is standalone and the rest of the patches does not
> depend on it, but I figured I'd throw it in anyway.
>
> --
>     Gregor Best
>

Didn't get this patch. Is it me or the patch is missing?

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

Gregor Best
On Sun, Oct 14, 2012 at 11:05:36AM +0200, David Coppa wrote:

> On Tue, Oct 9, 2012 at 6:21 PM, Gregor Best <[hidden email]> wrote:
> > This patch simply halves the timeslice processes get until they are
> > preempted. This patch is standalone and the rest of the patches does not
> > depend on it, but I figured I'd throw it in anyway.
> >
> > --
> >     Gregor Best
> >
>
> Didn't get this patch. Is it me or the patch is missing?
>

The patch was the message my mail was in reply to. There's no Patch 2/5
subject line because I forgot to change that after sending out Patch
1/5.

--
    Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

Bob Beck-2
Gregor you would perhaps get better feedback if it were easier to discern
where your patches are and what each one is doing.  If you can't be
inclined to keep the subjects matching the diffs and are sending stuff out
with subjects like "scheduler improvement diff X" instead of something like
"reduce time slice length by 50%" it makes it a lot more effort for someone
to read and test.
On Oct 14, 2012 9:01 AM, "Gregor Best" <[hidden email]> wrote:

> On Sun, Oct 14, 2012 at 11:05:36AM +0200, David Coppa wrote:
> > On Tue, Oct 9, 2012 at 6:21 PM, Gregor Best <[hidden email]> wrote:
> > > This patch simply halves the timeslice processes get until they are
> > > preempted. This patch is standalone and the rest of the patches does
> not
> > > depend on it, but I figured I'd throw it in anyway.
> > >
> > > --
> > >     Gregor Best
> > >
> >
> > Didn't get this patch. Is it me or the patch is missing?
> >
>
> The patch was the message my mail was in reply to. There's no Patch 2/5
> subject line because I forgot to change that after sending out Patch
> 1/5.
>
> --
>     Gregor Best

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

David Coppa
In reply to this post by Gregor Best
On Sun, Oct 14, 2012 at 4:53 PM, Gregor Best <[hidden email]> wrote:

> On Sun, Oct 14, 2012 at 11:05:36AM +0200, David Coppa wrote:
>> On Tue, Oct 9, 2012 at 6:21 PM, Gregor Best <[hidden email]> wrote:
>> > This patch simply halves the timeslice processes get until they are
>> > preempted. This patch is standalone and the rest of the patches does not
>> > depend on it, but I figured I'd throw it in anyway.
>> >
>> > --
>> >     Gregor Best
>> >
>>
>> Didn't get this patch. Is it me or the patch is missing?
>>
>
> The patch was the message my mail was in reply to. There's no Patch 2/5
> subject line because I forgot to change that after sending out Patch
> 1/5.

Can you resend it, please?
I'm confused...

TIA,
David

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

Stuart Henderson
On 2012/10/15 16:18, David Coppa wrote:

> On Sun, Oct 14, 2012 at 4:53 PM, Gregor Best <[hidden email]> wrote:
> > On Sun, Oct 14, 2012 at 11:05:36AM +0200, David Coppa wrote:
> >> On Tue, Oct 9, 2012 at 6:21 PM, Gregor Best <[hidden email]> wrote:
> >> > This patch simply halves the timeslice processes get until they are
> >> > preempted. This patch is standalone and the rest of the patches does not
> >> > depend on it, but I figured I'd throw it in anyway.
> >> >
> >> > --
> >> >     Gregor Best
> >> >
> >>
> >> Didn't get this patch. Is it me or the patch is missing?
> >>
> >
> > The patch was the message my mail was in reply to. There's no Patch 2/5
> > subject line because I forgot to change that after sending out Patch
> > 1/5.
>
> Can you resend it, please?
> I'm confused...
>
> TIA,
> David
>

Best to send the diff, with the accompanying text, in the same email,
and make sure they still all apply, I tried testing some of these but
didn't manage to get them to apply (either some conflicting change,
or they were in the wrong order and stacked up on top of each other
or something).

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

Stefan Fritsch
On Monday 15 October 2012, Stuart Henderson wrote:
> Best to send the diff, with the accompanying text, in the same
> email, and make sure they still all apply, I tried testing some of
> these but didn't manage to get them to apply (either some
> conflicting change, or they were in the wrong order and stacked up
> on top of each other or something).

As you mentioned using git in another mail, I would recommend to use
git's format-patch to create the mails. Maybe with the --no-prefix
option,  because that seems to be the preferred patch format here.
Also, interactive rebase (git rebase -i) is nice for adjusting patches
and commit messages (in case you didn't know that one).

Cheers,
Stefan

Reply | Threaded
Open this post in threaded view
|

Re: Scheduler improvements, take 1001, Patch 2/5

Otto Moerbeek
In reply to this post by Stuart Henderson
On Mon, Oct 15, 2012 at 04:01:29PM +0100, Stuart Henderson wrote:

> On 2012/10/15 16:18, David Coppa wrote:
> > On Sun, Oct 14, 2012 at 4:53 PM, Gregor Best <[hidden email]> wrote:
> > > On Sun, Oct 14, 2012 at 11:05:36AM +0200, David Coppa wrote:
> > >> On Tue, Oct 9, 2012 at 6:21 PM, Gregor Best <[hidden email]> wrote:
> > >> > This patch simply halves the timeslice processes get until they are
> > >> > preempted. This patch is standalone and the rest of the patches does not
> > >> > depend on it, but I figured I'd throw it in anyway.
> > >> >
> > >> > --
> > >> >     Gregor Best
> > >> >
> > >>
> > >> Didn't get this patch. Is it me or the patch is missing?
> > >>
> > >
> > > The patch was the message my mail was in reply to. There's no Patch 2/5
> > > subject line because I forgot to change that after sending out Patch
> > > 1/5.
> >
> > Can you resend it, please?
> > I'm confused...
> >
> > TIA,
> > David
> >
>
> Best to send the diff, with the accompanying text, in the same email,
> and make sure they still all apply, I tried testing some of these but
> didn't manage to get them to apply (either some conflicting change,
> or they were in the wrong order and stacked up on top of each other
> or something).

This is important. A diff should apply without effort in current.

Also, a step by step approach is likely to work better: one diff at a
time, with explanation, own tets result and/or requets for tests. That
will enable us to discuss the merits of it while everybody knows which
diff we are talking about.

        -Otto