userland clock_gettime proof of concept

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
203 messages Options
1234 ... 11
Reply | Threaded
Open this post in threaded view
|

userland clock_gettime proof of concept

Paul Irofti-4
Hi,

By far one of the most popular and frequently used system calls is
clock_gettime(2). As a result the cost of kernel-userland transitions
out weight the actual work, thus I am proposing we make the data
available directly from userland without passing through a system call.

This has been a subject of discussion multiple times across the years
and last I heard from it was at the p2k19 hackthon that I hosted in
Bucharest where espie@ sent me a diff from one of his students(?). Being
busy with organization I have not had the time to look at it and
I am thus getting back to it just now due to robert@ prodding me again
on the subject. The proposed diff is mine, not the student's.


The technical bits.

Please keep in mind that this is only proof of concept. I am looking for
ways to improve the current diff. As it is, it requires a flag day
because it makes use of ELF aux vectors to export the data from the
kernel.

I have also played with exposing the data via separate ELF sections and
with kbind-mmap alternatives. The frist also involves a flag day and is
more intrusive in my opinion, and the second I could not get to work. I
think that would be the less intrusive way of doing it, possibly without
a flag day, so if anyone knows how, please let me know.

The supported clocks are just those that do not require process specific
data. Those can also be handled later if this diff is decided to be a
good thing.

Clock update inside the kernel is done at the end of tc_windup(). There
might be better places to do it. Let me know where.

The update currently does the work of clock_gettime(), but it can
probably be changed to only update the timehands and move the logic
elsewhere. Note that if we expose only the timehands to userland, most
of the bintime functionality has to also be made available there. Or so
I think.

In userland, I wrapped the clock_gettime(2) syscall in libc. There, I
search for the auxiliary vector and fetch the timespec data from it.
As you can see in the diff, parts from the elf_exec header will have to
be exposed to userland if we do it this way.


Results.

To test this diff you need to do a full release(8). I have tested this
with multiple programs. Test programs, base programs and packages. None
the less, this diff touches many important areas of our tree and is
very fragile. I also probably missed changing some parts that required
change due to libc or elf changes.

If you see regressions, which you probably will, please let me know.

Here is a stress test from robert@:

robert@x202:/home/robert> time ./t && time ./t2
0m00.11s real 0m00.12s user 0m00.00s system
0m09.99s real 0m02.64s user 0m03.36s system
t is clock_gettime() and t2 is SYS_clock_gettime()


Please keep the discussions on the list and let me know what you think
and how we can improve this if we decide this is wanted in the tree.

Paul

diff --git lib/libc/shlib_version lib/libc/shlib_version
index 06f98b01084..5fb0770494f 100644
--- lib/libc/shlib_version
+++ lib/libc/shlib_version
@@ -1,4 +1,4 @@
 major=96
-minor=0
+minor=1
 # note: If changes were made to include/thread_private.h or if system calls
 # were added/changed then librthread/shlib_version must also be updated.
diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
index 34769576ced..607985e8f20 100644
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
 
 # glue to offer userland wrappers for some syscalls
 SRCS+= posix_madvise.c pthread_sigmask.c \
- w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
+ w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
+ w_clock_gettime.c
 
 # glue for compat with old syscall interfaces.
 SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
new file mode 100644
index 00000000000..e955615248f
--- /dev/null
+++ lib/libc/sys/w_clock_gettime.c
@@ -0,0 +1,114 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <err.h>
+
+#include <sys/timekeep.h>
+
+void *elf_aux_timekeep;
+
+
+/*
+ * Needed exec_elf implementation.
+ * To be exposed by the kernel later if needed.
+ */
+
+#include <sys/exec_elf.h>
+
+typedef struct {
+ uint32_t au_id; /* 32-bit id */
+ uint64_t au_v; /* 64-bit value */
+} AuxInfo;
+
+enum AuxID {
+ AUX_null = 0,
+ AUX_ignore = 1,
+ AUX_execfd = 2,
+ AUX_phdr = 3, /* &phdr[0] */
+ AUX_phent = 4, /* sizeof(phdr[0]) */
+ AUX_phnum = 5, /* # phdr entries */
+ AUX_pagesz = 6, /* PAGESIZE */
+ AUX_base = 7, /* ld.so base addr */
+ AUX_flags = 8, /* processor flags */
+ AUX_entry = 9, /* a.out entry */
+ AUX_sun_uid = 2000, /* euid */
+ AUX_sun_ruid = 2001, /* ruid */
+ AUX_sun_gid = 2002, /* egid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
+};
+
+
+/*
+ * Helper functions.
+ */
+
+static int
+find_timekeep(void)
+{
+ Elf_Addr *stackp;
+ AuxInfo *auxv;
+ int found = 0;
+
+ stackp = (Elf_Addr *)environ;
+ while (*stackp++) ; /* pass environment */
+
+ /* look-up timekeep auxv */
+ for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
+ if (auxv->au_id == AUX_openbsd_timekeep) {
+ found = 1;
+ break;
+ }
+ if (found == 0) {
+ warnx("%s", "Could not find auxv!");
+ return -1;
+ }
+
+ elf_aux_timekeep = (void *)auxv->au_v;
+ return 0;
+}
+
+int
+WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
+{
+ struct timekeep *timekeep;
+
+ if (elf_aux_timekeep == NULL && find_timekeep())
+ return clock_gettime(clock_id, tp);
+ timekeep = elf_aux_timekeep;
+
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ *tp = timekeep->tp_realtime;
+ break;
+ case CLOCK_UPTIME:
+ *tp = timekeep->tp_uptime;
+ break;
+ case CLOCK_MONOTONIC:
+ *tp = timekeep->tp_monotonic;
+ break;
+ case CLOCK_BOOTTIME:
+ *tp = timekeep->tp_boottime;
+ break;
+ default:
+ return clock_gettime(clock_id, tp);
+ }
+ return 0;
+}
+DEF_WRAP(clock_gettime);
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 9b5b8eb3acf..59bc923a6fb 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -124,7 +124,7 @@ extern char *syscallnames[];
 /*
  * How many entries are in the AuxInfo array we pass to the process?
  */
-#define ELF_AUX_ENTRIES 8
+#define ELF_AUX_ENTRIES 9
 
 /*
  * This is the OpenBSD ELF emul
@@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
  a->au_v = ap->arg_entry;
  a++;
 
+ a->au_id = AUX_openbsd_timekeep;
+ a->au_v = p->p_p->ps_timekeep;
+ a++;
+
  a->au_id = AUX_null;
  a->au_v = 0;
  a++;
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 20480c2fc28..2496458fde1 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -64,6 +64,11 @@
 #include <uvm/uvm_extern.h>
 #include <machine/tcb.h>
 
+#include <sys/timekeep.h>
+
+struct uvm_object *timekeep_object;
+struct timekeep* timekeep;
+
 void unveil_destroy(struct process *ps);
 
 const struct kmem_va_mode kv_exec = {
@@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
  */
 int exec_sigcode_map(struct process *, struct emul *);
 
+/*
+ * Map the shared timekeep page.
+ */
+int exec_timekeep_map(struct process *);
+
 /*
  * If non-zero, stackgap_random specifies the upper limit of the random gap size
  * added to the fixed stack position. Must be n^2.
@@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
  /* map the process's signal trampoline code */
  if (exec_sigcode_map(pr, pack.ep_emul))
  goto free_pack_abort;
+ /* map the process's timekeep page */
+ if (exec_timekeep_map(pr))
+ goto free_pack_abort;
 
 #ifdef __HAVE_EXEC_MD_MAP
  /* perform md specific mappings that process might need */
@@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
 
  return (0);
 }
+
+int exec_timekeep_map(struct process *pr)
+{
+ size_t timekeep_sz = sizeof(struct timekeep);
+
+ /*
+ * Similar to the sigcode object, except that there is a single timekeep
+ * object, and not one per emulation.
+ */
+ if (timekeep_object == NULL) {
+ vaddr_t va;
+
+ timekeep_object = uao_create(timekeep_sz, 0);
+ uao_reference(timekeep_object);
+
+ if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
+    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
+    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ timekeep = (struct timekeep *)va;
+ }
+
+ uao_reference(timekeep_object);
+ if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
+    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
+    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ return (0);
+}
diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
index bcf8f689625..007f1116c4f 100644
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -35,6 +35,7 @@
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <dev/rndvar.h>
+#include <sys/timekeep.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
  BINTIME_TO_TIMEVAL(&bt, tvp);
 }
 
+void
+tc_clock_gettime(void)
+{
+ struct bintime bt;
+
+ if (timekeep == NULL)
+ return;
+
+ /* CLOCK_REALTIME */
+ nanotime(&timekeep->tp_realtime);
+
+ /* CLOCK_UPTIME */
+ binuptime(&bt);
+ bintimesub(&bt, &naptime, &bt);
+ BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
+
+ /* CLOCK_MONOTONIC */
+ nanouptime(&timekeep->tp_monotonic);
+
+ /* CLOCK_BOOTTIME */
+ timekeep->tp_boottime = timekeep->tp_monotonic;
+
+ return;
+}
+
 void
 bintime(struct bintime *bt)
 {
@@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
  time_uptime = th->th_offset.sec;
  membar_producer();
  timehands = th;
+
+ tc_clock_gettime();
 }
 
 /* Report or change the active timecounter hardware. */
diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
index a40e0510273..f55b75f1e84 100644
--- sys/sys/exec_elf.h
+++ sys/sys/exec_elf.h
@@ -691,7 +691,8 @@ enum AuxID {
  AUX_sun_uid = 2000, /* euid */
  AUX_sun_ruid = 2001, /* ruid */
  AUX_sun_gid = 2002, /* egid */
- AUX_sun_rgid = 2003 /* rgid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
 };
 
 struct elf_args {
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..93a79a220db 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -248,6 +248,8 @@ struct process {
  u_int ps_rtableid; /* Process routing table/domain. */
  char ps_nice; /* Process "nice" value. */
 
+ vaddr_t ps_timekeep; /* User pointer to timekeep */
+
  struct uprof { /* profile arguments */
  caddr_t pr_base; /* buffer base */
  size_t  pr_size; /* buffer size */
diff --git sys/sys/timekeep.h sys/sys/timekeep.h
new file mode 100644
index 00000000000..bad25185bc4
--- /dev/null
+++ sys/sys/timekeep.h
@@ -0,0 +1,37 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _SYS_TIMEKEEP_H_
+#define _SYS_TIMEKEEP_H_
+
+#include <sys/time.h>
+
+struct timekeep {
+ struct timespec tp_realtime;
+ struct timespec tp_uptime;
+ struct timespec tp_monotonic;
+ struct timespec tp_boottime;
+};
+
+#if defined(_KERNEL)
+#include <uvm/uvm_extern.h>
+
+extern struct uvm_object *timekeep_object;
+extern struct timekeep *timekeep;
+#endif
+
+#endif /* _SYS_TIMEKEEP_H_ */

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Robert Nagy
On 13/05/20 17:03 +0300, Paul Irofti wrote:

> Hi,
>
> By far one of the most popular and frequently used system calls is
> clock_gettime(2). As a result the cost of kernel-userland transitions
> out weight the actual work, thus I am proposing we make the data
> available directly from userland without passing through a system call.
>
> This has been a subject of discussion multiple times across the years
> and last I heard from it was at the p2k19 hackthon that I hosted in
> Bucharest where espie@ sent me a diff from one of his students(?). Being
> busy with organization I have not had the time to look at it and
> I am thus getting back to it just now due to robert@ prodding me again
> on the subject. The proposed diff is mine, not the student's.
>
>
> The technical bits.
>
> Please keep in mind that this is only proof of concept. I am looking for
> ways to improve the current diff. As it is, it requires a flag day
> because it makes use of ELF aux vectors to export the data from the
> kernel.
>
> I have also played with exposing the data via separate ELF sections and
> with kbind-mmap alternatives. The frist also involves a flag day and is
> more intrusive in my opinion, and the second I could not get to work. I
> think that would be the less intrusive way of doing it, possibly without
> a flag day, so if anyone knows how, please let me know.
>
> The supported clocks are just those that do not require process specific
> data. Those can also be handled later if this diff is decided to be a
> good thing.
>
> Clock update inside the kernel is done at the end of tc_windup(). There
> might be better places to do it. Let me know where.
>
> The update currently does the work of clock_gettime(), but it can
> probably be changed to only update the timehands and move the logic
> elsewhere. Note that if we expose only the timehands to userland, most
> of the bintime functionality has to also be made available there. Or so
> I think.
>
> In userland, I wrapped the clock_gettime(2) syscall in libc. There, I
> search for the auxiliary vector and fetch the timespec data from it.
> As you can see in the diff, parts from the elf_exec header will have to
> be exposed to userland if we do it this way.
>
>
> Results.
>
> To test this diff you need to do a full release(8). I have tested this
> with multiple programs. Test programs, base programs and packages. None
> the less, this diff touches many important areas of our tree and is
> very fragile. I also probably missed changing some parts that required
> change due to libc or elf changes.
>
> If you see regressions, which you probably will, please let me know.
>
> Here is a stress test from robert@:
>
> robert@x202:/home/robert> time ./t && time ./t2
> 0m00.11s real 0m00.12s user 0m00.00s system
> 0m09.99s real 0m02.64s user 0m03.36s system
> t is clock_gettime() and t2 is SYS_clock_gettime()

I am in the middle of rebuilding the packages that should gain significant
speedup right now. That small test does 5 million calls to clock_gettime,
so it is a bit over-reaching but still it shows the difference.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Mark Kettenis
In reply to this post by Paul Irofti-4
> Date: Wed, 13 May 2020 17:03:01 +0300
> From: Paul Irofti <[hidden email]>
>
> Hi,
>
> By far one of the most popular and frequently used system calls is
> clock_gettime(2). As a result the cost of kernel-userland transitions
> out weight the actual work, thus I am proposing we make the data
> available directly from userland without passing through a system call.
>
> This has been a subject of discussion multiple times across the years
> and last I heard from it was at the p2k19 hackthon that I hosted in
> Bucharest where espie@ sent me a diff from one of his students(?). Being
> busy with organization I have not had the time to look at it and
> I am thus getting back to it just now due to robert@ prodding me again
> on the subject. The proposed diff is mine, not the student's.
>
>
> The technical bits.
>
> Please keep in mind that this is only proof of concept. I am looking for
> ways to improve the current diff. As it is, it requires a flag day
> because it makes use of ELF aux vectors to export the data from the
> kernel.

That is not an entirely unreasonable way to pass the information from
the kernel to userland.  Care has to be taken that thr right thing
happens for static binaries.  Should probably export this stuff to
userland instead of duplicating the definitions in libc.  But we
should do that in a more standard-compliant way.  See for example what
NetBSD has in <sys/exec_elf.h>.

Doesn't really imply a flag day.  We should simply fall back on using
the system call when the "timekeep" page isn't provided.

> I have also played with exposing the data via separate ELF sections and
> with kbind-mmap alternatives. The frist also involves a flag day and is
> more intrusive in my opinion, and the second I could not get to work. I
> think that would be the less intrusive way of doing it, possibly without
> a flag day, so if anyone knows how, please let me know.

Agrred, the Linux-style VDSO stuff is way to complicated.

> The supported clocks are just those that do not require process specific
> data. Those can also be handled later if this diff is decided to be a
> good thing.

Linux defenitely doesn't export all the clocks.

> Clock update inside the kernel is done at the end of tc_windup(). There
> might be better places to do it. Let me know where.
>
> The update currently does the work of clock_gettime(), but it can
> probably be changed to only update the timehands and move the logic
> elsewhere. Note that if we expose only the timehands to userland, most
> of the bintime functionality has to also be made available there. Or so
> I think.

Unfortunately what you're doing here isn't good enough.  You're only
exporting low-resolution versions of the clocks.  The equivalent of
what Linux class CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE.
And I'm fairly certain that isn't what the applications want.  Why
else would they be calling clock_gettime() a gazillion times per
second...

And implementing the non-coarse variants is where things get messy.
For this you basically need a high-resolution clock that you can read
from userland.  For amd64 that probably means the TSC which is a can
of worms of its own.  Ignoring that for the moment, you then need to
store te following bits of information in your shared "timekeeping" page:

1. The clock time.
2. The TSC count corresponding to that clock time.
3. The TSC frequency.

And then let the code in libc extrapolate the time based on that.
Tricky bit is that the kernel may update 1 and 2 while userland is
reading those values.  So you probably need to read 1, then 2 and read
1 again and check that it didn't change.

> In userland, I wrapped the clock_gettime(2) syscall in libc. There, I
> search for the auxiliary vector and fetch the timespec data from it.
> As you can see in the diff, parts from the elf_exec header will have to
> be exposed to userland if we do it this way.

There seems to be a preferred way to do this wrapping, with the true
system call being renamed with two underscores.  Philip can probably
give some hints on how this whould be done.

> Results.
>
> To test this diff you need to do a full release(8). I have tested this
> with multiple programs. Test programs, base programs and packages. None
> the less, this diff touches many important areas of our tree and is
> very fragile. I also probably missed changing some parts that required
> change due to libc or elf changes.
>
> If you see regressions, which you probably will, please let me know.
>
> Here is a stress test from robert@:
>
> robert@x202:/home/robert> time ./t && time ./t2
> 0m00.11s real 0m00.12s user 0m00.00s system
> 0m09.99s real 0m02.64s user 0m03.36s system
> t is clock_gettime() and t2 is SYS_clock_gettime()
>
>
> Please keep the discussions on the list and let me know what you think
> and how we can improve this if we decide this is wanted in the tree.
>
> Paul
>
> diff --git lib/libc/shlib_version lib/libc/shlib_version
> index 06f98b01084..5fb0770494f 100644
> --- lib/libc/shlib_version
> +++ lib/libc/shlib_version
> @@ -1,4 +1,4 @@
>  major=96
> -minor=0
> +minor=1
>  # note: If changes were made to include/thread_private.h or if system calls
>  # were added/changed then librthread/shlib_version must also be updated.
> diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
> index 34769576ced..607985e8f20 100644
> --- lib/libc/sys/Makefile.inc
> +++ lib/libc/sys/Makefile.inc
> @@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
>  
>  # glue to offer userland wrappers for some syscalls
>  SRCS+= posix_madvise.c pthread_sigmask.c \
> - w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
> + w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
> + w_clock_gettime.c
>  
>  # glue for compat with old syscall interfaces.
>  SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
> diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
> new file mode 100644
> index 00000000000..e955615248f
> --- /dev/null
> +++ lib/libc/sys/w_clock_gettime.c
> @@ -0,0 +1,114 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2020 Paul Irofti <[hidden email]>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <stdlib.h>
> +#include <time.h>
> +#include <err.h>
> +
> +#include <sys/timekeep.h>
> +
> +void *elf_aux_timekeep;
> +
> +
> +/*
> + * Needed exec_elf implementation.
> + * To be exposed by the kernel later if needed.
> + */
> +
> +#include <sys/exec_elf.h>
> +
> +typedef struct {
> + uint32_t au_id; /* 32-bit id */
> + uint64_t au_v; /* 64-bit value */
> +} AuxInfo;
> +
> +enum AuxID {
> + AUX_null = 0,
> + AUX_ignore = 1,
> + AUX_execfd = 2,
> + AUX_phdr = 3, /* &phdr[0] */
> + AUX_phent = 4, /* sizeof(phdr[0]) */
> + AUX_phnum = 5, /* # phdr entries */
> + AUX_pagesz = 6, /* PAGESIZE */
> + AUX_base = 7, /* ld.so base addr */
> + AUX_flags = 8, /* processor flags */
> + AUX_entry = 9, /* a.out entry */
> + AUX_sun_uid = 2000, /* euid */
> + AUX_sun_ruid = 2001, /* ruid */
> + AUX_sun_gid = 2002, /* egid */
> + AUX_sun_rgid = 2003, /* rgid */
> + AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
> +};
> +
> +
> +/*
> + * Helper functions.
> + */
> +
> +static int
> +find_timekeep(void)
> +{
> + Elf_Addr *stackp;
> + AuxInfo *auxv;
> + int found = 0;
> +
> + stackp = (Elf_Addr *)environ;
> + while (*stackp++) ; /* pass environment */
> +
> + /* look-up timekeep auxv */
> + for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
> + if (auxv->au_id == AUX_openbsd_timekeep) {
> + found = 1;
> + break;
> + }
> + if (found == 0) {
> + warnx("%s", "Could not find auxv!");
> + return -1;
> + }
> +
> + elf_aux_timekeep = (void *)auxv->au_v;
> + return 0;
> +}
> +
> +int
> +WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
> +{
> + struct timekeep *timekeep;
> +
> + if (elf_aux_timekeep == NULL && find_timekeep())
> + return clock_gettime(clock_id, tp);
> + timekeep = elf_aux_timekeep;
> +
> + switch (clock_id) {
> + case CLOCK_REALTIME:
> + *tp = timekeep->tp_realtime;
> + break;
> + case CLOCK_UPTIME:
> + *tp = timekeep->tp_uptime;
> + break;
> + case CLOCK_MONOTONIC:
> + *tp = timekeep->tp_monotonic;
> + break;
> + case CLOCK_BOOTTIME:
> + *tp = timekeep->tp_boottime;
> + break;
> + default:
> + return clock_gettime(clock_id, tp);
> + }
> + return 0;
> +}
> +DEF_WRAP(clock_gettime);
> diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> index 9b5b8eb3acf..59bc923a6fb 100644
> --- sys/kern/exec_elf.c
> +++ sys/kern/exec_elf.c
> @@ -124,7 +124,7 @@ extern char *syscallnames[];
>  /*
>   * How many entries are in the AuxInfo array we pass to the process?
>   */
> -#define ELF_AUX_ENTRIES 8
> +#define ELF_AUX_ENTRIES 9
>  
>  /*
>   * This is the OpenBSD ELF emul
> @@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
>   a->au_v = ap->arg_entry;
>   a++;
>  
> + a->au_id = AUX_openbsd_timekeep;
> + a->au_v = p->p_p->ps_timekeep;
> + a++;
> +
>   a->au_id = AUX_null;
>   a->au_v = 0;
>   a++;
> diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
> index 20480c2fc28..2496458fde1 100644
> --- sys/kern/kern_exec.c
> +++ sys/kern/kern_exec.c
> @@ -64,6 +64,11 @@
>  #include <uvm/uvm_extern.h>
>  #include <machine/tcb.h>
>  
> +#include <sys/timekeep.h>
> +
> +struct uvm_object *timekeep_object;
> +struct timekeep* timekeep;
> +
>  void unveil_destroy(struct process *ps);
>  
>  const struct kmem_va_mode kv_exec = {
> @@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
>   */
>  int exec_sigcode_map(struct process *, struct emul *);
>  
> +/*
> + * Map the shared timekeep page.
> + */
> +int exec_timekeep_map(struct process *);
> +
>  /*
>   * If non-zero, stackgap_random specifies the upper limit of the random gap size
>   * added to the fixed stack position. Must be n^2.
> @@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
>   /* map the process's signal trampoline code */
>   if (exec_sigcode_map(pr, pack.ep_emul))
>   goto free_pack_abort;
> + /* map the process's timekeep page */
> + if (exec_timekeep_map(pr))
> + goto free_pack_abort;
>  
>  #ifdef __HAVE_EXEC_MD_MAP
>   /* perform md specific mappings that process might need */
> @@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
>  
>   return (0);
>  }
> +
> +int exec_timekeep_map(struct process *pr)
> +{
> + size_t timekeep_sz = sizeof(struct timekeep);
> +
> + /*
> + * Similar to the sigcode object, except that there is a single timekeep
> + * object, and not one per emulation.
> + */
> + if (timekeep_object == NULL) {
> + vaddr_t va;
> +
> + timekeep_object = uao_create(timekeep_sz, 0);
> + uao_reference(timekeep_object);
> +
> + if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
> +    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
> +    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
> + uao_detach(timekeep_object);
> + return (ENOMEM);
> + }
> +
> + timekeep = (struct timekeep *)va;
> + }
> +
> + uao_reference(timekeep_object);
> + if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
> +    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
> +    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
> + uao_detach(timekeep_object);
> + return (ENOMEM);
> + }
> +
> + return (0);
> +}
> diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
> index bcf8f689625..007f1116c4f 100644
> --- sys/kern/kern_tc.c
> +++ sys/kern/kern_tc.c
> @@ -35,6 +35,7 @@
>  #include <sys/queue.h>
>  #include <sys/malloc.h>
>  #include <dev/rndvar.h>
> +#include <sys/timekeep.h>
>  
>  /*
>   * A large step happens on boot.  This constant detects such steps.
> @@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
>   BINTIME_TO_TIMEVAL(&bt, tvp);
>  }
>  
> +void
> +tc_clock_gettime(void)
> +{
> + struct bintime bt;
> +
> + if (timekeep == NULL)
> + return;
> +
> + /* CLOCK_REALTIME */
> + nanotime(&timekeep->tp_realtime);
> +
> + /* CLOCK_UPTIME */
> + binuptime(&bt);
> + bintimesub(&bt, &naptime, &bt);
> + BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
> +
> + /* CLOCK_MONOTONIC */
> + nanouptime(&timekeep->tp_monotonic);
> +
> + /* CLOCK_BOOTTIME */
> + timekeep->tp_boottime = timekeep->tp_monotonic;
> +
> + return;
> +}
> +
>  void
>  bintime(struct bintime *bt)
>  {
> @@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
>   time_uptime = th->th_offset.sec;
>   membar_producer();
>   timehands = th;
> +
> + tc_clock_gettime();
>  }
>  
>  /* Report or change the active timecounter hardware. */
> diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
> index a40e0510273..f55b75f1e84 100644
> --- sys/sys/exec_elf.h
> +++ sys/sys/exec_elf.h
> @@ -691,7 +691,8 @@ enum AuxID {
>   AUX_sun_uid = 2000, /* euid */
>   AUX_sun_ruid = 2001, /* ruid */
>   AUX_sun_gid = 2002, /* egid */
> - AUX_sun_rgid = 2003 /* rgid */
> + AUX_sun_rgid = 2003, /* rgid */
> + AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
>  };
>  
>  struct elf_args {
> diff --git sys/sys/proc.h sys/sys/proc.h
> index 357c0c0d52c..93a79a220db 100644
> --- sys/sys/proc.h
> +++ sys/sys/proc.h
> @@ -248,6 +248,8 @@ struct process {
>   u_int ps_rtableid; /* Process routing table/domain. */
>   char ps_nice; /* Process "nice" value. */
>  
> + vaddr_t ps_timekeep; /* User pointer to timekeep */
> +
>   struct uprof { /* profile arguments */
>   caddr_t pr_base; /* buffer base */
>   size_t  pr_size; /* buffer size */
> diff --git sys/sys/timekeep.h sys/sys/timekeep.h
> new file mode 100644
> index 00000000000..bad25185bc4
> --- /dev/null
> +++ sys/sys/timekeep.h
> @@ -0,0 +1,37 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2020 Paul Irofti <[hidden email]>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#ifndef _SYS_TIMEKEEP_H_
> +#define _SYS_TIMEKEEP_H_
> +
> +#include <sys/time.h>
> +
> +struct timekeep {
> + struct timespec tp_realtime;
> + struct timespec tp_uptime;
> + struct timespec tp_monotonic;
> + struct timespec tp_boottime;
> +};
> +
> +#if defined(_KERNEL)
> +#include <uvm/uvm_extern.h>
> +
> +extern struct uvm_object *timekeep_object;
> +extern struct timekeep *timekeep;
> +#endif
> +
> +#endif /* _SYS_TIMEKEEP_H_ */
>
>

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Robert Nagy
On 13/05/20 17:05 +0200, Mark Kettenis wrote:

> > The update currently does the work of clock_gettime(), but it can
> > probably be changed to only update the timehands and move the logic
> > elsewhere. Note that if we expose only the timehands to userland, most
> > of the bintime functionality has to also be made available there. Or so
> > I think.
>
> Unfortunately what you're doing here isn't good enough.  You're only
> exporting low-resolution versions of the clocks.  The equivalent of
> what Linux class CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE.
> And I'm fairly certain that isn't what the applications want.  Why
> else would they be calling clock_gettime() a gazillion times per
> second...


Most of the big programs use CLOCK_MONOTONIC.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Benjamin Baier
In reply to this post by Robert Nagy
On Wed, 13 May 2020 16:09:57 +0200
Robert Nagy <[hidden email]> wrote:

> On 13/05/20 17:03 +0300, Paul Irofti wrote:
> > Here is a stress test from robert@:
> >
> > robert@x202:/home/robert> time ./t && time ./t2
> > 0m00.11s real 0m00.12s user 0m00.00s system
> > 0m09.99s real 0m02.64s user 0m03.36s system
> > t is clock_gettime() and t2 is SYS_clock_gettime()
>
> I am in the middle of rebuilding the packages that should gain significant
> speedup right now. That small test does 5 million calls to clock_gettime,
> so it is a bit over-reaching but still it shows the difference.

Well, it's pretty close to a real world desktop system (4mio in 27sec)

root# time btrace calling_clock_gettime.bt  
^C@num[chrome]: 2476715
@num[java]: 1429533
@num[Xorg]: 308404
@num[sndiod]: 126398
@num[conky]: 116
@num[sh]: 106
@num[syslogd]: 2
    0m27.12s real     0m04.64s user     0m01.89s system

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Stuart Henderson
In reply to this post by Robert Nagy
Thanks for looking at this Paul!

On 2020/05/13 17:15, Robert Nagy wrote:

> On 13/05/20 17:05 +0200, Mark Kettenis wrote:
> > > The update currently does the work of clock_gettime(), but it can
> > > probably be changed to only update the timehands and move the logic
> > > elsewhere. Note that if we expose only the timehands to userland, most
> > > of the bintime functionality has to also be made available there. Or so
> > > I think.
> >
> > Unfortunately what you're doing here isn't good enough.  You're only
> > exporting low-resolution versions of the clocks.  The equivalent of
> > what Linux class CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE.
> > And I'm fairly certain that isn't what the applications want.  Why
> > else would they be calling clock_gettime() a gazillion times per
> > second...
>
>
> Most of the big programs use CLOCK_MONOTONIC.
>

Agreed.

Quick counts from a dumb search with codesearch.debian:

CLOCK_REALTIME_COARSE 376
CLOCK_MONOTONIC_COARSE 639
CLOCK_REALTIME 8756
CLOCK_MONOTONIC 10776

I have looked over ports source and almost everything I see prefers
CLOCK_MONOTONIC if available then falls back to CLOCK_REALTIME.
Occasionally you have things using only CLOCK_REALTIME but not many.
So I think it's fair to say most of the latter two are overlapping
cases.

In linux the vdso handles CLOCK_{REALTIME,MONOTONIC}{,_COARSE}.
Depending on the clock source it may still use syscalls though, people
got bitten by this on ec2 where some machine types default to a source
that still needed syscalls.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Mark Kettenis
> Date: Wed, 13 May 2020 16:55:24 +0100
> From: Stuart Henderson <[hidden email]>
>
> Thanks for looking at this Paul!
>
> On 2020/05/13 17:15, Robert Nagy wrote:
> > On 13/05/20 17:05 +0200, Mark Kettenis wrote:
> > > > The update currently does the work of clock_gettime(), but it can
> > > > probably be changed to only update the timehands and move the logic
> > > > elsewhere. Note that if we expose only the timehands to userland, most
> > > > of the bintime functionality has to also be made available there. Or so
> > > > I think.
> > >
> > > Unfortunately what you're doing here isn't good enough.  You're only
> > > exporting low-resolution versions of the clocks.  The equivalent of
> > > what Linux class CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE.
> > > And I'm fairly certain that isn't what the applications want.  Why
> > > else would they be calling clock_gettime() a gazillion times per
> > > second...
> >
> >
> > Most of the big programs use CLOCK_MONOTONIC.
> >
>
> Agreed.
>
> Quick counts from a dumb search with codesearch.debian:
>
> CLOCK_REALTIME_COARSE 376
> CLOCK_MONOTONIC_COARSE 639
> CLOCK_REALTIME 8756
> CLOCK_MONOTONIC 10776
>
> I have looked over ports source and almost everything I see prefers
> CLOCK_MONOTONIC if available then falls back to CLOCK_REALTIME.
> Occasionally you have things using only CLOCK_REALTIME but not many.
> So I think it's fair to say most of the latter two are overlapping
> cases.
>
> In linux the vdso handles CLOCK_{REALTIME,MONOTONIC}{,_COARSE}.
> Depending on the clock source it may still use syscalls though, people
> got bitten by this on ec2 where some machine types default to a source
> that still needed syscalls.

Given the shitshow that the TSC is on amd64, it is unavoidable that we
end up in that same situation.  In theory we could support alternative
hardware clocks as well by memory mapping the HPET read-only into
userland, but that may have unintended side-effects.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Scott Cheloha
On Wed, May 13, 2020 at 06:09:52PM +0200, Mark Kettenis wrote:

> > Date: Wed, 13 May 2020 16:55:24 +0100
> > From: Stuart Henderson <[hidden email]>
> >
> > Thanks for looking at this Paul!
> >
> > On 2020/05/13 17:15, Robert Nagy wrote:
> > > On 13/05/20 17:05 +0200, Mark Kettenis wrote:
> > > > > The update currently does the work of clock_gettime(), but it can
> > > > > probably be changed to only update the timehands and move the logic
> > > > > elsewhere. Note that if we expose only the timehands to userland, most
> > > > > of the bintime functionality has to also be made available there. Or so
> > > > > I think.
> > > >
> > > > Unfortunately what you're doing here isn't good enough.  You're only
> > > > exporting low-resolution versions of the clocks.  The equivalent of
> > > > what Linux class CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE.
> > > > And I'm fairly certain that isn't what the applications want.  Why
> > > > else would they be calling clock_gettime() a gazillion times per
> > > > second...
> > >
> > >
> > > Most of the big programs use CLOCK_MONOTONIC.
> > >
> >
> > Agreed.
> >
> > Quick counts from a dumb search with codesearch.debian:
> >
> > CLOCK_REALTIME_COARSE 376
> > CLOCK_MONOTONIC_COARSE 639
> > CLOCK_REALTIME 8756
> > CLOCK_MONOTONIC 10776
> >
> > I have looked over ports source and almost everything I see prefers
> > CLOCK_MONOTONIC if available then falls back to CLOCK_REALTIME.
> > Occasionally you have things using only CLOCK_REALTIME but not many.
> > So I think it's fair to say most of the latter two are overlapping
> > cases.
> >
> > In linux the vdso handles CLOCK_{REALTIME,MONOTONIC}{,_COARSE}.
> > Depending on the clock source it may still use syscalls though, people
> > got bitten by this on ec2 where some machine types default to a source
> > that still needed syscalls.
>
> Given the shitshow that the TSC is on amd64, it is unavoidable that we
> end up in that same situation.  In theory we could support alternative
> hardware clocks as well by memory mapping the HPET read-only into
> userland, but that may have unintended side-effects.

The overhead of actually reading the HPET is larger than the overhead
of the syscall itself.  Ditto that for the ACPI timer.  At least on
amd64, this userland approach is only a net savings with the TSC and
all the complexity it entails to keep it monotonic.

Reading the TSC is only slightly more expensive than reading the low-res
timestamps we provide via e.g. getnanouptime(9).  Switch to the HPET and
you've lost your upside.

Attached is a patch to add CLOCK_MONOTONIC_COARSE/CLOCK_REALTIME_COARSE
to the system.

Below is a test program I use to fuss around with comparing timecounters.
I've added a -c flag to switch to the COARSE clocks.

Via SYS_clock_gettime() I'm seeing output like this:

$ doas sysctl kern.timecounter.hardware=tsc
kern.timecounter.hardware: tsc -> tsc
$ command time doas nice -n -20 ./gettime mono
        1.23 real         0.78 user         0.45 sys
$ command time doas nice -n -20 ./gettime -c mono
        1.18 real         0.74 user         0.45 sys
$ doas sysctl kern.timecounter.hardware=acpihpet0
kern.timecounter.hardware: tsc -> acpihpet0
$ command time doas nice -n -20 ./gettime mono    
        4.73 real         0.82 user         3.91 sys
$ command time doas nice -n -20 ./gettime -c mono
        1.19 real         0.73 user         0.46 sys

... so the clock_gettime(2) overhead as a syscall on my machine is
substantially smaller than the HPET overhead, unless you have a way to
make reading the HPET faster than it is right now.

--

#include <err.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

void usage(void);

int
main(int argc, char *argv[])
{
        struct timespec now;
        const char *clockname, *errstr;
        long long i, num;
        clockid_t clock;
        int ch, coarse;

        num = 1000000;
        coarse = 0;

        while ((ch = getopt(argc, argv, "c")) != -1) {
                switch (ch) {
                case 'c':
                        coarse = 1;
                        break;
                case 'n':
                        num = strtonum(optarg, 1, LLONG_MAX, &errstr);
                        if (errstr != NULL)
                                errx(1, "num is %s: %s", errstr, optarg);
                        break;
                default:
                        usage();
                }
        }
        argc -= optind;
        argv += optind;

        if (argc != 1)
                usage();

        clockname = argv[0];

        if (strcmp(clockname, "mono") == 0)
                clock = (coarse) ? CLOCK_MONOTONIC_COARSE : CLOCK_MONOTONIC;
        else if (strcmp(clockname, "real") == 0)
                clock = (coarse) ? CLOCK_REALTIME_COARSE : CLOCK_REALTIME;
        else
                errx(1, "invalid clock: %s", clockname);

        for (i = 0; i < num; i++)
                clock_gettime(clock, &now);

        return 0;
}

void
usage(void)
{
        fprintf(stderr, "usage: %s [-c] [-n num] mono | real\n", getprogname());
        exit(1);
}

--

Index: kern/kern_time.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.127
diff -u -p -r1.127 kern_time.c
--- kern/kern_time.c 20 Mar 2020 04:08:25 -0000 1.127
+++ kern/kern_time.c 13 May 2020 19:17:34 -0000
@@ -114,6 +114,9 @@ clock_gettime(struct proc *p, clockid_t
  case CLOCK_REALTIME:
  nanotime(tp);
  break;
+ case CLOCK_REALTIME_COARSE:
+ getnanotime(tp);
+ break;
  case CLOCK_UPTIME:
  binuptime(&bt);
  bintimesub(&bt, &naptime, &bt);
@@ -122,6 +125,9 @@ clock_gettime(struct proc *p, clockid_t
  case CLOCK_MONOTONIC:
  case CLOCK_BOOTTIME:
  nanouptime(tp);
+ break;
+ case CLOCK_MONOTONIC_COARSE:
+ getnanouptime(tp);
  break;
  case CLOCK_PROCESS_CPUTIME_ID:
  nanouptime(tp);
Index: sys/_time.h
===================================================================
RCS file: /cvs/src/sys/sys/_time.h,v
retrieving revision 1.9
diff -u -p -r1.9 _time.h
--- sys/_time.h 18 Dec 2017 05:51:53 -0000 1.9
+++ sys/_time.h 13 May 2020 19:17:34 -0000
@@ -38,6 +38,8 @@
 #define CLOCK_THREAD_CPUTIME_ID 4
 #define CLOCK_UPTIME 5
 #define CLOCK_BOOTTIME 6
+#define CLOCK_REALTIME_COARSE 7
+#define CLOCK_MONOTONIC_COARSE 8
 
 #if __BSD_VISIBLE
 /*

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Solene Rapenne
In reply to this post by Paul Irofti-4
Le Wed, 13 May 2020 17:03:01 +0300,
Paul Irofti <[hidden email]> a écrit :

> Hi,
>
> By far one of the most popular and frequently used system calls is
> clock_gettime(2). As a result the cost of kernel-userland transitions
> out weight the actual work, thus I am proposing we make the data
> available directly from userland without passing through a system
> call.
>
> This has been a subject of discussion multiple times across the years
> and last I heard from it was at the p2k19 hackthon that I hosted in
> Bucharest where espie@ sent me a diff from one of his students(?).
> Being busy with organization I have not had the time to look at it and
> I am thus getting back to it just now due to robert@ prodding me again
> on the subject. The proposed diff is mine, not the student's.
>
>
> The technical bits.
>
> Please keep in mind that this is only proof of concept. I am looking
> for ways to improve the current diff. As it is, it requires a flag day
> because it makes use of ELF aux vectors to export the data from the
> kernel.
>
> I have also played with exposing the data via separate ELF sections
> and with kbind-mmap alternatives. The frist also involves a flag day
> and is more intrusive in my opinion, and the second I could not get
> to work. I think that would be the less intrusive way of doing it,
> possibly without a flag day, so if anyone knows how, please let me
> know.
>
> The supported clocks are just those that do not require process
> specific data. Those can also be handled later if this diff is
> decided to be a good thing.
>
> Clock update inside the kernel is done at the end of tc_windup().
> There might be better places to do it. Let me know where.
>
> The update currently does the work of clock_gettime(), but it can
> probably be changed to only update the timehands and move the logic
> elsewhere. Note that if we expose only the timehands to userland, most
> of the bintime functionality has to also be made available there. Or
> so I think.
>
> In userland, I wrapped the clock_gettime(2) syscall in libc. There, I
> search for the auxiliary vector and fetch the timespec data from it.
> As you can see in the diff, parts from the elf_exec header will have
> to be exposed to userland if we do it this way.
>
>
> Results.
>
> To test this diff you need to do a full release(8). I have tested this
> with multiple programs. Test programs, base programs and packages.
> None the less, this diff touches many important areas of our tree and
> is very fragile. I also probably missed changing some parts that
> required change due to libc or elf changes.
>
> If you see regressions, which you probably will, please let me know.

With the patch, system crashes reliably at boot when prompting for login

I followed release(8) instructions, did I miss something?

cd /sys/arch/$(machine)/compile/GENERIC.MP
make obj
make config
make && make install
reboot

cd /usrc/src
make obj
make build
sysmerge
cd /dev && ./MAKEDEV all

cd /usr/xenocara
make bootstrap
make obj
make build
reboot

I got a first panic like « panic init died (signal 0, exit 11)
when I typed reboot.

Now, if I start the system with the new kernel (old kernel.sp
still work), I get either a panic init died or I have ddb but
can't type in it. This happens after full boot sequence when
I'm prompted for login: I tried to disable all pkg_services
and xdm and it still crash at this step, I can't login.

2 screenshots of crashes errors

https://perso.pw/IMG_20200514_110104.jpg 
https://perso.pw/IMG_20200514_110451.jpg 


dmesg output (from bsd.sp kernel which still boots)

OpenBSD 6.7 (GENERIC) #179: Thu May  7 11:02:37 MDT 2020
    [hidden email]:/usr/src/sys/arch/amd64/compile/GENERIC
real mem = 8033624064 (7661MB)
avail mem = 7777611776 (7417MB)
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.7 @ 0xec070 (76 entries)
bios0: vendor American Megatrends Inc. version "FB" date 06/25/2014
bios0: Gigabyte Technology Co., Ltd. H81M-D2V
acpi0 at bios0: ACPI 5.0
acpi0: sleep states S0 S3 S4 S5
acpi0: tables DSDT FACP APIC FPDT SSDT SSDT MCFG HPET SSDT SSDT
acpi0: wakeup devices RP01(S4) PXSX(S4) PXSX(S4) RP03(S4) PXSX(S4)
RP04(S4) PXSX(S4) PXSX(S4) PXSX(S4) PXSX(S4) PXSX(S4) GLAN(S4) EHC1(S4)
EHC2(S4) XHC_(S4) HDEF(S4) [...] acpitimer0 at acpi0: 3579545 Hz, 24
bits acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat cpu0 at mainbus0:
apid 0 (boot processor) cpu0: Intel(R) Core(TM) i3-4160 CPU @ 3.60GHz,
3592.14 MHz, 06-3c-03 cpu0:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,PERF,ITSC,FSGSBASE,TSC_ADJUST,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,MELTDOWN
cpu0: 256KB 64b/line 8-way L2 cache cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
cpu0: apic clock running at 99MHz
cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4, IBE
cpu at mainbus0: not configured
cpu at mainbus0: not configured
cpu at mainbus0: not configured
ioapic0 at mainbus0: apid 8 pa 0xfec00000, version 20, 24 pins
acpimcfg0 at acpi0
acpimcfg0: addr 0xf8000000, bus 0-63
acpihpet0 at acpi0: 14318179 Hz
acpiprt0 at acpi0: bus 0 (PCI0)
acpiprt1 at acpi0: bus 2 (RP01)
acpiprt2 at acpi0: bus 3 (RP03)
acpiprt3 at acpi0: bus 4 (RP04)
acpiprt4 at acpi0: bus 1 (PEG0)
acpiprt5 at acpi0: bus -1 (PEG1)
acpiprt6 at acpi0: bus -1 (PEG2)
acpiec0 at acpi0: not present
acpicpu0 at acpi0: C2(500@67 mwait.1@0x10), C1(1000@1 mwait.1), PSS
acpipwrres0 at acpi0: FN00, resource for FAN0
acpipwrres1 at acpi0: FN01, resource for FAN1
acpipwrres2 at acpi0: FN02, resource for FAN2
acpipwrres3 at acpi0: FN03, resource for FAN3
acpipwrres4 at acpi0: FN04, resource for FAN4
acpitz0 at acpi0: critical temperature is 105 degC
acpitz1 at acpi0: critical temperature is 105 degC
acpipci0 at acpi0 PCI0: 0x00000010 0x00000011 0x00000000
acpicmos0 at acpi0
acpibtn0 at acpi0: PWRB
"PNP0C0B" at acpi0 not configured
"PNP0C0B" at acpi0 not configured
"PNP0C0B" at acpi0 not configured
"PNP0C0B" at acpi0 not configured
"PNP0C0B" at acpi0 not configured
acpivideo0 at acpi0: GFX0
acpivout0 at acpivideo0: DD1F
cpu0: using VERW MDS workaround (except on vmm entry)
cpu0: Enhanced SpeedStep 3592 MHz: speeds: 3600, 3400, 3200, 3000,
2900, 2700, 2500, 2300, 2100, 1900, 1700, 1500, 1400, 1200, 1000, 800
MHz pci0 at mainbus0 bus 0 pchb0 at pci0 dev 0 function 0 "Intel Core
4G Host" rev 0x06 ppb0 at pci0 dev 1 function 0 "Intel Core 4G PCIE"
rev 0x06: msi pci1 at ppb0 bus 1
vendor "NVIDIA", unknown product 0x1c03 (class display subclass VGA,
rev 0xa1) at pci1 dev 0 function 0 not configured azalia0 at pci1 dev 0
function 1 vendor "NVIDIA", unknown product 0x10f1 rev 0xa1: msi
azalia0: no supported codecs inteldrm0 at pci0 dev 2 function 0 "Intel
HD Graphics 4600" rev 0x06 drm0 at inteldrm0
inteldrm0: msi, HASWELL, gen 7
azalia1 at pci0 dev 3 function 0 "Intel Core 4G HD Audio" rev 0x06: msi
azalia1: No codecs found
xhci0 at pci0 dev 20 function 0 "Intel 8 Series xHCI" rev 0x05: msi,
xHCI 1.0 usb0 at xhci0: USB revision 3.0
uhub0 at usb0 configuration 1 interface 0 "Intel xHCI root hub" rev
3.00/1.00 addr 1 "Intel 8 Series MEI" rev 0x04 at pci0 dev 22 function
0 not configured ehci0 at pci0 dev 26 function 0 "Intel 8 Series USB"
rev 0x05: apic 8 int 16 usb1 at ehci0: USB revision 2.0
uhub1 at usb1 configuration 1 interface 0 "Intel EHCI root hub" rev
2.00/1.00 addr 1 azalia2 at pci0 dev 27 function 0 "Intel 8 Series HD
Audio" rev 0x05: msi azalia2: codecs: Realtek/0x0887
audio0 at azalia2
ppb1 at pci0 dev 28 function 0 "Intel 8 Series PCIE" rev 0xd5: msi
pci2 at ppb1 bus 2
ppb2 at pci0 dev 28 function 2 "Intel 8 Series PCIE" rev 0xd5: msi
pci3 at ppb2 bus 3
re0 at pci3 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G
(0x4c00), msi, address fc:aa:14:68:75:64 rgephy0 at re0 phy 7: RTL8251
PHY, rev. 0 ppb3 at pci0 dev 28 function 3 "Intel 8 Series PCIE" rev
0xd5: msi pci4 at ppb3 bus 4
xhci1 at pci4 dev 0 function 0 "VIA VL805 xHCI" rev 0x01: msi, xHCI 1.0
usb2 at xhci1: USB revision 3.0
uhub2 at usb2 configuration 1 interface 0 "VIA xHCI root hub" rev
3.00/1.00 addr 1 ehci1 at pci0 dev 29 function 0 "Intel 8 Series USB"
rev 0x05: apic 8 int 23 usb3 at ehci1: USB revision 2.0
uhub3 at usb3 configuration 1 interface 0 "Intel EHCI root hub" rev
2.00/1.00 addr 1 pcib0 at pci0 dev 31 function 0 "Intel H81 LPC" rev
0x05 ahci0 at pci0 dev 31 function 2 "Intel 8 Series AHCI" rev 0x05:
msi, AHCI 1.3 ahci0: port 4: 3.0Gb/s
scsibus1 at ahci0: 32 targets
sd0 at scsibus1 targ 4 lun 0: <ATA, WDC WD5000AAKX-0, 15.0>
naa.50014ee0ae3f2d32 sd0: 476940MB, 512 bytes/sector, 976773168 sectors
ichiic0 at pci0 dev 31 function 3 "Intel 8 Series SMBus" rev 0x05: apic
8 int 18 iic0 at ichiic0
spdmem0 at iic0 addr 0x50: 4GB DDR3 SDRAM PC3-12800
spdmem1 at iic0 addr 0x52: 4GB DDR3 SDRAM PC3-12800
isa0 at pcib0
isadma0 at isa0
com0 at isa0 port 0x3f8/8 irq 4: ns16550a, 16 byte fifo
pckbc0 at isa0 port 0x60/5 irq 1 irq 12
pckbd0 at pckbc0 (kbd slot)
wskbd0 at pckbd0: console keyboard
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
lpt0 at isa0 port 0x378/4 irq 7
vmm0 at mainbus0: VMX/EPT
uhub4 at uhub1 port 1 configuration 1 interface 0 "Intel Rate Matching
Hub" rev 2.00/0.05 addr 2 uhub5 at uhub2 port 1 configuration 1
interface 0 "VIA Labs USB2.0 Hub" rev 2.10/4.20 addr 2 uhub6 at uhub5
port 1 configuration 1 interface 0 "ALCOR Generic USB Hub" rev
1.10/3.12 addr 3 uhidev0 at uhub6 port 1 configuration 1 interface 0
"CHERRY Mechanical Keyboard" rev 2.00/0.02 addr 4 uhidev0: iclass 3/1
ukbd0 at uhidev0: 8 variable keys, 6 key codes wskbd1 at ukbd0 mux 1
uhidev1 at uhub6 port 1 configuration 1 interface 1 "CHERRY Mechanical
Keyboard" rev 2.00/0.02 addr 4 uhidev1: iclass 3/0, 3 report ids
uhid0 at uhidev1 reportid 1: input=2, output=0, feature=0
ukbd1 at uhidev1 reportid 3: 120 variable keys, 0 key codes
wskbd2 at ukbd1 mux 1
uhidev2 at uhub6 port 1 configuration 1 interface 2 "CHERRY Mechanical
Keyboard" rev 2.00/0.02 addr 4 uhidev2: iclass 3/0
uhid1 at uhidev2: input=64, output=64, feature=64
uhidev3 at uhub6 port 2 configuration 1 interface 0 "SteelSeries Sensei
Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev3: iclass 3/0
uhid2 at uhidev3: input=32, output=32, feature=255
uhidev4 at uhub6 port 2 configuration 1 interface 1 "SteelSeries Sensei
Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev4: iclass 3/1
ums0 at uhidev4: 8 buttons, Z dir
wsmouse0 at ums0 mux 0
uhidev5 at uhub6 port 2 configuration 1 interface 2 "SteelSeries Sensei
Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev5: iclass 3/1
ukbd2 at uhidev5: 8 variable keys, 6 key codes
wskbd3 at ukbd2 mux 1
uhidev6 at uhub6 port 2 configuration 1 interface 3 "SteelSeries Sensei
Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev6: iclass 3/0
uhid3 at uhidev6: input=4, output=0, feature=0
uhidev7 at uhub6 port 3 configuration 1 interface 0 "Device 2Port
KVMSwitcher" rev 1.10/0.01 addr 6 uhidev7: iclass 3/0
ukbd3 at uhidev7: 8 variable keys, 6 key codes
wskbd4 at ukbd3 mux 1
uhidev8 at uhub6 port 3 configuration 1 interface 1 "Device 2Port
KVMSwitcher" rev 1.10/0.01 addr 6 uhidev8: no input interrupt endpoint
uhub7 at uhub3 port 1 configuration 1 interface 0 "Intel Rate Matching
Hub" rev 2.00/0.05 addr 2 vscsi0 at root
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (9b5c1232128549ea.a) swap on sd0b dump on sd0b
WARNING: / was not properly unmounted
inteldrm0: 1920x1080, 32bpp
wsdisplay0 at inteldrm0 mux 1: console (std, vt100 emulation), using
wskbd0 wskbd1: connecting to wsdisplay0
wskbd2: connecting to wsdisplay0
wskbd3: connecting to wsdisplay0
wskbd4: connecting to wsdisplay0
wsdisplay0: screen 1-5 added (std, vt100 emulation)


Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Paul Irofti-4
Hi Solene,

Robert is also seeing this on one of his systems. I am currently
investigating this and will come back with a fix.

Thank you for the report,
Paul

On 2020-05-14 12:31, Solene Rapenne wrote:

> Le Wed, 13 May 2020 17:03:01 +0300,
> Paul Irofti <[hidden email]> a écrit :
>
>> Hi,
>>
>> By far one of the most popular and frequently used system calls is
>> clock_gettime(2). As a result the cost of kernel-userland transitions
>> out weight the actual work, thus I am proposing we make the data
>> available directly from userland without passing through a system
>> call.
>>
>> This has been a subject of discussion multiple times across the years
>> and last I heard from it was at the p2k19 hackthon that I hosted in
>> Bucharest where espie@ sent me a diff from one of his students(?).
>> Being busy with organization I have not had the time to look at it and
>> I am thus getting back to it just now due to robert@ prodding me again
>> on the subject. The proposed diff is mine, not the student's.
>>
>>
>> The technical bits.
>>
>> Please keep in mind that this is only proof of concept. I am looking
>> for ways to improve the current diff. As it is, it requires a flag day
>> because it makes use of ELF aux vectors to export the data from the
>> kernel.
>>
>> I have also played with exposing the data via separate ELF sections
>> and with kbind-mmap alternatives. The frist also involves a flag day
>> and is more intrusive in my opinion, and the second I could not get
>> to work. I think that would be the less intrusive way of doing it,
>> possibly without a flag day, so if anyone knows how, please let me
>> know.
>>
>> The supported clocks are just those that do not require process
>> specific data. Those can also be handled later if this diff is
>> decided to be a good thing.
>>
>> Clock update inside the kernel is done at the end of tc_windup().
>> There might be better places to do it. Let me know where.
>>
>> The update currently does the work of clock_gettime(), but it can
>> probably be changed to only update the timehands and move the logic
>> elsewhere. Note that if we expose only the timehands to userland, most
>> of the bintime functionality has to also be made available there. Or
>> so I think.
>>
>> In userland, I wrapped the clock_gettime(2) syscall in libc. There, I
>> search for the auxiliary vector and fetch the timespec data from it.
>> As you can see in the diff, parts from the elf_exec header will have
>> to be exposed to userland if we do it this way.
>>
>>
>> Results.
>>
>> To test this diff you need to do a full release(8). I have tested this
>> with multiple programs. Test programs, base programs and packages.
>> None the less, this diff touches many important areas of our tree and
>> is very fragile. I also probably missed changing some parts that
>> required change due to libc or elf changes.
>>
>> If you see regressions, which you probably will, please let me know.
>
> With the patch, system crashes reliably at boot when prompting for login
>
> I followed release(8) instructions, did I miss something?
>
> cd /sys/arch/$(machine)/compile/GENERIC.MP
> make obj
> make config
> make && make install
> reboot
>
> cd /usrc/src
> make obj
> make build
> sysmerge
> cd /dev && ./MAKEDEV all
>
> cd /usr/xenocara
> make bootstrap
> make obj
> make build
> reboot
>
> I got a first panic like « panic init died (signal 0, exit 11)
> when I typed reboot.
>
> Now, if I start the system with the new kernel (old kernel.sp
> still work), I get either a panic init died or I have ddb but
> can't type in it. This happens after full boot sequence when
> I'm prompted for login: I tried to disable all pkg_services
> and xdm and it still crash at this step, I can't login.
>
> 2 screenshots of crashes errors
>
> https://perso.pw/IMG_20200514_110104.jpg
> https://perso.pw/IMG_20200514_110451.jpg
>
>
> dmesg output (from bsd.sp kernel which still boots)
>
> OpenBSD 6.7 (GENERIC) #179: Thu May  7 11:02:37 MDT 2020
>      [hidden email]:/usr/src/sys/arch/amd64/compile/GENERIC
> real mem = 8033624064 (7661MB)
> avail mem = 7777611776 (7417MB)
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 2.7 @ 0xec070 (76 entries)
> bios0: vendor American Megatrends Inc. version "FB" date 06/25/2014
> bios0: Gigabyte Technology Co., Ltd. H81M-D2V
> acpi0 at bios0: ACPI 5.0
> acpi0: sleep states S0 S3 S4 S5
> acpi0: tables DSDT FACP APIC FPDT SSDT SSDT MCFG HPET SSDT SSDT
> acpi0: wakeup devices RP01(S4) PXSX(S4) PXSX(S4) RP03(S4) PXSX(S4)
> RP04(S4) PXSX(S4) PXSX(S4) PXSX(S4) PXSX(S4) PXSX(S4) GLAN(S4) EHC1(S4)
> EHC2(S4) XHC_(S4) HDEF(S4) [...] acpitimer0 at acpi0: 3579545 Hz, 24
> bits acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat cpu0 at mainbus0:
> apid 0 (boot processor) cpu0: Intel(R) Core(TM) i3-4160 CPU @ 3.60GHz,
> 3592.14 MHz, 06-3c-03 cpu0:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,PERF,ITSC,FSGSBASE,TSC_ADJUST,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,MELTDOWN
> cpu0: 256KB 64b/line 8-way L2 cache cpu0: smt 0, core 0, package 0
> mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
> cpu0: apic clock running at 99MHz
> cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4, IBE
> cpu at mainbus0: not configured
> cpu at mainbus0: not configured
> cpu at mainbus0: not configured
> ioapic0 at mainbus0: apid 8 pa 0xfec00000, version 20, 24 pins
> acpimcfg0 at acpi0
> acpimcfg0: addr 0xf8000000, bus 0-63
> acpihpet0 at acpi0: 14318179 Hz
> acpiprt0 at acpi0: bus 0 (PCI0)
> acpiprt1 at acpi0: bus 2 (RP01)
> acpiprt2 at acpi0: bus 3 (RP03)
> acpiprt3 at acpi0: bus 4 (RP04)
> acpiprt4 at acpi0: bus 1 (PEG0)
> acpiprt5 at acpi0: bus -1 (PEG1)
> acpiprt6 at acpi0: bus -1 (PEG2)
> acpiec0 at acpi0: not present
> acpicpu0 at acpi0: C2(500@67 mwait.1@0x10), C1(1000@1 mwait.1), PSS
> acpipwrres0 at acpi0: FN00, resource for FAN0
> acpipwrres1 at acpi0: FN01, resource for FAN1
> acpipwrres2 at acpi0: FN02, resource for FAN2
> acpipwrres3 at acpi0: FN03, resource for FAN3
> acpipwrres4 at acpi0: FN04, resource for FAN4
> acpitz0 at acpi0: critical temperature is 105 degC
> acpitz1 at acpi0: critical temperature is 105 degC
> acpipci0 at acpi0 PCI0: 0x00000010 0x00000011 0x00000000
> acpicmos0 at acpi0
> acpibtn0 at acpi0: PWRB
> "PNP0C0B" at acpi0 not configured
> "PNP0C0B" at acpi0 not configured
> "PNP0C0B" at acpi0 not configured
> "PNP0C0B" at acpi0 not configured
> "PNP0C0B" at acpi0 not configured
> acpivideo0 at acpi0: GFX0
> acpivout0 at acpivideo0: DD1F
> cpu0: using VERW MDS workaround (except on vmm entry)
> cpu0: Enhanced SpeedStep 3592 MHz: speeds: 3600, 3400, 3200, 3000,
> 2900, 2700, 2500, 2300, 2100, 1900, 1700, 1500, 1400, 1200, 1000, 800
> MHz pci0 at mainbus0 bus 0 pchb0 at pci0 dev 0 function 0 "Intel Core
> 4G Host" rev 0x06 ppb0 at pci0 dev 1 function 0 "Intel Core 4G PCIE"
> rev 0x06: msi pci1 at ppb0 bus 1
> vendor "NVIDIA", unknown product 0x1c03 (class display subclass VGA,
> rev 0xa1) at pci1 dev 0 function 0 not configured azalia0 at pci1 dev 0
> function 1 vendor "NVIDIA", unknown product 0x10f1 rev 0xa1: msi
> azalia0: no supported codecs inteldrm0 at pci0 dev 2 function 0 "Intel
> HD Graphics 4600" rev 0x06 drm0 at inteldrm0
> inteldrm0: msi, HASWELL, gen 7
> azalia1 at pci0 dev 3 function 0 "Intel Core 4G HD Audio" rev 0x06: msi
> azalia1: No codecs found
> xhci0 at pci0 dev 20 function 0 "Intel 8 Series xHCI" rev 0x05: msi,
> xHCI 1.0 usb0 at xhci0: USB revision 3.0
> uhub0 at usb0 configuration 1 interface 0 "Intel xHCI root hub" rev
> 3.00/1.00 addr 1 "Intel 8 Series MEI" rev 0x04 at pci0 dev 22 function
> 0 not configured ehci0 at pci0 dev 26 function 0 "Intel 8 Series USB"
> rev 0x05: apic 8 int 16 usb1 at ehci0: USB revision 2.0
> uhub1 at usb1 configuration 1 interface 0 "Intel EHCI root hub" rev
> 2.00/1.00 addr 1 azalia2 at pci0 dev 27 function 0 "Intel 8 Series HD
> Audio" rev 0x05: msi azalia2: codecs: Realtek/0x0887
> audio0 at azalia2
> ppb1 at pci0 dev 28 function 0 "Intel 8 Series PCIE" rev 0xd5: msi
> pci2 at ppb1 bus 2
> ppb2 at pci0 dev 28 function 2 "Intel 8 Series PCIE" rev 0xd5: msi
> pci3 at ppb2 bus 3
> re0 at pci3 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G
> (0x4c00), msi, address fc:aa:14:68:75:64 rgephy0 at re0 phy 7: RTL8251
> PHY, rev. 0 ppb3 at pci0 dev 28 function 3 "Intel 8 Series PCIE" rev
> 0xd5: msi pci4 at ppb3 bus 4
> xhci1 at pci4 dev 0 function 0 "VIA VL805 xHCI" rev 0x01: msi, xHCI 1.0
> usb2 at xhci1: USB revision 3.0
> uhub2 at usb2 configuration 1 interface 0 "VIA xHCI root hub" rev
> 3.00/1.00 addr 1 ehci1 at pci0 dev 29 function 0 "Intel 8 Series USB"
> rev 0x05: apic 8 int 23 usb3 at ehci1: USB revision 2.0
> uhub3 at usb3 configuration 1 interface 0 "Intel EHCI root hub" rev
> 2.00/1.00 addr 1 pcib0 at pci0 dev 31 function 0 "Intel H81 LPC" rev
> 0x05 ahci0 at pci0 dev 31 function 2 "Intel 8 Series AHCI" rev 0x05:
> msi, AHCI 1.3 ahci0: port 4: 3.0Gb/s
> scsibus1 at ahci0: 32 targets
> sd0 at scsibus1 targ 4 lun 0: <ATA, WDC WD5000AAKX-0, 15.0>
> naa.50014ee0ae3f2d32 sd0: 476940MB, 512 bytes/sector, 976773168 sectors
> ichiic0 at pci0 dev 31 function 3 "Intel 8 Series SMBus" rev 0x05: apic
> 8 int 18 iic0 at ichiic0
> spdmem0 at iic0 addr 0x50: 4GB DDR3 SDRAM PC3-12800
> spdmem1 at iic0 addr 0x52: 4GB DDR3 SDRAM PC3-12800
> isa0 at pcib0
> isadma0 at isa0
> com0 at isa0 port 0x3f8/8 irq 4: ns16550a, 16 byte fifo
> pckbc0 at isa0 port 0x60/5 irq 1 irq 12
> pckbd0 at pckbc0 (kbd slot)
> wskbd0 at pckbd0: console keyboard
> pcppi0 at isa0 port 0x61
> spkr0 at pcppi0
> lpt0 at isa0 port 0x378/4 irq 7
> vmm0 at mainbus0: VMX/EPT
> uhub4 at uhub1 port 1 configuration 1 interface 0 "Intel Rate Matching
> Hub" rev 2.00/0.05 addr 2 uhub5 at uhub2 port 1 configuration 1
> interface 0 "VIA Labs USB2.0 Hub" rev 2.10/4.20 addr 2 uhub6 at uhub5
> port 1 configuration 1 interface 0 "ALCOR Generic USB Hub" rev
> 1.10/3.12 addr 3 uhidev0 at uhub6 port 1 configuration 1 interface 0
> "CHERRY Mechanical Keyboard" rev 2.00/0.02 addr 4 uhidev0: iclass 3/1
> ukbd0 at uhidev0: 8 variable keys, 6 key codes wskbd1 at ukbd0 mux 1
> uhidev1 at uhub6 port 1 configuration 1 interface 1 "CHERRY Mechanical
> Keyboard" rev 2.00/0.02 addr 4 uhidev1: iclass 3/0, 3 report ids
> uhid0 at uhidev1 reportid 1: input=2, output=0, feature=0
> ukbd1 at uhidev1 reportid 3: 120 variable keys, 0 key codes
> wskbd2 at ukbd1 mux 1
> uhidev2 at uhub6 port 1 configuration 1 interface 2 "CHERRY Mechanical
> Keyboard" rev 2.00/0.02 addr 4 uhidev2: iclass 3/0
> uhid1 at uhidev2: input=64, output=64, feature=64
> uhidev3 at uhub6 port 2 configuration 1 interface 0 "SteelSeries Sensei
> Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev3: iclass 3/0
> uhid2 at uhidev3: input=32, output=32, feature=255
> uhidev4 at uhub6 port 2 configuration 1 interface 1 "SteelSeries Sensei
> Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev4: iclass 3/1
> ums0 at uhidev4: 8 buttons, Z dir
> wsmouse0 at ums0 mux 0
> uhidev5 at uhub6 port 2 configuration 1 interface 2 "SteelSeries Sensei
> Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev5: iclass 3/1
> ukbd2 at uhidev5: 8 variable keys, 6 key codes
> wskbd3 at ukbd2 mux 1
> uhidev6 at uhub6 port 2 configuration 1 interface 3 "SteelSeries Sensei
> Raw Gaming Mouse" rev 1.10/1.12 addr 5 uhidev6: iclass 3/0
> uhid3 at uhidev6: input=4, output=0, feature=0
> uhidev7 at uhub6 port 3 configuration 1 interface 0 "Device 2Port
> KVMSwitcher" rev 1.10/0.01 addr 6 uhidev7: iclass 3/0
> ukbd3 at uhidev7: 8 variable keys, 6 key codes
> wskbd4 at ukbd3 mux 1
> uhidev8 at uhub6 port 3 configuration 1 interface 1 "Device 2Port
> KVMSwitcher" rev 1.10/0.01 addr 6 uhidev8: no input interrupt endpoint
> uhub7 at uhub3 port 1 configuration 1 interface 0 "Intel Rate Matching
> Hub" rev 2.00/0.05 addr 2 vscsi0 at root
> scsibus2 at vscsi0: 256 targets
> softraid0 at root
> scsibus3 at softraid0: 256 targets
> root on sd0a (9b5c1232128549ea.a) swap on sd0b dump on sd0b
> WARNING: / was not properly unmounted
> inteldrm0: 1920x1080, 32bpp
> wsdisplay0 at inteldrm0 mux 1: console (std, vt100 emulation), using
> wskbd0 wskbd1: connecting to wsdisplay0
> wskbd2: connecting to wsdisplay0
> wskbd3: connecting to wsdisplay0
> wskbd4: connecting to wsdisplay0
> wsdisplay0: screen 1-5 added (std, vt100 emulation)
>
>

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Paul Irofti-4
In reply to this post by Paul Irofti-4
Here is an updated diff that addresses the following points mentioned by
kettenis@:

  - syscall fallback was implemented from the first version

  - the low resolution clock argument, I think, was shown not to be a
    problem

  - TSC and HPET alternatives were discussed, and if we decide to add
    them, I think that should be done by a separate diff

  - I think this version does proper wrapping (at least according to the
    README); of course Philip's input would be greatly appreciated!

  - I will export the ELF bits after the diff gets in commitable shape

  - proper auxv number instead of 2004: I see that NetBSD
    has taken 2004 for AT_SUN_LDELF; should I take 2015 which seems the
    next one free?

Hopefully this version also fixes the init bug solene@ was seeing.

Paul

diff --git lib/libc/asr/asr.c lib/libc/asr/asr.c
index cd056c85719..2b25d49f32a 100644
--- lib/libc/asr/asr.c
+++ lib/libc/asr/asr.c
@@ -196,11 +196,11 @@ poll_intrsafe(struct pollfd *fds, nfds_t nfds, int timeout)
  struct timespec pollstart, pollend, elapsed;
  int r;
 
- if (clock_gettime(CLOCK_MONOTONIC, &pollstart))
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollstart))
  return -1;
 
  while ((r = poll(fds, 1, timeout)) == -1 && errno == EINTR) {
- if (clock_gettime(CLOCK_MONOTONIC, &pollend))
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollend))
  return -1;
  timespecsub(&pollend, &pollstart, &elapsed);
  timeout -= elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
@@ -418,7 +418,7 @@ asr_check_reload(struct asr *asr)
  asr->a_rtime = 0;
  }
 
- if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
  return;
 
  if ((ts.tv_sec - asr->a_rtime) < RELOAD_DELAY && asr->a_rtime != 0)
diff --git lib/libc/crypt/bcrypt.c lib/libc/crypt/bcrypt.c
index 82de8fa33b7..63edde9072e 100644
--- lib/libc/crypt/bcrypt.c
+++ lib/libc/crypt/bcrypt.c
@@ -31,6 +31,7 @@
  *
  */
 
+#include <sys/time.h>
 #include <sys/types.h>
 #include <blf.h>
 #include <ctype.h>
@@ -248,9 +249,9 @@ _bcrypt_autorounds(void)
  char buf[_PASSWORD_LEN];
  int duration;
 
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &before);
+ WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &before);
  bcrypt_newhash("testpassword", r, buf, sizeof(buf));
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &after);
+ WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &after);
 
  duration = after.tv_sec - before.tv_sec;
  duration *= 1000000;
diff --git lib/libc/dlfcn/dlfcn_stubs.c lib/libc/dlfcn/dlfcn_stubs.c
index 78d728f66cb..7b75ec4582a 100644
--- lib/libc/dlfcn/dlfcn_stubs.c
+++ lib/libc/dlfcn/dlfcn_stubs.c
@@ -80,10 +80,14 @@ dlerror(void)
  return "Wrong dl symbols!\n";
 }
 
+extern void *elf_aux_timekeep;
+extern int find_timekeep(void);
+
 int
 dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
  void *data)
 {
+ find_timekeep();
  if (_dl_cb != NULL && _dl_cb->dl_iterate_phdr != NULL)
  return _dl_cb->dl_iterate_phdr(callback, data);
 #ifndef PIC
diff --git lib/libc/dlfcn/init.c lib/libc/dlfcn/init.c
index 270f54aada5..0238bb50b0b 100644
--- lib/libc/dlfcn/init.c
+++ lib/libc/dlfcn/init.c
@@ -69,6 +69,9 @@ extern Elf_Ehdr __executable_start[] __attribute__((weak));
 /* provide definitions for these */
 const dl_cb *_dl_cb __relro = NULL;
 
+extern void *elf_aux_timekeep;
+extern int find_timekeep(void);
+
 void _libc_preinit(int, char **, char **, dl_cb_cb *) __dso_hidden;
 void
 _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
@@ -126,6 +129,7 @@ _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
  if (cb == NULL)
  setup_static_tib(phdr, phnum);
 #endif /* !PIC */
+ find_timekeep();
 }
 
 /* ARM just had to be different... */
diff --git lib/libc/gen/times.c lib/libc/gen/times.c
index 02e4dd44b5c..36841810d1b 100644
--- lib/libc/gen/times.c
+++ lib/libc/gen/times.c
@@ -52,7 +52,7 @@ times(struct tms *tp)
  return ((clock_t)-1);
  tp->tms_cutime = CONVTCK(ru.ru_utime);
  tp->tms_cstime = CONVTCK(ru.ru_stime);
- if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
  return ((clock_t)-1);
  return (ts.tv_sec * CLK_TCK + ts.tv_nsec / (1000000000 / CLK_TCK));
 }
diff --git lib/libc/gen/timespec_get.c lib/libc/gen/timespec_get.c
index 520a5954025..b2bdcd15a4d 100644
--- lib/libc/gen/timespec_get.c
+++ lib/libc/gen/timespec_get.c
@@ -30,6 +30,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <sys/time.h>
 #include <time.h>
 
 int
@@ -37,7 +38,7 @@ timespec_get(struct timespec *ts, int base)
 {
  switch (base) {
  case TIME_UTC:
- if (clock_gettime(CLOCK_REALTIME, ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_REALTIME, ts) == -1)
  return 0;
  break;
  default:
diff --git lib/libc/hidden/sys/time.h lib/libc/hidden/sys/time.h
index ed112320fa2..7f59daa0107 100644
--- lib/libc/hidden/sys/time.h
+++ lib/libc/hidden/sys/time.h
@@ -22,6 +22,7 @@
 
 PROTO_NORMAL(adjfreq);
 PROTO_NORMAL(adjtime);
+PROTO_WRAP(clock_gettime);
 PROTO_NORMAL(futimes);
 PROTO_NORMAL(getitimer);
 PROTO_NORMAL(gettimeofday);
diff --git lib/libc/net/res_random.c lib/libc/net/res_random.c
index 763e420bb88..9babb28470a 100644
--- lib/libc/net/res_random.c
+++ lib/libc/net/res_random.c
@@ -219,7 +219,7 @@ res_initid(void)
  if (ru_prf != NULL)
  arc4random_buf(ru_prf, sizeof(*ru_prf));
 
- clock_gettime(CLOCK_MONOTONIC, &ts);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
  ru_reseed = ts.tv_sec + RU_OUT;
  ru_msb = ru_msb == 0x8000 ? 0 : 0x8000;
 }
@@ -232,7 +232,7 @@ __res_randomid(void)
  u_int r;
  static void *randomid_mutex;
 
- clock_gettime(CLOCK_MONOTONIC, &ts);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
  pid = getpid();
 
  _MUTEX_LOCK(&randomid_mutex);
diff --git lib/libc/rpc/clnt_tcp.c lib/libc/rpc/clnt_tcp.c
index 8e6ef515b0e..927b4bf2028 100644
--- lib/libc/rpc/clnt_tcp.c
+++ lib/libc/rpc/clnt_tcp.c
@@ -393,12 +393,12 @@ readtcp(struct ct_data *ct, caddr_t buf, int len)
  pfd[0].events = POLLIN;
  TIMEVAL_TO_TIMESPEC(&ct->ct_wait, &wait);
  delta = wait;
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  for (;;) {
  r = ppoll(pfd, 1, &delta, NULL);
  save_errno = errno;
 
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&start, &after, &duration);
  timespecsub(&wait, &duration, &delta);
  if (delta.tv_sec < 0 || !timespecisset(&delta))
diff --git lib/libc/shlib_version lib/libc/shlib_version
index 06f98b01084..5fb0770494f 100644
--- lib/libc/shlib_version
+++ lib/libc/shlib_version
@@ -1,4 +1,4 @@
 major=96
-minor=0
+minor=1
 # note: If changes were made to include/thread_private.h or if system calls
 # were added/changed then librthread/shlib_version must also be updated.
diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
index 34769576ced..d0b5dd1bdcd 100644
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
 
 # glue to offer userland wrappers for some syscalls
 SRCS+= posix_madvise.c pthread_sigmask.c \
- w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
+ w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
+ w_clock_gettime.c
 
 # glue for compat with old syscall interfaces.
 SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
@@ -43,7 +44,7 @@ SRCS+= ${CANCEL:%=w_%.c} w_pread.c w_preadv.c w_pwrite.c w_pwritev.c
 ASM= __semctl.o __syscall.o __thrsigdivert.o \
  access.o acct.o adjfreq.o adjtime.o \
  bind.o chdir.o chflags.o chflagsat.o chmod.o chown.o chroot.o \
- clock_getres.o clock_gettime.o clock_settime.o \
+ clock_getres.o clock_settime.o \
  dup.o dup2.o dup3.o \
  execve.o \
  faccessat.o fchdir.o fchflags.o fchmod.o fchmodat.o fchown.o \
@@ -109,7 +110,7 @@ PPSEUDO_NOERR=${PSEUDO_NOERR:.o=.po}
 SPSEUDO_NOERR=${PSEUDO_NOERR:.o=.so}
 DPSEUDO_NOERR=${PSEUDO_NOERR:.o=.do}
 
-HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o}
+HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o} clock_gettime.o
 PHIDDEN=${HIDDEN:.o=.po}
 SHIDDEN=${HIDDEN:.o=.so}
 DHIDDEN=${HIDDEN:.o=.do}
diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
new file mode 100644
index 00000000000..04850fbda32
--- /dev/null
+++ lib/libc/sys/w_clock_gettime.c
@@ -0,0 +1,109 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <err.h>
+
+#include <sys/timekeep.h>
+
+void *elf_aux_timekeep;
+
+
+/*
+ * Needed exec_elf implementation.
+ * To be exposed by the kernel later if needed.
+ */
+
+#include <sys/exec_elf.h>
+
+typedef struct {
+ uint32_t au_id; /* 32-bit id */
+ uint64_t au_v; /* 64-bit value */
+} AuxInfo;
+
+enum AuxID {
+ AUX_null = 0,
+ AUX_ignore = 1,
+ AUX_execfd = 2,
+ AUX_phdr = 3, /* &phdr[0] */
+ AUX_phent = 4, /* sizeof(phdr[0]) */
+ AUX_phnum = 5, /* # phdr entries */
+ AUX_pagesz = 6, /* PAGESIZE */
+ AUX_base = 7, /* ld.so base addr */
+ AUX_flags = 8, /* processor flags */
+ AUX_entry = 9, /* a.out entry */
+ AUX_sun_uid = 2000, /* euid */
+ AUX_sun_ruid = 2001, /* ruid */
+ AUX_sun_gid = 2002, /* egid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
+};
+
+
+/*
+ * Helper functions.
+ */
+
+int
+find_timekeep(void)
+{
+ Elf_Addr *stackp;
+ AuxInfo *auxv;
+
+ stackp = (Elf_Addr *)environ;
+ while (*stackp++) ; /* pass environment */
+
+ /* look-up timekeep auxv */
+ for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
+ if (auxv->au_id == AUX_openbsd_timekeep) {
+ elf_aux_timekeep = (void *)auxv->au_v;
+ return 0;
+ }
+
+ warnx("%s", "Could not find auxv!");
+ return -1;
+}
+
+int
+WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
+{
+ struct timekeep *timekeep;
+
+ if (elf_aux_timekeep == NULL && find_timekeep())
+ return clock_gettime(clock_id, tp);
+ timekeep = elf_aux_timekeep;
+
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ *tp = timekeep->tp_realtime;
+ break;
+ case CLOCK_UPTIME:
+ *tp = timekeep->tp_uptime;
+ break;
+ case CLOCK_MONOTONIC:
+ *tp = timekeep->tp_monotonic;
+ break;
+ case CLOCK_BOOTTIME:
+ *tp = timekeep->tp_boottime;
+ break;
+ default:
+ return clock_gettime(clock_id, tp);
+ }
+ return 0;
+}
+DEF_WRAP(clock_gettime);
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 9b5b8eb3acf..59bc923a6fb 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -124,7 +124,7 @@ extern char *syscallnames[];
 /*
  * How many entries are in the AuxInfo array we pass to the process?
  */
-#define ELF_AUX_ENTRIES 8
+#define ELF_AUX_ENTRIES 9
 
 /*
  * This is the OpenBSD ELF emul
@@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
  a->au_v = ap->arg_entry;
  a++;
 
+ a->au_id = AUX_openbsd_timekeep;
+ a->au_v = p->p_p->ps_timekeep;
+ a++;
+
  a->au_id = AUX_null;
  a->au_v = 0;
  a++;
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 20480c2fc28..2496458fde1 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -64,6 +64,11 @@
 #include <uvm/uvm_extern.h>
 #include <machine/tcb.h>
 
+#include <sys/timekeep.h>
+
+struct uvm_object *timekeep_object;
+struct timekeep* timekeep;
+
 void unveil_destroy(struct process *ps);
 
 const struct kmem_va_mode kv_exec = {
@@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
  */
 int exec_sigcode_map(struct process *, struct emul *);
 
+/*
+ * Map the shared timekeep page.
+ */
+int exec_timekeep_map(struct process *);
+
 /*
  * If non-zero, stackgap_random specifies the upper limit of the random gap size
  * added to the fixed stack position. Must be n^2.
@@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
  /* map the process's signal trampoline code */
  if (exec_sigcode_map(pr, pack.ep_emul))
  goto free_pack_abort;
+ /* map the process's timekeep page */
+ if (exec_timekeep_map(pr))
+ goto free_pack_abort;
 
 #ifdef __HAVE_EXEC_MD_MAP
  /* perform md specific mappings that process might need */
@@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
 
  return (0);
 }
+
+int exec_timekeep_map(struct process *pr)
+{
+ size_t timekeep_sz = sizeof(struct timekeep);
+
+ /*
+ * Similar to the sigcode object, except that there is a single timekeep
+ * object, and not one per emulation.
+ */
+ if (timekeep_object == NULL) {
+ vaddr_t va;
+
+ timekeep_object = uao_create(timekeep_sz, 0);
+ uao_reference(timekeep_object);
+
+ if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
+    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
+    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ timekeep = (struct timekeep *)va;
+ }
+
+ uao_reference(timekeep_object);
+ if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
+    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
+    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ return (0);
+}
diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
index bcf8f689625..007f1116c4f 100644
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -35,6 +35,7 @@
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <dev/rndvar.h>
+#include <sys/timekeep.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
  BINTIME_TO_TIMEVAL(&bt, tvp);
 }
 
+void
+tc_clock_gettime(void)
+{
+ struct bintime bt;
+
+ if (timekeep == NULL)
+ return;
+
+ /* CLOCK_REALTIME */
+ nanotime(&timekeep->tp_realtime);
+
+ /* CLOCK_UPTIME */
+ binuptime(&bt);
+ bintimesub(&bt, &naptime, &bt);
+ BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
+
+ /* CLOCK_MONOTONIC */
+ nanouptime(&timekeep->tp_monotonic);
+
+ /* CLOCK_BOOTTIME */
+ timekeep->tp_boottime = timekeep->tp_monotonic;
+
+ return;
+}
+
 void
 bintime(struct bintime *bt)
 {
@@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
  time_uptime = th->th_offset.sec;
  membar_producer();
  timehands = th;
+
+ tc_clock_gettime();
 }
 
 /* Report or change the active timecounter hardware. */
diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
index a40e0510273..f55b75f1e84 100644
--- sys/sys/exec_elf.h
+++ sys/sys/exec_elf.h
@@ -691,7 +691,8 @@ enum AuxID {
  AUX_sun_uid = 2000, /* euid */
  AUX_sun_ruid = 2001, /* ruid */
  AUX_sun_gid = 2002, /* egid */
- AUX_sun_rgid = 2003 /* rgid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
 };
 
 struct elf_args {
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..93a79a220db 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -248,6 +248,8 @@ struct process {
  u_int ps_rtableid; /* Process routing table/domain. */
  char ps_nice; /* Process "nice" value. */
 
+ vaddr_t ps_timekeep; /* User pointer to timekeep */
+
  struct uprof { /* profile arguments */
  caddr_t pr_base; /* buffer base */
  size_t  pr_size; /* buffer size */
diff --git sys/sys/timekeep.h sys/sys/timekeep.h
new file mode 100644
index 00000000000..bad25185bc4
--- /dev/null
+++ sys/sys/timekeep.h
@@ -0,0 +1,37 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _SYS_TIMEKEEP_H_
+#define _SYS_TIMEKEEP_H_
+
+#include <sys/time.h>
+
+struct timekeep {
+ struct timespec tp_realtime;
+ struct timespec tp_uptime;
+ struct timespec tp_monotonic;
+ struct timespec tp_boottime;
+};
+
+#if defined(_KERNEL)
+#include <uvm/uvm_extern.h>
+
+extern struct uvm_object *timekeep_object;
+extern struct timekeep *timekeep;
+#endif
+
+#endif /* _SYS_TIMEKEEP_H_ */

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Paul Irofti-4
> Hopefully this version also fixes the init bug solene@ was seeing.

No according to robert@, sorry. I'll look into it more and get back with
a fix.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Paul Irofti-4
In reply to this post by Paul Irofti-4
Here is a third version of the diff. With this robert@ is able to use
chrome with almost all the calls to kernel clock_gettime gone. I
think the number dropped from 600,000 to 400. Robert can give you more
details. But the idea is that it is very fast now. Zoom zoom.

The diff includes a temporary hack for /sbin/init, the only known issue
remaining, to call the kernel syscall directly.

Make sure you clean your /usr/share/relink directory before compiling
and installing this! Robert's machine was picking up old objects and
creating a broken library on every reboot becuase his directory was not
clean.


Responding to the feedback received from deraadt@:

        - moved timekeep.h inside sys/time.h

        - ignoring the ELF auxv numbering as it is not an issue;
          deraadt@ says we do not run binaries across BSDs so no need to
          worry

        - the clock quality issue was brought back; afaics the current
          diff does exactly what the syscall is doing and the
          information is updated inside every tc_windup() which makes this
          almost an exact replica of the syscall for the clocks in libc;
          for the other clocks we go to the syscall and everything is
          the same as before

I thought that is what Scott said about the last bit as well. If I
missunderstood and clock quality is lost somewhere, I would apprecaite
someone with better technical understanding point out the exact issues
in the code. Perhaps again Scott?

Thank you,
Paul


diff --git lib/libc/asr/asr.c lib/libc/asr/asr.c
index cd056c85719..2b25d49f32a 100644
--- lib/libc/asr/asr.c
+++ lib/libc/asr/asr.c
@@ -196,11 +196,11 @@ poll_intrsafe(struct pollfd *fds, nfds_t nfds, int timeout)
  struct timespec pollstart, pollend, elapsed;
  int r;
 
- if (clock_gettime(CLOCK_MONOTONIC, &pollstart))
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollstart))
  return -1;
 
  while ((r = poll(fds, 1, timeout)) == -1 && errno == EINTR) {
- if (clock_gettime(CLOCK_MONOTONIC, &pollend))
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollend))
  return -1;
  timespecsub(&pollend, &pollstart, &elapsed);
  timeout -= elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
@@ -418,7 +418,7 @@ asr_check_reload(struct asr *asr)
  asr->a_rtime = 0;
  }
 
- if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
  return;
 
  if ((ts.tv_sec - asr->a_rtime) < RELOAD_DELAY && asr->a_rtime != 0)
diff --git lib/libc/crypt/bcrypt.c lib/libc/crypt/bcrypt.c
index 82de8fa33b7..02fd3013cc1 100644
--- lib/libc/crypt/bcrypt.c
+++ lib/libc/crypt/bcrypt.c
@@ -248,9 +248,9 @@ _bcrypt_autorounds(void)
  char buf[_PASSWORD_LEN];
  int duration;
 
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &before);
+ WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &before);
  bcrypt_newhash("testpassword", r, buf, sizeof(buf));
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &after);
+ WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &after);
 
  duration = after.tv_sec - before.tv_sec;
  duration *= 1000000;
diff --git lib/libc/dlfcn/dlfcn_stubs.c lib/libc/dlfcn/dlfcn_stubs.c
index 78d728f66cb..7b75ec4582a 100644
--- lib/libc/dlfcn/dlfcn_stubs.c
+++ lib/libc/dlfcn/dlfcn_stubs.c
@@ -80,10 +80,14 @@ dlerror(void)
  return "Wrong dl symbols!\n";
 }
 
+extern void *elf_aux_timekeep;
+extern int find_timekeep(void);
+
 int
 dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
  void *data)
 {
+ find_timekeep();
  if (_dl_cb != NULL && _dl_cb->dl_iterate_phdr != NULL)
  return _dl_cb->dl_iterate_phdr(callback, data);
 #ifndef PIC
diff --git lib/libc/dlfcn/init.c lib/libc/dlfcn/init.c
index 270f54aada5..0238bb50b0b 100644
--- lib/libc/dlfcn/init.c
+++ lib/libc/dlfcn/init.c
@@ -69,6 +69,9 @@ extern Elf_Ehdr __executable_start[] __attribute__((weak));
 /* provide definitions for these */
 const dl_cb *_dl_cb __relro = NULL;
 
+extern void *elf_aux_timekeep;
+extern int find_timekeep(void);
+
 void _libc_preinit(int, char **, char **, dl_cb_cb *) __dso_hidden;
 void
 _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
@@ -126,6 +129,7 @@ _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
  if (cb == NULL)
  setup_static_tib(phdr, phnum);
 #endif /* !PIC */
+ find_timekeep();
 }
 
 /* ARM just had to be different... */
diff --git lib/libc/gen/times.c lib/libc/gen/times.c
index 02e4dd44b5c..36841810d1b 100644
--- lib/libc/gen/times.c
+++ lib/libc/gen/times.c
@@ -52,7 +52,7 @@ times(struct tms *tp)
  return ((clock_t)-1);
  tp->tms_cutime = CONVTCK(ru.ru_utime);
  tp->tms_cstime = CONVTCK(ru.ru_stime);
- if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
  return ((clock_t)-1);
  return (ts.tv_sec * CLK_TCK + ts.tv_nsec / (1000000000 / CLK_TCK));
 }
diff --git lib/libc/gen/timespec_get.c lib/libc/gen/timespec_get.c
index 520a5954025..845cbe80356 100644
--- lib/libc/gen/timespec_get.c
+++ lib/libc/gen/timespec_get.c
@@ -37,7 +37,7 @@ timespec_get(struct timespec *ts, int base)
 {
  switch (base) {
  case TIME_UTC:
- if (clock_gettime(CLOCK_REALTIME, ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_REALTIME, ts) == -1)
  return 0;
  break;
  default:
diff --git lib/libc/hidden/time.h lib/libc/hidden/time.h
index 18c49f8fcb9..d8e1e0caf64 100644
--- lib/libc/hidden/time.h
+++ lib/libc/hidden/time.h
@@ -29,7 +29,7 @@ PROTO_NORMAL(asctime_r);
 PROTO_STD_DEPRECATED(clock);
 PROTO_DEPRECATED(clock_getcpuclockid);
 PROTO_NORMAL(clock_getres);
-PROTO_NORMAL(clock_gettime);
+PROTO_WRAP(clock_gettime);
 PROTO_NORMAL(clock_settime);
 PROTO_STD_DEPRECATED(ctime);
 PROTO_DEPRECATED(ctime_r);
diff --git lib/libc/net/res_random.c lib/libc/net/res_random.c
index 763e420bb88..9babb28470a 100644
--- lib/libc/net/res_random.c
+++ lib/libc/net/res_random.c
@@ -219,7 +219,7 @@ res_initid(void)
  if (ru_prf != NULL)
  arc4random_buf(ru_prf, sizeof(*ru_prf));
 
- clock_gettime(CLOCK_MONOTONIC, &ts);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
  ru_reseed = ts.tv_sec + RU_OUT;
  ru_msb = ru_msb == 0x8000 ? 0 : 0x8000;
 }
@@ -232,7 +232,7 @@ __res_randomid(void)
  u_int r;
  static void *randomid_mutex;
 
- clock_gettime(CLOCK_MONOTONIC, &ts);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
  pid = getpid();
 
  _MUTEX_LOCK(&randomid_mutex);
diff --git lib/libc/rpc/clnt_tcp.c lib/libc/rpc/clnt_tcp.c
index 8e6ef515b0e..927b4bf2028 100644
--- lib/libc/rpc/clnt_tcp.c
+++ lib/libc/rpc/clnt_tcp.c
@@ -393,12 +393,12 @@ readtcp(struct ct_data *ct, caddr_t buf, int len)
  pfd[0].events = POLLIN;
  TIMEVAL_TO_TIMESPEC(&ct->ct_wait, &wait);
  delta = wait;
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  for (;;) {
  r = ppoll(pfd, 1, &delta, NULL);
  save_errno = errno;
 
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&start, &after, &duration);
  timespecsub(&wait, &duration, &delta);
  if (delta.tv_sec < 0 || !timespecisset(&delta))
diff --git lib/libc/rpc/clnt_udp.c lib/libc/rpc/clnt_udp.c
index 68d01674410..92e1d5c350d 100644
--- lib/libc/rpc/clnt_udp.c
+++ lib/libc/rpc/clnt_udp.c
@@ -265,7 +265,7 @@ send_again:
  reply_msg.acpted_rply.ar_results.where = resultsp;
  reply_msg.acpted_rply.ar_results.proc = xresults;
 
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  for (;;) {
  switch (ppoll(pfd, 1, &wait, NULL)) {
  case 0:
@@ -283,7 +283,7 @@ send_again:
  /* FALLTHROUGH */
  case -1:
  if (errno == EINTR) {
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&after, &start, &duration);
  timespecadd(&time_waited, &duration, &time_waited);
  if (timespeccmp(&time_waited, &timeout, <))
diff --git lib/libc/rpc/svc_tcp.c lib/libc/rpc/svc_tcp.c
index f9d7a70938f..6c99db84359 100644
--- lib/libc/rpc/svc_tcp.c
+++ lib/libc/rpc/svc_tcp.c
@@ -342,7 +342,7 @@ readtcp(SVCXPRT *xprt, caddr_t buf, int len)
  * A timeout is fatal for the connection.
  */
  delta = wait_per_try;
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  pfd[0].fd = sock;
  pfd[0].events = POLLIN;
  do {
@@ -351,7 +351,7 @@ readtcp(SVCXPRT *xprt, caddr_t buf, int len)
  case -1:
  if (errno != EINTR)
  goto fatal_err;
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&after, &start, &duration);
  timespecsub(&wait_per_try, &duration, &delta);
  if (delta.tv_sec < 0 || !timespecisset(&delta))
diff --git lib/libc/shlib_version lib/libc/shlib_version
index 06f98b01084..5fb0770494f 100644
--- lib/libc/shlib_version
+++ lib/libc/shlib_version
@@ -1,4 +1,4 @@
 major=96
-minor=0
+minor=1
 # note: If changes were made to include/thread_private.h or if system calls
 # were added/changed then librthread/shlib_version must also be updated.
diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
index 34769576ced..d0b5dd1bdcd 100644
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
 
 # glue to offer userland wrappers for some syscalls
 SRCS+= posix_madvise.c pthread_sigmask.c \
- w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
+ w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
+ w_clock_gettime.c
 
 # glue for compat with old syscall interfaces.
 SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
@@ -43,7 +44,7 @@ SRCS+= ${CANCEL:%=w_%.c} w_pread.c w_preadv.c w_pwrite.c w_pwritev.c
 ASM= __semctl.o __syscall.o __thrsigdivert.o \
  access.o acct.o adjfreq.o adjtime.o \
  bind.o chdir.o chflags.o chflagsat.o chmod.o chown.o chroot.o \
- clock_getres.o clock_gettime.o clock_settime.o \
+ clock_getres.o clock_settime.o \
  dup.o dup2.o dup3.o \
  execve.o \
  faccessat.o fchdir.o fchflags.o fchmod.o fchmodat.o fchown.o \
@@ -109,7 +110,7 @@ PPSEUDO_NOERR=${PSEUDO_NOERR:.o=.po}
 SPSEUDO_NOERR=${PSEUDO_NOERR:.o=.so}
 DPSEUDO_NOERR=${PSEUDO_NOERR:.o=.do}
 
-HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o}
+HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o} clock_gettime.o
 PHIDDEN=${HIDDEN:.o=.po}
 SHIDDEN=${HIDDEN:.o=.so}
 DHIDDEN=${HIDDEN:.o=.do}
diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
new file mode 100644
index 00000000000..061dcd47dce
--- /dev/null
+++ lib/libc/sys/w_clock_gettime.c
@@ -0,0 +1,109 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <err.h>
+
+#include <sys/time.h>
+
+void *elf_aux_timekeep;
+
+
+/*
+ * Needed exec_elf implementation.
+ * To be exposed by the kernel later if needed.
+ */
+
+#include <sys/exec_elf.h>
+
+typedef struct {
+ uint32_t au_id; /* 32-bit id */
+ uint64_t au_v; /* 64-bit value */
+} AuxInfo;
+
+enum AuxID {
+ AUX_null = 0,
+ AUX_ignore = 1,
+ AUX_execfd = 2,
+ AUX_phdr = 3, /* &phdr[0] */
+ AUX_phent = 4, /* sizeof(phdr[0]) */
+ AUX_phnum = 5, /* # phdr entries */
+ AUX_pagesz = 6, /* PAGESIZE */
+ AUX_base = 7, /* ld.so base addr */
+ AUX_flags = 8, /* processor flags */
+ AUX_entry = 9, /* a.out entry */
+ AUX_sun_uid = 2000, /* euid */
+ AUX_sun_ruid = 2001, /* ruid */
+ AUX_sun_gid = 2002, /* egid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
+};
+
+
+/*
+ * Helper functions.
+ */
+
+int
+find_timekeep(void)
+{
+ Elf_Addr *stackp;
+ AuxInfo *auxv;
+
+ stackp = (Elf_Addr *)environ;
+ while (*stackp++) ; /* pass environment */
+
+ /* look-up timekeep auxv */
+ for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
+ if (auxv->au_id == AUX_openbsd_timekeep) {
+ elf_aux_timekeep = (void *)auxv->au_v;
+ return 0;
+ }
+
+ warnx("%s", "Could not find auxv!");
+ return -1;
+}
+
+int
+WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
+{
+ struct timekeep *timekeep;
+
+ if (elf_aux_timekeep == NULL && find_timekeep())
+ return clock_gettime(clock_id, tp);
+ timekeep = elf_aux_timekeep;
+
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ *tp = timekeep->tp_realtime;
+ break;
+ case CLOCK_UPTIME:
+ *tp = timekeep->tp_uptime;
+ break;
+ case CLOCK_MONOTONIC:
+ *tp = timekeep->tp_monotonic;
+ break;
+ case CLOCK_BOOTTIME:
+ *tp = timekeep->tp_boottime;
+ break;
+ default:
+ return clock_gettime(clock_id, tp);
+ }
+ return 0;
+}
+DEF_WRAP(clock_gettime);
diff --git lib/libc/thread/synch.h lib/libc/thread/synch.h
index 788890add89..df2239438d2 100644
--- lib/libc/thread/synch.h
+++ lib/libc/thread/synch.h
@@ -33,7 +33,7 @@ _twait(volatile uint32_t *p, int val, clockid_t clockid, const struct timespec *
  if (abs == NULL)
  return futex(p, FUTEX_WAIT_PRIVATE, val, NULL, NULL);
 
- if (abs->tv_nsec >= 1000000000 || clock_gettime(clockid, &rel))
+ if (abs->tv_nsec >= 1000000000 || WRAP(clock_gettime)(clockid, &rel))
  return (EINVAL);
 
  rel.tv_sec = abs->tv_sec - rel.tv_sec;
diff --git sbin/init/init.c sbin/init/init.c
index 72d929706d3..c595d33bfac 100644
--- sbin/init/init.c
+++ sbin/init/init.c
@@ -38,6 +38,7 @@
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/tree.h>
+#include <sys/syscall.h>
 #include <sys/wait.h>
 #include <machine/cpu.h>
 
@@ -1039,7 +1040,7 @@ start_getty(session_t *sp)
  }
 
  if (timespecisset(&sp->se_started)) {
- clock_gettime(CLOCK_MONOTONIC, &current_time);
+ syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &current_time);
  timespecsub(&current_time, &sp->se_started, &elapsed);
  if (elapsed.tv_sec < GETTY_SPACING) {
  warning(
@@ -1103,7 +1104,7 @@ collect_child(pid_t pid)
  }
 
  sp->se_process = pid;
- clock_gettime(CLOCK_MONOTONIC, &sp->se_started);
+ syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &sp->se_started);
  add_session(sp);
 }
 
@@ -1170,7 +1171,7 @@ f_multi_user(void)
  break;
  }
  sp->se_process = pid;
- clock_gettime(CLOCK_MONOTONIC, &sp->se_started);
+ syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &sp->se_started);
  add_session(sp);
  }
 
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 9b5b8eb3acf..59bc923a6fb 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -124,7 +124,7 @@ extern char *syscallnames[];
 /*
  * How many entries are in the AuxInfo array we pass to the process?
  */
-#define ELF_AUX_ENTRIES 8
+#define ELF_AUX_ENTRIES 9
 
 /*
  * This is the OpenBSD ELF emul
@@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
  a->au_v = ap->arg_entry;
  a++;
 
+ a->au_id = AUX_openbsd_timekeep;
+ a->au_v = p->p_p->ps_timekeep;
+ a++;
+
  a->au_id = AUX_null;
  a->au_v = 0;
  a++;
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 20480c2fc28..ee34c86d05b 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -64,6 +64,11 @@
 #include <uvm/uvm_extern.h>
 #include <machine/tcb.h>
 
+#include <sys/time.h>
+
+struct uvm_object *timekeep_object;
+struct timekeep* timekeep;
+
 void unveil_destroy(struct process *ps);
 
 const struct kmem_va_mode kv_exec = {
@@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
  */
 int exec_sigcode_map(struct process *, struct emul *);
 
+/*
+ * Map the shared timekeep page.
+ */
+int exec_timekeep_map(struct process *);
+
 /*
  * If non-zero, stackgap_random specifies the upper limit of the random gap size
  * added to the fixed stack position. Must be n^2.
@@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
  /* map the process's signal trampoline code */
  if (exec_sigcode_map(pr, pack.ep_emul))
  goto free_pack_abort;
+ /* map the process's timekeep page */
+ if (exec_timekeep_map(pr))
+ goto free_pack_abort;
 
 #ifdef __HAVE_EXEC_MD_MAP
  /* perform md specific mappings that process might need */
@@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
 
  return (0);
 }
+
+int exec_timekeep_map(struct process *pr)
+{
+ size_t timekeep_sz = sizeof(struct timekeep);
+
+ /*
+ * Similar to the sigcode object, except that there is a single timekeep
+ * object, and not one per emulation.
+ */
+ if (timekeep_object == NULL) {
+ vaddr_t va;
+
+ timekeep_object = uao_create(timekeep_sz, 0);
+ uao_reference(timekeep_object);
+
+ if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
+    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
+    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ timekeep = (struct timekeep *)va;
+ }
+
+ uao_reference(timekeep_object);
+ if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
+    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
+    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ return (0);
+}
diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
index bcf8f689625..5f3ba524042 100644
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -35,6 +35,7 @@
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <dev/rndvar.h>
+#include <sys/time.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
  BINTIME_TO_TIMEVAL(&bt, tvp);
 }
 
+void
+tc_clock_gettime(void)
+{
+ struct bintime bt;
+
+ if (timekeep == NULL)
+ return;
+
+ /* CLOCK_REALTIME */
+ nanotime(&timekeep->tp_realtime);
+
+ /* CLOCK_UPTIME */
+ binuptime(&bt);
+ bintimesub(&bt, &naptime, &bt);
+ BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
+
+ /* CLOCK_MONOTONIC */
+ nanouptime(&timekeep->tp_monotonic);
+
+ /* CLOCK_BOOTTIME */
+ timekeep->tp_boottime = timekeep->tp_monotonic;
+
+ return;
+}
+
 void
 bintime(struct bintime *bt)
 {
@@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
  time_uptime = th->th_offset.sec;
  membar_producer();
  timehands = th;
+
+ tc_clock_gettime();
 }
 
 /* Report or change the active timecounter hardware. */
diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
index a40e0510273..f55b75f1e84 100644
--- sys/sys/exec_elf.h
+++ sys/sys/exec_elf.h
@@ -691,7 +691,8 @@ enum AuxID {
  AUX_sun_uid = 2000, /* euid */
  AUX_sun_ruid = 2001, /* ruid */
  AUX_sun_gid = 2002, /* egid */
- AUX_sun_rgid = 2003 /* rgid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
 };
 
 struct elf_args {
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..93a79a220db 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -248,6 +248,8 @@ struct process {
  u_int ps_rtableid; /* Process routing table/domain. */
  char ps_nice; /* Process "nice" value. */
 
+ vaddr_t ps_timekeep; /* User pointer to timekeep */
+
  struct uprof { /* profile arguments */
  caddr_t pr_base; /* buffer base */
  size_t  pr_size; /* buffer size */
diff --git sys/sys/time.h sys/sys/time.h
index 564bae30b48..aab80121743 100644
--- sys/sys/time.h
+++ sys/sys/time.h
@@ -163,6 +163,13 @@ struct clockinfo {
 };
 #endif /* __BSD_VISIBLE */
 
+struct timekeep {
+ struct timespec tp_realtime;
+ struct timespec tp_uptime;
+ struct timespec tp_monotonic;
+ struct timespec tp_boottime;
+};
+
 #if defined(_KERNEL) || defined(_STANDALONE)
 #include <sys/_time.h>
 
@@ -393,6 +400,8 @@ TIMESPEC_TO_NSEC(const struct timespec *ts)
  return ts->tv_sec * 1000000000ULL + ts->tv_nsec;
 }
 
+extern struct uvm_object *timekeep_object;
+extern struct timekeep *timekeep;
 #else /* !_KERNEL */
 #include <time.h>
 

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Robert Nagy
On 16/05/20 20:14 +0300, Paul Irofti wrote:
> Make sure you clean your /usr/share/relink directory before compiling
> and installing this! Robert's machine was picking up old objects and
> creating a broken library on every reboot becuase his directory was not
> clean.

It was actually an old .so inside libc obj dir. So better clean everything
and just do a make build :)

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Theo de Raadt-2
In reply to this post by Paul Irofti-4
Paul Irofti <[hidden email]> wrote:

> Here is a third version of the diff. With this robert@ is able to use
> chrome with almost all the calls to kernel clock_gettime gone. I
> think the number dropped from 600,000 to 400. Robert can give you more
> details. But the idea is that it is very fast now. Zoom zoom.

But the


> The diff includes a temporary hack for /sbin/init, the only known issue
> remaining, to call the kernel syscall directly.

This is wrong.  It means you haven't found the function start_init()
in init_main.c yet.

> - moved timekeep.h inside sys/time.h

Namespace issues must be considered.  That structure could collide.
The name is probably poorly chosen, and the structure may need _ or __
naming to avoid potential collision.

> - ignoring the ELF auxv numbering as it is not an issue;
>  deraadt@ says we do not run binaries across BSDs so no need to
>  worry

That is my thought.  People who have knowledge of cross-compile should
speak up.

> - the clock quality issue was brought back; afaics the current
>  diff does exactly what the syscall is doing and the
>  information is updated inside every tc_windup() which makes this
>  almost an exact replica of the syscall for the clocks in libc;
>  for the other clocks we go to the syscall and everything is
>  the same as before
>
> I thought that is what Scott said about the last bit as well. If I
> missunderstood and clock quality is lost somewhere, I would apprecaite
> someone with better technical understanding point out the exact issues
> in the code. Perhaps again Scott?

Let me make it clear this is very important.  Changing this structure
later will be very painful ABI break.  The high-resolution issue brought
up by kettenis MUST be considered in the first commit.  There is no point
commiting drafts of this which simply result in painful ABI breaks in the
short term.  Meaning, this must not be rushed.

But secondly, commiting a design which *blocks work* on resolution
improvement is nasty.  This design is looking at "something changed
slowly", and there is no way to improve it.

Final point:

I think tc_clock_gettime() in the kernel and userland reading in
WRAP(clock_gettime) are very non-atomic.  What prevents the structure
assignment from reading half of an old timespec, and half of a new timespec?
As higher bytes in the timeval increase, lower bytes could be read which
decrease.  It seems this is depending on consistant cache behaviour...
which might not be the case.


> diff --git lib/libc/asr/asr.c lib/libc/asr/asr.c
> index cd056c85719..2b25d49f32a 100644
> --- lib/libc/asr/asr.c
> +++ lib/libc/asr/asr.c
> @@ -196,11 +196,11 @@ poll_intrsafe(struct pollfd *fds, nfds_t nfds, int timeout)
>   struct timespec pollstart, pollend, elapsed;
>   int r;
>  
> - if (clock_gettime(CLOCK_MONOTONIC, &pollstart))
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollstart))
>   return -1;
>  
>   while ((r = poll(fds, 1, timeout)) == -1 && errno == EINTR) {
> - if (clock_gettime(CLOCK_MONOTONIC, &pollend))
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollend))
>   return -1;
>   timespecsub(&pollend, &pollstart, &elapsed);
>   timeout -= elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
> @@ -418,7 +418,7 @@ asr_check_reload(struct asr *asr)
>   asr->a_rtime = 0;
>   }
>  
> - if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
>   return;
>  
>   if ((ts.tv_sec - asr->a_rtime) < RELOAD_DELAY && asr->a_rtime != 0)
> diff --git lib/libc/crypt/bcrypt.c lib/libc/crypt/bcrypt.c
> index 82de8fa33b7..02fd3013cc1 100644
> --- lib/libc/crypt/bcrypt.c
> +++ lib/libc/crypt/bcrypt.c
> @@ -248,9 +248,9 @@ _bcrypt_autorounds(void)
>   char buf[_PASSWORD_LEN];
>   int duration;
>  
> - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &before);
> + WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &before);
>   bcrypt_newhash("testpassword", r, buf, sizeof(buf));
> - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &after);
> + WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &after);
>  
>   duration = after.tv_sec - before.tv_sec;
>   duration *= 1000000;
> diff --git lib/libc/dlfcn/dlfcn_stubs.c lib/libc/dlfcn/dlfcn_stubs.c
> index 78d728f66cb..7b75ec4582a 100644
> --- lib/libc/dlfcn/dlfcn_stubs.c
> +++ lib/libc/dlfcn/dlfcn_stubs.c
> @@ -80,10 +80,14 @@ dlerror(void)
>   return "Wrong dl symbols!\n";
>  }
>  
> +extern void *elf_aux_timekeep;
> +extern int find_timekeep(void);
> +
>  int
>  dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
>   void *data)
>  {
> + find_timekeep();
>   if (_dl_cb != NULL && _dl_cb->dl_iterate_phdr != NULL)
>   return _dl_cb->dl_iterate_phdr(callback, data);
>  #ifndef PIC
> diff --git lib/libc/dlfcn/init.c lib/libc/dlfcn/init.c
> index 270f54aada5..0238bb50b0b 100644
> --- lib/libc/dlfcn/init.c
> +++ lib/libc/dlfcn/init.c
> @@ -69,6 +69,9 @@ extern Elf_Ehdr __executable_start[] __attribute__((weak));
>  /* provide definitions for these */
>  const dl_cb *_dl_cb __relro = NULL;
>  
> +extern void *elf_aux_timekeep;
> +extern int find_timekeep(void);
> +
>  void _libc_preinit(int, char **, char **, dl_cb_cb *) __dso_hidden;
>  void
>  _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
> @@ -126,6 +129,7 @@ _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
>   if (cb == NULL)
>   setup_static_tib(phdr, phnum);
>  #endif /* !PIC */
> + find_timekeep();
>  }
>  
>  /* ARM just had to be different... */
> diff --git lib/libc/gen/times.c lib/libc/gen/times.c
> index 02e4dd44b5c..36841810d1b 100644
> --- lib/libc/gen/times.c
> +++ lib/libc/gen/times.c
> @@ -52,7 +52,7 @@ times(struct tms *tp)
>   return ((clock_t)-1);
>   tp->tms_cutime = CONVTCK(ru.ru_utime);
>   tp->tms_cstime = CONVTCK(ru.ru_stime);
> - if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
>   return ((clock_t)-1);
>   return (ts.tv_sec * CLK_TCK + ts.tv_nsec / (1000000000 / CLK_TCK));
>  }
> diff --git lib/libc/gen/timespec_get.c lib/libc/gen/timespec_get.c
> index 520a5954025..845cbe80356 100644
> --- lib/libc/gen/timespec_get.c
> +++ lib/libc/gen/timespec_get.c
> @@ -37,7 +37,7 @@ timespec_get(struct timespec *ts, int base)
>  {
>   switch (base) {
>   case TIME_UTC:
> - if (clock_gettime(CLOCK_REALTIME, ts) == -1)
> + if (WRAP(clock_gettime)(CLOCK_REALTIME, ts) == -1)
>   return 0;
>   break;
>   default:
> diff --git lib/libc/hidden/time.h lib/libc/hidden/time.h
> index 18c49f8fcb9..d8e1e0caf64 100644
> --- lib/libc/hidden/time.h
> +++ lib/libc/hidden/time.h
> @@ -29,7 +29,7 @@ PROTO_NORMAL(asctime_r);
>  PROTO_STD_DEPRECATED(clock);
>  PROTO_DEPRECATED(clock_getcpuclockid);
>  PROTO_NORMAL(clock_getres);
> -PROTO_NORMAL(clock_gettime);
> +PROTO_WRAP(clock_gettime);
>  PROTO_NORMAL(clock_settime);
>  PROTO_STD_DEPRECATED(ctime);
>  PROTO_DEPRECATED(ctime_r);
> diff --git lib/libc/net/res_random.c lib/libc/net/res_random.c
> index 763e420bb88..9babb28470a 100644
> --- lib/libc/net/res_random.c
> +++ lib/libc/net/res_random.c
> @@ -219,7 +219,7 @@ res_initid(void)
>   if (ru_prf != NULL)
>   arc4random_buf(ru_prf, sizeof(*ru_prf));
>  
> - clock_gettime(CLOCK_MONOTONIC, &ts);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
>   ru_reseed = ts.tv_sec + RU_OUT;
>   ru_msb = ru_msb == 0x8000 ? 0 : 0x8000;
>  }
> @@ -232,7 +232,7 @@ __res_randomid(void)
>   u_int r;
>   static void *randomid_mutex;
>  
> - clock_gettime(CLOCK_MONOTONIC, &ts);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
>   pid = getpid();
>  
>   _MUTEX_LOCK(&randomid_mutex);
> diff --git lib/libc/rpc/clnt_tcp.c lib/libc/rpc/clnt_tcp.c
> index 8e6ef515b0e..927b4bf2028 100644
> --- lib/libc/rpc/clnt_tcp.c
> +++ lib/libc/rpc/clnt_tcp.c
> @@ -393,12 +393,12 @@ readtcp(struct ct_data *ct, caddr_t buf, int len)
>   pfd[0].events = POLLIN;
>   TIMEVAL_TO_TIMESPEC(&ct->ct_wait, &wait);
>   delta = wait;
> - clock_gettime(CLOCK_MONOTONIC, &start);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
>   for (;;) {
>   r = ppoll(pfd, 1, &delta, NULL);
>   save_errno = errno;
>  
> - clock_gettime(CLOCK_MONOTONIC, &after);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
>   timespecsub(&start, &after, &duration);
>   timespecsub(&wait, &duration, &delta);
>   if (delta.tv_sec < 0 || !timespecisset(&delta))
> diff --git lib/libc/rpc/clnt_udp.c lib/libc/rpc/clnt_udp.c
> index 68d01674410..92e1d5c350d 100644
> --- lib/libc/rpc/clnt_udp.c
> +++ lib/libc/rpc/clnt_udp.c
> @@ -265,7 +265,7 @@ send_again:
>   reply_msg.acpted_rply.ar_results.where = resultsp;
>   reply_msg.acpted_rply.ar_results.proc = xresults;
>  
> - clock_gettime(CLOCK_MONOTONIC, &start);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
>   for (;;) {
>   switch (ppoll(pfd, 1, &wait, NULL)) {
>   case 0:
> @@ -283,7 +283,7 @@ send_again:
>   /* FALLTHROUGH */
>   case -1:
>   if (errno == EINTR) {
> - clock_gettime(CLOCK_MONOTONIC, &after);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
>   timespecsub(&after, &start, &duration);
>   timespecadd(&time_waited, &duration, &time_waited);
>   if (timespeccmp(&time_waited, &timeout, <))
> diff --git lib/libc/rpc/svc_tcp.c lib/libc/rpc/svc_tcp.c
> index f9d7a70938f..6c99db84359 100644
> --- lib/libc/rpc/svc_tcp.c
> +++ lib/libc/rpc/svc_tcp.c
> @@ -342,7 +342,7 @@ readtcp(SVCXPRT *xprt, caddr_t buf, int len)
>   * A timeout is fatal for the connection.
>   */
>   delta = wait_per_try;
> - clock_gettime(CLOCK_MONOTONIC, &start);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
>   pfd[0].fd = sock;
>   pfd[0].events = POLLIN;
>   do {
> @@ -351,7 +351,7 @@ readtcp(SVCXPRT *xprt, caddr_t buf, int len)
>   case -1:
>   if (errno != EINTR)
>   goto fatal_err;
> - clock_gettime(CLOCK_MONOTONIC, &after);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
>   timespecsub(&after, &start, &duration);
>   timespecsub(&wait_per_try, &duration, &delta);
>   if (delta.tv_sec < 0 || !timespecisset(&delta))
> diff --git lib/libc/shlib_version lib/libc/shlib_version
> index 06f98b01084..5fb0770494f 100644
> --- lib/libc/shlib_version
> +++ lib/libc/shlib_version
> @@ -1,4 +1,4 @@
>  major=96
> -minor=0
> +minor=1
>  # note: If changes were made to include/thread_private.h or if system calls
>  # were added/changed then librthread/shlib_version must also be updated.
> diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
> index 34769576ced..d0b5dd1bdcd 100644
> --- lib/libc/sys/Makefile.inc
> +++ lib/libc/sys/Makefile.inc
> @@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
>  
>  # glue to offer userland wrappers for some syscalls
>  SRCS+= posix_madvise.c pthread_sigmask.c \
> - w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
> + w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
> + w_clock_gettime.c
>  
>  # glue for compat with old syscall interfaces.
>  SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
> @@ -43,7 +44,7 @@ SRCS+= ${CANCEL:%=w_%.c} w_pread.c w_preadv.c w_pwrite.c w_pwritev.c
>  ASM= __semctl.o __syscall.o __thrsigdivert.o \
>   access.o acct.o adjfreq.o adjtime.o \
>   bind.o chdir.o chflags.o chflagsat.o chmod.o chown.o chroot.o \
> - clock_getres.o clock_gettime.o clock_settime.o \
> + clock_getres.o clock_settime.o \
>   dup.o dup2.o dup3.o \
>   execve.o \
>   faccessat.o fchdir.o fchflags.o fchmod.o fchmodat.o fchown.o \
> @@ -109,7 +110,7 @@ PPSEUDO_NOERR=${PSEUDO_NOERR:.o=.po}
>  SPSEUDO_NOERR=${PSEUDO_NOERR:.o=.so}
>  DPSEUDO_NOERR=${PSEUDO_NOERR:.o=.do}
>  
> -HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o}
> +HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o} clock_gettime.o
>  PHIDDEN=${HIDDEN:.o=.po}
>  SHIDDEN=${HIDDEN:.o=.so}
>  DHIDDEN=${HIDDEN:.o=.do}
> diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
> new file mode 100644
> index 00000000000..061dcd47dce
> --- /dev/null
> +++ lib/libc/sys/w_clock_gettime.c
> @@ -0,0 +1,109 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2020 Paul Irofti <[hidden email]>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <stdlib.h>
> +#include <time.h>
> +#include <err.h>
> +
> +#include <sys/time.h>
> +
> +void *elf_aux_timekeep;
> +
> +
> +/*
> + * Needed exec_elf implementation.
> + * To be exposed by the kernel later if needed.
> + */
> +
> +#include <sys/exec_elf.h>
> +
> +typedef struct {
> + uint32_t au_id; /* 32-bit id */
> + uint64_t au_v; /* 64-bit value */
> +} AuxInfo;
> +
> +enum AuxID {
> + AUX_null = 0,
> + AUX_ignore = 1,
> + AUX_execfd = 2,
> + AUX_phdr = 3, /* &phdr[0] */
> + AUX_phent = 4, /* sizeof(phdr[0]) */
> + AUX_phnum = 5, /* # phdr entries */
> + AUX_pagesz = 6, /* PAGESIZE */
> + AUX_base = 7, /* ld.so base addr */
> + AUX_flags = 8, /* processor flags */
> + AUX_entry = 9, /* a.out entry */
> + AUX_sun_uid = 2000, /* euid */
> + AUX_sun_ruid = 2001, /* ruid */
> + AUX_sun_gid = 2002, /* egid */
> + AUX_sun_rgid = 2003, /* rgid */
> + AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
> +};
> +
> +
> +/*
> + * Helper functions.
> + */
> +
> +int
> +find_timekeep(void)
> +{
> + Elf_Addr *stackp;
> + AuxInfo *auxv;
> +
> + stackp = (Elf_Addr *)environ;
> + while (*stackp++) ; /* pass environment */
> +
> + /* look-up timekeep auxv */
> + for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
> + if (auxv->au_id == AUX_openbsd_timekeep) {
> + elf_aux_timekeep = (void *)auxv->au_v;
> + return 0;
> + }
> +
> + warnx("%s", "Could not find auxv!");
> + return -1;
> +}
> +
> +int
> +WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
> +{
> + struct timekeep *timekeep;
> +
> + if (elf_aux_timekeep == NULL && find_timekeep())
> + return clock_gettime(clock_id, tp);
> + timekeep = elf_aux_timekeep;
> +
> + switch (clock_id) {
> + case CLOCK_REALTIME:
> + *tp = timekeep->tp_realtime;
> + break;
> + case CLOCK_UPTIME:
> + *tp = timekeep->tp_uptime;
> + break;
> + case CLOCK_MONOTONIC:
> + *tp = timekeep->tp_monotonic;
> + break;
> + case CLOCK_BOOTTIME:
> + *tp = timekeep->tp_boottime;
> + break;
> + default:
> + return clock_gettime(clock_id, tp);
> + }
> + return 0;
> +}
> +DEF_WRAP(clock_gettime);
> diff --git lib/libc/thread/synch.h lib/libc/thread/synch.h
> index 788890add89..df2239438d2 100644
> --- lib/libc/thread/synch.h
> +++ lib/libc/thread/synch.h
> @@ -33,7 +33,7 @@ _twait(volatile uint32_t *p, int val, clockid_t clockid, const struct timespec *
>   if (abs == NULL)
>   return futex(p, FUTEX_WAIT_PRIVATE, val, NULL, NULL);
>  
> - if (abs->tv_nsec >= 1000000000 || clock_gettime(clockid, &rel))
> + if (abs->tv_nsec >= 1000000000 || WRAP(clock_gettime)(clockid, &rel))
>   return (EINVAL);
>  
>   rel.tv_sec = abs->tv_sec - rel.tv_sec;
> diff --git sbin/init/init.c sbin/init/init.c
> index 72d929706d3..c595d33bfac 100644
> --- sbin/init/init.c
> +++ sbin/init/init.c
> @@ -38,6 +38,7 @@
>  #include <sys/sysctl.h>
>  #include <sys/time.h>
>  #include <sys/tree.h>
> +#include <sys/syscall.h>
>  #include <sys/wait.h>
>  #include <machine/cpu.h>
>  
> @@ -1039,7 +1040,7 @@ start_getty(session_t *sp)
>   }
>  
>   if (timespecisset(&sp->se_started)) {
> - clock_gettime(CLOCK_MONOTONIC, &current_time);
> + syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &current_time);
>   timespecsub(&current_time, &sp->se_started, &elapsed);
>   if (elapsed.tv_sec < GETTY_SPACING) {
>   warning(
> @@ -1103,7 +1104,7 @@ collect_child(pid_t pid)
>   }
>  
>   sp->se_process = pid;
> - clock_gettime(CLOCK_MONOTONIC, &sp->se_started);
> + syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &sp->se_started);
>   add_session(sp);
>  }
>  
> @@ -1170,7 +1171,7 @@ f_multi_user(void)
>   break;
>   }
>   sp->se_process = pid;
> - clock_gettime(CLOCK_MONOTONIC, &sp->se_started);
> + syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &sp->se_started);
>   add_session(sp);
>   }
>  
> diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> index 9b5b8eb3acf..59bc923a6fb 100644
> --- sys/kern/exec_elf.c
> +++ sys/kern/exec_elf.c
> @@ -124,7 +124,7 @@ extern char *syscallnames[];
>  /*
>   * How many entries are in the AuxInfo array we pass to the process?
>   */
> -#define ELF_AUX_ENTRIES 8
> +#define ELF_AUX_ENTRIES 9
>  
>  /*
>   * This is the OpenBSD ELF emul
> @@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
>   a->au_v = ap->arg_entry;
>   a++;
>  
> + a->au_id = AUX_openbsd_timekeep;
> + a->au_v = p->p_p->ps_timekeep;
> + a++;
> +
>   a->au_id = AUX_null;
>   a->au_v = 0;
>   a++;
> diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
> index 20480c2fc28..ee34c86d05b 100644
> --- sys/kern/kern_exec.c
> +++ sys/kern/kern_exec.c
> @@ -64,6 +64,11 @@
>  #include <uvm/uvm_extern.h>
>  #include <machine/tcb.h>
>  
> +#include <sys/time.h>
> +
> +struct uvm_object *timekeep_object;
> +struct timekeep* timekeep;
> +
>  void unveil_destroy(struct process *ps);
>  
>  const struct kmem_va_mode kv_exec = {
> @@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
>   */
>  int exec_sigcode_map(struct process *, struct emul *);
>  
> +/*
> + * Map the shared timekeep page.
> + */
> +int exec_timekeep_map(struct process *);
> +
>  /*
>   * If non-zero, stackgap_random specifies the upper limit of the random gap size
>   * added to the fixed stack position. Must be n^2.
> @@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
>   /* map the process's signal trampoline code */
>   if (exec_sigcode_map(pr, pack.ep_emul))
>   goto free_pack_abort;
> + /* map the process's timekeep page */
> + if (exec_timekeep_map(pr))
> + goto free_pack_abort;
>  
>  #ifdef __HAVE_EXEC_MD_MAP
>   /* perform md specific mappings that process might need */
> @@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
>  
>   return (0);
>  }
> +
> +int exec_timekeep_map(struct process *pr)
> +{
> + size_t timekeep_sz = sizeof(struct timekeep);
> +
> + /*
> + * Similar to the sigcode object, except that there is a single timekeep
> + * object, and not one per emulation.
> + */
> + if (timekeep_object == NULL) {
> + vaddr_t va;
> +
> + timekeep_object = uao_create(timekeep_sz, 0);
> + uao_reference(timekeep_object);
> +
> + if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
> +    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
> +    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
> + uao_detach(timekeep_object);
> + return (ENOMEM);
> + }
> +
> + timekeep = (struct timekeep *)va;
> + }
> +
> + uao_reference(timekeep_object);
> + if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
> +    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
> +    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
> + uao_detach(timekeep_object);
> + return (ENOMEM);
> + }
> +
> + return (0);
> +}
> diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
> index bcf8f689625..5f3ba524042 100644
> --- sys/kern/kern_tc.c
> +++ sys/kern/kern_tc.c
> @@ -35,6 +35,7 @@
>  #include <sys/queue.h>
>  #include <sys/malloc.h>
>  #include <dev/rndvar.h>
> +#include <sys/time.h>
>  
>  /*
>   * A large step happens on boot.  This constant detects such steps.
> @@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
>   BINTIME_TO_TIMEVAL(&bt, tvp);
>  }
>  
> +void
> +tc_clock_gettime(void)
> +{
> + struct bintime bt;
> +
> + if (timekeep == NULL)
> + return;
> +
> + /* CLOCK_REALTIME */
> + nanotime(&timekeep->tp_realtime);
> +
> + /* CLOCK_UPTIME */
> + binuptime(&bt);
> + bintimesub(&bt, &naptime, &bt);
> + BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
> +
> + /* CLOCK_MONOTONIC */
> + nanouptime(&timekeep->tp_monotonic);
> +
> + /* CLOCK_BOOTTIME */
> + timekeep->tp_boottime = timekeep->tp_monotonic;
> +
> + return;
> +}
> +
>  void
>  bintime(struct bintime *bt)
>  {
> @@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
>   time_uptime = th->th_offset.sec;
>   membar_producer();
>   timehands = th;
> +
> + tc_clock_gettime();
>  }
>  
>  /* Report or change the active timecounter hardware. */
> diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
> index a40e0510273..f55b75f1e84 100644
> --- sys/sys/exec_elf.h
> +++ sys/sys/exec_elf.h
> @@ -691,7 +691,8 @@ enum AuxID {
>   AUX_sun_uid = 2000, /* euid */
>   AUX_sun_ruid = 2001, /* ruid */
>   AUX_sun_gid = 2002, /* egid */
> - AUX_sun_rgid = 2003 /* rgid */
> + AUX_sun_rgid = 2003, /* rgid */
> + AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
>  };
>  
>  struct elf_args {
> diff --git sys/sys/proc.h sys/sys/proc.h
> index 357c0c0d52c..93a79a220db 100644
> --- sys/sys/proc.h
> +++ sys/sys/proc.h
> @@ -248,6 +248,8 @@ struct process {
>   u_int ps_rtableid; /* Process routing table/domain. */
>   char ps_nice; /* Process "nice" value. */
>  
> + vaddr_t ps_timekeep; /* User pointer to timekeep */
> +
>   struct uprof { /* profile arguments */
>   caddr_t pr_base; /* buffer base */
>   size_t  pr_size; /* buffer size */
> diff --git sys/sys/time.h sys/sys/time.h
> index 564bae30b48..aab80121743 100644
> --- sys/sys/time.h
> +++ sys/sys/time.h
> @@ -163,6 +163,13 @@ struct clockinfo {
>  };
>  #endif /* __BSD_VISIBLE */
>  
> +struct timekeep {
> + struct timespec tp_realtime;
> + struct timespec tp_uptime;
> + struct timespec tp_monotonic;
> + struct timespec tp_boottime;
> +};
> +
>  #if defined(_KERNEL) || defined(_STANDALONE)
>  #include <sys/_time.h>
>  
> @@ -393,6 +400,8 @@ TIMESPEC_TO_NSEC(const struct timespec *ts)
>   return ts->tv_sec * 1000000000ULL + ts->tv_nsec;
>  }
>  
> +extern struct uvm_object *timekeep_object;
> +extern struct timekeep *timekeep;
>  #else /* !_KERNEL */
>  #include <time.h>
>  
>

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Theo de Raadt-2
It seems very wrong that this find_timekeep() function is called 3
times, and EACH TIME it walks the same two arrays, AND I expect bad
things to happen if a program messes with it's environment early on,
meaning this stops pointing to a place the right place on the stack:

> +     stackp = (Elf_Addr *)environ;

Whatever find_timekeep() is, it should be done only once, and I don't
think you have found the right place to perform the initialization
and store it into a global variable which all future consumers can
hinge off.

>  int
>  dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
>       void *data)
>  {
> +     find_timekeep();
>       if (_dl_cb != NULL && _dl_cb->dl_iterate_phdr != NULL)
>               return _dl_cb->dl_iterate_phdr(callback, data);

...

>  void _libc_preinit(int, char **, char **, dl_cb_cb *) __dso_hidden;
>  void
>  _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
> @@ -126,6 +129,7 @@ _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
>       if (cb == NULL)
>               setup_static_tib(phdr, phnum);
>  #endif /* !PIC */
> +     find_timekeep();
>  }

....

> +int
> +WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
> +{
> +     struct timekeep *timekeep;
> +
> +     if (elf_aux_timekeep == NULL && find_timekeep())
> +             return clock_gettime(clock_id, tp);
> +     timekeep = elf_aux_timekeep;

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Mark Kettenis
In reply to this post by Paul Irofti-4
> Date: Sat, 16 May 2020 02:40:02 +0300
> From: Paul Irofti <[hidden email]>
>
> Here is an updated diff that addresses the following points mentioned by
> kettenis@:
>
>   - syscall fallback was implemented from the first version
>
>   - the low resolution clock argument, I think, was shown not to be a
>     problem

There will be code for which this will be an unacceptable regression.
Things like benchmarking tools and things that care about video/audio
synchronization.

>   - TSC and HPET alternatives were discussed, and if we decide to add
>     them, I think that should be done by a separate diff
>
>   - I think this version does proper wrapping (at least according to the
>     README); of course Philip's input would be greatly appreciated!

Looks reasonable.  You could consider not wrapping the
CLOCK_THREAD_CPUTIME_ID calls as we won't be able to optimize those
with this approach.  That would make them slightly faster.

>   - I will export the ELF bits after the diff gets in commitable shape

It really needs TSC support for that.  And you also need to find a
solution for proper synchronization between userland and the kernel.
Currently they can race eachother in the sense that userland can read
the time at the same time the kernel is updating it.  A generation
mechanism like the timehands use may be appropriate.  You can't use locks!

Also, I think we need a versioning mechanism for the shared page, such
that libc can verify that it understands the interface offered by the
kernel.  Maybe a major/minor number at the start of the page.  If the
version check fails we can fall back on the system call.  That allows
updates without a flag day.

If you really don't want to implement the TSC approach now, we could
consider introducing CLOCK_MONOTONIC_COARSE and CLOCK_REALTIME_COARSE
and use your implementation just for those two.

>   - proper auxv number instead of 2004: I see that NetBSD
>     has taken 2004 for AT_SUN_LDELF; should I take 2015 which seems the
>     next one free?

The AT_SUN_XXX defines came from Solaris.  They don't matter to us, so
I'd start the OpenBSD-specific values at 2000.

> Hopefully this version also fixes the init bug solene@ was seeing.

_dl_iterate_phdr() should not call find_timekeep().

>
> Paul
>
> diff --git lib/libc/asr/asr.c lib/libc/asr/asr.c
> index cd056c85719..2b25d49f32a 100644
> --- lib/libc/asr/asr.c
> +++ lib/libc/asr/asr.c
> @@ -196,11 +196,11 @@ poll_intrsafe(struct pollfd *fds, nfds_t nfds, int timeout)
>   struct timespec pollstart, pollend, elapsed;
>   int r;
>  
> - if (clock_gettime(CLOCK_MONOTONIC, &pollstart))
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollstart))
>   return -1;
>  
>   while ((r = poll(fds, 1, timeout)) == -1 && errno == EINTR) {
> - if (clock_gettime(CLOCK_MONOTONIC, &pollend))
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollend))
>   return -1;
>   timespecsub(&pollend, &pollstart, &elapsed);
>   timeout -= elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
> @@ -418,7 +418,7 @@ asr_check_reload(struct asr *asr)
>   asr->a_rtime = 0;
>   }
>  
> - if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
>   return;
>  
>   if ((ts.tv_sec - asr->a_rtime) < RELOAD_DELAY && asr->a_rtime != 0)
> diff --git lib/libc/crypt/bcrypt.c lib/libc/crypt/bcrypt.c
> index 82de8fa33b7..63edde9072e 100644
> --- lib/libc/crypt/bcrypt.c
> +++ lib/libc/crypt/bcrypt.c
> @@ -31,6 +31,7 @@
>   *
>   */
>  
> +#include <sys/time.h>
>  #include <sys/types.h>
>  #include <blf.h>
>  #include <ctype.h>
> @@ -248,9 +249,9 @@ _bcrypt_autorounds(void)
>   char buf[_PASSWORD_LEN];
>   int duration;
>  
> - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &before);
> + WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &before);
>   bcrypt_newhash("testpassword", r, buf, sizeof(buf));
> - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &after);
> + WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &after);
>  
>   duration = after.tv_sec - before.tv_sec;
>   duration *= 1000000;
> diff --git lib/libc/dlfcn/dlfcn_stubs.c lib/libc/dlfcn/dlfcn_stubs.c
> index 78d728f66cb..7b75ec4582a 100644
> --- lib/libc/dlfcn/dlfcn_stubs.c
> +++ lib/libc/dlfcn/dlfcn_stubs.c
> @@ -80,10 +80,14 @@ dlerror(void)
>   return "Wrong dl symbols!\n";
>  }
>  
> +extern void *elf_aux_timekeep;
> +extern int find_timekeep(void);
> +
>  int
>  dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
>   void *data)
>  {
> + find_timekeep();
>   if (_dl_cb != NULL && _dl_cb->dl_iterate_phdr != NULL)
>   return _dl_cb->dl_iterate_phdr(callback, data);
>  #ifndef PIC
> diff --git lib/libc/dlfcn/init.c lib/libc/dlfcn/init.c
> index 270f54aada5..0238bb50b0b 100644
> --- lib/libc/dlfcn/init.c
> +++ lib/libc/dlfcn/init.c
> @@ -69,6 +69,9 @@ extern Elf_Ehdr __executable_start[] __attribute__((weak));
>  /* provide definitions for these */
>  const dl_cb *_dl_cb __relro = NULL;
>  
> +extern void *elf_aux_timekeep;
> +extern int find_timekeep(void);
> +
>  void _libc_preinit(int, char **, char **, dl_cb_cb *) __dso_hidden;
>  void
>  _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
> @@ -126,6 +129,7 @@ _libc_preinit(int argc, char **argv, char **envp, dl_cb_cb *cb)
>   if (cb == NULL)
>   setup_static_tib(phdr, phnum);
>  #endif /* !PIC */
> + find_timekeep();
>  }
>  
>  /* ARM just had to be different... */
> diff --git lib/libc/gen/times.c lib/libc/gen/times.c
> index 02e4dd44b5c..36841810d1b 100644
> --- lib/libc/gen/times.c
> +++ lib/libc/gen/times.c
> @@ -52,7 +52,7 @@ times(struct tms *tp)
>   return ((clock_t)-1);
>   tp->tms_cutime = CONVTCK(ru.ru_utime);
>   tp->tms_cstime = CONVTCK(ru.ru_stime);
> - if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
> + if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
>   return ((clock_t)-1);
>   return (ts.tv_sec * CLK_TCK + ts.tv_nsec / (1000000000 / CLK_TCK));
>  }
> diff --git lib/libc/gen/timespec_get.c lib/libc/gen/timespec_get.c
> index 520a5954025..b2bdcd15a4d 100644
> --- lib/libc/gen/timespec_get.c
> +++ lib/libc/gen/timespec_get.c
> @@ -30,6 +30,7 @@
>   * POSSIBILITY OF SUCH DAMAGE.
>   */
>  
> +#include <sys/time.h>
>  #include <time.h>
>  
>  int
> @@ -37,7 +38,7 @@ timespec_get(struct timespec *ts, int base)
>  {
>   switch (base) {
>   case TIME_UTC:
> - if (clock_gettime(CLOCK_REALTIME, ts) == -1)
> + if (WRAP(clock_gettime)(CLOCK_REALTIME, ts) == -1)
>   return 0;
>   break;
>   default:
> diff --git lib/libc/hidden/sys/time.h lib/libc/hidden/sys/time.h
> index ed112320fa2..7f59daa0107 100644
> --- lib/libc/hidden/sys/time.h
> +++ lib/libc/hidden/sys/time.h
> @@ -22,6 +22,7 @@
>  
>  PROTO_NORMAL(adjfreq);
>  PROTO_NORMAL(adjtime);
> +PROTO_WRAP(clock_gettime);
>  PROTO_NORMAL(futimes);
>  PROTO_NORMAL(getitimer);
>  PROTO_NORMAL(gettimeofday);
> diff --git lib/libc/net/res_random.c lib/libc/net/res_random.c
> index 763e420bb88..9babb28470a 100644
> --- lib/libc/net/res_random.c
> +++ lib/libc/net/res_random.c
> @@ -219,7 +219,7 @@ res_initid(void)
>   if (ru_prf != NULL)
>   arc4random_buf(ru_prf, sizeof(*ru_prf));
>  
> - clock_gettime(CLOCK_MONOTONIC, &ts);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
>   ru_reseed = ts.tv_sec + RU_OUT;
>   ru_msb = ru_msb == 0x8000 ? 0 : 0x8000;
>  }
> @@ -232,7 +232,7 @@ __res_randomid(void)
>   u_int r;
>   static void *randomid_mutex;
>  
> - clock_gettime(CLOCK_MONOTONIC, &ts);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
>   pid = getpid();
>  
>   _MUTEX_LOCK(&randomid_mutex);
> diff --git lib/libc/rpc/clnt_tcp.c lib/libc/rpc/clnt_tcp.c
> index 8e6ef515b0e..927b4bf2028 100644
> --- lib/libc/rpc/clnt_tcp.c
> +++ lib/libc/rpc/clnt_tcp.c
> @@ -393,12 +393,12 @@ readtcp(struct ct_data *ct, caddr_t buf, int len)
>   pfd[0].events = POLLIN;
>   TIMEVAL_TO_TIMESPEC(&ct->ct_wait, &wait);
>   delta = wait;
> - clock_gettime(CLOCK_MONOTONIC, &start);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
>   for (;;) {
>   r = ppoll(pfd, 1, &delta, NULL);
>   save_errno = errno;
>  
> - clock_gettime(CLOCK_MONOTONIC, &after);
> + WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
>   timespecsub(&start, &after, &duration);
>   timespecsub(&wait, &duration, &delta);
>   if (delta.tv_sec < 0 || !timespecisset(&delta))
> diff --git lib/libc/shlib_version lib/libc/shlib_version
> index 06f98b01084..5fb0770494f 100644
> --- lib/libc/shlib_version
> +++ lib/libc/shlib_version
> @@ -1,4 +1,4 @@
>  major=96
> -minor=0
> +minor=1
>  # note: If changes were made to include/thread_private.h or if system calls
>  # were added/changed then librthread/shlib_version must also be updated.
> diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
> index 34769576ced..d0b5dd1bdcd 100644
> --- lib/libc/sys/Makefile.inc
> +++ lib/libc/sys/Makefile.inc
> @@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
>  
>  # glue to offer userland wrappers for some syscalls
>  SRCS+= posix_madvise.c pthread_sigmask.c \
> - w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
> + w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
> + w_clock_gettime.c
>  
>  # glue for compat with old syscall interfaces.
>  SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
> @@ -43,7 +44,7 @@ SRCS+= ${CANCEL:%=w_%.c} w_pread.c w_preadv.c w_pwrite.c w_pwritev.c
>  ASM= __semctl.o __syscall.o __thrsigdivert.o \
>   access.o acct.o adjfreq.o adjtime.o \
>   bind.o chdir.o chflags.o chflagsat.o chmod.o chown.o chroot.o \
> - clock_getres.o clock_gettime.o clock_settime.o \
> + clock_getres.o clock_settime.o \
>   dup.o dup2.o dup3.o \
>   execve.o \
>   faccessat.o fchdir.o fchflags.o fchmod.o fchmodat.o fchown.o \
> @@ -109,7 +110,7 @@ PPSEUDO_NOERR=${PSEUDO_NOERR:.o=.po}
>  SPSEUDO_NOERR=${PSEUDO_NOERR:.o=.so}
>  DPSEUDO_NOERR=${PSEUDO_NOERR:.o=.do}
>  
> -HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o}
> +HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o} clock_gettime.o
>  PHIDDEN=${HIDDEN:.o=.po}
>  SHIDDEN=${HIDDEN:.o=.so}
>  DHIDDEN=${HIDDEN:.o=.do}
> diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
> new file mode 100644
> index 00000000000..04850fbda32
> --- /dev/null
> +++ lib/libc/sys/w_clock_gettime.c
> @@ -0,0 +1,109 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2020 Paul Irofti <[hidden email]>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <stdlib.h>
> +#include <time.h>
> +#include <err.h>
> +
> +#include <sys/timekeep.h>
> +
> +void *elf_aux_timekeep;
> +
> +
> +/*
> + * Needed exec_elf implementation.
> + * To be exposed by the kernel later if needed.
> + */
> +
> +#include <sys/exec_elf.h>
> +
> +typedef struct {
> + uint32_t au_id; /* 32-bit id */
> + uint64_t au_v; /* 64-bit value */
> +} AuxInfo;
> +
> +enum AuxID {
> + AUX_null = 0,
> + AUX_ignore = 1,
> + AUX_execfd = 2,
> + AUX_phdr = 3, /* &phdr[0] */
> + AUX_phent = 4, /* sizeof(phdr[0]) */
> + AUX_phnum = 5, /* # phdr entries */
> + AUX_pagesz = 6, /* PAGESIZE */
> + AUX_base = 7, /* ld.so base addr */
> + AUX_flags = 8, /* processor flags */
> + AUX_entry = 9, /* a.out entry */
> + AUX_sun_uid = 2000, /* euid */
> + AUX_sun_ruid = 2001, /* ruid */
> + AUX_sun_gid = 2002, /* egid */
> + AUX_sun_rgid = 2003, /* rgid */
> + AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
> +};
> +
> +
> +/*
> + * Helper functions.
> + */
> +
> +int
> +find_timekeep(void)
> +{
> + Elf_Addr *stackp;
> + AuxInfo *auxv;
> +
> + stackp = (Elf_Addr *)environ;
> + while (*stackp++) ; /* pass environment */
> +
> + /* look-up timekeep auxv */
> + for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
> + if (auxv->au_id == AUX_openbsd_timekeep) {
> + elf_aux_timekeep = (void *)auxv->au_v;
> + return 0;
> + }
> +
> + warnx("%s", "Could not find auxv!");
> + return -1;
> +}
> +
> +int
> +WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
> +{
> + struct timekeep *timekeep;
> +
> + if (elf_aux_timekeep == NULL && find_timekeep())
> + return clock_gettime(clock_id, tp);
> + timekeep = elf_aux_timekeep;
> +
> + switch (clock_id) {
> + case CLOCK_REALTIME:
> + *tp = timekeep->tp_realtime;
> + break;
> + case CLOCK_UPTIME:
> + *tp = timekeep->tp_uptime;
> + break;
> + case CLOCK_MONOTONIC:
> + *tp = timekeep->tp_monotonic;
> + break;
> + case CLOCK_BOOTTIME:
> + *tp = timekeep->tp_boottime;
> + break;
> + default:
> + return clock_gettime(clock_id, tp);
> + }
> + return 0;
> +}
> +DEF_WRAP(clock_gettime);
> diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> index 9b5b8eb3acf..59bc923a6fb 100644
> --- sys/kern/exec_elf.c
> +++ sys/kern/exec_elf.c
> @@ -124,7 +124,7 @@ extern char *syscallnames[];
>  /*
>   * How many entries are in the AuxInfo array we pass to the process?
>   */
> -#define ELF_AUX_ENTRIES 8
> +#define ELF_AUX_ENTRIES 9
>  
>  /*
>   * This is the OpenBSD ELF emul
> @@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
>   a->au_v = ap->arg_entry;
>   a++;
>  
> + a->au_id = AUX_openbsd_timekeep;
> + a->au_v = p->p_p->ps_timekeep;
> + a++;
> +
>   a->au_id = AUX_null;
>   a->au_v = 0;
>   a++;
> diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
> index 20480c2fc28..2496458fde1 100644
> --- sys/kern/kern_exec.c
> +++ sys/kern/kern_exec.c
> @@ -64,6 +64,11 @@
>  #include <uvm/uvm_extern.h>
>  #include <machine/tcb.h>
>  
> +#include <sys/timekeep.h>
> +
> +struct uvm_object *timekeep_object;
> +struct timekeep* timekeep;
> +
>  void unveil_destroy(struct process *ps);
>  
>  const struct kmem_va_mode kv_exec = {
> @@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
>   */
>  int exec_sigcode_map(struct process *, struct emul *);
>  
> +/*
> + * Map the shared timekeep page.
> + */
> +int exec_timekeep_map(struct process *);
> +
>  /*
>   * If non-zero, stackgap_random specifies the upper limit of the random gap size
>   * added to the fixed stack position. Must be n^2.
> @@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
>   /* map the process's signal trampoline code */
>   if (exec_sigcode_map(pr, pack.ep_emul))
>   goto free_pack_abort;
> + /* map the process's timekeep page */
> + if (exec_timekeep_map(pr))
> + goto free_pack_abort;
>  
>  #ifdef __HAVE_EXEC_MD_MAP
>   /* perform md specific mappings that process might need */
> @@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
>  
>   return (0);
>  }
> +
> +int exec_timekeep_map(struct process *pr)
> +{
> + size_t timekeep_sz = sizeof(struct timekeep);
> +
> + /*
> + * Similar to the sigcode object, except that there is a single timekeep
> + * object, and not one per emulation.
> + */
> + if (timekeep_object == NULL) {
> + vaddr_t va;
> +
> + timekeep_object = uao_create(timekeep_sz, 0);
> + uao_reference(timekeep_object);
> +
> + if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
> +    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
> +    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
> + uao_detach(timekeep_object);
> + return (ENOMEM);
> + }
> +
> + timekeep = (struct timekeep *)va;
> + }
> +
> + uao_reference(timekeep_object);
> + if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
> +    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
> +    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
> + uao_detach(timekeep_object);
> + return (ENOMEM);
> + }
> +
> + return (0);
> +}
> diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
> index bcf8f689625..007f1116c4f 100644
> --- sys/kern/kern_tc.c
> +++ sys/kern/kern_tc.c
> @@ -35,6 +35,7 @@
>  #include <sys/queue.h>
>  #include <sys/malloc.h>
>  #include <dev/rndvar.h>
> +#include <sys/timekeep.h>
>  
>  /*
>   * A large step happens on boot.  This constant detects such steps.
> @@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
>   BINTIME_TO_TIMEVAL(&bt, tvp);
>  }
>  
> +void
> +tc_clock_gettime(void)
> +{
> + struct bintime bt;
> +
> + if (timekeep == NULL)
> + return;
> +
> + /* CLOCK_REALTIME */
> + nanotime(&timekeep->tp_realtime);
> +
> + /* CLOCK_UPTIME */
> + binuptime(&bt);
> + bintimesub(&bt, &naptime, &bt);
> + BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
> +
> + /* CLOCK_MONOTONIC */
> + nanouptime(&timekeep->tp_monotonic);
> +
> + /* CLOCK_BOOTTIME */
> + timekeep->tp_boottime = timekeep->tp_monotonic;
> +
> + return;
> +}
> +
>  void
>  bintime(struct bintime *bt)
>  {
> @@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
>   time_uptime = th->th_offset.sec;
>   membar_producer();
>   timehands = th;
> +
> + tc_clock_gettime();
>  }
>  
>  /* Report or change the active timecounter hardware. */
> diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
> index a40e0510273..f55b75f1e84 100644
> --- sys/sys/exec_elf.h
> +++ sys/sys/exec_elf.h
> @@ -691,7 +691,8 @@ enum AuxID {
>   AUX_sun_uid = 2000, /* euid */
>   AUX_sun_ruid = 2001, /* ruid */
>   AUX_sun_gid = 2002, /* egid */
> - AUX_sun_rgid = 2003 /* rgid */
> + AUX_sun_rgid = 2003, /* rgid */
> + AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
>  };
>  
>  struct elf_args {
> diff --git sys/sys/proc.h sys/sys/proc.h
> index 357c0c0d52c..93a79a220db 100644
> --- sys/sys/proc.h
> +++ sys/sys/proc.h
> @@ -248,6 +248,8 @@ struct process {
>   u_int ps_rtableid; /* Process routing table/domain. */
>   char ps_nice; /* Process "nice" value. */
>  
> + vaddr_t ps_timekeep; /* User pointer to timekeep */
> +
>   struct uprof { /* profile arguments */
>   caddr_t pr_base; /* buffer base */
>   size_t  pr_size; /* buffer size */
> diff --git sys/sys/timekeep.h sys/sys/timekeep.h
> new file mode 100644
> index 00000000000..bad25185bc4
> --- /dev/null
> +++ sys/sys/timekeep.h
> @@ -0,0 +1,37 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2020 Paul Irofti <[hidden email]>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#ifndef _SYS_TIMEKEEP_H_
> +#define _SYS_TIMEKEEP_H_
> +
> +#include <sys/time.h>
> +
> +struct timekeep {
> + struct timespec tp_realtime;
> + struct timespec tp_uptime;
> + struct timespec tp_monotonic;
> + struct timespec tp_boottime;
> +};
> +
> +#if defined(_KERNEL)
> +#include <uvm/uvm_extern.h>
> +
> +extern struct uvm_object *timekeep_object;
> +extern struct timekeep *timekeep;
> +#endif
> +
> +#endif /* _SYS_TIMEKEEP_H_ */
>
>

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Theo de Raadt-2
Mark Kettenis <[hidden email]> wrote:

> [...]  And you also need to find a
> solution for proper synchronization between userland and the kernel.
> Currently they can race eachother in the sense that userland can read
> the time at the same time the kernel is updating it.  A generation
> mechanism like the timehands use may be appropriate.  You can't use locks!

Absolutely.  This design is totally broken.  You cannot just structure
copy.

Consider seperately what happens when big-endian and little-endian
systems copy the two integers in this structures.  In both cases, userland
can read a region of bytes which is partially the old time, and partially
the future.  The read value is garbage.   It is a false value in the future
or the time.  Monotonicity is out the window.

> Also, I think we need a versioning mechanism for the shared page, such
> that libc can verify that it understands the interface offered by the
> kernel.  Maybe a major/minor number at the start of the page.  If the
> version check fails we can fall back on the system call.  That allows
> updates without a flag day.

Yep.

> If you really don't want to implement the TSC approach now, we could
> consider introducing CLOCK_MONOTONIC_COARSE and CLOCK_REALTIME_COARSE
> and use your implementation just for those two.

Which likely doesn't improve any of the applications being targeted.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Mark Kettenis
In reply to this post by Theo de Raadt-2
> From: "Theo de Raadt" <[hidden email]>
> Date: Sat, 16 May 2020 11:34:22 -0600
>
> Paul Irofti <[hidden email]> wrote:
>
> > - the clock quality issue was brought back; afaics the current
> >  diff does exactly what the syscall is doing and the
> >  information is updated inside every tc_windup() which makes this
> >  almost an exact replica of the syscall for the clocks in libc;
> >  for the other clocks we go to the syscall and everything is
> >  the same as before
> >
> > I thought that is what Scott said about the last bit as well. If I
> > missunderstood and clock quality is lost somewhere, I would apprecaite
> > someone with better technical understanding point out the exact issues
> > in the code. Perhaps again Scott?

Let's take CLOCK_MONOTONIC as an example.  The clock_gettime() system
call ends up calling nanouptime(9).  This is the function that returns
a precise result (as opposed to getnanotime(9) which implements
returns the less precise result).

If you look at the implementation of nanouptime(9) you see it calls
binuptime(9).  Inside the loop it has:

    bintimeaddfrac(bt, th->th_scale * tc_delta(th), bt);

This is the bit that actually accesses the hardware to calculate the
precise result adding the elapsed time since the last time the
timehands were updated (when tc_windup() was called).

It is this bit that has to reproduced in userland.

> Let me make it clear this is very important.  Changing this structure
> later will be very painful ABI break.  The high-resolution issue brought
> up by kettenis MUST be considered in the first commit.  There is no point
> commiting drafts of this which simply result in painful ABI breaks in the
> short term.  Meaning, this must not be rushed.

See my proposal in the other mail I sent.

Reply | Threaded
Open this post in threaded view
|

Re: userland clock_gettime proof of concept

Paul Irofti-4
In reply to this post by Paul Irofti-4
Hi,

Here is another iteration of the diff. It addresses some of the issues
and opens a discussion for ohers.

Fixed.

  - find_timekeep() called 3 times, call only once (deraadt@)
     -> this was leftover code, removed from everywhere except the
        wrapper

  - atomic read time (deraadt@)
     o generation mechanism like the timehands (kettenis@)
       SOLUTION:
          kernel: gen mechanism is already used by the functions called,
                  added seq timekeep counter
          userland: added gen-like mechanism based on seq
       PROBLEMS:
          might need to change a bit  after high-res clock addition

  - versioning mechanism for shared page (kettenis@)
    -> added two bytes for major and minor at page start

  - structure may need _ or __ to avoid potential collision (deraadt@)
    -> prefixed with __ as found in other sys headers

Discussions.

  - /sbin/init init_main.c!start_init() map page? (deraadt@)
    -> that is not the problem, the page should be mapped even there
       by the sys_execve() call

  - bikesheding
    o struct timekeep naming (deraadt@)
    o AT_TIMEKEEP at 2000 (kettenis@)
    -> I will leave that for last

  - I do not understand the endianess issues when copying structs but
    that might no longer be part of the next diff, see bellow (deraadt@)

Main topic.

High resolution time sources are mandatory with this diff (deraadt@)
and CLOCK_MONOTONIC > nanouptime > binuptime > bintemaddfrac >
tc_delta(th) should be reproduced in userland (kettenis@).
     
I know about the need for high resolution time sources, I was the one
that added back TSC after it was disabled in favor of HPET last year...

The way I see it, we would need to either duplicate or expose some of
the microtime(9) functions to userland plus the CPU skews in order to
achieve our goal.

Basically we would replace the timespec structs in the shared page with
a timehands struct and get that updated through tc_windup() and then
proceed to adjust based on rdtsc offsets until we get the next update.

This will require a bit of tweaking to the current "locking" mechanism,
but I think that will be done quickly.

Does that sound good? Do you want it done another way?

Thank you for the feedback so far,
Paul


diff --git lib/libc/asr/asr.c lib/libc/asr/asr.c
index cd056c85719..2b25d49f32a 100644
--- lib/libc/asr/asr.c
+++ lib/libc/asr/asr.c
@@ -196,11 +196,11 @@ poll_intrsafe(struct pollfd *fds, nfds_t nfds, int timeout)
  struct timespec pollstart, pollend, elapsed;
  int r;
 
- if (clock_gettime(CLOCK_MONOTONIC, &pollstart))
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollstart))
  return -1;
 
  while ((r = poll(fds, 1, timeout)) == -1 && errno == EINTR) {
- if (clock_gettime(CLOCK_MONOTONIC, &pollend))
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &pollend))
  return -1;
  timespecsub(&pollend, &pollstart, &elapsed);
  timeout -= elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
@@ -418,7 +418,7 @@ asr_check_reload(struct asr *asr)
  asr->a_rtime = 0;
  }
 
- if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
  return;
 
  if ((ts.tv_sec - asr->a_rtime) < RELOAD_DELAY && asr->a_rtime != 0)
diff --git lib/libc/crypt/bcrypt.c lib/libc/crypt/bcrypt.c
index 82de8fa33b7..02fd3013cc1 100644
--- lib/libc/crypt/bcrypt.c
+++ lib/libc/crypt/bcrypt.c
@@ -248,9 +248,9 @@ _bcrypt_autorounds(void)
  char buf[_PASSWORD_LEN];
  int duration;
 
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &before);
+ WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &before);
  bcrypt_newhash("testpassword", r, buf, sizeof(buf));
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &after);
+ WRAP(clock_gettime)(CLOCK_THREAD_CPUTIME_ID, &after);
 
  duration = after.tv_sec - before.tv_sec;
  duration *= 1000000;
diff --git lib/libc/gen/times.c lib/libc/gen/times.c
index 02e4dd44b5c..36841810d1b 100644
--- lib/libc/gen/times.c
+++ lib/libc/gen/times.c
@@ -52,7 +52,7 @@ times(struct tms *tp)
  return ((clock_t)-1);
  tp->tms_cutime = CONVTCK(ru.ru_utime);
  tp->tms_cstime = CONVTCK(ru.ru_stime);
- if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts) == -1)
  return ((clock_t)-1);
  return (ts.tv_sec * CLK_TCK + ts.tv_nsec / (1000000000 / CLK_TCK));
 }
diff --git lib/libc/gen/timespec_get.c lib/libc/gen/timespec_get.c
index 520a5954025..845cbe80356 100644
--- lib/libc/gen/timespec_get.c
+++ lib/libc/gen/timespec_get.c
@@ -37,7 +37,7 @@ timespec_get(struct timespec *ts, int base)
 {
  switch (base) {
  case TIME_UTC:
- if (clock_gettime(CLOCK_REALTIME, ts) == -1)
+ if (WRAP(clock_gettime)(CLOCK_REALTIME, ts) == -1)
  return 0;
  break;
  default:
diff --git lib/libc/hidden/time.h lib/libc/hidden/time.h
index 18c49f8fcb9..d8e1e0caf64 100644
--- lib/libc/hidden/time.h
+++ lib/libc/hidden/time.h
@@ -29,7 +29,7 @@ PROTO_NORMAL(asctime_r);
 PROTO_STD_DEPRECATED(clock);
 PROTO_DEPRECATED(clock_getcpuclockid);
 PROTO_NORMAL(clock_getres);
-PROTO_NORMAL(clock_gettime);
+PROTO_WRAP(clock_gettime);
 PROTO_NORMAL(clock_settime);
 PROTO_STD_DEPRECATED(ctime);
 PROTO_DEPRECATED(ctime_r);
diff --git lib/libc/net/res_random.c lib/libc/net/res_random.c
index 763e420bb88..9babb28470a 100644
--- lib/libc/net/res_random.c
+++ lib/libc/net/res_random.c
@@ -219,7 +219,7 @@ res_initid(void)
  if (ru_prf != NULL)
  arc4random_buf(ru_prf, sizeof(*ru_prf));
 
- clock_gettime(CLOCK_MONOTONIC, &ts);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
  ru_reseed = ts.tv_sec + RU_OUT;
  ru_msb = ru_msb == 0x8000 ? 0 : 0x8000;
 }
@@ -232,7 +232,7 @@ __res_randomid(void)
  u_int r;
  static void *randomid_mutex;
 
- clock_gettime(CLOCK_MONOTONIC, &ts);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &ts);
  pid = getpid();
 
  _MUTEX_LOCK(&randomid_mutex);
diff --git lib/libc/rpc/clnt_tcp.c lib/libc/rpc/clnt_tcp.c
index 8e6ef515b0e..927b4bf2028 100644
--- lib/libc/rpc/clnt_tcp.c
+++ lib/libc/rpc/clnt_tcp.c
@@ -393,12 +393,12 @@ readtcp(struct ct_data *ct, caddr_t buf, int len)
  pfd[0].events = POLLIN;
  TIMEVAL_TO_TIMESPEC(&ct->ct_wait, &wait);
  delta = wait;
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  for (;;) {
  r = ppoll(pfd, 1, &delta, NULL);
  save_errno = errno;
 
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&start, &after, &duration);
  timespecsub(&wait, &duration, &delta);
  if (delta.tv_sec < 0 || !timespecisset(&delta))
diff --git lib/libc/rpc/clnt_udp.c lib/libc/rpc/clnt_udp.c
index 68d01674410..92e1d5c350d 100644
--- lib/libc/rpc/clnt_udp.c
+++ lib/libc/rpc/clnt_udp.c
@@ -265,7 +265,7 @@ send_again:
  reply_msg.acpted_rply.ar_results.where = resultsp;
  reply_msg.acpted_rply.ar_results.proc = xresults;
 
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  for (;;) {
  switch (ppoll(pfd, 1, &wait, NULL)) {
  case 0:
@@ -283,7 +283,7 @@ send_again:
  /* FALLTHROUGH */
  case -1:
  if (errno == EINTR) {
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&after, &start, &duration);
  timespecadd(&time_waited, &duration, &time_waited);
  if (timespeccmp(&time_waited, &timeout, <))
diff --git lib/libc/rpc/svc_tcp.c lib/libc/rpc/svc_tcp.c
index f9d7a70938f..6c99db84359 100644
--- lib/libc/rpc/svc_tcp.c
+++ lib/libc/rpc/svc_tcp.c
@@ -342,7 +342,7 @@ readtcp(SVCXPRT *xprt, caddr_t buf, int len)
  * A timeout is fatal for the connection.
  */
  delta = wait_per_try;
- clock_gettime(CLOCK_MONOTONIC, &start);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &start);
  pfd[0].fd = sock;
  pfd[0].events = POLLIN;
  do {
@@ -351,7 +351,7 @@ readtcp(SVCXPRT *xprt, caddr_t buf, int len)
  case -1:
  if (errno != EINTR)
  goto fatal_err;
- clock_gettime(CLOCK_MONOTONIC, &after);
+ WRAP(clock_gettime)(CLOCK_MONOTONIC, &after);
  timespecsub(&after, &start, &duration);
  timespecsub(&wait_per_try, &duration, &delta);
  if (delta.tv_sec < 0 || !timespecisset(&delta))
diff --git lib/libc/shlib_version lib/libc/shlib_version
index 06f98b01084..5fb0770494f 100644
--- lib/libc/shlib_version
+++ lib/libc/shlib_version
@@ -1,4 +1,4 @@
 major=96
-minor=0
+minor=1
 # note: If changes were made to include/thread_private.h or if system calls
 # were added/changed then librthread/shlib_version must also be updated.
diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
index 34769576ced..d0b5dd1bdcd 100644
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -12,7 +12,8 @@ SRCS+= Ovfork.S brk.S ${CERROR} \
 
 # glue to offer userland wrappers for some syscalls
 SRCS+= posix_madvise.c pthread_sigmask.c \
- w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
+ w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
+ w_clock_gettime.c
 
 # glue for compat with old syscall interfaces.
 SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
@@ -43,7 +44,7 @@ SRCS+= ${CANCEL:%=w_%.c} w_pread.c w_preadv.c w_pwrite.c w_pwritev.c
 ASM= __semctl.o __syscall.o __thrsigdivert.o \
  access.o acct.o adjfreq.o adjtime.o \
  bind.o chdir.o chflags.o chflagsat.o chmod.o chown.o chroot.o \
- clock_getres.o clock_gettime.o clock_settime.o \
+ clock_getres.o clock_settime.o \
  dup.o dup2.o dup3.o \
  execve.o \
  faccessat.o fchdir.o fchflags.o fchmod.o fchmodat.o fchown.o \
@@ -109,7 +110,7 @@ PPSEUDO_NOERR=${PSEUDO_NOERR:.o=.po}
 SPSEUDO_NOERR=${PSEUDO_NOERR:.o=.so}
 DPSEUDO_NOERR=${PSEUDO_NOERR:.o=.do}
 
-HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o}
+HIDDEN= ___realpath.o ___getcwd.o fork.o sigaction.o _ptrace.o ${CANCEL:=.o} clock_gettime.o
 PHIDDEN=${HIDDEN:.o=.po}
 SHIDDEN=${HIDDEN:.o=.so}
 DHIDDEN=${HIDDEN:.o=.do}
diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
new file mode 100644
index 00000000000..858308e91c4
--- /dev/null
+++ lib/libc/sys/w_clock_gettime.c
@@ -0,0 +1,126 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <err.h>
+
+#include <sys/time.h>
+
+void *elf_aux_timekeep;
+
+/*
+ * Needed exec_elf implementation.
+ * To be exposed by the kernel later if needed.
+ */
+
+#include <sys/exec_elf.h>
+
+typedef struct {
+ uint32_t au_id; /* 32-bit id */
+ uint64_t au_v; /* 64-bit value */
+} AuxInfo;
+
+enum AuxID {
+ AUX_null = 0,
+ AUX_ignore = 1,
+ AUX_execfd = 2,
+ AUX_phdr = 3, /* &phdr[0] */
+ AUX_phent = 4, /* sizeof(phdr[0]) */
+ AUX_phnum = 5, /* # phdr entries */
+ AUX_pagesz = 6, /* PAGESIZE */
+ AUX_base = 7, /* ld.so base addr */
+ AUX_flags = 8, /* processor flags */
+ AUX_entry = 9, /* a.out entry */
+ AUX_sun_uid = 2000, /* euid */
+ AUX_sun_ruid = 2001, /* ruid */
+ AUX_sun_gid = 2002, /* egid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
+};
+
+
+/*
+ * Helper functions.
+ */
+
+static int
+find_timekeep(void)
+{
+ Elf_Addr *stackp;
+ AuxInfo *auxv;
+ int found = 0;
+
+ stackp = (Elf_Addr *)environ;
+ while (*stackp++) ; /* pass environment */
+
+ /* look-up timekeep auxv */
+ for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
+ if (auxv->au_id == AUX_openbsd_timekeep) {
+ found = 1;
+ break;
+ }
+ if (found == 0) {
+ warnx("%s", "Could not find auxv!");
+ return -1;
+ }
+
+ elf_aux_timekeep = (void *)auxv->au_v;
+ return 0;
+}
+
+int
+WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
+{
+ struct __timekeep *timekeep;
+ unsigned int seq;
+
+ if (elf_aux_timekeep == NULL && find_timekeep())
+ return clock_gettime(clock_id, tp);
+ timekeep = elf_aux_timekeep;
+
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ do {
+ seq = timekeep->seq;
+ *tp = timekeep->tp_realtime;
+ } while (seq == 0 || seq != timekeep->seq);
+ break;
+ case CLOCK_UPTIME:
+ do {
+ seq = timekeep->seq;
+ *tp = timekeep->tp_uptime;
+ } while (seq == 0 || seq != timekeep->seq);
+ break;
+ case CLOCK_MONOTONIC:
+ do {
+ seq = timekeep->seq;
+ *tp = timekeep->tp_monotonic;
+ } while (seq == 0 || seq != timekeep->seq);
+ break;
+ case CLOCK_BOOTTIME:
+ do {
+ seq = timekeep->seq;
+ *tp = timekeep->tp_boottime;
+ } while (seq == 0 || seq != timekeep->seq);
+ break;
+ default:
+ return clock_gettime(clock_id, tp);
+ }
+ return 0;
+}
+DEF_WRAP(clock_gettime);
diff --git lib/libc/thread/synch.h lib/libc/thread/synch.h
index 788890add89..df2239438d2 100644
--- lib/libc/thread/synch.h
+++ lib/libc/thread/synch.h
@@ -33,7 +33,7 @@ _twait(volatile uint32_t *p, int val, clockid_t clockid, const struct timespec *
  if (abs == NULL)
  return futex(p, FUTEX_WAIT_PRIVATE, val, NULL, NULL);
 
- if (abs->tv_nsec >= 1000000000 || clock_gettime(clockid, &rel))
+ if (abs->tv_nsec >= 1000000000 || WRAP(clock_gettime)(clockid, &rel))
  return (EINVAL);
 
  rel.tv_sec = abs->tv_sec - rel.tv_sec;
diff --git sbin/init/init.c sbin/init/init.c
index 72d929706d3..c595d33bfac 100644
--- sbin/init/init.c
+++ sbin/init/init.c
@@ -38,6 +38,7 @@
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/tree.h>
+#include <sys/syscall.h>
 #include <sys/wait.h>
 #include <machine/cpu.h>
 
@@ -1039,7 +1040,7 @@ start_getty(session_t *sp)
  }
 
  if (timespecisset(&sp->se_started)) {
- clock_gettime(CLOCK_MONOTONIC, &current_time);
+ syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &current_time);
  timespecsub(&current_time, &sp->se_started, &elapsed);
  if (elapsed.tv_sec < GETTY_SPACING) {
  warning(
@@ -1103,7 +1104,7 @@ collect_child(pid_t pid)
  }
 
  sp->se_process = pid;
- clock_gettime(CLOCK_MONOTONIC, &sp->se_started);
+ syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &sp->se_started);
  add_session(sp);
 }
 
@@ -1170,7 +1171,7 @@ f_multi_user(void)
  break;
  }
  sp->se_process = pid;
- clock_gettime(CLOCK_MONOTONIC, &sp->se_started);
+ syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &sp->se_started);
  add_session(sp);
  }
 
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 9b5b8eb3acf..59bc923a6fb 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -124,7 +124,7 @@ extern char *syscallnames[];
 /*
  * How many entries are in the AuxInfo array we pass to the process?
  */
-#define ELF_AUX_ENTRIES 8
+#define ELF_AUX_ENTRIES 9
 
 /*
  * This is the OpenBSD ELF emul
@@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
  a->au_v = ap->arg_entry;
  a++;
 
+ a->au_id = AUX_openbsd_timekeep;
+ a->au_v = p->p_p->ps_timekeep;
+ a++;
+
  a->au_id = AUX_null;
  a->au_v = 0;
  a++;
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 20480c2fc28..15bf4db6fbd 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -64,6 +64,11 @@
 #include <uvm/uvm_extern.h>
 #include <machine/tcb.h>
 
+#include <sys/time.h>
+
+struct uvm_object *timekeep_object;
+struct __timekeep* timekeep;
+
 void unveil_destroy(struct process *ps);
 
 const struct kmem_va_mode kv_exec = {
@@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
  */
 int exec_sigcode_map(struct process *, struct emul *);
 
+/*
+ * Map the shared timekeep page.
+ */
+int exec_timekeep_map(struct process *);
+
 /*
  * If non-zero, stackgap_random specifies the upper limit of the random gap size
  * added to the fixed stack position. Must be n^2.
@@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
  /* map the process's signal trampoline code */
  if (exec_sigcode_map(pr, pack.ep_emul))
  goto free_pack_abort;
+ /* map the process's timekeep page */
+ if (exec_timekeep_map(pr))
+ goto free_pack_abort;
 
 #ifdef __HAVE_EXEC_MD_MAP
  /* perform md specific mappings that process might need */
@@ -863,3 +876,43 @@ exec_sigcode_map(struct process *pr, struct emul *e)
 
  return (0);
 }
+
+int
+exec_timekeep_map(struct process *pr)
+{
+ size_t timekeep_sz = sizeof(struct __timekeep);
+
+ /*
+ * Similar to the sigcode object, except that there is a single timekeep
+ * object, and not one per emulation.
+ */
+ if (timekeep_object == NULL) {
+ vaddr_t va;
+
+ timekeep_object = uao_create(timekeep_sz, 0);
+ uao_reference(timekeep_object);
+
+ if (uvm_map(kernel_map, &va, round_page(timekeep_sz), timekeep_object,
+    0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
+    MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ timekeep = (struct __timekeep *)va;
+ timekeep->major = 0;
+ timekeep->minor = 0;
+
+ timekeep->seq = 0;
+ }
+
+ uao_reference(timekeep_object);
+ if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, round_page(timekeep_sz),
+    timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
+    MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
+ uao_detach(timekeep_object);
+ return (ENOMEM);
+ }
+
+ return (0);
+}
diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
index 4b9eedf50b9..9c67cb738de 100644
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -35,6 +35,7 @@
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <dev/rndvar.h>
+#include <sys/time.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -480,6 +481,29 @@ tc_setclock(const struct timespec *ts)
 #endif
 }
 
+void
+tc_clock_gettime(void)
+{
+ if (timekeep == NULL)
+ return;
+
+ atomic_inc_int(&timekeep->seq);
+
+ /* CLOCK_REALTIME */
+ nanotime(&timekeep->tp_realtime);
+
+ /* CLOCK_UPTIME */
+ nanoruntime(&timekeep->tp_uptime);
+
+ /* CLOCK_MONOTONIC */
+ nanouptime(&timekeep->tp_monotonic);
+
+ /* CLOCK_BOOTTIME */
+ timekeep->tp_boottime = timekeep->tp_monotonic;
+
+ return;
+}
+
 /*
  * Initialize the next struct timehands in the ring and make
  * it the active timehands.  Along the way we might switch to a different
@@ -632,6 +656,8 @@ tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
  time_uptime = th->th_offset.sec;
  membar_producer();
  timehands = th;
+
+ tc_clock_gettime();
 }
 
 /* Report or change the active timecounter hardware. */
diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
index a40e0510273..f55b75f1e84 100644
--- sys/sys/exec_elf.h
+++ sys/sys/exec_elf.h
@@ -691,7 +691,8 @@ enum AuxID {
  AUX_sun_uid = 2000, /* euid */
  AUX_sun_ruid = 2001, /* ruid */
  AUX_sun_gid = 2002, /* egid */
- AUX_sun_rgid = 2003 /* rgid */
+ AUX_sun_rgid = 2003, /* rgid */
+ AUX_openbsd_timekeep = 2004, /* userland clock_gettime */
 };
 
 struct elf_args {
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..93a79a220db 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -248,6 +248,8 @@ struct process {
  u_int ps_rtableid; /* Process routing table/domain. */
  char ps_nice; /* Process "nice" value. */
 
+ vaddr_t ps_timekeep; /* User pointer to timekeep */
+
  struct uprof { /* profile arguments */
  caddr_t pr_base; /* buffer base */
  size_t  pr_size; /* buffer size */
diff --git sys/sys/time.h sys/sys/time.h
index e758a64ce07..be762be15e4 100644
--- sys/sys/time.h
+++ sys/sys/time.h
@@ -163,6 +163,17 @@ struct clockinfo {
 };
 #endif /* __BSD_VISIBLE */
 
+struct __timekeep {
+ uint8_t major; /* version major number */
+ uint8_t minor; /* version minor number */
+
+ volatile unsigned int seq; /* synchronization */
+ struct timespec tp_realtime; /* CLOCK_REALTIME */
+ struct timespec tp_uptime; /* CLOCK_UPTIME */
+ struct timespec tp_monotonic; /* CLOCK_MONOTONIC */
+ struct timespec tp_boottime; /* CLOCK_BOOTTIME */
+};
+
 #if defined(_KERNEL) || defined(_STANDALONE)
 #include <sys/_time.h>
 
@@ -396,6 +407,8 @@ TIMESPEC_TO_NSEC(const struct timespec *ts)
  return ts->tv_sec * 1000000000ULL + ts->tv_nsec;
 }
 
+extern struct uvm_object *timekeep_object;
+extern struct __timekeep *timekeep;
 #else /* !_KERNEL */
 #include <time.h>
 

1234 ... 11