Slow gettimeofday/clock_gettime

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

Slow gettimeofday/clock_gettime

William Ahern-2
Are there plans/wishes for gettimeofday to be sped up? I was playing around
with the RDTSC instruction.... Anyhow, to make a long story short it seems
that Linux is anywhere from 40X (@400 cycles, Linux 2.4.21, Athlon 1.8GHZ)
to ~10X (@1500 cycles, Linux 2.6.17, Athlon 1.8GHZ) faster, which could be
significant for heavily loaded applications which are constantly
re-calculating time deltas (e.g. network servers). My 2.2GHZ Opteron 148
takes about 11000 cycles.

Here's the code I ended up throwing together to get a feel for cost. This
proof-of-concept code puts my fast-path at 200 cycles, and the goalpost
benchmark at 34x improvement, with a 10-millisecond granularity. (Note, this
code doesn't solve the harder problems of guaranteeing monotonous operation;
for instance a gettimeofday() based on this might jump backwards temporarily
if the CPU lowered its frequency, though for some architectures there are
easy solutions, like AMD's RDTSCP).


#include <stdio.h>

#include <stdint.h>

#include <sys/param.h>

#include <time.h>

#include <sys/time.h>

#ifndef CLOCK_MONOTONIC
#define CLOCK_MONOTONIC CLOCK_REALTIME
#endif

#include <unistd.h>


#define DURATION 5

#define SIMULATE_JITTER 0

#define FUZZ 1000000000 / 100 /* 10 milliseconds */



/*
 * See http://www.flipcode.com/cgi-bin/fcarticles.cgi?show=63830
 * and http://en.wikipedia.org/wiki/RDTSC
 */
static inline unsigned long long rdtsc() {
        uint64_t tsc;

        asm volatile ("rdtsc\n\t" : "=A" (tsc));

#if 0
#if SIMULATE_JITTER
        /* Throw wrench into the works after counter has been tuned. */
        if (tsc % 13)
                tsc += 65536;
#endif
#endif

        return tsc;
} /* rdtsc() */


#define TIMESPEC_TO_NANOSECOND(ts) ((unsigned long long)(ts)->tv_sec * 1000000000ULL + (unsigned long long)(ts)->tv_nsec)


/*
 * Detect abnormal deviations.
 */
#define CLOCK_GETTIME_INITIALIZER { { -1, 0, }, }

static const struct clock_gettime {
        struct clock_gettime_benchmark {
                unsigned long long strength;
                unsigned long long average;

                unsigned long long sample, lomark, himark;
        } bm[1];
} clock_gettime_initializer = CLOCK_GETTIME_INITIALIZER;


static inline int clock_gettime_tune(struct clock_gettime *cst, unsigned benchmark, unsigned long long sample) {
        struct clock_gettime_benchmark *bm = &cst->bm[benchmark];
        unsigned long long interval = (sample > bm->sample)? sample - bm->sample : bm->sample;
        unsigned long long average = ((bm->average? bm->average : interval) + interval) / 2;
        int inrange;

        if ((inrange = average > bm->lomark && average < bm->himark))
                bm->strength++;

        if (bm->lomark == 0 || average < bm->lomark)
                bm->lomark = average;

        if (bm->himark == 0 || average > bm->himark)
                bm->himark = average;

        bm->average = (bm->lomark + bm->himark) / 2;
        bm->sample = sample;

        return !inrange;
} /* clock_gettime_tune() */


unsigned long long clock_gettime_tsc() {
        /*
         * RDTSC is broken. Nearly impossible (though tantalizingly close)
         * to keep monotonic. RDTSCP should work fine, though.
         */
        return rdtsc();
} /* clock_gettime_tsc() */


int clock_gettime_fuzzy(clockid_t clk, struct timespec *ts, unsigned nanofuzz) {
        /* Not async-signal safge or MT-safe. __thread would be nice. */
        static struct {
                struct clock_gettime tsc_hist;

                unsigned tsc_scale;
                unsigned long long tsc_per_ns;

                unsigned long long last_tsc;
                struct timespec last_ts;
        } s = { CLOCK_GETTIME_INITIALIZER, 1, 1, -1 }, s_init = { CLOCK_GETTIME_INITIALIZER, 1, 1, -1 };
        unsigned long long tsc, ratio, tsc_diff, ts_diff;
        unsigned scale;
        int retval;

        tsc = clock_gettime_tsc() * s.tsc_scale;

        if (tsc < s.last_tsc)
                s = s_init;

        tsc_diff = tsc - s.last_tsc;

        if (0 != clock_gettime_tune(&s.tsc_hist, 0, tsc))
                goto syscall;

        if (s.tsc_per_ns == 1 || tsc_diff <= s.tsc_per_ns || tsc_diff / s.tsc_per_ns >= nanofuzz)
                goto syscall;

        *ts = s.last_ts;

        return 0;
syscall:
        if (0 != (retval = clock_gettime(clk, ts)))
                return retval;

        if (timespeccmp(ts, &s.last_ts, <)) {
                s = s_init;

                goto syscall;
        }

        tsc_diff = (tsc - s.last_tsc);
        ts_diff = TIMESPEC_TO_NANOSECOND(ts) - TIMESPEC_TO_NANOSECOND(&s.last_ts);

        if (ts_diff == 0 || 1 >= (ratio = tsc_diff / ts_diff)) {
                /*
                 * Keep the counter tick longer than one nanosecond so our
                 * calculations are meaninful. Plus, don't want to divide by
                 * zero.
                 */
                scale = s.tsc_scale + 1;
                s = s_init;
                s.tsc_scale = scale;
        } else if (ts_diff > nanofuzz && s.last_ts.tv_sec > 0) {
                s = s_init;
        } else
                s.tsc_per_ns = ratio;

        s.last_tsc = tsc;
        s.last_ts = *ts;

        return 0;
} /* clock_gettime_fuzzy() */


int main(void) {
        int fuzzy, i, j;
        struct timespec prev, now;
        int count[3] = { 1, 1 };
        unsigned long long a, b, c, p;

        setvbuf(stdout, NULL, _IONBF, 0);

        for (i = 0, a = 0; i < 1 << 20; i++) {
                b = rdtsc();

#if 1
                clock_gettime(CLOCK_MONOTONIC, &now);
#elif 0
                gettimeofday((struct timeval *)&now, 0);
#elif 0
                time(&now.tv_sec);
#else
                /* NOOP */
#endif
                c = rdtsc();

                a = (a + (c - b)) / 2;
        }

        printf("clock_gettime takes %llu cycles\n", a);

        for (i = 0, p = a, a = 0; i < 1 << 20; i++) {
                b = rdtsc();

                clock_gettime_fuzzy(CLOCK_MONOTONIC, &now, FUZZ);

                c = rdtsc();

                a = (a + (c - b)) / 2;
        }

        printf("clock_gettime_fuzzy takes %llu cycles\n", a);

        printf("speedup to goalpost %lluX\n", p / a);


        for (fuzzy = 0, i = 0; fuzzy < 2; fuzzy++) {
                clock_gettime(CLOCK_MONOTONIC, &prev);

                do {
                        if (fuzzy)
                                clock_gettime_fuzzy(CLOCK_MONOTONIC, &now, FUZZ); /* 1/4 second fuzz */
                        else
                                clock_gettime(CLOCK_MONOTONIC, &now);

                        i++;
                } while (TIMESPEC_TO_NANOSECOND(&now) - TIMESPEC_TO_NANOSECOND(&prev) < DURATION * 1000000000ULL);

                printf("%d iterations in %d seconds (fuzzy:%d)\n", i, now.tv_sec - prev.tv_sec, fuzzy);

                count[fuzzy] = i;
        }

        printf("speedup by elapsed time %dX\n", count[1] / count[0]);

        return 0;
}

Reply | Threaded
Open this post in threaded view
|

Re: Slow gettimeofday/clock_gettime

William Ahern-2
On Sun, Jan 14, 2007 at 01:25:27AM +0100, Marc Balmer wrote:

> * William Ahern wrote:
> > Are there plans/wishes for gettimeofday to be sped up? I was playing around
> > with the RDTSC instruction.... Anyhow, to make a long story short it seems
> > that Linux is anywhere from 40X (@400 cycles, Linux 2.4.21, Athlon 1.8GHZ)
> > to ~10X (@1500 cycles, Linux 2.6.17, Athlon 1.8GHZ) faster, which could be
> > significant for heavily loaded applications which are constantly
> > re-calculating time deltas (e.g. network servers). My 2.2GHZ Opteron 148
> > takes about 11000 cycles.
> >
> > Here's the code I ended up throwing together to get a feel for cost. This
> > proof-of-concept code puts my fast-path at 200 cycles, and the goalpost
> > benchmark at 34x improvement, with a 10-millisecond granularity. (Note, this
> > code doesn't solve the harder problems of guaranteeing monotonous operation;
> > for instance a gettimeofday() based on this might jump backwards temporarily
> > if the CPU lowered its frequency, though for some architectures there are
> > easy solutions, like AMD's RDTSCP).
>
> hmm, what about a (unified) diff against current?
>

How 'bout this:

        --- /usr/src/sys/arch/amd64/compile/GENERIC/lapic.h
        +++ /usr/src/sys/arch/amd64/compile/GENERIC/lapic.h
        @@ -1 +1 @@
        -#define NLAPIC 0
        +#define NLAPIC 1

So, turns out for AMD64 the APIC timer isn't being used. On plain-i386 GTOD
is fast-enough (not optimized to the hilt, but not abysmal, either; in fact,
equivalent to my user-land caching code--though admittedly it was focused on
the caching, not on squeezing cycles on a per-instruction basis).

Who can/should I speak with about enabling this? I can only assume it was
disabled on purpose.

Also, any idea where I can find where/how this file is being generated from?
Recursive grep for NLAPIC doesn't help.