hardware packet timestamping for mbufs, then sockets and bpf

classic Classic list List threaded Threaded
5 messages Options
Reply | Threaded
Open this post in threaded view
|

hardware packet timestamping for mbufs, then sockets and bpf

David Gwynne-5
nics are starting to offer the ability to timestamp packets when
they're received. other systems (eg linux and freebsd) have support
for recording that timestamp on mbufs and then using it as the
backend for at least the SO_TIMESTAMP socket option instead of a
call to microtime().

this implements the above, and additionally supports using the hw
timestamp in bpf too. other systems may do the bpf thing too, but i
didn't look closely enough to find out.

timestamps are recorded as the uptime of the system in nanoseconds
in the ph_timestamp field in mbufs. this mirrors the use of
ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
im using another bit in m_pkthdr.csum_flags to say whether the timestamp
is valid or not (M_TIMESTAMP). im arguing that it's another offloading
feature and therefore appropriate for the csum offload flags field.

this adds some inline functions to time.h for turning ns into a timeval
and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
i originally wanted ns_to_timeval and ns_to_timespec, but the linux
compat stuff in drm already uses those names and ruined the idea.
especially since they return the time{val,spec}s as values.

the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
and if so turns ph_timestamp into a timeval before adding it to boottime
(which is the wall clock time that uptime starts at), before using it
instead of microtime().

the mcx changes are based on what freebsd did to their driver, but
simplified a bit.

i want this because we're being asked to look at recording network
traffic for possible audit use. part of that is having accurate
timestamps on received packets, and hopefully it will mitigate against
chunks of packets getting reordered or delayed significantly when the
box is busy.

thoughts? ok?

Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.242
diff -u -p -r1.242 mbuf.h
--- sys/mbuf.h 11 Feb 2019 00:25:33 -0000 1.242
+++ sys/mbuf.h 7 Jun 2019 03:27:41 -0000
@@ -226,13 +226,14 @@ struct mbuf {
 #define M_ICMP_CSUM_IN_OK 0x0400 /* ICMP/ICMPv6 checksum verified */
 #define M_ICMP_CSUM_IN_BAD 0x0800 /* ICMP/ICMPv6 checksum bad */
 #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */
+#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */
 
 #ifdef _KERNEL
 #define MCS_BITS \
     ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
     "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
     "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
-    "\15IPV6_NODF_OUT")
+    "\15IPV6_NODF_OUT" "\16TIMESTAMP")
 #endif
 
 /* mbuf types */
Index: sys/time.h
===================================================================
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.41
diff -u -p -r1.41 time.h
--- sys/time.h 3 Jun 2019 01:27:30 -0000 1.41
+++ sys/time.h 7 Jun 2019 03:27:41 -0000
@@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct
 /* Traditional POSIX base year */
 #define POSIX_BASE_YEAR 1970
 
+static __inline void
+ns_to_microtime(struct timeval *tv, uint64_t ns)
+{
+ tv->tv_sec = ns / 1000000000L;
+ tv->tv_usec = (ns % 1000000000L) / 1000;
+}
+
+static __inline void
+ns_to_nanotime(struct timespec *tv, uint64_t ns)
+{
+ tv->tv_sec = ns / 1000000000L;
+ tv->tv_nsec = ns % 1000000000L;
+}
+
 #else /* !_KERNEL */
 #include <time.h>
 
Index: net/bpf.c
===================================================================
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.175
diff -u -p -r1.175 bpf.c
--- net/bpf.c 18 May 2019 12:59:32 -0000 1.175
+++ net/bpf.c 7 Jun 2019 03:27:41 -0000
@@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf
  fcode = bps->bps_bf.bf_insns;
  slen = bpf_mfilter(fcode, m, pktlen);
 
- if (slen == 0)
+ if (slen == 0)
  continue;
  if (d->bd_fildrop != BPF_FILDROP_PASS)
  drop = 1;
  if (d->bd_fildrop != BPF_FILDROP_DROP) {
- if (!gottime++)
- microtime(&tv);
+ if (!gottime) {
+ if (ISSET(m->m_flags, M_PKTHDR) &&
+    ISSET(m->m_pkthdr.csum_flags,
+     M_TIMESTAMP)) {
+ struct timeval btv;
+ ns_to_microtime(&tv,
+    m->m_pkthdr.ph_timestamp);
+ microboottime(&btv);
+ timeradd(&tv, &btv, &tv);
+ } else
+ microtime(&tv);
+
+ gottime = 1;
+ }
 
  mtx_enter(&d->bd_mtx);
  bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.342
diff -u -p -r1.342 ip_input.c
--- netinet/ip_input.c 13 Oct 2018 18:36:01 -0000 1.342
+++ netinet/ip_input.c 7 Jun 2019 03:27:41 -0000
@@ -1712,7 +1712,14 @@ ip_savecontrol(struct inpcb *inp, struct
  if (inp->inp_socket->so_options & SO_TIMESTAMP) {
  struct timeval tv;
 
- microtime(&tv);
+ if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) {
+ struct timeval btv;
+ ns_to_microtime(&tv, m->m_pkthdr.ph_timestamp);
+ microboottime(&btv);
+ timeradd(&tv, &btv, &tv);
+ } else
+ microtime(&tv);
+
  *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
     SCM_TIMESTAMP, SOL_SOCKET);
  if (*mp)
Index: dev/pci/if_mcx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.16
diff -u -p -r1.16 if_mcx.c
--- dev/pci/if_mcx.c 4 Jun 2019 05:29:30 -0000 1.16
+++ dev/pci/if_mcx.c 7 Jun 2019 03:27:41 -0000
@@ -1243,20 +1245,22 @@ struct mcx_cmd_destroy_cq_out {
 } __packed __aligned(4);
 
 struct mcx_cq_entry {
- uint32_t cq_reserved1;
+ uint32_t __reserved__;
  uint32_t cq_lro;
  uint32_t cq_lro_ack_seq_num;
  uint32_t cq_rx_hash;
- uint32_t cq_rx_hash_type;
+ uint8_t cq_rx_hash_type;
+ uint8_t cq_ml_path;
+ uint16_t __reserved__;
  uint32_t cq_checksum;
- uint32_t cq_reserved2;
+ uint32_t __reserved__;
  uint32_t cq_flags;
  uint32_t cq_lro_srqn;
- uint32_t cq_reserved3[2];
+ uint32_t __reserved__[2];
  uint32_t cq_byte_cnt;
- uint32_t cq_lro_ts_value;
- uint32_t cq_lro_ts_echo;
- uint32_t cq_flow_tag;
+ uint64_t cq_timestamp;
+#define MCX_CQ_ENTRY_TIMESTAMP_PTP (1 << 63)
+ uint32_t cq_rx_drops;
  uint16_t cq_wqe_count;
  uint8_t cq_signature;
  uint8_t cq_opcode_owner;
@@ -1895,6 +1899,18 @@ struct mcx_cq {
  int cq_count;
 };
 
+struct mcx_calibration {
+ uint64_t c_timestamp; /* previous mcx chip time */
+ uint64_t c_uptime; /* previous kernel nanouptime */
+ uint64_t c_tbase; /* mcx chip time */
+ uint64_t c_ubase; /* kernel nanouptime */
+ uint64_t c_tdiff;
+ uint64_t c_udiff;
+};
+
+#define MCX_CALIBRATE_FIRST 2
+#define MCX_CALIBRATE_NORMAL 30
+
 struct mcx_softc {
  struct device sc_dev;
  struct arpcom sc_ac;
@@ -1948,6 +1964,11 @@ struct mcx_softc {
  int sc_extra_mcast;
  uint8_t sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN];
 
+ struct mcx_calibration sc_calibration[2];
+ unsigned int sc_calibration_gen;
+ uint32_t sc_freq;
+ struct timeout sc_calibrate;
+
  struct mcx_cq sc_cq[MCX_MAX_CQS];
  int sc_num_cq;
 
@@ -2039,8 +2060,8 @@ static void mcx_cmdq_dump(const struct m
 static void mcx_cmdq_mbox_dump(struct mcx_dmamem *, int);
 */
 static void mcx_refill(void *);
-static void mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
-    struct mbuf_list *, int *);
+static int mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
+    struct mbuf_list *, const struct mcx_calibration *);
 static void mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *,
     int *);
 static void mcx_process_cq(struct mcx_softc *, struct mcx_cq *);
@@ -2060,6 +2081,9 @@ static void mcx_media_status(struct ifne
 static int mcx_media_change(struct ifnet *);
 static int mcx_get_sffpage(struct ifnet *, struct if_sffpage *);
 
+static void mcx_calibrate_first(struct mcx_softc *);
+static void mcx_calibrate(void *);
+
 static inline uint32_t
  mcx_rd(struct mcx_softc *, bus_size_t);
 static inline void
@@ -2067,6 +2091,8 @@ static inline void
 static inline void
  mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int);
 
+static uint64_t mcx_timer(struct mcx_softc *);
+
 static int mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *,
     bus_size_t, u_int align);
 static void mcx_dmamem_zero(struct mcx_dmamem *);
@@ -2338,6 +2364,7 @@ mcx_attach(struct device *parent, struct
  ether_ifattach(ifp);
 
  timeout_set(&sc->sc_rx_refill, mcx_refill, sc);
+ timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
 
  sc->sc_flow_table_id = -1;
  for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) {
@@ -5557,9 +5590,65 @@ mcx_process_txeof(struct mcx_softc *sc,
  ms->ms_m = NULL;
 }
 
-void
+static uint64_t
+mcx_uptime(void)
+{
+ struct timespec ts;
+
+ nanouptime(&ts);
+
+ return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec);
+}
+
+static void
+mcx_calibrate_first(struct mcx_softc *sc)
+{
+ struct mcx_calibration *c = &sc->sc_calibration[0];
+
+ sc->sc_calibration_gen = 0;
+
+ c->c_ubase = mcx_uptime();
+ c->c_tbase = mcx_timer(sc);
+ c->c_tdiff = 0;
+
+ timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST);
+}
+
+#define MCX_TIMESTAMP_SHIFT 10
+
+static void
+mcx_calibrate(void *arg)
+{
+ struct mcx_softc *sc = arg;
+ struct mcx_calibration *nc, *pc;
+ unsigned int gen;
+
+ if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING))
+ return;
+
+ timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL);
+
+ gen = sc->sc_calibration_gen;
+ pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+ gen++;
+ nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+
+ nc->c_uptime = pc->c_ubase;
+ nc->c_timestamp = pc->c_tbase;
+
+ nc->c_ubase = mcx_uptime();
+ nc->c_tbase = mcx_timer(sc);
+
+ nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT;
+ nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT;
+
+ membar_producer();
+ sc->sc_calibration_gen = gen;
+}
+
+static int
 mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe,
-    struct mbuf_list *ml, int *slots)
+    struct mbuf_list *ml, const struct mcx_calibration *c)
 {
  struct mcx_slot *ms;
  struct mbuf *m;
@@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
 
  m = ms->ms_m;
  ms->ms_m = NULL;
- m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt);
- (*slots)++;
+
+ m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
+
+ if (cqe->cq_rx_hash_type) {
+ m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
+    bemtoh32(&cqe->cq_rx_hash);
+ }
+
+ if (c->c_tdiff) {
+ uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
+ t *= c->c_udiff;
+ t /= c->c_tdiff;
+
+ m->m_pkthdr.ph_timestamp = c->c_uptime + t;
+ SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
+ }
 
  ml_enqueue(ml, m);
+
+ return (1);
 }
 
 static struct mcx_cq_entry *
@@ -5624,11 +5729,17 @@ void
 mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
 {
  struct ifnet *ifp = &sc->sc_ac.ac_if;
+ const struct mcx_calibration *c;
+ unsigned int gen;
  struct mcx_cq_entry *cqe;
  uint8_t *cqp;
  struct mbuf_list ml = MBUF_LIST_INITIALIZER();
  int rxfree, txfree;
 
+ gen = sc->sc_calibration_gen;
+ membar_consumer();
+ c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+
  rxfree = 0;
  txfree = 0;
  while ((cqe = mcx_next_cq_entry(sc, cq))) {
@@ -5639,7 +5750,7 @@ mcx_process_cq(struct mcx_softc *sc, str
  mcx_process_txeof(sc, cqe, &txfree);
  break;
  case MCX_CQ_ENTRY_OPCODE_SEND:
- mcx_process_rx(sc, cqe, &ml, &rxfree);
+ rxfree += mcx_process_rx(sc, cqe, &ml, c);
  break;
  case MCX_CQ_ENTRY_OPCODE_REQ_ERR:
  case MCX_CQ_ENTRY_OPCODE_SEND_ERR:
@@ -5882,6 +5993,8 @@ mcx_up(struct mcx_softc *sc)
  sc->sc_rx_prod = 0;
  mcx_rx_fill(sc);
 
+ mcx_calibrate_first(sc);
+
  SET(ifp->if_flags, IFF_RUNNING);
 
  sc->sc_tx_cons = 0;
@@ -5922,6 +6035,8 @@ mcx_down(struct mcx_softc *sc)
  intr_barrier(&sc->sc_ih);
  ifq_barrier(&ifp->if_snd);
 
+ timeout_del_barrier(&sc->sc_calibrate);
+
  for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) {
  if (sc->sc_flow_group_id[group] != -1)
  mcx_destroy_flow_group(sc,
@@ -6435,6 +6550,26 @@ static inline void
 mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f)
 {
  bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f);
+}
+
+static uint64_t
+mcx_timer(struct mcx_softc *sc)
+{
+ uint32_t hi, lo, ni;
+
+ hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
+ for (;;) {
+ lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
+ mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
+ ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
+
+ if (ni == hi)
+ break;
+
+ hi = ni;
+ }
+
+ return (((uint64_t)hi << 32) | (uint64_t)lo);
 }
 
 static int

Reply | Threaded
Open this post in threaded view
|

Re: hardware packet timestamping for mbufs, then sockets and bpf

Scott Cheloha
On Fri, Jun 07, 2019 at 02:34:20PM +1000, David Gwynne wrote:

> nics are starting to offer the ability to timestamp packets when
> they're received. other systems (eg linux and freebsd) have support
> for recording that timestamp on mbufs and then using it as the
> backend for at least the SO_TIMESTAMP socket option instead of a
> call to microtime().
>
> this implements the above, and additionally supports using the hw
> timestamp in bpf too. other systems may do the bpf thing too, but i
> didn't look closely enough to find out.
>
> timestamps are recorded as the uptime of the system in nanoseconds
> in the ph_timestamp field in mbufs. this mirrors the use of
> ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
> im using another bit in m_pkthdr.csum_flags to say whether the timestamp
> is valid or not (M_TIMESTAMP). im arguing that it's another offloading
> feature and therefore appropriate for the csum offload flags field.
>
> this adds some inline functions to time.h for turning ns into a timeval
> and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
> i originally wanted ns_to_timeval and ns_to_timespec, but the linux
> compat stuff in drm already uses those names and ruined the idea.
> especially since they return the time{val,spec}s as values.

If we're going to add more conversion stuff to sys/time.h I want it to
resemble the stuff that's already there.  I think NSEC_TO_TIMESPEC and
NSEC_TO_TIMEVAL would be consistent names, so in sys/time.h I'd strongly
prefer the following:

static __inline void
NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
{
        tv->tv_sec = ns / 1000000000LL;
        tv->tv_usec = (ns % 1000000000L) / 1000L;
}

static __inline void
NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *ts)
{
        ts->tv_sec = ns / 1000000000LL;
        ts->tv_nsec = ns % 1000000000L;
}

> the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
> and if so turns ph_timestamp into a timeval before adding it to boottime
> (which is the wall clock time that uptime starts at), before using it
> instead of microtime().
>
> the mcx changes are based on what freebsd did to their driver, but
> simplified a bit.
>
> i want this because we're being asked to look at recording network
> traffic for possible audit use. part of that is having accurate
> timestamps on received packets, and hopefully it will mitigate against
> chunks of packets getting reordered or delayed significantly when the
> box is busy.
>
> thoughts? ok?

Thoughts inline.

> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> retrieving revision 1.242
> diff -u -p -r1.242 mbuf.h
> --- sys/mbuf.h 11 Feb 2019 00:25:33 -0000 1.242
> +++ sys/mbuf.h 7 Jun 2019 03:27:41 -0000
> @@ -226,13 +226,14 @@ struct mbuf {
>  #define M_ICMP_CSUM_IN_OK 0x0400 /* ICMP/ICMPv6 checksum verified */
>  #define M_ICMP_CSUM_IN_BAD 0x0800 /* ICMP/ICMPv6 checksum bad */
>  #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */
> +#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */
>  
>  #ifdef _KERNEL
>  #define MCS_BITS \
>      ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
>      "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
>      "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
> -    "\15IPV6_NODF_OUT")
> +    "\15IPV6_NODF_OUT" "\16TIMESTAMP")
>  #endif
>  
>  /* mbuf types */
> Index: sys/time.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/time.h,v
> retrieving revision 1.41
> diff -u -p -r1.41 time.h
> --- sys/time.h 3 Jun 2019 01:27:30 -0000 1.41
> +++ sys/time.h 7 Jun 2019 03:27:41 -0000
> @@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct
>  /* Traditional POSIX base year */
>  #define POSIX_BASE_YEAR 1970
>  
> +static __inline void
> +ns_to_microtime(struct timeval *tv, uint64_t ns)
> +{
> + tv->tv_sec = ns / 1000000000L;
> + tv->tv_usec = (ns % 1000000000L) / 1000;
> +}
> +
> +static __inline void
> +ns_to_nanotime(struct timespec *tv, uint64_t ns)
> +{
> + tv->tv_sec = ns / 1000000000L;
> + tv->tv_nsec = ns % 1000000000L;
> +}
> +

Same thoughts from before about NSEC_TO_TIMEVAL etc.

Also, unless I've missed something, I don't see you use ns_to_nanotime
in this diff.

However I see people doing this conversion elsewhere, so maybe
we could use it anyway.

>  #else /* !_KERNEL */
>  #include <time.h>
>  
> Index: net/bpf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.175
> diff -u -p -r1.175 bpf.c
> --- net/bpf.c 18 May 2019 12:59:32 -0000 1.175
> +++ net/bpf.c 7 Jun 2019 03:27:41 -0000
> @@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>   fcode = bps->bps_bf.bf_insns;
>   slen = bpf_mfilter(fcode, m, pktlen);
>  
> - if (slen == 0)
> + if (slen == 0)
>   continue;
>   if (d->bd_fildrop != BPF_FILDROP_PASS)
>   drop = 1;
>   if (d->bd_fildrop != BPF_FILDROP_DROP) {
> - if (!gottime++)
> - microtime(&tv);
> + if (!gottime) {
> + if (ISSET(m->m_flags, M_PKTHDR) &&
> +    ISSET(m->m_pkthdr.csum_flags,
> +     M_TIMESTAMP)) {
> + struct timeval btv;
> + ns_to_microtime(&tv,
> +    m->m_pkthdr.ph_timestamp);
> + microboottime(&btv);
> + timeradd(&tv, &btv, &tv);

You could probably put this dance into a function.

void
pkthdr_microtime(const struct pkthdr *hdr, struct timeval *tv)
{
        struct timeval boottime, uptime;

        microboottime(&boottime);
        NSEC_TO_TIMEVAL(hdr->ph_timestamp, &uptime);
        timeradd(&boottime, &uptime, &tv);
}

> + } else
> + microtime(&tv);
> +
> + gottime = 1;
> + }
>  
>   mtx_enter(&d->bd_mtx);
>   bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.342
> diff -u -p -r1.342 ip_input.c
> --- netinet/ip_input.c 13 Oct 2018 18:36:01 -0000 1.342
> +++ netinet/ip_input.c 7 Jun 2019 03:27:41 -0000
> @@ -1712,7 +1712,14 @@ ip_savecontrol(struct inpcb *inp, struct
>   if (inp->inp_socket->so_options & SO_TIMESTAMP) {
>   struct timeval tv;
>  
> - microtime(&tv);
> + if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) {
> + struct timeval btv;
> + ns_to_microtime(&tv, m->m_pkthdr.ph_timestamp);
> + microboottime(&btv);
> + timeradd(&tv, &btv, &tv);

Yeah see, you do it here too.  Dunno where to actually put that
function though.

> + } else
> + microtime(&tv);
> +
>   *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
>      SCM_TIMESTAMP, SOL_SOCKET);
>   if (*mp)
> Index: dev/pci/if_mcx.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
> retrieving revision 1.16
> diff -u -p -r1.16 if_mcx.c
> --- dev/pci/if_mcx.c 4 Jun 2019 05:29:30 -0000 1.16
> +++ dev/pci/if_mcx.c 7 Jun 2019 03:27:41 -0000
> @@ -1243,20 +1245,22 @@ struct mcx_cmd_destroy_cq_out {
>  } __packed __aligned(4);
>  
>  struct mcx_cq_entry {
> - uint32_t cq_reserved1;
> + uint32_t __reserved__;
>   uint32_t cq_lro;
>   uint32_t cq_lro_ack_seq_num;
>   uint32_t cq_rx_hash;
> - uint32_t cq_rx_hash_type;
> + uint8_t cq_rx_hash_type;
> + uint8_t cq_ml_path;
> + uint16_t __reserved__;
>   uint32_t cq_checksum;
> - uint32_t cq_reserved2;
> + uint32_t __reserved__;
>   uint32_t cq_flags;
>   uint32_t cq_lro_srqn;
> - uint32_t cq_reserved3[2];
> + uint32_t __reserved__[2];
>   uint32_t cq_byte_cnt;
> - uint32_t cq_lro_ts_value;
> - uint32_t cq_lro_ts_echo;
> - uint32_t cq_flow_tag;
> + uint64_t cq_timestamp;
> +#define MCX_CQ_ENTRY_TIMESTAMP_PTP (1 << 63)
> + uint32_t cq_rx_drops;
>   uint16_t cq_wqe_count;
>   uint8_t cq_signature;
>   uint8_t cq_opcode_owner;
> @@ -1895,6 +1899,18 @@ struct mcx_cq {
>   int cq_count;
>  };
>  
> +struct mcx_calibration {
> + uint64_t c_timestamp; /* previous mcx chip time */
> + uint64_t c_uptime; /* previous kernel nanouptime */
> + uint64_t c_tbase; /* mcx chip time */
> + uint64_t c_ubase; /* kernel nanouptime */
> + uint64_t c_tdiff;
> + uint64_t c_udiff;
> +};
> +
> +#define MCX_CALIBRATE_FIRST 2
> +#define MCX_CALIBRATE_NORMAL 30
> +
>  struct mcx_softc {
>   struct device sc_dev;
>   struct arpcom sc_ac;
> @@ -1948,6 +1964,11 @@ struct mcx_softc {
>   int sc_extra_mcast;
>   uint8_t sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN];
>  
> + struct mcx_calibration sc_calibration[2];
> + unsigned int sc_calibration_gen;
> + uint32_t sc_freq;
> + struct timeout sc_calibrate;
> +
>   struct mcx_cq sc_cq[MCX_MAX_CQS];
>   int sc_num_cq;
>  
> @@ -2039,8 +2060,8 @@ static void mcx_cmdq_dump(const struct m
>  static void mcx_cmdq_mbox_dump(struct mcx_dmamem *, int);
>  */
>  static void mcx_refill(void *);
> -static void mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
> -    struct mbuf_list *, int *);
> +static int mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
> +    struct mbuf_list *, const struct mcx_calibration *);
>  static void mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *,
>      int *);
>  static void mcx_process_cq(struct mcx_softc *, struct mcx_cq *);
> @@ -2060,6 +2081,9 @@ static void mcx_media_status(struct ifne
>  static int mcx_media_change(struct ifnet *);
>  static int mcx_get_sffpage(struct ifnet *, struct if_sffpage *);
>  
> +static void mcx_calibrate_first(struct mcx_softc *);
> +static void mcx_calibrate(void *);
> +
>  static inline uint32_t
>   mcx_rd(struct mcx_softc *, bus_size_t);
>  static inline void
> @@ -2067,6 +2091,8 @@ static inline void
>  static inline void
>   mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int);
>  
> +static uint64_t mcx_timer(struct mcx_softc *);
> +
>  static int mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *,
>      bus_size_t, u_int align);
>  static void mcx_dmamem_zero(struct mcx_dmamem *);
> @@ -2338,6 +2364,7 @@ mcx_attach(struct device *parent, struct
>   ether_ifattach(ifp);
>  
>   timeout_set(&sc->sc_rx_refill, mcx_refill, sc);
> + timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
>  
>   sc->sc_flow_table_id = -1;
>   for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) {
> @@ -5557,9 +5590,65 @@ mcx_process_txeof(struct mcx_softc *sc,
>   ms->ms_m = NULL;
>  }
>  
> -void
> +static uint64_t
> +mcx_uptime(void)
> +{
> + struct timespec ts;
> +
> + nanouptime(&ts);
> +
> + return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec);
> +}
>
> +
> +static void
> +mcx_calibrate_first(struct mcx_softc *sc)
> +{
> + struct mcx_calibration *c = &sc->sc_calibration[0];
> +
> + sc->sc_calibration_gen = 0;
> +
> + c->c_ubase = mcx_uptime();
> + c->c_tbase = mcx_timer(sc);
> + c->c_tdiff = 0;
> +
> + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST);

should this be MCX_CALIBRATE_NORMAL?

Also, maybe "mcx_start_calibration"?

> +}
> +
> +#define MCX_TIMESTAMP_SHIFT 10
> +
> +static void
> +mcx_calibrate(void *arg)
> +{
> + struct mcx_softc *sc = arg;
> + struct mcx_calibration *nc, *pc;
> + unsigned int gen;
> +
> + if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING))
> + return;
> +
> + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL);
> +
> + gen = sc->sc_calibration_gen;
> + pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> + gen++;
> + nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +
> + nc->c_uptime = pc->c_ubase;
> + nc->c_timestamp = pc->c_tbase;
> +
> + nc->c_ubase = mcx_uptime();
> + nc->c_tbase = mcx_timer(sc);
> +
> + nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT;
> + nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT;
> +
> + membar_producer();
> + sc->sc_calibration_gen = gen;
> +}
> +
> +static int
>  mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe,
> -    struct mbuf_list *ml, int *slots)
> +    struct mbuf_list *ml, const struct mcx_calibration *c)

Would you ever use a mcx_calibration struct in this function that
wasn't in the mcx_softc?

>  {
>   struct mcx_slot *ms;
>   struct mbuf *m;
> @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
>  
>   m = ms->ms_m;
>   ms->ms_m = NULL;
> - m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt);
> - (*slots)++;
> +
> + m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
> +
> + if (cqe->cq_rx_hash_type) {
> + m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
> +    bemtoh32(&cqe->cq_rx_hash);
> + }
> +
> + if (c->c_tdiff) {
> + uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
> + t *= c->c_udiff;
> + t /= c->c_tdiff;
> +
> + m->m_pkthdr.ph_timestamp = c->c_uptime + t;
> + SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
> + }
>  
>   ml_enqueue(ml, m);
> +
> + return (1);
>  }
>  
>  static struct mcx_cq_entry *
> @@ -5624,11 +5729,17 @@ void
>  mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
>  {
>   struct ifnet *ifp = &sc->sc_ac.ac_if;
> + const struct mcx_calibration *c;
> + unsigned int gen;
>   struct mcx_cq_entry *cqe;
>   uint8_t *cqp;
>   struct mbuf_list ml = MBUF_LIST_INITIALIZER();
>   int rxfree, txfree;
>  
> + gen = sc->sc_calibration_gen;
> + membar_consumer();
> + c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +

What IPL does this run at?  Couldn't the calibration struct be modified
out from under you?

>   rxfree = 0;
>   txfree = 0;
>   while ((cqe = mcx_next_cq_entry(sc, cq))) {
> @@ -5639,7 +5750,7 @@ mcx_process_cq(struct mcx_softc *sc, str
>   mcx_process_txeof(sc, cqe, &txfree);
>   break;
>   case MCX_CQ_ENTRY_OPCODE_SEND:
> - mcx_process_rx(sc, cqe, &ml, &rxfree);
> + rxfree += mcx_process_rx(sc, cqe, &ml, c);
>   break;
>   case MCX_CQ_ENTRY_OPCODE_REQ_ERR:
>   case MCX_CQ_ENTRY_OPCODE_SEND_ERR:
> @@ -5882,6 +5993,8 @@ mcx_up(struct mcx_softc *sc)
>   sc->sc_rx_prod = 0;
>   mcx_rx_fill(sc);
>  
> + mcx_calibrate_first(sc);
> +
>   SET(ifp->if_flags, IFF_RUNNING);
>  
>   sc->sc_tx_cons = 0;
> @@ -5922,6 +6035,8 @@ mcx_down(struct mcx_softc *sc)
>   intr_barrier(&sc->sc_ih);
>   ifq_barrier(&ifp->if_snd);
>  
> + timeout_del_barrier(&sc->sc_calibrate);
> +
>   for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) {
>   if (sc->sc_flow_group_id[group] != -1)
>   mcx_destroy_flow_group(sc,
> @@ -6435,6 +6550,26 @@ static inline void
>  mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f)
>  {
>   bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f);
> +}
> +
> +static uint64_t
> +mcx_timer(struct mcx_softc *sc)
> +{
> + uint32_t hi, lo, ni;
> +
> + hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> + for (;;) {
> + lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
> + mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
> + ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> +
> + if (ni == hi)
> + break;
> +
> + hi = ni;
> + }
> +
> + return (((uint64_t)hi << 32) | (uint64_t)lo);
>  }

You could actually truncate this loop:

        for (ni = hi + 1; ni != hi; hi = ni) {
                lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
                mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
                ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
        }

but maybe that's too clever :)

Reply | Threaded
Open this post in threaded view
|

Re: hardware packet timestamping for mbufs, then sockets and bpf

Scott Cheloha
On Fri, Jun 07, 2019 at 07:36:36PM -0500, Scott Cheloha wrote:

> [...]
>
> > +
> > +static uint64_t
> > +mcx_timer(struct mcx_softc *sc)
> > +{
> > + uint32_t hi, lo, ni;
> > +
> > + hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> > + for (;;) {
> > + lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
> > + mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
> > + ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> > +
> > + if (ni == hi)
> > + break;
> > +
> > + hi = ni;
> > + }
> > +
> > + return (((uint64_t)hi << 32) | (uint64_t)lo);
> >  }
>
> You could actually truncate this loop:
>
> for (ni = hi + 1; ni != hi; hi = ni) {
> lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
> mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
> ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> }
>
> but maybe that's too clever :)

this is wrong, ignore this part, your loop is fine.

too clever indeed...

Reply | Threaded
Open this post in threaded view
|

Re: hardware packet timestamping for mbufs, then sockets and bpf

David Gwynne-5
In reply to this post by Scott Cheloha
On Fri, Jun 07, 2019 at 07:36:36PM -0500, Scott Cheloha wrote:

> On Fri, Jun 07, 2019 at 02:34:20PM +1000, David Gwynne wrote:
> > nics are starting to offer the ability to timestamp packets when
> > they're received. other systems (eg linux and freebsd) have support
> > for recording that timestamp on mbufs and then using it as the
> > backend for at least the SO_TIMESTAMP socket option instead of a
> > call to microtime().
> >
> > this implements the above, and additionally supports using the hw
> > timestamp in bpf too. other systems may do the bpf thing too, but i
> > didn't look closely enough to find out.
> >
> > timestamps are recorded as the uptime of the system in nanoseconds
> > in the ph_timestamp field in mbufs. this mirrors the use of
> > ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
> > im using another bit in m_pkthdr.csum_flags to say whether the timestamp
> > is valid or not (M_TIMESTAMP). im arguing that it's another offloading
> > feature and therefore appropriate for the csum offload flags field.
> >
> > this adds some inline functions to time.h for turning ns into a timeval
> > and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
> > i originally wanted ns_to_timeval and ns_to_timespec, but the linux
> > compat stuff in drm already uses those names and ruined the idea.
> > especially since they return the time{val,spec}s as values.
>
> If we're going to add more conversion stuff to sys/time.h I want it to
> resemble the stuff that's already there.  I think NSEC_TO_TIMESPEC and
> NSEC_TO_TIMEVAL would be consistent names, so in sys/time.h I'd strongly
> prefer the following:
>
> static __inline void
> NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
> {
> tv->tv_sec = ns / 1000000000LL;
> tv->tv_usec = (ns % 1000000000L) / 1000L;
> }
>
> static __inline void
> NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *ts)
> {
> ts->tv_sec = ns / 1000000000LL;
> ts->tv_nsec = ns % 1000000000L;
> }

ok. i update that, and added a nsmicrotime() to go from a uint64_t nsec
offset to a wall clock time in kern_tc.c too.

> > the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
> > and if so turns ph_timestamp into a timeval before adding it to boottime
> > (which is the wall clock time that uptime starts at), before using it
> > instead of microtime().
> >
> > the mcx changes are based on what freebsd did to their driver, but
> > simplified a bit.
> >
> > i want this because we're being asked to look at recording network
> > traffic for possible audit use. part of that is having accurate
> > timestamps on received packets, and hopefully it will mitigate against
> > chunks of packets getting reordered or delayed significantly when the
> > box is busy.
> >
> > thoughts? ok?
>
> Thoughts inline.

cool.

>
> Same thoughts from before about NSEC_TO_TIMEVAL etc.
>
> Also, unless I've missed something, I don't see you use ns_to_nanotime
> in this diff.
>
> However I see people doing this conversion elsewhere, so maybe
> we could use it anyway.

those are my thoughts too..

> > + struct timeval btv;
> > + ns_to_microtime(&tv,
> > +    m->m_pkthdr.ph_timestamp);
> > + microboottime(&btv);
> > + timeradd(&tv, &btv, &tv);
>
> You could probably put this dance into a function.
>
> void
> pkthdr_microtime(const struct pkthdr *hdr, struct timeval *tv)
> {
> struct timeval boottime, uptime;
>
> microboottime(&boottime);
> NSEC_TO_TIMEVAL(hdr->ph_timestamp, &uptime);
> timeradd(&boottime, &uptime, &tv);
> }

this is nsmicrotime() in my new diff.

>
> should this be MCX_CALIBRATE_NORMAL?
>
> Also, maybe "mcx_start_calibration"?

now we're bikeshedding :)

i want to start hw timestamping "real soon now", which is why i schedule
the first timeout in a few seconds time. after that i don't want to
calibrate too often (to amortise the cost of calibrating), so i use a
longer timeout.

> Would you ever use a mcx_calibration struct in this function that
> wasn't in the mcx_softc?

no, i want to avoid the overhead of finding a usable set of calibration
values on every packet. once per interrupt is good enough.

>
> >  {
> >   struct mcx_slot *ms;
> >   struct mbuf *m;
> > @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
> >  
> >   m = ms->ms_m;
> >   ms->ms_m = NULL;
> > - m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt);
> > - (*slots)++;
> > +
> > + m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
> > +
> > + if (cqe->cq_rx_hash_type) {
> > + m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
> > +    bemtoh32(&cqe->cq_rx_hash);
> > + }
> > +
> > + if (c->c_tdiff) {
> > + uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
> > + t *= c->c_udiff;
> > + t /= c->c_tdiff;
> > +
> > + m->m_pkthdr.ph_timestamp = c->c_uptime + t;
> > + SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
> > + }
> >  
> >   ml_enqueue(ml, m);
> > +
> > + return (1);
> >  }
> >  
> >  static struct mcx_cq_entry *
> > @@ -5624,11 +5729,17 @@ void
> >  mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
> >  {
> >   struct ifnet *ifp = &sc->sc_ac.ac_if;
> > + const struct mcx_calibration *c;
> > + unsigned int gen;
> >   struct mcx_cq_entry *cqe;
> >   uint8_t *cqp;
> >   struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> >   int rxfree, txfree;
> >  
> > + gen = sc->sc_calibration_gen;
> > + membar_consumer();
> > + c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> > +
>
> What IPL does this run at?  Couldn't the calibration struct be modified
> out from under you?

IPL_NET. however, unless the processing of a single interrupt takes
more than a couple of seconds there can't be an inconsistent view
of the calibration data.

here's the updated diff. im going to put the mbuf.h and mcx bits in
tomorrow unless anyone objects. hopefully the time.h side of things will
get your ok and i can put them in too.

Index: sys/time.h
===================================================================
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.41
diff -u -p -r1.41 time.h
--- sys/time.h 3 Jun 2019 01:27:30 -0000 1.41
+++ sys/time.h 10 Jun 2019 04:24:35 -0000
@@ -294,6 +294,8 @@ void getmicrouptime(struct timeval *);
 void binboottime(struct bintime *);
 void microboottime(struct timeval *);
 
+void nsmicrotime(uint64_t, struct timeval *);
+
 struct proc;
 int clock_gettime(struct proc *, clockid_t, struct timespec *);
 
@@ -332,6 +334,20 @@ void clock_secs_to_ymdhms(time_t, struct
 
 /* Traditional POSIX base year */
 #define POSIX_BASE_YEAR 1970
+
+static __inline void
+NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
+{
+ tv->tv_sec = ns / 1000000000L;
+ tv->tv_usec = (ns % 1000000000L) / 1000;
+}
+
+static __inline void
+NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *tv)
+{
+ tv->tv_sec = ns / 1000000000L;
+ tv->tv_nsec = ns % 1000000000L;
+}
 
 #else /* !_KERNEL */
 #include <time.h>
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.242
diff -u -p -r1.242 mbuf.h
--- sys/mbuf.h 11 Feb 2019 00:25:33 -0000 1.242
+++ sys/mbuf.h 10 Jun 2019 04:24:35 -0000
@@ -226,13 +226,14 @@ struct mbuf {
 #define M_ICMP_CSUM_IN_OK 0x0400 /* ICMP/ICMPv6 checksum verified */
 #define M_ICMP_CSUM_IN_BAD 0x0800 /* ICMP/ICMPv6 checksum bad */
 #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */
+#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */
 
 #ifdef _KERNEL
 #define MCS_BITS \
     ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
     "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
     "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
-    "\15IPV6_NODF_OUT")
+    "\15IPV6_NODF_OUT" "\16TIMESTAMP")
 #endif
 
 /* mbuf types */
Index: net/bpf.c
===================================================================
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.175
diff -u -p -r1.175 bpf.c
--- net/bpf.c 18 May 2019 12:59:32 -0000 1.175
+++ net/bpf.c 10 Jun 2019 04:24:35 -0000
@@ -1284,13 +1284,22 @@ _bpf_mtap(caddr_t arg, const struct mbuf
  fcode = bps->bps_bf.bf_insns;
  slen = bpf_mfilter(fcode, m, pktlen);
 
- if (slen == 0)
+ if (slen == 0)
  continue;
  if (d->bd_fildrop != BPF_FILDROP_PASS)
  drop = 1;
  if (d->bd_fildrop != BPF_FILDROP_DROP) {
- if (!gottime++)
- microtime(&tv);
+ if (!gottime) {
+ if (ISSET(m->m_flags, M_PKTHDR) &&
+    ISSET(m->m_pkthdr.csum_flags,
+     M_TIMESTAMP)) {
+ nsmicrotime(m->m_pkthdr.ph_timestamp,
+    &tv);
+ } else
+ microtime(&tv);
+
+ gottime = 1;
+ }
 
  mtx_enter(&d->bd_mtx);
  bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.342
diff -u -p -r1.342 ip_input.c
--- netinet/ip_input.c 13 Oct 2018 18:36:01 -0000 1.342
+++ netinet/ip_input.c 10 Jun 2019 04:24:35 -0000
@@ -1712,7 +1712,11 @@ ip_savecontrol(struct inpcb *inp, struct
  if (inp->inp_socket->so_options & SO_TIMESTAMP) {
  struct timeval tv;
 
- microtime(&tv);
+ if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP))
+ nsmicrotime(m->m_pkthdr.ph_timestamp, &tv);
+ else
+ microtime(&tv);
+
  *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
     SCM_TIMESTAMP, SOL_SOCKET);
  if (*mp)
Index: netinet6/ip6_input.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.216
diff -u -p -r1.216 ip6_input.c
--- netinet6/ip6_input.c 9 Nov 2018 14:14:32 -0000 1.216
+++ netinet6/ip6_input.c 10 Jun 2019 04:24:35 -0000
@@ -947,7 +947,11 @@ ip6_savecontrol(struct inpcb *in6p, stru
  if (in6p->inp_socket->so_options & SO_TIMESTAMP) {
  struct timeval tv;
 
- microtime(&tv);
+ if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP))
+ nsmicrotime(m->m_pkthdr.ph_timestamp, &tv);
+ else
+ microtime(&tv);
+
  *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
     SCM_TIMESTAMP, SOL_SOCKET);
  if (*mp)
Index: dev/pci/if_mcx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.21
diff -u -p -r1.21 if_mcx.c
--- dev/pci/if_mcx.c 7 Jun 2019 06:53:15 -0000 1.21
+++ dev/pci/if_mcx.c 10 Jun 2019 04:24:35 -0000
@@ -1892,6 +1892,18 @@ struct mcx_cq {
  int cq_count;
 };
 
+struct mcx_calibration {
+ uint64_t c_timestamp; /* previous mcx chip time */
+ uint64_t c_uptime; /* previous kernel nanouptime */
+ uint64_t c_tbase; /* mcx chip time */
+ uint64_t c_ubase; /* kernel nanouptime */
+ uint64_t c_tdiff;
+ uint64_t c_udiff;
+};
+
+#define MCX_CALIBRATE_FIRST    2
+#define MCX_CALIBRATE_NORMAL   30
+
 struct mcx_softc {
  struct device sc_dev;
  struct arpcom sc_ac;
@@ -1945,6 +1957,10 @@ struct mcx_softc {
  int sc_extra_mcast;
  uint8_t sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN];
 
+ struct mcx_calibration sc_calibration[2];
+ unsigned int sc_calibration_gen;
+ struct timeout sc_calibrate;
+
  struct mcx_cq sc_cq[MCX_MAX_CQS];
  int sc_num_cq;
 
@@ -2037,7 +2053,7 @@ static void mcx_cmdq_mbox_dump(struct mc
 */
 static void mcx_refill(void *);
 static int mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
-    struct mbuf_list *);
+    struct mbuf_list *, const struct mcx_calibration *);
 static void mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *,
     int *);
 static void mcx_process_cq(struct mcx_softc *, struct mcx_cq *);
@@ -2057,6 +2073,9 @@ static void mcx_media_status(struct ifne
 static int mcx_media_change(struct ifnet *);
 static int mcx_get_sffpage(struct ifnet *, struct if_sffpage *);
 
+static void mcx_calibrate_first(struct mcx_softc *);
+static void mcx_calibrate(void *);
+
 static inline uint32_t
  mcx_rd(struct mcx_softc *, bus_size_t);
 static inline void
@@ -2064,6 +2083,8 @@ static inline void
 static inline void
  mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int);
 
+static uint64_t mcx_timer(struct mcx_softc *);
+
 static int mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *,
     bus_size_t, u_int align);
 static void mcx_dmamem_zero(struct mcx_dmamem *);
@@ -2338,6 +2359,7 @@ mcx_attach(struct device *parent, struct
  ether_ifattach(ifp);
 
  timeout_set(&sc->sc_rx_refill, mcx_refill, sc);
+ timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
 
  sc->sc_flow_table_id = -1;
  for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) {
@@ -5555,9 +5577,65 @@ mcx_process_txeof(struct mcx_softc *sc,
  ms->ms_m = NULL;
 }
 
+static uint64_t
+mcx_uptime(void)
+{
+ struct timespec ts;
+
+ nanouptime(&ts);
+
+ return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec);
+}
+
+static void
+mcx_calibrate_first(struct mcx_softc *sc)
+{
+ struct mcx_calibration *c = &sc->sc_calibration[0];
+
+ sc->sc_calibration_gen = 0;
+
+ c->c_ubase = mcx_uptime();
+ c->c_tbase = mcx_timer(sc);
+ c->c_tdiff = 0;
+
+ timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST);
+}
+
+#define MCX_TIMESTAMP_SHIFT 10
+
+static void
+mcx_calibrate(void *arg)
+{
+ struct mcx_softc *sc = arg;
+ struct mcx_calibration *nc, *pc;
+ unsigned int gen;
+
+ if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING))
+ return;
+
+ timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL);
+
+ gen = sc->sc_calibration_gen;
+ pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+ gen++;
+ nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+
+ nc->c_uptime = pc->c_ubase;
+ nc->c_timestamp = pc->c_tbase;
+
+ nc->c_ubase = mcx_uptime();
+ nc->c_tbase = mcx_timer(sc);
+
+ nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT;
+ nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT;
+
+ membar_producer();
+ sc->sc_calibration_gen = gen;
+}
+
 static int
 mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe,
-    struct mbuf_list *ml)
+    struct mbuf_list *ml, const struct mcx_calibration *c)
 {
  struct mcx_slot *ms;
  struct mbuf *m;
@@ -5575,6 +5653,15 @@ mcx_process_rx(struct mcx_softc *sc, str
 
  m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
 
+ if (c->c_tdiff) {
+ uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
+ t *= c->c_udiff;
+ t /= c->c_tdiff;
+
+ m->m_pkthdr.ph_timestamp = c->c_uptime + t;
+ SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
+ }
+
  ml_enqueue(ml, m);
 
  return (1);
@@ -5624,11 +5711,17 @@ void
 mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
 {
  struct ifnet *ifp = &sc->sc_ac.ac_if;
+ const struct mcx_calibration *c;
+ unsigned int gen;
  struct mcx_cq_entry *cqe;
  uint8_t *cqp;
  struct mbuf_list ml = MBUF_LIST_INITIALIZER();
  int rxfree, txfree;
 
+ gen = sc->sc_calibration_gen;
+ membar_consumer();
+ c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+
  rxfree = 0;
  txfree = 0;
  while ((cqe = mcx_next_cq_entry(sc, cq))) {
@@ -5639,7 +5732,7 @@ mcx_process_cq(struct mcx_softc *sc, str
  mcx_process_txeof(sc, cqe, &txfree);
  break;
  case MCX_CQ_ENTRY_OPCODE_SEND:
- rxfree += mcx_process_rx(sc, cqe, &ml);
+ rxfree += mcx_process_rx(sc, cqe, &ml, c);
  break;
  case MCX_CQ_ENTRY_OPCODE_REQ_ERR:
  case MCX_CQ_ENTRY_OPCODE_SEND_ERR:
@@ -5882,6 +5975,8 @@ mcx_up(struct mcx_softc *sc)
  sc->sc_rx_prod = 0;
  mcx_rx_fill(sc);
 
+ mcx_calibrate_first(sc);
+
  SET(ifp->if_flags, IFF_RUNNING);
 
  sc->sc_tx_cons = 0;
@@ -5930,6 +6025,8 @@ mcx_down(struct mcx_softc *sc)
  intr_barrier(&sc->sc_ih);
  ifq_barrier(&ifp->if_snd);
 
+ timeout_del_barrier(&sc->sc_calibrate);
+
  for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) {
  if (sc->sc_flow_group_id[group] != -1)
  mcx_destroy_flow_group(sc,
@@ -6443,6 +6540,26 @@ static inline void
 mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f)
 {
  bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f);
+}
+
+static uint64_t
+mcx_timer(struct mcx_softc *sc)
+{
+ uint32_t hi, lo, ni;
+
+ hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
+ for (;;) {
+ lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
+ mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
+ ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
+
+ if (ni == hi)
+ break;
+
+ hi = ni;
+ }
+
+ return (((uint64_t)hi << 32) | (uint64_t)lo);
 }
 
 static int


Reply | Threaded
Open this post in threaded view
|

Re: hardware packet timestamping for mbufs, then sockets and bpf

Scott Cheloha
On Mon, Jun 10, 2019 at 02:45:03PM +1000, David Gwynne wrote:

> On Fri, Jun 07, 2019 at 07:36:36PM -0500, Scott Cheloha wrote:
> > On Fri, Jun 07, 2019 at 02:34:20PM +1000, David Gwynne wrote:
> > > nics are starting to offer the ability to timestamp packets when
> > > they're received. other systems (eg linux and freebsd) have support
> > > for recording that timestamp on mbufs and then using it as the
> > > backend for at least the SO_TIMESTAMP socket option instead of a
> > > call to microtime().
> > >
> > > this implements the above, and additionally supports using the hw
> > > timestamp in bpf too. other systems may do the bpf thing too, but i
> > > didn't look closely enough to find out.
> > >
> > > timestamps are recorded as the uptime of the system in nanoseconds
> > > in the ph_timestamp field in mbufs. this mirrors the use of
> > > ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
> > > im using another bit in m_pkthdr.csum_flags to say whether the timestamp
> > > is valid or not (M_TIMESTAMP). im arguing that it's another offloading
> > > feature and therefore appropriate for the csum offload flags field.
> > >
> > > this adds some inline functions to time.h for turning ns into a timeval
> > > and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
> > > i originally wanted ns_to_timeval and ns_to_timespec, but the linux
> > > compat stuff in drm already uses those names and ruined the idea.
> > > especially since they return the time{val,spec}s as values.
> >
> > If we're going to add more conversion stuff to sys/time.h I want it to
> > resemble the stuff that's already there.  I think NSEC_TO_TIMESPEC and
> > NSEC_TO_TIMEVAL would be consistent names, so in sys/time.h I'd strongly
> > prefer the following:
> >
> > static __inline void
> > NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
> > {
> > tv->tv_sec = ns / 1000000000LL;
> > tv->tv_usec = (ns % 1000000000L) / 1000L;
> > }
> >
> > static __inline void
> > NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *ts)
> > {
> > ts->tv_sec = ns / 1000000000LL;
> > ts->tv_nsec = ns % 1000000000L;
> > }
>
> ok. i update that, and added a nsmicrotime() to go from a uint64_t nsec
> offset to a wall clock time in kern_tc.c too.
>
> > > the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
> > > and if so turns ph_timestamp into a timeval before adding it to boottime
> > > (which is the wall clock time that uptime starts at), before using it
> > > instead of microtime().
> > >
> > > the mcx changes are based on what freebsd did to their driver, but
> > > simplified a bit.
> > >
> > > i want this because we're being asked to look at recording network
> > > traffic for possible audit use. part of that is having accurate
> > > timestamps on received packets, and hopefully it will mitigate against
> > > chunks of packets getting reordered or delayed significantly when the
> > > box is busy.
> > >
> > > thoughts? ok?
> >
> > Thoughts inline.
>
> cool.
>
> >
> > Same thoughts from before about NSEC_TO_TIMEVAL etc.
> >
> > Also, unless I've missed something, I don't see you use ns_to_nanotime
> > in this diff.
> >
> > However I see people doing this conversion elsewhere, so maybe
> > we could use it anyway.
>
> those are my thoughts too..
>
> > > + struct timeval btv;
> > > + ns_to_microtime(&tv,
> > > +    m->m_pkthdr.ph_timestamp);
> > > + microboottime(&btv);
> > > + timeradd(&tv, &btv, &tv);
> >
> > You could probably put this dance into a function.
> >
> > void
> > pkthdr_microtime(const struct pkthdr *hdr, struct timeval *tv)
> > {
> > struct timeval boottime, uptime;
> >
> > microboottime(&boottime);
> > NSEC_TO_TIMEVAL(hdr->ph_timestamp, &uptime);
> > timeradd(&boottime, &uptime, &tv);
> > }
>
> this is nsmicrotime() in my new diff.
>
> >
> > should this be MCX_CALIBRATE_NORMAL?
> >
> > Also, maybe "mcx_start_calibration"?
>
> now we're bikeshedding :)

Can't help it...

> i want to start hw timestamping "real soon now", which is why i schedule
> the first timeout in a few seconds time. after that i don't want to
> calibrate too often (to amortise the cost of calibrating), so i use a
> longer timeout.

Alright, makes sense, just making sure.

> > Would you ever use a mcx_calibration struct in this function that
> > wasn't in the mcx_softc?
>
> no, i want to avoid the overhead of finding a usable set of calibration
> values on every packet. once per interrupt is good enough.

Gotcha.

> > >  {
> > >   struct mcx_slot *ms;
> > >   struct mbuf *m;
> > > @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
> > >  
> > >   m = ms->ms_m;
> > >   ms->ms_m = NULL;
> > > - m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt);
> > > - (*slots)++;
> > > +
> > > + m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
> > > +
> > > + if (cqe->cq_rx_hash_type) {
> > > + m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
> > > +    bemtoh32(&cqe->cq_rx_hash);
> > > + }
> > > +
> > > + if (c->c_tdiff) {
> > > + uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
> > > + t *= c->c_udiff;
> > > + t /= c->c_tdiff;
> > > +
> > > + m->m_pkthdr.ph_timestamp = c->c_uptime + t;
> > > + SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
> > > + }
> > >  
> > >   ml_enqueue(ml, m);
> > > +
> > > + return (1);
> > >  }
> > >  
> > >  static struct mcx_cq_entry *
> > > @@ -5624,11 +5729,17 @@ void
> > >  mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
> > >  {
> > >   struct ifnet *ifp = &sc->sc_ac.ac_if;
> > > + const struct mcx_calibration *c;
> > > + unsigned int gen;
> > >   struct mcx_cq_entry *cqe;
> > >   uint8_t *cqp;
> > >   struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> > >   int rxfree, txfree;
> > >  
> > > + gen = sc->sc_calibration_gen;
> > > + membar_consumer();
> > > + c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> > > +
> >
> > What IPL does this run at?  Couldn't the calibration struct be modified
> > out from under you?
>
> IPL_NET. however, unless the processing of a single interrupt takes
> more than a couple of seconds there can't be an inconsistent view
> of the calibration data.

ack.

> here's the updated diff. im going to put the mbuf.h and mcx bits in
> tomorrow unless anyone objects. hopefully the time.h side of things will
> get your ok and i can put them in too.

The time.h header NSEC* stuff is ok cheloha@.

*deep breath*

However, for the moment I would prefer to keep nsmicrotime() out of
kern_tc.c and sys/time.h.  My issue is that the only thing that makes
the active timecounter "special" is that it provides a canonical
offset from boot.  This supercedes that in order to get an offset
closer to when the packet hits the hardware.  This is fine, but this
works around the timecounter, so it belongs elsewhere.

If more NICs get support for this timestamp offloading in the future
there is definitely room for standardizing the calibration of a slave
timecounting device with the master timecounter as you've done here.
Dunno how we would even standardize any aspect of it, but that's the
idea rattling around in my head.

And if other subsystems outside of network device drivers develop an
analogous need for deriving a UTC timestamp given some sort of offset
from a non-canonical timecounter, *then* yes, nsmicrotime() or an
equivalent would be appropriate for kern_tc.c or sys/time.h.

Until then I think it belongs in one of the networking headers.  Given
that the offset is coming out of the packet header my suggestion was
putting it in mbuf.h and calling it "pkthdr_microtime", but those
aspects are up to you.

and FWIW, if you're worried about the length, calling

        pkthdr_microtime(&m->m_pkthdr, &tv);

is actually briefer and sort-of more descriptive sounding (to me, a
sys/net noob) than

        nsmicrotime(m->m_pkthdr.ph_timestamp, &tv);

... but there I go bikeshedding again...

> Index: sys/time.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/time.h,v
> retrieving revision 1.41
> diff -u -p -r1.41 time.h
> --- sys/time.h 3 Jun 2019 01:27:30 -0000 1.41
> +++ sys/time.h 10 Jun 2019 04:24:35 -0000
> @@ -294,6 +294,8 @@ void getmicrouptime(struct timeval *);
>  void binboottime(struct bintime *);
>  void microboottime(struct timeval *);
>  
> +void nsmicrotime(uint64_t, struct timeval *);
> +
>  struct proc;
>  int clock_gettime(struct proc *, clockid_t, struct timespec *);
>  
> @@ -332,6 +334,20 @@ void clock_secs_to_ymdhms(time_t, struct
>  
>  /* Traditional POSIX base year */
>  #define POSIX_BASE_YEAR 1970
> +
> +static __inline void
> +NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
> +{
> + tv->tv_sec = ns / 1000000000L;
> + tv->tv_usec = (ns % 1000000000L) / 1000;
> +}
> +
> +static __inline void
> +NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *tv)
> +{
> + tv->tv_sec = ns / 1000000000L;
> + tv->tv_nsec = ns % 1000000000L;
> +}
>  
>  #else /* !_KERNEL */
>  #include <time.h>
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> retrieving revision 1.242
> diff -u -p -r1.242 mbuf.h
> --- sys/mbuf.h 11 Feb 2019 00:25:33 -0000 1.242
> +++ sys/mbuf.h 10 Jun 2019 04:24:35 -0000
> @@ -226,13 +226,14 @@ struct mbuf {
>  #define M_ICMP_CSUM_IN_OK 0x0400 /* ICMP/ICMPv6 checksum verified */
>  #define M_ICMP_CSUM_IN_BAD 0x0800 /* ICMP/ICMPv6 checksum bad */
>  #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */
> +#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */
>  
>  #ifdef _KERNEL
>  #define MCS_BITS \
>      ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
>      "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
>      "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
> -    "\15IPV6_NODF_OUT")
> +    "\15IPV6_NODF_OUT" "\16TIMESTAMP")
>  #endif
>  
>  /* mbuf types */
> Index: net/bpf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.175
> diff -u -p -r1.175 bpf.c
> --- net/bpf.c 18 May 2019 12:59:32 -0000 1.175
> +++ net/bpf.c 10 Jun 2019 04:24:35 -0000
> @@ -1284,13 +1284,22 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>   fcode = bps->bps_bf.bf_insns;
>   slen = bpf_mfilter(fcode, m, pktlen);
>  
> - if (slen == 0)
> + if (slen == 0)
>   continue;
>   if (d->bd_fildrop != BPF_FILDROP_PASS)
>   drop = 1;
>   if (d->bd_fildrop != BPF_FILDROP_DROP) {
> - if (!gottime++)
> - microtime(&tv);
> + if (!gottime) {
> + if (ISSET(m->m_flags, M_PKTHDR) &&
> +    ISSET(m->m_pkthdr.csum_flags,
> +     M_TIMESTAMP)) {
> + nsmicrotime(m->m_pkthdr.ph_timestamp,
> +    &tv);
> + } else
> + microtime(&tv);
> +
> + gottime = 1;
> + }
>  
>   mtx_enter(&d->bd_mtx);
>   bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.342
> diff -u -p -r1.342 ip_input.c
> --- netinet/ip_input.c 13 Oct 2018 18:36:01 -0000 1.342
> +++ netinet/ip_input.c 10 Jun 2019 04:24:35 -0000
> @@ -1712,7 +1712,11 @@ ip_savecontrol(struct inpcb *inp, struct
>   if (inp->inp_socket->so_options & SO_TIMESTAMP) {
>   struct timeval tv;
>  
> - microtime(&tv);
> + if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP))
> + nsmicrotime(m->m_pkthdr.ph_timestamp, &tv);
> + else
> + microtime(&tv);
> +
>   *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
>      SCM_TIMESTAMP, SOL_SOCKET);
>   if (*mp)
> Index: netinet6/ip6_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.216
> diff -u -p -r1.216 ip6_input.c
> --- netinet6/ip6_input.c 9 Nov 2018 14:14:32 -0000 1.216
> +++ netinet6/ip6_input.c 10 Jun 2019 04:24:35 -0000
> @@ -947,7 +947,11 @@ ip6_savecontrol(struct inpcb *in6p, stru
>   if (in6p->inp_socket->so_options & SO_TIMESTAMP) {
>   struct timeval tv;
>  
> - microtime(&tv);
> + if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP))
> + nsmicrotime(m->m_pkthdr.ph_timestamp, &tv);
> + else
> + microtime(&tv);
> +
>   *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
>      SCM_TIMESTAMP, SOL_SOCKET);
>   if (*mp)
> Index: dev/pci/if_mcx.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 if_mcx.c
> --- dev/pci/if_mcx.c 7 Jun 2019 06:53:15 -0000 1.21
> +++ dev/pci/if_mcx.c 10 Jun 2019 04:24:35 -0000
> @@ -1892,6 +1892,18 @@ struct mcx_cq {
>   int cq_count;
>  };
>  
> +struct mcx_calibration {
> + uint64_t c_timestamp; /* previous mcx chip time */
> + uint64_t c_uptime; /* previous kernel nanouptime */
> + uint64_t c_tbase; /* mcx chip time */
> + uint64_t c_ubase; /* kernel nanouptime */
> + uint64_t c_tdiff;
> + uint64_t c_udiff;
> +};
> +
> +#define MCX_CALIBRATE_FIRST    2
> +#define MCX_CALIBRATE_NORMAL   30
> +
>  struct mcx_softc {
>   struct device sc_dev;
>   struct arpcom sc_ac;
> @@ -1945,6 +1957,10 @@ struct mcx_softc {
>   int sc_extra_mcast;
>   uint8_t sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN];
>  
> + struct mcx_calibration sc_calibration[2];
> + unsigned int sc_calibration_gen;
> + struct timeout sc_calibrate;
> +
>   struct mcx_cq sc_cq[MCX_MAX_CQS];
>   int sc_num_cq;
>  
> @@ -2037,7 +2053,7 @@ static void mcx_cmdq_mbox_dump(struct mc
>  */
>  static void mcx_refill(void *);
>  static int mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
> -    struct mbuf_list *);
> +    struct mbuf_list *, const struct mcx_calibration *);
>  static void mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *,
>      int *);
>  static void mcx_process_cq(struct mcx_softc *, struct mcx_cq *);
> @@ -2057,6 +2073,9 @@ static void mcx_media_status(struct ifne
>  static int mcx_media_change(struct ifnet *);
>  static int mcx_get_sffpage(struct ifnet *, struct if_sffpage *);
>  
> +static void mcx_calibrate_first(struct mcx_softc *);
> +static void mcx_calibrate(void *);
> +
>  static inline uint32_t
>   mcx_rd(struct mcx_softc *, bus_size_t);
>  static inline void
> @@ -2064,6 +2083,8 @@ static inline void
>  static inline void
>   mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int);
>  
> +static uint64_t mcx_timer(struct mcx_softc *);
> +
>  static int mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *,
>      bus_size_t, u_int align);
>  static void mcx_dmamem_zero(struct mcx_dmamem *);
> @@ -2338,6 +2359,7 @@ mcx_attach(struct device *parent, struct
>   ether_ifattach(ifp);
>  
>   timeout_set(&sc->sc_rx_refill, mcx_refill, sc);
> + timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
>  
>   sc->sc_flow_table_id = -1;
>   for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) {
> @@ -5555,9 +5577,65 @@ mcx_process_txeof(struct mcx_softc *sc,
>   ms->ms_m = NULL;
>  }
>  
> +static uint64_t
> +mcx_uptime(void)
> +{
> + struct timespec ts;
> +
> + nanouptime(&ts);
> +
> + return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec);
> +}
> +
> +static void
> +mcx_calibrate_first(struct mcx_softc *sc)
> +{
> + struct mcx_calibration *c = &sc->sc_calibration[0];
> +
> + sc->sc_calibration_gen = 0;
> +
> + c->c_ubase = mcx_uptime();
> + c->c_tbase = mcx_timer(sc);
> + c->c_tdiff = 0;
> +
> + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST);
> +}
> +
> +#define MCX_TIMESTAMP_SHIFT 10
> +
> +static void
> +mcx_calibrate(void *arg)
> +{
> + struct mcx_softc *sc = arg;
> + struct mcx_calibration *nc, *pc;
> + unsigned int gen;
> +
> + if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING))
> + return;
> +
> + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL);
> +
> + gen = sc->sc_calibration_gen;
> + pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> + gen++;
> + nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +
> + nc->c_uptime = pc->c_ubase;
> + nc->c_timestamp = pc->c_tbase;
> +
> + nc->c_ubase = mcx_uptime();
> + nc->c_tbase = mcx_timer(sc);
> +
> + nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT;
> + nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT;
> +
> + membar_producer();
> + sc->sc_calibration_gen = gen;
> +}
> +
>  static int
>  mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe,
> -    struct mbuf_list *ml)
> +    struct mbuf_list *ml, const struct mcx_calibration *c)
>  {
>   struct mcx_slot *ms;
>   struct mbuf *m;
> @@ -5575,6 +5653,15 @@ mcx_process_rx(struct mcx_softc *sc, str
>  
>   m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
>  
> + if (c->c_tdiff) {
> + uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
> + t *= c->c_udiff;
> + t /= c->c_tdiff;
> +
> + m->m_pkthdr.ph_timestamp = c->c_uptime + t;
> + SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
> + }
> +
>   ml_enqueue(ml, m);
>  
>   return (1);
> @@ -5624,11 +5711,17 @@ void
>  mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
>  {
>   struct ifnet *ifp = &sc->sc_ac.ac_if;
> + const struct mcx_calibration *c;
> + unsigned int gen;
>   struct mcx_cq_entry *cqe;
>   uint8_t *cqp;
>   struct mbuf_list ml = MBUF_LIST_INITIALIZER();
>   int rxfree, txfree;
>  
> + gen = sc->sc_calibration_gen;
> + membar_consumer();
> + c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +
>   rxfree = 0;
>   txfree = 0;
>   while ((cqe = mcx_next_cq_entry(sc, cq))) {
> @@ -5639,7 +5732,7 @@ mcx_process_cq(struct mcx_softc *sc, str
>   mcx_process_txeof(sc, cqe, &txfree);
>   break;
>   case MCX_CQ_ENTRY_OPCODE_SEND:
> - rxfree += mcx_process_rx(sc, cqe, &ml);
> + rxfree += mcx_process_rx(sc, cqe, &ml, c);
>   break;
>   case MCX_CQ_ENTRY_OPCODE_REQ_ERR:
>   case MCX_CQ_ENTRY_OPCODE_SEND_ERR:
> @@ -5882,6 +5975,8 @@ mcx_up(struct mcx_softc *sc)
>   sc->sc_rx_prod = 0;
>   mcx_rx_fill(sc);
>  
> + mcx_calibrate_first(sc);
> +
>   SET(ifp->if_flags, IFF_RUNNING);
>  
>   sc->sc_tx_cons = 0;
> @@ -5930,6 +6025,8 @@ mcx_down(struct mcx_softc *sc)
>   intr_barrier(&sc->sc_ih);
>   ifq_barrier(&ifp->if_snd);
>  
> + timeout_del_barrier(&sc->sc_calibrate);
> +
>   for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) {
>   if (sc->sc_flow_group_id[group] != -1)
>   mcx_destroy_flow_group(sc,
> @@ -6443,6 +6540,26 @@ static inline void
>  mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f)
>  {
>   bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f);
> +}
> +
> +static uint64_t
> +mcx_timer(struct mcx_softc *sc)
> +{
> + uint32_t hi, lo, ni;
> +
> + hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> + for (;;) {
> + lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
> + mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
> + ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> +
> + if (ni == hi)
> + break;
> +
> + hi = ni;
> + }
> +
> + return (((uint64_t)hi << 32) | (uint64_t)lo);
>  }
>  
>  static int
>
>