move to interface rx queue backpressure for if_rxr livelock detection

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

move to interface rx queue backpressure for if_rxr livelock detection

David Gwynne-5
interface rx queue processing includes detection of when the stack
becomes too busy to process packets.

there's three stages to this mechanism. firstly, everything is fine
and the packets are simply queued for processing. the second is the
"pressure_return" stage where the interface has queued a few times,
but the stack hasn't run to process them. ifiq_input returns 1 in
this situation to notify the nic that it should start to slow down.
the last stage is the "pressure_drop" stage where the nic has
continued to queue packets and the stack still hasnt run. in this
stage it drops the packets and returns 1.

independently, the stack looks for lost clock ticks (because the stack
traditionally blocked softclock ticks) as a livelock detection
mechanism. this no longer works that well now we're in an MP worls.
firstly, the stack could be running on a different cpu to the clock and
therefore wont block it. secondly, the stack runs in a thread and doesnt
raise the spl, so it shouldnt be blocking clock interrupts even if it is
sharing a cpu now.

therefore the traditional livelock detection mechanism doesnt work and
should be moved away from. the replacement is getting nics that
implement rx ring moderation to look at the return value of the rx queue
input function and telling the rings to slow down. that is what this
diff does.

i've compiled it on amd64, which covers most of the drivers, but there's
a few in fdt that i did blind and havent tested. ive tested a couple of
the interfaces, but more testing would be appreciated.

ok?

Index: ./dev/fdt/if_dwxe.c
===================================================================
RCS file: /cvs/src/sys/dev/fdt/if_dwxe.c,v
retrieving revision 1.11
diff -u -p -r1.11 if_dwxe.c
--- ./dev/fdt/if_dwxe.c 3 Jan 2019 00:59:58 -0000 1.11
+++ ./dev/fdt/if_dwxe.c 1 Jul 2019 00:26:07 -0000
@@ -962,13 +962,14 @@ dwxe_rx_proc(struct dwxe_softc *sc)
  sc->sc_rx_cons++;
  }
 
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rx_ring);
+
  dwxe_fill_rx_ring(sc);
 
  bus_dmamap_sync(sc->sc_dmat, DWXE_DMA_MAP(sc->sc_rxring), 0,
     DWXE_DMA_LEN(sc->sc_rxring),
     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-
- if_input(ifp, &ml);
 }
 
 void
Index: ./dev/fdt/if_fec.c
===================================================================
RCS file: /cvs/src/sys/dev/fdt/if_fec.c,v
retrieving revision 1.8
diff -u -p -r1.8 if_fec.c
--- ./dev/fdt/if_fec.c 6 Feb 2019 22:59:06 -0000 1.8
+++ ./dev/fdt/if_fec.c 1 Jul 2019 00:26:07 -0000
@@ -1123,6 +1123,9 @@ fec_rx_proc(struct fec_softc *sc)
  sc->sc_rx_cons++;
  }
 
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rx_ring);
+
  fec_fill_rx_ring(sc);
 
  bus_dmamap_sync(sc->sc_dmat, ENET_DMA_MAP(sc->sc_rxring), 0,
@@ -1131,8 +1134,6 @@ fec_rx_proc(struct fec_softc *sc)
 
  /* rx descriptors are ready */
  HWRITE4(sc, ENET_RDAR, ENET_RDAR_RDAR);
-
- if_input(ifp, &ml);
 }
 
 void
Index: ./dev/fdt/if_mvneta.c
===================================================================
RCS file: /cvs/src/sys/dev/fdt/if_mvneta.c,v
retrieving revision 1.7
diff -u -p -r1.7 if_mvneta.c
--- ./dev/fdt/if_mvneta.c 30 Apr 2019 20:26:02 -0000 1.7
+++ ./dev/fdt/if_mvneta.c 1 Jul 2019 00:26:07 -0000
@@ -1369,9 +1369,10 @@ mvneta_rx_proc(struct mvneta_softc *sc)
  sc->sc_rx_cons = MVNETA_RX_RING_NEXT(idx);
  }
 
- mvneta_fill_rx_ring(sc);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rx_ring);
 
- if_input(ifp, &ml);
+ mvneta_fill_rx_ring(sc);
 }
 
 void
Index: ./dev/pci/if_em.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.342
diff -u -p -r1.342 if_em.c
--- ./dev/pci/if_em.c 1 Mar 2019 10:02:44 -0000 1.342
+++ ./dev/pci/if_em.c 1 Jul 2019 00:26:07 -0000
@@ -2922,7 +2922,8 @@ em_rxeof(struct em_softc *sc)
 
  sc->sc_rx_desc_tail = i;
 
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rx_ring);
 
  return (rv);
 }
Index: ./dev/pci/if_bge.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_bge.c,v
retrieving revision 1.388
diff -u -p -r1.388 if_bge.c
--- ./dev/pci/if_bge.c 9 Nov 2018 14:14:31 -0000 1.388
+++ ./dev/pci/if_bge.c 1 Jul 2019 00:26:07 -0000
@@ -3561,6 +3561,13 @@ bge_rxeof(struct bge_softc *sc)
  ml_enqueue(&ml, m);
  }
 
+ if (ifiq_input(&ifp->if_rcv, &ml)) {
+ if (stdcnt)
+ if_rxr_livelocked(&sc->bge_std_ring);
+ if (jumbocnt)
+ if_rxr_livelocked(&sc->bge_jumbo_ring);
+ }
+
  sc->bge_rx_saved_considx = rx_cons;
  bge_writembx(sc, BGE_MBX_RX_CONS0_LO, sc->bge_rx_saved_considx);
  if (stdcnt) {
@@ -3571,8 +3578,6 @@ bge_rxeof(struct bge_softc *sc)
  if_rxr_put(&sc->bge_jumbo_ring, jumbocnt);
  bge_fill_rx_ring_jumbo(sc);
  }
-
- if_input(ifp, &ml);
 }
 
 void
Index: ./dev/pci/if_bnx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_bnx.c,v
retrieving revision 1.125
diff -u -p -r1.125 if_bnx.c
--- ./dev/pci/if_bnx.c 10 Mar 2018 10:51:46 -0000 1.125
+++ ./dev/pci/if_bnx.c 1 Jul 2019 00:26:07 -0000
@@ -4466,6 +4466,9 @@ bnx_rx_int_next_rx:
     BUS_SPACE_BARRIER_READ);
  }
 
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->rx_ring);
+
  /* No new packets to process.  Refill the RX chain and exit. */
  sc->rx_cons = sw_cons;
  if (!bnx_fill_rx_chain(sc))
@@ -4476,8 +4479,6 @@ bnx_rx_int_next_rx:
     sc->rx_bd_chain_map[i], 0,
     sc->rx_bd_chain_map[i]->dm_mapsize,
     BUS_DMASYNC_PREWRITE);
-
- if_input(ifp, &ml);
 
  DBPRINT(sc, BNX_INFO_RECV, "%s(exit): rx_prod = 0x%04X, "
     "rx_cons = 0x%04X, rx_prod_bseq = 0x%08X\n",
Index: ./dev/pci/if_bnxt.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v
retrieving revision 1.20
diff -u -p -r1.20 if_bnxt.c
--- ./dev/pci/if_bnxt.c 24 Apr 2019 10:09:49 -0000 1.20
+++ ./dev/pci/if_bnxt.c 1 Jul 2019 00:26:07 -0000
@@ -1300,12 +1300,17 @@ bnxt_intr(void *xsc)
  if_rxr_put(&sc->sc_rxr[0], rxfree);
  if_rxr_put(&sc->sc_rxr[1], agfree);
 
+ if (ifiq_input(&sc->sc_ac.ac_if.if_rcv, &ml)) {
+ if (rxfree)
+ if_rxr_livelocked(&sc->sc_rxr[0]);
+ if (agfree)
+ if_rxr_livelocked(&sc->sc_rxr[1]);
+ }
+
  bnxt_rx_fill(sc);
  if ((sc->sc_rx_cons == sc->sc_rx_prod) ||
     (sc->sc_rx_ag_cons == sc->sc_rx_ag_prod))
  timeout_add(&sc->sc_rx_refill, 0);
-
- if_input(&sc->sc_ac.ac_if, &ml);
  }
  if (txfree != 0) {
  if (ifq_is_oactive(&ifp->if_snd))
Index: ./dev/pci/if_ix.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.157
diff -u -p -r1.157 if_ix.c
--- ./dev/pci/if_ix.c 10 Apr 2019 09:55:02 -0000 1.157
+++ ./dev/pci/if_ix.c 1 Jul 2019 00:26:07 -0000
@@ -2959,7 +2959,8 @@ next_desc:
  }
  rxr->next_to_check = i;
 
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&rxr->rx_ring);
 
  if (!(staterr & IXGBE_RXD_STAT_DD))
  return FALSE;
Index: ./dev/pci/if_ixl.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.38
diff -u -p -r1.38 if_ixl.c
--- ./dev/pci/if_ixl.c 4 May 2019 13:42:12 -0000 1.38
+++ ./dev/pci/if_ixl.c 1 Jul 2019 00:26:07 -0000
@@ -3518,11 +3518,17 @@ ixl_sff_get_byte(struct ixl_softc *sc, u
  iaq->iaq_opcode = htole16(IXL_AQ_OP_PHY_GET_REGISTER);
  param = (struct ixl_aq_phy_reg_access *)iaq->iaq_param;
  param->phy_iface = IXL_AQ_PHY_IF_MODULE;
- param->dev_addr = dev;
+ //param->dev_addr = dev;
  htolem32(&param->reg, reg);
 
  ixl_atq_exec(sc, &iatq, "ixlsffget");
 
+ if (ISSET(sc->sc_ac.ac_if.if_flags, IFF_DEBUG)) {
+ printf("%s: %s(dev 0x%02x, reg 0x%02x) -> %04x\n",
+    DEVNAME(sc), __func__,
+    dev, reg, lemtoh16(&iaq->iaq_retval));
+ }
+
  switch (iaq->iaq_retval) {
  case htole16(IXL_AQ_RC_OK):
  break;
@@ -3554,11 +3560,17 @@ ixl_sff_set_byte(struct ixl_softc *sc, u
  iaq->iaq_opcode = htole16(IXL_AQ_OP_PHY_SET_REGISTER);
  param = (struct ixl_aq_phy_reg_access *)iaq->iaq_param;
  param->phy_iface = IXL_AQ_PHY_IF_MODULE;
- param->dev_addr = dev;
+ //param->dev_addr = dev;
  htolem32(&param->reg, reg);
  htolem32(&param->val, v);
 
  ixl_atq_exec(sc, &iatq, "ixlsffset");
+
+ if (ISSET(sc->sc_ac.ac_if.if_flags, IFF_DEBUG)) {
+ printf("%s: %s(dev 0x%02x, reg 0x%02x, val 0x%02x) -> %04x\n",
+    DEVNAME(sc), __func__,
+    dev, reg, v, lemtoh16(&iaq->iaq_retval));
+ }
 
  switch (iaq->iaq_retval) {
  case htole16(IXL_AQ_RC_OK):
Index: ./dev/pci/if_msk.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_msk.c,v
retrieving revision 1.131
diff -u -p -r1.131 if_msk.c
--- ./dev/pci/if_msk.c 6 Jan 2018 03:11:04 -0000 1.131
+++ ./dev/pci/if_msk.c 1 Jul 2019 00:26:07 -0000
@@ -1641,7 +1641,8 @@ msk_rxeof(struct sk_if_softc *sc_if, uin
  m->m_pkthdr.len = m->m_len = len;
 
  ml_enqueue(&ml, m);
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc_if->sk_cdata.sk_rx_ring);
 }
 
 void
Index: ./dev/pci/if_myx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
retrieving revision 1.107
diff -u -p -r1.107 if_myx.c
--- ./dev/pci/if_myx.c 16 Apr 2019 11:42:56 -0000 1.107
+++ ./dev/pci/if_myx.c 1 Jul 2019 00:26:07 -0000
@@ -1743,6 +1743,7 @@ myx_rxeof(struct myx_softc *sc)
  int ring;
  u_int rxfree[2] = { 0 , 0 };
  u_int len;
+ int livelocked;
 
  bus_dmamap_sync(sc->sc_dmat, sc->sc_intrq_dma.mxm_map, 0,
     sc->sc_intrq_dma.mxm_map->dm_mapsize, BUS_DMASYNC_POSTREAD);
@@ -1778,19 +1779,21 @@ myx_rxeof(struct myx_softc *sc)
  bus_dmamap_sync(sc->sc_dmat, sc->sc_intrq_dma.mxm_map, 0,
     sc->sc_intrq_dma.mxm_map->dm_mapsize, BUS_DMASYNC_PREREAD);
 
+ livelocked = ifiq_input(&ifp->if_rcv, &ml);
  for (ring = MYX_RXSMALL; ring <= MYX_RXBIG; ring++) {
  if (rxfree[ring] == 0)
  continue;
 
  mrr = &sc->sc_rx_ring[ring];
 
+ if (livelocked)
+ if_rxr_livelocked(&mrr->mrr_rxr);
+
  if_rxr_put(&mrr->mrr_rxr, rxfree[ring]);
  myx_rx_fill(sc, mrr);
  if (mrr->mrr_prod == mrr->mrr_cons)
  timeout_add(&mrr->mrr_refill, 0);
  }
-
- if_input(ifp, &ml);
 }
 
 static int
Index: ./dev/pci/if_nep.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_nep.c,v
retrieving revision 1.31
diff -u -p -r1.31 if_nep.c
--- ./dev/pci/if_nep.c 9 Nov 2018 14:14:31 -0000 1.31
+++ ./dev/pci/if_nep.c 1 Jul 2019 00:26:07 -0000
@@ -1049,7 +1049,8 @@ nep_rx_proc(struct nep_softc *sc)
  bus_dmamap_sync(sc->sc_dmat, NEP_DMA_MAP(sc->sc_rcring), 0,
     NEP_DMA_LEN(sc->sc_rcring), BUS_DMASYNC_PREREAD);
 
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rx_ring);
 
  nep_fill_rx_ring(sc);
 
Index: ./dev/pci/if_oce.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_oce.c,v
retrieving revision 1.100
diff -u -p -r1.100 if_oce.c
--- ./dev/pci/if_oce.c 27 Nov 2017 16:53:04 -0000 1.100
+++ ./dev/pci/if_oce.c 1 Jul 2019 00:26:07 -0000
@@ -1639,7 +1639,8 @@ oce_rxeof(struct oce_rq *rq, struct oce_
  ml_enqueue(&ml, m);
  }
 exit:
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&rq->rxring);
 }
 
 void
Index: ./dev/pci/if_sis.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_sis.c,v
retrieving revision 1.135
diff -u -p -r1.135 if_sis.c
--- ./dev/pci/if_sis.c 22 Jan 2017 10:17:38 -0000 1.135
+++ ./dev/pci/if_sis.c 1 Jul 2019 00:26:07 -0000
@@ -1447,7 +1447,8 @@ sis_rxeof(struct sis_softc *sc)
  ml_enqueue(&ml, m);
  }
 
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sis_cdata.sis_rx_ring);
 
  sis_fill_rx_ring(sc);
 }
Index: ./dev/pci/if_sk.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_sk.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_sk.c
--- ./dev/pci/if_sk.c 4 Jun 2017 04:29:23 -0000 1.189
+++ ./dev/pci/if_sk.c 1 Jul 2019 00:26:07 -0000
@@ -1637,9 +1637,10 @@ sk_rxeof(struct sk_if_softc *sc_if)
  }
  sc_if->sk_cdata.sk_rx_cons = cur;
 
- sk_fill_rx_ring(sc_if);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(rxr);
 
- if_input(ifp, &ml);
+ sk_fill_rx_ring(sc_if);
 }
 
 void
Index: ./dev/pci/if_vic.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vic.c,v
retrieving revision 1.98
diff -u -p -r1.98 if_vic.c
--- ./dev/pci/if_vic.c 12 Jul 2017 14:25:36 -0000 1.98
+++ ./dev/pci/if_vic.c 1 Jul 2019 00:26:07 -0000
@@ -867,7 +867,8 @@ vic_rx_proc(struct vic_softc *sc, int q)
  VIC_INC(sc->sc_data->vd_rx[q].nextidx, sc->sc_nrxbuf);
  }
 
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rxq[q].ring);
  vic_rx_fill(sc, q);
 
  bus_dmamap_sync(sc->sc_dmat, sc->sc_dma_map, 0, sc->sc_dma_size,
Index: ./dev/pci/if_vmx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v
retrieving revision 1.45
diff -u -p -r1.45 if_vmx.c
--- ./dev/pci/if_vmx.c 22 Jan 2017 10:17:38 -0000 1.45
+++ ./dev/pci/if_vmx.c 1 Jul 2019 00:26:07 -0000
@@ -785,10 +785,12 @@ skip_buffer:
  }
  }
 
- if_input(ifp, &ml);
-
  /* XXX Should we (try to) allocate buffers for ring 2 too? */
  ring = &rq->cmd_ring[0];
+
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&ring->rxr);
+
  for (slots = if_rxr_get(&ring->rxr, NRXDESC); slots > 0; slots--) {
  if (vmxnet3_getbuf(sc, ring))
  break;
Index: ./dev/pci/if_vr.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_vr.c,v
retrieving revision 1.153
diff -u -p -r1.153 if_vr.c
--- ./dev/pci/if_vr.c 22 Jan 2017 10:17:38 -0000 1.153
+++ ./dev/pci/if_vr.c 1 Jul 2019 00:26:07 -0000
@@ -933,13 +933,14 @@ vr_rxeof(struct vr_softc *sc)
  ml_enqueue(&ml, m);
  }
 
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rxring);
+
  vr_fill_rx_ring(sc);
 
  bus_dmamap_sync(sc->sc_dmat, sc->sc_listmap.vrm_map,
     0, sc->sc_listmap.vrm_map->dm_mapsize,
     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-
- if_input(ifp, &ml);
 }
 
 void
Index: ./dev/pv/if_vio.c
===================================================================
RCS file: /cvs/src/sys/dev/pv/if_vio.c,v
retrieving revision 1.12
diff -u -p -r1.12 if_vio.c
--- ./dev/pv/if_vio.c 26 May 2019 15:22:31 -0000 1.12
+++ ./dev/pv/if_vio.c 1 Jul 2019 00:26:07 -0000
@@ -1043,7 +1043,8 @@ vio_rxeof(struct vio_softc *sc)
  m_freem(m0);
  }
 
- if_input(ifp, &ml);
+ if (ifiq_input(&ifp->if_rcv, &ml))
+ if_rxr_livelocked(&sc->sc_rx_ring);
  return r;
 }
 

Reply | Threaded
Open this post in threaded view
|

Re: move to interface rx queue backpressure for if_rxr livelock detection

Hrvoje Popovski
On 1.7.2019. 3:16, David Gwynne wrote:

> interface rx queue processing includes detection of when the stack
> becomes too busy to process packets.
>
> there's three stages to this mechanism. firstly, everything is fine
> and the packets are simply queued for processing. the second is the
> "pressure_return" stage where the interface has queued a few times,
> but the stack hasn't run to process them. ifiq_input returns 1 in
> this situation to notify the nic that it should start to slow down.
> the last stage is the "pressure_drop" stage where the nic has
> continued to queue packets and the stack still hasnt run. in this
> stage it drops the packets and returns 1.
>
> independently, the stack looks for lost clock ticks (because the stack
> traditionally blocked softclock ticks) as a livelock detection
> mechanism. this no longer works that well now we're in an MP worls.
> firstly, the stack could be running on a different cpu to the clock and
> therefore wont block it. secondly, the stack runs in a thread and doesnt
> raise the spl, so it shouldnt be blocking clock interrupts even if it is
> sharing a cpu now.
>
> therefore the traditional livelock detection mechanism doesnt work and
> should be moved away from. the replacement is getting nics that
> implement rx ring moderation to look at the return value of the rx queue
> input function and telling the rings to slow down. that is what this
> diff does.
>
> i've compiled it on amd64, which covers most of the drivers, but there's
> a few in fdt that i did blind and havent tested. ive tested a couple of
> the interfaces, but more testing would be appreciated.


Hi,

without this diff when box is under pressure ifconfig output can take up
from 10 to 20 min to finish...
   11m26.31s real     0m00.01s user     1m37.16s system


with this diff and
net.link.ifrxq.pressure_return=4
net.link.ifrxq.pressure_drop=8

it takes under minute
    0m40.55s real     0m00.00s user     0m00.80s system


every time numbers will be different, but this diff makes my test box
smoother under pressure ...