nep(4) rx error interrupts

classic Classic list List threaded Threaded
4 messages Options
Reply | Threaded
Open this post in threaded view
|

nep(4) rx error interrupts

Jonathan Matthew-4
I'm trying to use aggr on top of nep(4), which has turned up a few bugs.
Firstly, the nic appears to report rx errors in the second interrupt
state vector, which the driver doesn't expect, so I get this:

nep3: nep_intr 0 8 0

and then, since the interrupt wasn't rearmed, the interface stops.

Then, the driver doesn't clear rx error interrupt sources, so rearming
results in an interrupt storm.  The illumos nxge driver masks off read-only
bits in the rx status register and writes it back, clearing all active
interrupt sources, so I've done that here too.

ok?


Index: if_nep.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_nep.c,v
retrieving revision 1.31
diff -u -p -r1.31 if_nep.c
--- if_nep.c 9 Nov 2018 14:14:31 -0000 1.31
+++ if_nep.c 10 Feb 2020 11:57:44 -0000
@@ -333,11 +333,24 @@
 #define RX_DMA_ENT_MSK(chan) (DMC + 0x00068 + (chan) * 0x00200)
 #define  RX_DMA_ENT_MSK_RBR_EMPTY (1ULL << 3)
 #define RX_DMA_CTL_STAT(chan) (DMC + 0x00070 + (chan) * 0x00200)
+#define  RX_DMA_CTL_STAT_DC_FIFO_ERR (1ULL << 48)
 #define  RX_DMA_CTL_STAT_MEX (1ULL << 47)
 #define  RX_DMA_CTL_STAT_RCRTHRES (1ULL << 46)
 #define  RX_DMA_CTL_STAT_RCRTO (1ULL << 45)
+#define  RX_DMA_CTL_STAT_PORT_DROP_PKT (1ULL << 42)
+#define  RX_DMA_CTL_STAT_WRED_DROP (1ULL << 41)
+#define  RX_DMA_CTL_STAT_RBR_PRE_EMTY (1ULL << 40)
+#define  RX_DMA_CTL_STAT_RCR_SHDW_FULL (1ULL << 39)
 #define  RX_DMA_CTL_STAT_RBR_EMPTY (1ULL << 35)
 #define  RX_DMA_CTL_STAT_PTRREAD_SHIFT 16
+#define  RX_DMA_CTL_STAT_WR1C_BITS (RX_DMA_CTL_STAT_RBR_EMPTY | \
+ RX_DMA_CTL_STAT_RCR_SHDW_FULL | \
+ RX_DMA_CTL_STAT_RBR_PRE_EMTY | \
+ RX_DMA_CTL_STAT_WRED_DROP | \
+ RX_DMA_CTL_STAT_PORT_DROP_PKT | \
+ RX_DMA_CTL_STAT_RCRTO | \
+ RX_DMA_CTL_STAT_RCRTHRES | \
+ RX_DMA_CTL_STAT_DC_FIFO_ERR)
 #define RX_DMA_CTL_STAT_DBG(chan) (DMC + 0x00098 + (chan) * 0x00200)
 
 #define TX_LOG_PAGE_VLD(chan) (FZC_DMC + 0x40000 + (chan) * 0x00200)
@@ -959,7 +972,7 @@ nep_intr(void *arg)
  rearm = 1;
  }
 
- if (sv0 & (1ULL << LDN_RXDMA(sc->sc_port))) {
+ if ((sv0 | sv1) & (1ULL << LDN_RXDMA(sc->sc_port))) {
  nep_rx_proc(sc);
  rearm = 1;
  }
@@ -990,8 +1003,8 @@ nep_rx_proc(struct nep_softc *sc)
  int idx, len, i;
 
  val = nep_read(sc, RX_DMA_CTL_STAT(sc->sc_port));
- nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port),
-    RX_DMA_CTL_STAT_RCRTHRES | RX_DMA_CTL_STAT_RCRTO);
+ val &= RX_DMA_CTL_STAT_WR1C_BITS;
+ nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port), val);
 
  bus_dmamap_sync(sc->sc_dmat, NEP_DMA_MAP(sc->sc_rcring), 0,
     NEP_DMA_LEN(sc->sc_rcring), BUS_DMASYNC_POSTREAD);

Reply | Threaded
Open this post in threaded view
|

Re: nep(4) rx error interrupts

Mark Kettenis
Jonathan Matthew schreef op 2020-02-10 13:19:

> I'm trying to use aggr on top of nep(4), which has turned up a few
> bugs.
> Firstly, the nic appears to report rx errors in the second interrupt
> state vector, which the driver doesn't expect, so I get this:
>
> nep3: nep_intr 0 8 0
>
> and then, since the interrupt wasn't rearmed, the interface stops.
>
> Then, the driver doesn't clear rx error interrupt sources, so rearming
> results in an interrupt storm.  The illumos nxge driver masks off
> read-only
> bits in the rx status register and writes it back, clearing all active
> interrupt sources, so I've done that here too.
>
> ok?
>
>
> Index: if_nep.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_nep.c,v
> retrieving revision 1.31
> diff -u -p -r1.31 if_nep.c
> --- if_nep.c 9 Nov 2018 14:14:31 -0000 1.31
> +++ if_nep.c 10 Feb 2020 11:57:44 -0000
> @@ -333,11 +333,24 @@
>  #define RX_DMA_ENT_MSK(chan) (DMC + 0x00068 + (chan) * 0x00200)
>  #define  RX_DMA_ENT_MSK_RBR_EMPTY (1ULL << 3)
>  #define RX_DMA_CTL_STAT(chan) (DMC + 0x00070 + (chan) * 0x00200)
> +#define  RX_DMA_CTL_STAT_DC_FIFO_ERR (1ULL << 48)
>  #define  RX_DMA_CTL_STAT_MEX (1ULL << 47)
>  #define  RX_DMA_CTL_STAT_RCRTHRES (1ULL << 46)
>  #define  RX_DMA_CTL_STAT_RCRTO (1ULL << 45)
> +#define  RX_DMA_CTL_STAT_PORT_DROP_PKT (1ULL << 42)
> +#define  RX_DMA_CTL_STAT_WRED_DROP (1ULL << 41)
> +#define  RX_DMA_CTL_STAT_RBR_PRE_EMTY (1ULL << 40)
> +#define  RX_DMA_CTL_STAT_RCR_SHDW_FULL (1ULL << 39)
>  #define  RX_DMA_CTL_STAT_RBR_EMPTY (1ULL << 35)
>  #define  RX_DMA_CTL_STAT_PTRREAD_SHIFT 16
> +#define  RX_DMA_CTL_STAT_WR1C_BITS (RX_DMA_CTL_STAT_RBR_EMPTY | \
> + RX_DMA_CTL_STAT_RCR_SHDW_FULL | \
> + RX_DMA_CTL_STAT_RBR_PRE_EMTY | \
> + RX_DMA_CTL_STAT_WRED_DROP | \
> + RX_DMA_CTL_STAT_PORT_DROP_PKT | \
> + RX_DMA_CTL_STAT_RCRTO | \
> + RX_DMA_CTL_STAT_RCRTHRES | \
> + RX_DMA_CTL_STAT_DC_FIFO_ERR)
>  #define RX_DMA_CTL_STAT_DBG(chan) (DMC + 0x00098 + (chan) * 0x00200)
>
>  #define TX_LOG_PAGE_VLD(chan) (FZC_DMC + 0x40000 + (chan) * 0x00200)
> @@ -959,7 +972,7 @@ nep_intr(void *arg)
>   rearm = 1;
>   }
>
> - if (sv0 & (1ULL << LDN_RXDMA(sc->sc_port))) {
> + if ((sv0 | sv1) & (1ULL << LDN_RXDMA(sc->sc_port))) {

This doesn't make sense to me.

I suppose you see a bit getting set in sv1.  Which bit is that?

>   nep_rx_proc(sc);
>   rearm = 1;
>   }
> @@ -990,8 +1003,8 @@ nep_rx_proc(struct nep_softc *sc)
>   int idx, len, i;
>
>   val = nep_read(sc, RX_DMA_CTL_STAT(sc->sc_port));
> - nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port),
> -    RX_DMA_CTL_STAT_RCRTHRES | RX_DMA_CTL_STAT_RCRTO);
> + val &= RX_DMA_CTL_STAT_WR1C_BITS;
> + nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port), val);
>
>   bus_dmamap_sync(sc->sc_dmat, NEP_DMA_MAP(sc->sc_rcring), 0,
>      NEP_DMA_LEN(sc->sc_rcring), BUS_DMASYNC_POSTREAD);

Reply | Threaded
Open this post in threaded view
|

Re: nep(4) rx error interrupts

Jonathan Matthew-4
On Mon, Feb 10, 2020 at 02:04:06PM +0100, Mark Kettenis wrote:

> Jonathan Matthew schreef op 2020-02-10 13:19:
> > I'm trying to use aggr on top of nep(4), which has turned up a few bugs.
> > Firstly, the nic appears to report rx errors in the second interrupt
> > state vector, which the driver doesn't expect, so I get this:
> >
> > nep3: nep_intr 0 8 0
> >
> > and then, since the interrupt wasn't rearmed, the interface stops.
> >
> > Then, the driver doesn't clear rx error interrupt sources, so rearming
> > results in an interrupt storm.  The illumos nxge driver masks off
> > read-only
> > bits in the rx status register and writes it back, clearing all active
> > interrupt sources, so I've done that here too.
> >
> > ok?
> >
> >
> > Index: if_nep.c
> > ===================================================================
> > RCS file: /cvs/src/sys/dev/pci/if_nep.c,v
> > retrieving revision 1.31
> > diff -u -p -r1.31 if_nep.c
> > --- if_nep.c 9 Nov 2018 14:14:31 -0000 1.31
> > +++ if_nep.c 10 Feb 2020 11:57:44 -0000
> > @@ -333,11 +333,24 @@
> >  #define RX_DMA_ENT_MSK(chan) (DMC + 0x00068 + (chan) * 0x00200)
> >  #define  RX_DMA_ENT_MSK_RBR_EMPTY (1ULL << 3)
> >  #define RX_DMA_CTL_STAT(chan) (DMC + 0x00070 + (chan) * 0x00200)
> > +#define  RX_DMA_CTL_STAT_DC_FIFO_ERR (1ULL << 48)
> >  #define  RX_DMA_CTL_STAT_MEX (1ULL << 47)
> >  #define  RX_DMA_CTL_STAT_RCRTHRES (1ULL << 46)
> >  #define  RX_DMA_CTL_STAT_RCRTO (1ULL << 45)
> > +#define  RX_DMA_CTL_STAT_PORT_DROP_PKT (1ULL << 42)
> > +#define  RX_DMA_CTL_STAT_WRED_DROP (1ULL << 41)
> > +#define  RX_DMA_CTL_STAT_RBR_PRE_EMTY (1ULL << 40)
> > +#define  RX_DMA_CTL_STAT_RCR_SHDW_FULL (1ULL << 39)
> >  #define  RX_DMA_CTL_STAT_RBR_EMPTY (1ULL << 35)
> >  #define  RX_DMA_CTL_STAT_PTRREAD_SHIFT 16
> > +#define  RX_DMA_CTL_STAT_WR1C_BITS (RX_DMA_CTL_STAT_RBR_EMPTY | \
> > + RX_DMA_CTL_STAT_RCR_SHDW_FULL | \
> > + RX_DMA_CTL_STAT_RBR_PRE_EMTY | \
> > + RX_DMA_CTL_STAT_WRED_DROP | \
> > + RX_DMA_CTL_STAT_PORT_DROP_PKT | \
> > + RX_DMA_CTL_STAT_RCRTO | \
> > + RX_DMA_CTL_STAT_RCRTHRES | \
> > + RX_DMA_CTL_STAT_DC_FIFO_ERR)
> >  #define RX_DMA_CTL_STAT_DBG(chan) (DMC + 0x00098 + (chan) * 0x00200)
> >
> >  #define TX_LOG_PAGE_VLD(chan) (FZC_DMC + 0x40000 + (chan) * 0x00200)
> > @@ -959,7 +972,7 @@ nep_intr(void *arg)
> >   rearm = 1;
> >   }
> >
> > - if (sv0 & (1ULL << LDN_RXDMA(sc->sc_port))) {
> > + if ((sv0 | sv1) & (1ULL << LDN_RXDMA(sc->sc_port))) {
>
> This doesn't make sense to me.
>
> I suppose you see a bit getting set in sv1.  Which bit is that?

The same bit that gets set in sv0 under normal rx conditions.
For nep3, sv1 is 8, on nep1 it's 2.  You can see the illumos driver
doing the same thing here:

https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/io/nxge/nxge_hw.c#L274



>
> >   nep_rx_proc(sc);
> >   rearm = 1;
> >   }
> > @@ -990,8 +1003,8 @@ nep_rx_proc(struct nep_softc *sc)
> >   int idx, len, i;
> >
> >   val = nep_read(sc, RX_DMA_CTL_STAT(sc->sc_port));
> > - nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port),
> > -    RX_DMA_CTL_STAT_RCRTHRES | RX_DMA_CTL_STAT_RCRTO);
> > + val &= RX_DMA_CTL_STAT_WR1C_BITS;
> > + nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port), val);
> >
> >   bus_dmamap_sync(sc->sc_dmat, NEP_DMA_MAP(sc->sc_rcring), 0,
> >      NEP_DMA_LEN(sc->sc_rcring), BUS_DMASYNC_POSTREAD);

Reply | Threaded
Open this post in threaded view
|

Re: nep(4) rx error interrupts

Jonathan Matthew-4
On Tue, Feb 11, 2020 at 06:44:12AM +1000, Jonathan Matthew wrote:

> On Mon, Feb 10, 2020 at 02:04:06PM +0100, Mark Kettenis wrote:
> > Jonathan Matthew schreef op 2020-02-10 13:19:
> > > I'm trying to use aggr on top of nep(4), which has turned up a few bugs.
> > > Firstly, the nic appears to report rx errors in the second interrupt
> > > state vector, which the driver doesn't expect, so I get this:
> > >
> > > nep3: nep_intr 0 8 0
> > >
> > > and then, since the interrupt wasn't rearmed, the interface stops.
> > >
> > > Then, the driver doesn't clear rx error interrupt sources, so rearming
> > > results in an interrupt storm.  The illumos nxge driver masks off
> > > read-only
> > > bits in the rx status register and writes it back, clearing all active
> > > interrupt sources, so I've done that here too.
> > >
> > > ok?
> > >
> > >
> > > Index: if_nep.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/dev/pci/if_nep.c,v
> > > retrieving revision 1.31
> > > diff -u -p -r1.31 if_nep.c
> > > --- if_nep.c 9 Nov 2018 14:14:31 -0000 1.31
> > > +++ if_nep.c 10 Feb 2020 11:57:44 -0000
> > > @@ -333,11 +333,24 @@
> > >  #define RX_DMA_ENT_MSK(chan) (DMC + 0x00068 + (chan) * 0x00200)
> > >  #define  RX_DMA_ENT_MSK_RBR_EMPTY (1ULL << 3)
> > >  #define RX_DMA_CTL_STAT(chan) (DMC + 0x00070 + (chan) * 0x00200)
> > > +#define  RX_DMA_CTL_STAT_DC_FIFO_ERR (1ULL << 48)
> > >  #define  RX_DMA_CTL_STAT_MEX (1ULL << 47)
> > >  #define  RX_DMA_CTL_STAT_RCRTHRES (1ULL << 46)
> > >  #define  RX_DMA_CTL_STAT_RCRTO (1ULL << 45)
> > > +#define  RX_DMA_CTL_STAT_PORT_DROP_PKT (1ULL << 42)
> > > +#define  RX_DMA_CTL_STAT_WRED_DROP (1ULL << 41)
> > > +#define  RX_DMA_CTL_STAT_RBR_PRE_EMTY (1ULL << 40)
> > > +#define  RX_DMA_CTL_STAT_RCR_SHDW_FULL (1ULL << 39)
> > >  #define  RX_DMA_CTL_STAT_RBR_EMPTY (1ULL << 35)
> > >  #define  RX_DMA_CTL_STAT_PTRREAD_SHIFT 16
> > > +#define  RX_DMA_CTL_STAT_WR1C_BITS (RX_DMA_CTL_STAT_RBR_EMPTY | \
> > > + RX_DMA_CTL_STAT_RCR_SHDW_FULL | \
> > > + RX_DMA_CTL_STAT_RBR_PRE_EMTY | \
> > > + RX_DMA_CTL_STAT_WRED_DROP | \
> > > + RX_DMA_CTL_STAT_PORT_DROP_PKT | \
> > > + RX_DMA_CTL_STAT_RCRTO | \
> > > + RX_DMA_CTL_STAT_RCRTHRES | \
> > > + RX_DMA_CTL_STAT_DC_FIFO_ERR)
> > >  #define RX_DMA_CTL_STAT_DBG(chan) (DMC + 0x00098 + (chan) * 0x00200)
> > >
> > >  #define TX_LOG_PAGE_VLD(chan) (FZC_DMC + 0x40000 + (chan) * 0x00200)
> > > @@ -959,7 +972,7 @@ nep_intr(void *arg)
> > >   rearm = 1;
> > >   }
> > >
> > > - if (sv0 & (1ULL << LDN_RXDMA(sc->sc_port))) {
> > > + if ((sv0 | sv1) & (1ULL << LDN_RXDMA(sc->sc_port))) {
> >
> > This doesn't make sense to me.
> >
> > I suppose you see a bit getting set in sv1.  Which bit is that?
>
> The same bit that gets set in sv0 under normal rx conditions.
> For nep3, sv1 is 8, on nep1 it's 2.  You can see the illumos driver
> doing the same thing here:
>
> https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/io/nxge/nxge_hw.c#L274

Or, we could mask off the non-fatal rx error interrupts,
which also fixes the problem I'm seeing.

Does this look better to you?


Index: if_nep.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_nep.c,v
retrieving revision 1.31
diff -u -p -r1.31 if_nep.c
--- if_nep.c 9 Nov 2018 14:14:31 -0000 1.31
+++ if_nep.c 12 Feb 2020 22:10:56 -0000
@@ -332,12 +332,29 @@
 
 #define RX_DMA_ENT_MSK(chan) (DMC + 0x00068 + (chan) * 0x00200)
 #define  RX_DMA_ENT_MSK_RBR_EMPTY (1ULL << 3)
+#define  RX_DMA_ENT_MSK_WRED_DROP (1ULL << 9)
+#define  RX_DMA_ENT_MSK_PTDROP_PKT (1ULL << 10)
+#define  RX_DMA_ENT_MSK_RBR_PRE_PAR (1ULL << 11)
+#define  RX_DMA_ENT_MSK_RCR_SHA_PAR (1ULL << 12)
 #define RX_DMA_CTL_STAT(chan) (DMC + 0x00070 + (chan) * 0x00200)
+#define  RX_DMA_CTL_STAT_DC_FIFO_ERR (1ULL << 48)
 #define  RX_DMA_CTL_STAT_MEX (1ULL << 47)
 #define  RX_DMA_CTL_STAT_RCRTHRES (1ULL << 46)
 #define  RX_DMA_CTL_STAT_RCRTO (1ULL << 45)
+#define  RX_DMA_CTL_STAT_PORT_DROP_PKT (1ULL << 42)
+#define  RX_DMA_CTL_STAT_WRED_DROP (1ULL << 41)
+#define  RX_DMA_CTL_STAT_RBR_PRE_EMTY (1ULL << 40)
+#define  RX_DMA_CTL_STAT_RCR_SHDW_FULL (1ULL << 39)
 #define  RX_DMA_CTL_STAT_RBR_EMPTY (1ULL << 35)
 #define  RX_DMA_CTL_STAT_PTRREAD_SHIFT 16
+#define  RX_DMA_CTL_STAT_WR1C_BITS (RX_DMA_CTL_STAT_RBR_EMPTY | \
+ RX_DMA_CTL_STAT_RCR_SHDW_FULL | \
+ RX_DMA_CTL_STAT_RBR_PRE_EMTY | \
+ RX_DMA_CTL_STAT_WRED_DROP | \
+ RX_DMA_CTL_STAT_PORT_DROP_PKT | \
+ RX_DMA_CTL_STAT_RCRTO | \
+ RX_DMA_CTL_STAT_RCRTHRES | \
+ RX_DMA_CTL_STAT_DC_FIFO_ERR)
 #define RX_DMA_CTL_STAT_DBG(chan) (DMC + 0x00098 + (chan) * 0x00200)
 
 #define TX_LOG_PAGE_VLD(chan) (FZC_DMC + 0x40000 + (chan) * 0x00200)
@@ -990,8 +1007,8 @@ nep_rx_proc(struct nep_softc *sc)
  int idx, len, i;
 
  val = nep_read(sc, RX_DMA_CTL_STAT(sc->sc_port));
- nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port),
-    RX_DMA_CTL_STAT_RCRTHRES | RX_DMA_CTL_STAT_RCRTO);
+ val &= RX_DMA_CTL_STAT_WR1C_BITS;
+ nep_write(sc, RX_DMA_CTL_STAT(sc->sc_port), val);
 
  bus_dmamap_sync(sc->sc_dmat, NEP_DMA_MAP(sc->sc_rcring), 0,
     NEP_DMA_LEN(sc->sc_rcring), BUS_DMASYNC_POSTREAD);
@@ -1301,7 +1318,9 @@ nep_init_rx_channel(struct nep_softc *sc
     (sc->sc_port << RX_LOG_PAGE_VLD_FUNC_SHIFT) |
     RX_LOG_PAGE_VLD_PAGE0 | RX_LOG_PAGE_VLD_PAGE1);
 
- nep_write(sc, RX_DMA_ENT_MSK(chan), RX_DMA_ENT_MSK_RBR_EMPTY);
+ nep_write(sc, RX_DMA_ENT_MSK(chan), RX_DMA_ENT_MSK_RBR_EMPTY |
+    RX_DMA_ENT_MSK_WRED_DROP | RX_DMA_ENT_MSK_PTDROP_PKT |
+    RX_DMA_ENT_MSK_RBR_PRE_PAR | RX_DMA_ENT_MSK_RCR_SHA_PAR);
  nep_write(sc, RX_DMA_CTL_STAT(chan), RX_DMA_CTL_STAT_MEX);
 
  val = NEP_DMA_DVA(sc->sc_rxmbox) >> 32;