let's get reference to state key back

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

let's get reference to state key back

Alexandr Nedvedicky
Hello,

this yet another patch, which got backed out in Jan. This time I'd like to
commit change, which makes sure IP stack keeps reference to state key along
the packet (mbuf).

The change should not cause panics, but I'd like to ask testers to watch
for memory leaks on state key pool (pfstkey):

    while true ; do vmstat -m |egrep '^pf|^Name' ; sleep 5 ; done

I could spot no memory leaks, but my test scenario is currently limited
to local traffic and simple router, which forwards packets.

thanks a lot
regards
sasha

--------8<---------------8<---------------8<------------------8<--------
Index: src/sys/kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.220
diff -u -p -r1.220 uipc_mbuf.c
--- src/sys/kern/uipc_mbuf.c 22 Mar 2016 06:17:00 -0000 1.220
+++ src/sys/kern/uipc_mbuf.c 25 Mar 2016 12:49:38 -0000
@@ -72,6 +72,8 @@
  * Research Laboratory (NRL).
  */
 
+#include "pf.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
@@ -93,6 +95,10 @@
 #include <machine/db_machdep.h>
 #endif
 
+#if NPF > 0
+#include <net/pfvar.h>
+#endif /* NPF > 0 */
+
 struct mbstat mbstat; /* mbuf stats */
 struct mutex mbstatmtx = MUTEX_INITIALIZER(IPL_NET);
 struct pool mbpool; /* mbuf pool */
@@ -261,6 +267,10 @@ m_resethdr(struct mbuf *m)
  /* delete all mbuf tags to reset the state */
  m_tag_delete_chain(m);
 
+#if NPF > 0
+ pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+
  /* like m_inithdr(), but keep any associated data and mbufs */
  memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr));
  m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
@@ -350,8 +360,12 @@ m_free(struct mbuf *m)
  if (n)
  n->m_flags |= M_ZEROIZE;
  }
- if (m->m_flags & M_PKTHDR)
+ if (m->m_flags & M_PKTHDR) {
  m_tag_delete_chain(m);
+#if NPF > 0
+ pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+ }
  if (m->m_flags & M_EXT)
  m_extfree(m);
 
@@ -1201,6 +1215,10 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
  to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
  to->m_flags |= (from->m_flags & M_COPYFLAGS);
  to->m_pkthdr = from->m_pkthdr;
+
+#if NPF > 0
+ pf_pkt_state_key_ref(to);
+#endif /* NPF > 0 */
 
  SLIST_INIT(&to->m_pkthdr.ph_tags);
 
Index: src/sys/net/if_pfsync.c
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.227
diff -u -p -r1.227 if_pfsync.c
--- src/sys/net/if_pfsync.c 31 Jan 2016 00:18:07 -0000 1.227
+++ src/sys/net/if_pfsync.c 25 Mar 2016 12:49:41 -0000
@@ -523,6 +523,7 @@ pfsync_state_import(struct pfsync_state
  skw->port[0] = sp->key[PF_SK_WIRE].port[0];
  skw->port[1] = sp->key[PF_SK_WIRE].port[1];
  skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
+ PF_REF_INIT(skw->refcnt);
  skw->proto = sp->proto;
  if (!(skw->af = sp->key[PF_SK_WIRE].af))
  skw->af = sp->af;
@@ -532,6 +533,7 @@ pfsync_state_import(struct pfsync_state
  sks->port[0] = sp->key[PF_SK_STACK].port[0];
  sks->port[1] = sp->key[PF_SK_STACK].port[1];
  sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
+ PF_REF_INIT(sks->refcnt);
  if (!(sks->af = sp->key[PF_SK_STACK].af))
  sks->af = sp->af;
  if (sks->af != skw->af) {
Index: src/sys/net/pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.966
diff -u -p -r1.966 pf.c
--- src/sys/net/pf.c 4 Mar 2016 22:38:23 -0000 1.966
+++ src/sys/net/pf.c 25 Mar 2016 12:49:49 -0000
@@ -231,6 +231,11 @@ int pf_step_out_of_anchor(int *, stru
 void pf_counters_inc(int, struct pf_pdesc *,
     struct pf_state *, struct pf_rule *,
     struct pf_rule *);
+void pf_state_key_link(struct pf_state_key *,
+    struct pf_state_key *);
+void pf_inpcb_unlink_state_key(struct inpcb *);
+void pf_state_key_unlink_reverse(struct pf_state_key *);
+
 #if NPFLOG > 0
 void pf_log_matches(struct pf_pdesc *, struct pf_rule *,
     struct pf_rule *, struct pf_ruleset *,
@@ -732,6 +737,7 @@ void
 pf_state_key_detach(struct pf_state *s, int idx)
 {
  struct pf_state_item *si;
+ struct pf_state_key *sk;
 
  if (s->key[idx] == NULL)
  return;
@@ -745,15 +751,15 @@ pf_state_key_detach(struct pf_state *s,
  pool_put(&pf_state_item_pl, si);
  }
 
- if (TAILQ_EMPTY(&s->key[idx]->states)) {
- RB_REMOVE(pf_state_tree, &pf_statetbl, s->key[idx]);
- if (s->key[idx]->reverse)
- s->key[idx]->reverse->reverse = NULL;
- if (s->key[idx]->inp)
- s->key[idx]->inp->inp_pf_sk = NULL;
- pool_put(&pf_state_key_pl, s->key[idx]);
- }
+ sk = s->key[idx];
  s->key[idx] = NULL;
+ if (TAILQ_EMPTY(&sk->states)) {
+ RB_REMOVE(pf_state_tree, &pf_statetbl, sk);
+ sk->removed = 1;
+ pf_state_key_unlink_reverse(sk);
+ pf_inpcb_unlink_state_key(sk->inp);
+ pf_state_key_unref(sk);
+ }
 }
 
 struct pf_state_key *
@@ -840,6 +846,8 @@ pf_state_key_setup(struct pf_pdesc *pd,
  sk1->proto = pd->proto;
  sk1->af = pd->af;
  sk1->rdomain = pd->rdomain;
+ PF_REF_INIT(sk1->refcnt);
+ sk1->removed = 0;
  if (rtableid >= 0)
  wrdom = rtable_l2(rtableid);
 
@@ -871,6 +879,8 @@ pf_state_key_setup(struct pf_pdesc *pd,
  sk2->proto = pd->proto;
  sk2->af = pd->naf;
  sk2->rdomain = wrdom;
+ PF_REF_INIT(sk2->refcnt);
+ sk2->removed = 0;
  } else
  sk2 = sk1;
 
@@ -986,7 +996,7 @@ struct pf_state *
 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir,
     struct mbuf *m)
 {
- struct pf_state_key *sk;
+ struct pf_state_key *sk, *pkt_sk, *inp_sk;
  struct pf_state_item *si;
 
  pf_status.fcounters[FCNT_STATE_SEARCH]++;
@@ -996,31 +1006,47 @@ pf_find_state(struct pfi_kif *kif, struc
  addlog("\n");
  }
 
- if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
-    m->m_pkthdr.pf.statekey->reverse)
- sk = m->m_pkthdr.pf.statekey->reverse;
- else if (dir == PF_OUT && m->m_pkthdr.pf.inp &&
-    m->m_pkthdr.pf.inp->inp_pf_sk)
- sk = m->m_pkthdr.pf.inp->inp_pf_sk;
- else {
+ inp_sk = NULL;
+ pkt_sk = NULL;
+ sk = NULL;
+ if (dir == PF_OUT) {
+ /* first if block deals with outbound forwarded packet */
+ pkt_sk = m->m_pkthdr.pf.statekey;
+ if (pf_state_key_isvalid(pkt_sk) &&
+    pf_state_key_isvalid(pkt_sk->reverse)) {
+ sk = pkt_sk->reverse;
+ } else {
+ pf_pkt_unlink_state_key(m);
+ pkt_sk = NULL;
+ }
+
+ if (pkt_sk == NULL) {
+ /* here we deal with local outbound packet */
+ if (m->m_pkthdr.pf.inp != NULL) {
+ inp_sk = m->m_pkthdr.pf.inp->inp_pf_sk;
+ if (pf_state_key_isvalid(inp_sk))
+ sk = inp_sk;
+ else
+ pf_inpcb_unlink_state_key(
+    m->m_pkthdr.pf.inp);
+ }
+ }
+ }
+
+ if (sk == NULL) {
  if ((sk = RB_FIND(pf_state_tree, &pf_statetbl,
     (struct pf_state_key *)key)) == NULL)
  return (NULL);
- if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
-    pf_compare_state_keys(m->m_pkthdr.pf.statekey, sk,
-    kif, dir) == 0) {
- m->m_pkthdr.pf.statekey->reverse = sk;
- sk->reverse = m->m_pkthdr.pf.statekey;
- } else if (dir == PF_OUT && m->m_pkthdr.pf.inp && !sk->inp) {
- m->m_pkthdr.pf.inp->inp_pf_sk = sk;
- sk->inp = m->m_pkthdr.pf.inp;
- }
+ if (dir == PF_OUT && pkt_sk &&
+    pf_compare_state_keys(pkt_sk, sk, kif, dir) == 0)
+ pf_state_key_link(sk, pkt_sk);
+ else if (dir == PF_OUT)
+ pf_inp_link(m, m->m_pkthdr.pf.inp);
  }
 
- if (dir == PF_OUT) {
- m->m_pkthdr.pf.statekey = NULL;
- m->m_pkthdr.pf.inp = NULL;
- }
+ /* remove firewall data from outbound packet */
+ if (dir == PF_OUT)
+ pf_pkt_addr_changed(m);
 
  /* list is sorted, if-bound states before floating ones */
  TAILQ_FOREACH(si, &sk->states, entry)
@@ -6545,7 +6571,8 @@ done:
  if (pd.dir == PF_OUT &&
     pd.m->m_pkthdr.pf.inp && !pd.m->m_pkthdr.pf.inp->inp_pf_sk &&
     s && s->key[PF_SK_STACK] && !s->key[PF_SK_STACK]->inp) {
- pd.m->m_pkthdr.pf.inp->inp_pf_sk = s->key[PF_SK_STACK];
+ pd.m->m_pkthdr.pf.inp->inp_pf_sk =
+    pf_state_key_ref(s->key[PF_SK_STACK]);
  s->key[PF_SK_STACK]->inp = pd.m->m_pkthdr.pf.inp;
  }
 
@@ -6716,7 +6743,7 @@ pf_cksum(struct pf_pdesc *pd, struct mbu
 void
 pf_pkt_addr_changed(struct mbuf *m)
 {
- m->m_pkthdr.pf.statekey = NULL;
+ pf_pkt_unlink_state_key(m);
  m->m_pkthdr.pf.inp = NULL;
 }
 
@@ -6724,25 +6751,40 @@ struct inpcb *
 pf_inp_lookup(struct mbuf *m)
 {
  struct inpcb *inp = NULL;
+ struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
 
- if (m->m_pkthdr.pf.statekey) {
+ if (!pf_state_key_isvalid(sk))
+ pf_pkt_unlink_state_key(m);
+ else
  inp = m->m_pkthdr.pf.statekey->inp;
- if (inp && inp->inp_pf_sk)
- KASSERT(m->m_pkthdr.pf.statekey == inp->inp_pf_sk);
- }
+
+ if (inp && inp->inp_pf_sk)
+ KASSERT(m->m_pkthdr.pf.statekey == inp->inp_pf_sk);
+
  return (inp);
 }
 
 void
 pf_inp_link(struct mbuf *m, struct inpcb *inp)
 {
- if (m->m_pkthdr.pf.statekey && inp &&
-    !m->m_pkthdr.pf.statekey->inp && !inp->inp_pf_sk) {
- m->m_pkthdr.pf.statekey->inp = inp;
- inp->inp_pf_sk = m->m_pkthdr.pf.statekey;
+ struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
+
+ if (!pf_state_key_isvalid(sk)) {
+ pf_pkt_unlink_state_key(m);
+ return;
+ }
+
+ /*
+ * we don't need to grab PF-lock here. At worst case we link inp to
+ * state, which might be just being marked as deleted by another
+ * thread.
+ */
+ if (inp && !sk->inp && !inp->inp_pf_sk) {
+ sk->inp = inp;
+ inp->inp_pf_sk = pf_state_key_ref(sk);
  }
  /* The statekey has finished finding the inp, it is no longer needed. */
- m->m_pkthdr.pf.statekey = NULL;
+ pf_pkt_unlink_state_key(m);
 }
 
 void
@@ -6750,10 +6792,21 @@ pf_inp_unlink(struct inpcb *inp)
 {
  if (inp->inp_pf_sk) {
  inp->inp_pf_sk->inp = NULL;
- inp->inp_pf_sk = NULL;
+ pf_inpcb_unlink_state_key(inp);
  }
 }
 
+void
+pf_state_key_link(struct pf_state_key *sk, struct pf_state_key *pkt_sk)
+{
+ /*
+ * Assert will not fire as long as we are called by pf_find_state()
+ */
+ KASSERT((pkt_sk->reverse == NULL) && (sk->reverse == NULL));
+ pkt_sk->reverse = pf_state_key_ref(sk);
+ sk->reverse = pf_state_key_ref(pkt_sk);
+}
+
 #if NPFLOG > 0
 void
 pf_log_matches(struct pf_pdesc *pd, struct pf_rule *rm, struct pf_rule *am,
@@ -6770,3 +6823,66 @@ pf_log_matches(struct pf_pdesc *pd, stru
  PFLOG_PACKET(pd, PFRES_MATCH, rm, am, ruleset, ri->r);
 }
 #endif /* NPFLOG > 0 */
+
+struct pf_state_key *
+pf_state_key_ref(struct pf_state_key *sk)
+{
+ if (sk != NULL)
+ PF_REF_TAKE(sk->refcnt);
+
+ return (sk);
+}
+
+void
+pf_state_key_unref(struct pf_state_key *sk)
+{
+ if ((sk != NULL) && PF_REF_RELE(sk->refcnt)) {
+ /* state key must be removed from tree */
+ KASSERT(!pf_state_key_isvalid(sk));
+ /* state key must be unlinked from reverse key */
+ KASSERT(sk->reverse == NULL);
+ /* state key must be unlinked from socket */
+ KASSERT((sk->inp == NULL) || (sk->inp->inp_pf_sk == NULL));
+ sk->inp = NULL;
+ pool_put(&pf_state_key_pl, sk);
+ }
+}
+
+int
+pf_state_key_isvalid(struct pf_state_key *sk)
+{
+ return ((sk != NULL) && (sk->removed == 0));
+}
+
+void
+pf_pkt_unlink_state_key(struct mbuf *m)
+{
+ pf_state_key_unref(m->m_pkthdr.pf.statekey);
+ m->m_pkthdr.pf.statekey = NULL;
+}
+
+void
+pf_pkt_state_key_ref(struct mbuf *m)
+{
+ pf_state_key_ref(m->m_pkthdr.pf.statekey);
+}
+
+void
+pf_inpcb_unlink_state_key(struct inpcb *inp)
+{
+ if (inp != NULL) {
+ pf_state_key_unref(inp->inp_pf_sk);
+ inp->inp_pf_sk = NULL;
+ }
+}
+
+void
+pf_state_key_unlink_reverse(struct pf_state_key *sk)
+{
+ if ((sk != NULL) && (sk->reverse != NULL)) {
+ pf_state_key_unref(sk->reverse->reverse);
+ sk->reverse->reverse = NULL;
+ pf_state_key_unref(sk->reverse);
+ sk->reverse = NULL;
+ }
+}
Index: src/sys/net/pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.430
diff -u -p -r1.430 pfvar.h
--- src/sys/net/pfvar.h 31 Jan 2016 00:18:07 -0000 1.430
+++ src/sys/net/pfvar.h 25 Mar 2016 12:49:54 -0000
@@ -38,6 +38,9 @@
 #include <sys/tree.h>
 #include <sys/rwlock.h>
 #include <sys/syslimits.h>
+#include <sys/refcnt.h>
+
+#include <netinet/in.h>
 
 #include <net/radix.h>
 #include <net/route.h>
@@ -55,6 +58,11 @@ struct ip6_hdr;
 #endif
 #endif
 
+typedef struct refcnt pf_refcnt_t;
+#define PF_REF_INIT(_x) refcnt_init(&(_x))
+#define PF_REF_TAKE(_x) refcnt_take(&(_x))
+#define PF_REF_RELE(_x) refcnt_rele(&(_x))
+
 enum { PF_INOUT, PF_IN, PF_OUT, PF_FWD };
 enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT,
   PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER,
@@ -696,6 +704,8 @@ struct pf_state_key {
  struct pf_statelisthead states;
  struct pf_state_key *reverse;
  struct inpcb *inp;
+ pf_refcnt_t refcnt;
+ u_int8_t removed;
 };
 #define PF_REVERSED_KEY(key, family) \
  ((key[PF_SK_WIRE]->af != key[PF_SK_STACK]->af) && \
@@ -1909,7 +1919,12 @@ int pf_postprocess_addr(struct pf_sta
 
 void pf_cksum(struct pf_pdesc *, struct mbuf *);
 
-#endif /* _KERNEL */
+struct pf_state_key *pf_state_key_ref(struct pf_state_key *);
+void pf_state_key_unref(struct pf_state_key *);
+int pf_state_key_isvalid(struct pf_state_key *);
+void pf_pkt_unlink_state_key(struct mbuf *);
+void pf_pkt_state_key_ref(struct mbuf *);
 
+#endif /* _KERNEL */
 
 #endif /* _NET_PFVAR_H_ */
Index: src/sys/netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.268
diff -u -p -r1.268 ip_input.c
--- src/sys/netinet/ip_input.c 31 Jan 2016 00:18:07 -0000 1.268
+++ src/sys/netinet/ip_input.c 25 Mar 2016 12:49:58 -0000
@@ -1458,6 +1458,9 @@ ip_forward(struct mbuf *m, struct ifnet
  len = min(ntohs(ip->ip_len), 68);
  m_copydata(m, 0, len, mfake.m_pktdat);
  mfake.m_pkthdr.len = mfake.m_len = len;
+#if NPF > 0
+ pf_pkt_unlink_state_key(&mfake);
+#endif /* NPF > 0 */
  fake = 1;
  }
 
Index: src/sys/sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.208
diff -u -p -r1.208 mbuf.h
--- src/sys/sys/mbuf.h 23 Feb 2016 01:39:14 -0000 1.208
+++ src/sys/sys/mbuf.h 25 Mar 2016 12:50:00 -0000
@@ -316,6 +316,7 @@ struct mbuf {
  (to)->m_pkthdr = (from)->m_pkthdr; \
  (from)->m_flags &= ~M_PKTHDR; \
  SLIST_INIT(&(from)->m_pkthdr.ph_tags); \
+ (from)->m_pkthdr.pf.statekey = NULL; \
 } while (/* CONSTCOND */ 0)
 
 /*

Reply | Threaded
Open this post in threaded view
|

Re: let's get reference to state key back

Alexandr Nedvedicky
Hello,

sorry I have not test patch, which I've sent earlier.
I boot unpatched kernel...

this is the fix one need. updated patch is further below.

--------8<---------------8<-----------------8<--------

diff -r 00f90fca186c src/sys/net/pf.c
--- a/src/sys/net/pf.c  Fri Mar 25 09:29:43 2016 +0100
+++ b/src/sys/net/pf.c  Fri Mar 25 21:06:11 2016 +0100
@@ -6566,7 +6566,8 @@
                 * IP tunnels.
                 */
                KASSERT(pd.m->m_pkthdr.pf.statekey == NULL);
-               pd.m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
+               pd.m->m_pkthdr.pf.statekey =
+                   pf_state_key_ref(s->key[PF_SK_STACK]);
        }
        if (pd.dir == PF_OUT &&
            pd.m->m_pkthdr.pf.inp && !pd.m->m_pkthdr.pf.inp->inp_pf_sk &&

--------8<---------------8<-----------------8<--------


sorry for inconveniences.

regards
sasha


--------8<---------------8<-----------------8<--------

Index: src/sys/kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.220
diff -u -p -r1.220 uipc_mbuf.c
--- src/sys/kern/uipc_mbuf.c 22 Mar 2016 06:17:00 -0000 1.220
+++ src/sys/kern/uipc_mbuf.c 26 Mar 2016 00:12:07 -0000
@@ -72,6 +72,8 @@
  * Research Laboratory (NRL).
  */
 
+#include "pf.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
@@ -93,6 +95,10 @@
 #include <machine/db_machdep.h>
 #endif
 
+#if NPF > 0
+#include <net/pfvar.h>
+#endif /* NPF > 0 */
+
 struct mbstat mbstat; /* mbuf stats */
 struct mutex mbstatmtx = MUTEX_INITIALIZER(IPL_NET);
 struct pool mbpool; /* mbuf pool */
@@ -261,6 +267,10 @@ m_resethdr(struct mbuf *m)
  /* delete all mbuf tags to reset the state */
  m_tag_delete_chain(m);
 
+#if NPF > 0
+ pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+
  /* like m_inithdr(), but keep any associated data and mbufs */
  memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr));
  m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
@@ -350,8 +360,12 @@ m_free(struct mbuf *m)
  if (n)
  n->m_flags |= M_ZEROIZE;
  }
- if (m->m_flags & M_PKTHDR)
+ if (m->m_flags & M_PKTHDR) {
  m_tag_delete_chain(m);
+#if NPF > 0
+ pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+ }
  if (m->m_flags & M_EXT)
  m_extfree(m);
 
@@ -1201,6 +1215,10 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
  to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
  to->m_flags |= (from->m_flags & M_COPYFLAGS);
  to->m_pkthdr = from->m_pkthdr;
+
+#if NPF > 0
+ pf_pkt_state_key_ref(to);
+#endif /* NPF > 0 */
 
  SLIST_INIT(&to->m_pkthdr.ph_tags);
 
Index: src/sys/net/if_pfsync.c
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.227
diff -u -p -r1.227 if_pfsync.c
--- src/sys/net/if_pfsync.c 31 Jan 2016 00:18:07 -0000 1.227
+++ src/sys/net/if_pfsync.c 26 Mar 2016 00:12:08 -0000
@@ -523,6 +523,7 @@ pfsync_state_import(struct pfsync_state
  skw->port[0] = sp->key[PF_SK_WIRE].port[0];
  skw->port[1] = sp->key[PF_SK_WIRE].port[1];
  skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
+ PF_REF_INIT(skw->refcnt);
  skw->proto = sp->proto;
  if (!(skw->af = sp->key[PF_SK_WIRE].af))
  skw->af = sp->af;
@@ -532,6 +533,7 @@ pfsync_state_import(struct pfsync_state
  sks->port[0] = sp->key[PF_SK_STACK].port[0];
  sks->port[1] = sp->key[PF_SK_STACK].port[1];
  sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
+ PF_REF_INIT(sks->refcnt);
  if (!(sks->af = sp->key[PF_SK_STACK].af))
  sks->af = sp->af;
  if (sks->af != skw->af) {
Index: src/sys/net/pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.966
diff -u -p -r1.966 pf.c
--- src/sys/net/pf.c 4 Mar 2016 22:38:23 -0000 1.966
+++ src/sys/net/pf.c 26 Mar 2016 00:12:09 -0000
@@ -231,6 +231,11 @@ int pf_step_out_of_anchor(int *, stru
 void pf_counters_inc(int, struct pf_pdesc *,
     struct pf_state *, struct pf_rule *,
     struct pf_rule *);
+void pf_state_key_link(struct pf_state_key *,
+    struct pf_state_key *);
+void pf_inpcb_unlink_state_key(struct inpcb *);
+void pf_state_key_unlink_reverse(struct pf_state_key *);
+
 #if NPFLOG > 0
 void pf_log_matches(struct pf_pdesc *, struct pf_rule *,
     struct pf_rule *, struct pf_ruleset *,
@@ -732,6 +737,7 @@ void
 pf_state_key_detach(struct pf_state *s, int idx)
 {
  struct pf_state_item *si;
+ struct pf_state_key *sk;
 
  if (s->key[idx] == NULL)
  return;
@@ -745,15 +751,15 @@ pf_state_key_detach(struct pf_state *s,
  pool_put(&pf_state_item_pl, si);
  }
 
- if (TAILQ_EMPTY(&s->key[idx]->states)) {
- RB_REMOVE(pf_state_tree, &pf_statetbl, s->key[idx]);
- if (s->key[idx]->reverse)
- s->key[idx]->reverse->reverse = NULL;
- if (s->key[idx]->inp)
- s->key[idx]->inp->inp_pf_sk = NULL;
- pool_put(&pf_state_key_pl, s->key[idx]);
- }
+ sk = s->key[idx];
  s->key[idx] = NULL;
+ if (TAILQ_EMPTY(&sk->states)) {
+ RB_REMOVE(pf_state_tree, &pf_statetbl, sk);
+ sk->removed = 1;
+ pf_state_key_unlink_reverse(sk);
+ pf_inpcb_unlink_state_key(sk->inp);
+ pf_state_key_unref(sk);
+ }
 }
 
 struct pf_state_key *
@@ -840,6 +846,8 @@ pf_state_key_setup(struct pf_pdesc *pd,
  sk1->proto = pd->proto;
  sk1->af = pd->af;
  sk1->rdomain = pd->rdomain;
+ PF_REF_INIT(sk1->refcnt);
+ sk1->removed = 0;
  if (rtableid >= 0)
  wrdom = rtable_l2(rtableid);
 
@@ -871,6 +879,8 @@ pf_state_key_setup(struct pf_pdesc *pd,
  sk2->proto = pd->proto;
  sk2->af = pd->naf;
  sk2->rdomain = wrdom;
+ PF_REF_INIT(sk2->refcnt);
+ sk2->removed = 0;
  } else
  sk2 = sk1;
 
@@ -986,7 +996,7 @@ struct pf_state *
 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir,
     struct mbuf *m)
 {
- struct pf_state_key *sk;
+ struct pf_state_key *sk, *pkt_sk, *inp_sk;
  struct pf_state_item *si;
 
  pf_status.fcounters[FCNT_STATE_SEARCH]++;
@@ -996,31 +1006,47 @@ pf_find_state(struct pfi_kif *kif, struc
  addlog("\n");
  }
 
- if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
-    m->m_pkthdr.pf.statekey->reverse)
- sk = m->m_pkthdr.pf.statekey->reverse;
- else if (dir == PF_OUT && m->m_pkthdr.pf.inp &&
-    m->m_pkthdr.pf.inp->inp_pf_sk)
- sk = m->m_pkthdr.pf.inp->inp_pf_sk;
- else {
+ inp_sk = NULL;
+ pkt_sk = NULL;
+ sk = NULL;
+ if (dir == PF_OUT) {
+ /* first if block deals with outbound forwarded packet */
+ pkt_sk = m->m_pkthdr.pf.statekey;
+ if (pf_state_key_isvalid(pkt_sk) &&
+    pf_state_key_isvalid(pkt_sk->reverse)) {
+ sk = pkt_sk->reverse;
+ } else {
+ pf_pkt_unlink_state_key(m);
+ pkt_sk = NULL;
+ }
+
+ if (pkt_sk == NULL) {
+ /* here we deal with local outbound packet */
+ if (m->m_pkthdr.pf.inp != NULL) {
+ inp_sk = m->m_pkthdr.pf.inp->inp_pf_sk;
+ if (pf_state_key_isvalid(inp_sk))
+ sk = inp_sk;
+ else
+ pf_inpcb_unlink_state_key(
+    m->m_pkthdr.pf.inp);
+ }
+ }
+ }
+
+ if (sk == NULL) {
  if ((sk = RB_FIND(pf_state_tree, &pf_statetbl,
     (struct pf_state_key *)key)) == NULL)
  return (NULL);
- if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
-    pf_compare_state_keys(m->m_pkthdr.pf.statekey, sk,
-    kif, dir) == 0) {
- m->m_pkthdr.pf.statekey->reverse = sk;
- sk->reverse = m->m_pkthdr.pf.statekey;
- } else if (dir == PF_OUT && m->m_pkthdr.pf.inp && !sk->inp) {
- m->m_pkthdr.pf.inp->inp_pf_sk = sk;
- sk->inp = m->m_pkthdr.pf.inp;
- }
+ if (dir == PF_OUT && pkt_sk &&
+    pf_compare_state_keys(pkt_sk, sk, kif, dir) == 0)
+ pf_state_key_link(sk, pkt_sk);
+ else if (dir == PF_OUT)
+ pf_inp_link(m, m->m_pkthdr.pf.inp);
  }
 
- if (dir == PF_OUT) {
- m->m_pkthdr.pf.statekey = NULL;
- m->m_pkthdr.pf.inp = NULL;
- }
+ /* remove firewall data from outbound packet */
+ if (dir == PF_OUT)
+ pf_pkt_addr_changed(m);
 
  /* list is sorted, if-bound states before floating ones */
  TAILQ_FOREACH(si, &sk->states, entry)
@@ -6540,12 +6566,14 @@ done:
  * IP tunnels.
  */
  KASSERT(pd.m->m_pkthdr.pf.statekey == NULL);
- pd.m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
+ pd.m->m_pkthdr.pf.statekey =
+    pf_state_key_ref(s->key[PF_SK_STACK]);
  }
  if (pd.dir == PF_OUT &&
     pd.m->m_pkthdr.pf.inp && !pd.m->m_pkthdr.pf.inp->inp_pf_sk &&
     s && s->key[PF_SK_STACK] && !s->key[PF_SK_STACK]->inp) {
- pd.m->m_pkthdr.pf.inp->inp_pf_sk = s->key[PF_SK_STACK];
+ pd.m->m_pkthdr.pf.inp->inp_pf_sk =
+    pf_state_key_ref(s->key[PF_SK_STACK]);
  s->key[PF_SK_STACK]->inp = pd.m->m_pkthdr.pf.inp;
  }
 
@@ -6716,7 +6744,7 @@ pf_cksum(struct pf_pdesc *pd, struct mbu
 void
 pf_pkt_addr_changed(struct mbuf *m)
 {
- m->m_pkthdr.pf.statekey = NULL;
+ pf_pkt_unlink_state_key(m);
  m->m_pkthdr.pf.inp = NULL;
 }
 
@@ -6724,25 +6752,40 @@ struct inpcb *
 pf_inp_lookup(struct mbuf *m)
 {
  struct inpcb *inp = NULL;
+ struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
 
- if (m->m_pkthdr.pf.statekey) {
+ if (!pf_state_key_isvalid(sk))
+ pf_pkt_unlink_state_key(m);
+ else
  inp = m->m_pkthdr.pf.statekey->inp;
- if (inp && inp->inp_pf_sk)
- KASSERT(m->m_pkthdr.pf.statekey == inp->inp_pf_sk);
- }
+
+ if (inp && inp->inp_pf_sk)
+ KASSERT(m->m_pkthdr.pf.statekey == inp->inp_pf_sk);
+
  return (inp);
 }
 
 void
 pf_inp_link(struct mbuf *m, struct inpcb *inp)
 {
- if (m->m_pkthdr.pf.statekey && inp &&
-    !m->m_pkthdr.pf.statekey->inp && !inp->inp_pf_sk) {
- m->m_pkthdr.pf.statekey->inp = inp;
- inp->inp_pf_sk = m->m_pkthdr.pf.statekey;
+ struct pf_state_key *sk = m->m_pkthdr.pf.statekey;
+
+ if (!pf_state_key_isvalid(sk)) {
+ pf_pkt_unlink_state_key(m);
+ return;
+ }
+
+ /*
+ * we don't need to grab PF-lock here. At worst case we link inp to
+ * state, which might be just being marked as deleted by another
+ * thread.
+ */
+ if (inp && !sk->inp && !inp->inp_pf_sk) {
+ sk->inp = inp;
+ inp->inp_pf_sk = pf_state_key_ref(sk);
  }
  /* The statekey has finished finding the inp, it is no longer needed. */
- m->m_pkthdr.pf.statekey = NULL;
+ pf_pkt_unlink_state_key(m);
 }
 
 void
@@ -6750,10 +6793,21 @@ pf_inp_unlink(struct inpcb *inp)
 {
  if (inp->inp_pf_sk) {
  inp->inp_pf_sk->inp = NULL;
- inp->inp_pf_sk = NULL;
+ pf_inpcb_unlink_state_key(inp);
  }
 }
 
+void
+pf_state_key_link(struct pf_state_key *sk, struct pf_state_key *pkt_sk)
+{
+ /*
+ * Assert will not wire as long as we are called by pf_find_state()
+ */
+ KASSERT((pkt_sk->reverse == NULL) && (sk->reverse == NULL));
+ pkt_sk->reverse = pf_state_key_ref(sk);
+ sk->reverse = pf_state_key_ref(pkt_sk);
+}
+
 #if NPFLOG > 0
 void
 pf_log_matches(struct pf_pdesc *pd, struct pf_rule *rm, struct pf_rule *am,
@@ -6770,3 +6824,66 @@ pf_log_matches(struct pf_pdesc *pd, stru
  PFLOG_PACKET(pd, PFRES_MATCH, rm, am, ruleset, ri->r);
 }
 #endif /* NPFLOG > 0 */
+
+struct pf_state_key *
+pf_state_key_ref(struct pf_state_key *sk)
+{
+ if (sk != NULL)
+ PF_REF_TAKE(sk->refcnt);
+
+ return (sk);
+}
+
+void
+pf_state_key_unref(struct pf_state_key *sk)
+{
+ if ((sk != NULL) && PF_REF_RELE(sk->refcnt)) {
+ /* state key must be removed from tree */
+ KASSERT(!pf_state_key_isvalid(sk));
+ /* state key must be unlinked from reverse key */
+ KASSERT(sk->reverse == NULL);
+ /* state key must be unlinked from socket */
+ KASSERT((sk->inp == NULL) || (sk->inp->inp_pf_sk == NULL));
+ sk->inp = NULL;
+ pool_put(&pf_state_key_pl, sk);
+ }
+}
+
+int
+pf_state_key_isvalid(struct pf_state_key *sk)
+{
+ return ((sk != NULL) && (sk->removed == 0));
+}
+
+void
+pf_pkt_unlink_state_key(struct mbuf *m)
+{
+ pf_state_key_unref(m->m_pkthdr.pf.statekey);
+ m->m_pkthdr.pf.statekey = NULL;
+}
+
+void
+pf_pkt_state_key_ref(struct mbuf *m)
+{
+ pf_state_key_ref(m->m_pkthdr.pf.statekey);
+}
+
+void
+pf_inpcb_unlink_state_key(struct inpcb *inp)
+{
+ if (inp != NULL) {
+ pf_state_key_unref(inp->inp_pf_sk);
+ inp->inp_pf_sk = NULL;
+ }
+}
+
+void
+pf_state_key_unlink_reverse(struct pf_state_key *sk)
+{
+ if ((sk != NULL) && (sk->reverse != NULL)) {
+ pf_state_key_unref(sk->reverse->reverse);
+ sk->reverse->reverse = NULL;
+ pf_state_key_unref(sk->reverse);
+ sk->reverse = NULL;
+ }
+}
Index: src/sys/net/pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.430
diff -u -p -r1.430 pfvar.h
--- src/sys/net/pfvar.h 31 Jan 2016 00:18:07 -0000 1.430
+++ src/sys/net/pfvar.h 26 Mar 2016 00:12:10 -0000
@@ -38,6 +38,9 @@
 #include <sys/tree.h>
 #include <sys/rwlock.h>
 #include <sys/syslimits.h>
+#include <sys/refcnt.h>
+
+#include <netinet/in.h>
 
 #include <net/radix.h>
 #include <net/route.h>
@@ -55,6 +58,11 @@ struct ip6_hdr;
 #endif
 #endif
 
+typedef struct refcnt pf_refcnt_t;
+#define PF_REF_INIT(_x) refcnt_init(&(_x))
+#define PF_REF_TAKE(_x) refcnt_take(&(_x))
+#define PF_REF_RELE(_x) refcnt_rele(&(_x))
+
 enum { PF_INOUT, PF_IN, PF_OUT, PF_FWD };
 enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT,
   PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER,
@@ -696,6 +704,8 @@ struct pf_state_key {
  struct pf_statelisthead states;
  struct pf_state_key *reverse;
  struct inpcb *inp;
+ pf_refcnt_t refcnt;
+ u_int8_t removed;
 };
 #define PF_REVERSED_KEY(key, family) \
  ((key[PF_SK_WIRE]->af != key[PF_SK_STACK]->af) && \
@@ -1909,7 +1919,12 @@ int pf_postprocess_addr(struct pf_sta
 
 void pf_cksum(struct pf_pdesc *, struct mbuf *);
 
-#endif /* _KERNEL */
+struct pf_state_key *pf_state_key_ref(struct pf_state_key *);
+void pf_state_key_unref(struct pf_state_key *);
+int pf_state_key_isvalid(struct pf_state_key *);
+void pf_pkt_unlink_state_key(struct mbuf *);
+void pf_pkt_state_key_ref(struct mbuf *);
 
+#endif /* _KERNEL */
 
 #endif /* _NET_PFVAR_H_ */
Index: src/sys/netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.268
diff -u -p -r1.268 ip_input.c
--- src/sys/netinet/ip_input.c 31 Jan 2016 00:18:07 -0000 1.268
+++ src/sys/netinet/ip_input.c 26 Mar 2016 00:12:11 -0000
@@ -1458,6 +1458,9 @@ ip_forward(struct mbuf *m, struct ifnet
  len = min(ntohs(ip->ip_len), 68);
  m_copydata(m, 0, len, mfake.m_pktdat);
  mfake.m_pkthdr.len = mfake.m_len = len;
+#if NPF > 0
+ pf_pkt_unlink_state_key(&mfake);
+#endif /* NPF > 0 */
  fake = 1;
  }
 
Index: src/sys/sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.208
diff -u -p -r1.208 mbuf.h
--- src/sys/sys/mbuf.h 23 Feb 2016 01:39:14 -0000 1.208
+++ src/sys/sys/mbuf.h 26 Mar 2016 00:12:12 -0000
@@ -316,6 +316,7 @@ struct mbuf {
  (to)->m_pkthdr = (from)->m_pkthdr; \
  (from)->m_flags &= ~M_PKTHDR; \
  SLIST_INIT(&(from)->m_pkthdr.ph_tags); \
+ (from)->m_pkthdr.pf.statekey = NULL; \
 } while (/* CONSTCOND */ 0)
 
 /*

Reply | Threaded
Open this post in threaded view
|

Re: let's get reference to state key back

Alexandr Nedvedicky
Hello,

> i can't see any problem with this patch. i'm sending 14Mpps ip4 and ip6
> over ix intefaces and creating around 3M to 5M states and all this more
> than 24 hours. box is unusable but it's alive :)

thank you very much for testing. let's wait day or two to give other folks
chance to trip panics/leaks.

the patch you've kindly tried is tiny step, which prepares PF for
gentle/gradual unlock.

regards
sashan