bgpd adj-rib-out rewrite

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

bgpd adj-rib-out rewrite

Claudio Jeker
This diff is a bit of a monster. It changes the Adj-RIB-Out to be a peer
specific set of RB trees instead of using a rib in the original sense.
The reason for this is that the more peers a system has the more elements
end up being linked into the adj-rib-out and many operations do linear
searches which does not scale.

I did some testing with 4000 peers sending 1 prefix each which then are
sent back to all peers (resulting in 16Mio updates being put in Adj-RIB-Out).
Without this diff the system takes about 1h to bring up all sessions. With
the diff the system finishes in around 5min.

To not increase the memory footprint struct prefix is now using a union
for the lists or RB trees. Additionally the rib dump runner was adjusted
so that it also works with the Adj-RIB-Out. bgpctl show rib out changed
a bit since it will dump now one peer after the other apart from that
behaviour should be the same.

Please test
--
:wq Claudio


Index: mrt.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/mrt.c,v
retrieving revision 1.97
diff -u -p -r1.97 mrt.c
--- mrt.c 25 Jun 2019 21:33:55 -0000 1.97
+++ mrt.c 26 Jun 2019 09:44:12 -0000
@@ -512,15 +512,11 @@ mrt_dump_entry_v2(struct mrt *mrt, struc
  goto fail;
  }
  nump = 0;
- LIST_FOREACH(p, &re->prefix_h, rib_l) {
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
  struct nexthop *nexthop;
  struct bgpd_addr *nh;
  struct ibuf *tbuf;
 
- /* skip pending withdraw in Adj-RIB-Out */
- if (prefix_aspath(p) == NULL)
- continue;
-
  nexthop = prefix_nexthop(p);
  if (nexthop == NULL) {
  bzero(&addr, sizeof(struct bgpd_addr));
@@ -683,10 +679,7 @@ mrt_dump_upcall(struct rib_entry *re, vo
  * dumps the table so we do the same. If only the active route should
  * be dumped p should be set to p = pt->active.
  */
- LIST_FOREACH(p, &re->prefix_h, rib_l) {
- /* skip pending withdraw in Adj-RIB-Out */
- if (prefix_aspath(p) == NULL)
- continue;
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
  if (mrtbuf->type == MRT_TABLE_DUMP)
  mrt_dump_entry(mrtbuf, p, mrtbuf->seqnum++,
     prefix_peer(p));
Index: parse.y
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
retrieving revision 1.392
diff -u -p -r1.392 parse.y
--- parse.y 22 Jun 2019 05:36:40 -0000 1.392
+++ parse.y 22 Jun 2019 05:44:57 -0000
@@ -3282,8 +3282,6 @@ parse_config(char *filename, struct peer
 
  add_rib("Adj-RIB-In", conf->default_tableid,
     F_RIB_NOFIB | F_RIB_NOEVALUATE);
- add_rib("Adj-RIB-Out", conf->default_tableid,
-    F_RIB_NOFIB | F_RIB_NOEVALUATE);
  add_rib("Loc-RIB", conf->default_tableid, F_RIB_LOCAL);
 
  if ((file = pushfile(filename, 1)) == NULL)
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.475
diff -u -p -r1.475 rde.c
--- rde.c 1 Jul 2019 07:07:08 -0000 1.475
+++ rde.c 1 Jul 2019 07:37:09 -0000
@@ -94,8 +94,8 @@ u_int8_t rde_roa_validity(struct rde_pr
 void peer_init(u_int32_t);
 void peer_shutdown(void);
 int peer_localaddrs(struct rde_peer *, struct bgpd_addr *);
+struct rde_peer *peer_match(struct ctl_neighbor *, u_int32_t);
 struct rde_peer *peer_add(u_int32_t, struct peer_config *);
-struct rde_peer *peer_get(u_int32_t);
 void peer_up(u_int32_t, struct session_up *);
 void peer_down(u_int32_t);
 void peer_flush(struct rde_peer *, u_int8_t, time_t);
@@ -133,7 +133,7 @@ int softreconfig;
 struct rde_dump_ctx {
  LIST_ENTRY(rde_dump_ctx) entry;
  struct ctl_show_rib_request req;
- u_int16_t rid;
+ u_int32_t peerid;
  u_int8_t throttled;
 };
 
@@ -220,7 +220,6 @@ rde_main(int debug, int verbose)
 
  /* make sure the default RIBs are setup */
  rib_new("Adj-RIB-In", 0, F_RIB_NOFIB | F_RIB_NOEVALUATE);
- rib_new("Adj-RIB-Out", 0, F_RIB_NOFIB | F_RIB_NOEVALUATE);
 
  out_rules = calloc(1, sizeof(struct filter_head));
  if (out_rules == NULL)
@@ -2242,7 +2241,7 @@ rde_dump_rib_as(struct prefix *p, struct
  rib.origin = asp->origin;
  rib.validation_state = p->validation_state;
  rib.flags = 0;
- if (p->re->active == p)
+ if (p->re != NULL && p->re->active == p)
  rib.flags |= F_PREF_ACTIVE;
  if (!prefix_peer(p)->conf.ebgp)
  rib.flags |= F_PREF_INTERNAL;
@@ -2305,7 +2304,7 @@ rde_dump_rib_as(struct prefix *p, struct
 }
 
 static int
-rde_dump_match_peer(struct rde_peer *p, struct ctl_neighbor *n)
+rde_match_peer(struct rde_peer *p, struct ctl_neighbor *n)
 {
  char *s;
 
@@ -2326,7 +2325,7 @@ rde_dump_filter(struct prefix *p, struct
 {
  struct rde_aspath *asp;
 
- if (!rde_dump_match_peer(prefix_peer(p), &req->neighbor))
+ if (!rde_match_peer(prefix_peer(p), &req->neighbor))
  return;
 
  asp = prefix_aspath(p);
@@ -2353,10 +2352,10 @@ rde_dump_filter(struct prefix *p, struct
 static void
 rde_dump_upcall(struct rib_entry *re, void *ptr)
 {
- struct prefix *p;
  struct rde_dump_ctx *ctx = ptr;
+ struct prefix *p;
 
- LIST_FOREACH(p, &re->prefix_h, rib_l)
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
  rde_dump_filter(p, &ctx->req);
 }
 
@@ -2375,10 +2374,38 @@ rde_dump_prefix_upcall(struct rib_entry
  if (ctx->req.prefixlen > pt->prefixlen)
  return;
  if (!prefix_compare(&ctx->req.prefix, &addr, ctx->req.prefixlen))
- LIST_FOREACH(p, &re->prefix_h, rib_l)
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
  rde_dump_filter(p, &ctx->req);
 }
 
+static void
+rde_dump_adjout_upcall(struct prefix *p, void *ptr)
+{
+ struct rde_dump_ctx *ctx = ptr;
+
+ if (p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD))
+ return;
+ rde_dump_filter(p, &ctx->req);
+}
+
+static void
+rde_dump_adjout_prefix_upcall(struct prefix *p, void *ptr)
+{
+ struct rde_dump_ctx *ctx = ptr;
+ struct bgpd_addr addr;
+
+ if (p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD))
+ return;
+
+ pt_getaddr(p->pt, &addr);
+ if (addr.aid != ctx->req.prefix.aid)
+ return;
+ if (ctx->req.prefixlen > p->pt->prefixlen)
+ return;
+ if (!prefix_compare(&ctx->req.prefix, &addr, ctx->req.prefixlen))
+ rde_dump_filter(p, &ctx->req);
+}
+
 static int
 rde_dump_throttled(void *arg)
 {
@@ -2391,11 +2418,45 @@ static void
 rde_dump_done(void *arg, u_int8_t aid)
 {
  struct rde_dump_ctx *ctx = arg;
+ struct rde_peer *peer;
+ u_int error;
 
- imsg_compose(ibuf_se_ctl, IMSG_CTL_END, 0, ctx->req.pid,
-    -1, NULL, 0);
+ if (ctx->req.flags & F_CTL_ADJ_OUT) {
+ peer = peer_match(&ctx->req.neighbor, ctx->peerid);
+ if (peer == NULL)
+ goto done;
+ ctx->peerid = peer->conf.id;
+ switch (ctx->req.type) {
+ case IMSG_CTL_SHOW_RIB:
+ if (prefix_dump_new(peer, ctx->req.aid,
+    CTL_MSG_HIGH_MARK, ctx, rde_dump_adjout_upcall,
+    rde_dump_done, rde_dump_throttled) == -1)
+ goto nomem;
+ break;
+ case IMSG_CTL_SHOW_RIB_PREFIX:
+ if (prefix_dump_new(peer, ctx->req.aid,
+    CTL_MSG_HIGH_MARK, ctx,
+    rde_dump_adjout_prefix_upcall,
+    rde_dump_done, rde_dump_throttled) == -1)
+ goto nomem;
+ break;
+ default:
+ fatalx("%s: unsupported imsg type", __func__);
+ }
+ return;
+ }
+done:
+ imsg_compose(ibuf_se_ctl, IMSG_CTL_END, 0, ctx->req.pid, -1, NULL, 0);
  LIST_REMOVE(ctx, entry);
  free(ctx);
+ return;
+
+nomem:
+ log_warn(__func__);
+ error = CTL_RES_NOMEM;
+ imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, ctx->req.pid, -1, &error,
+    sizeof(error));
+ return;
 }
 
 void
@@ -2404,24 +2465,92 @@ rde_dump_ctx_new(struct ctl_show_rib_req
 {
  struct rde_dump_ctx *ctx;
  struct rib_entry *re;
+ struct prefix *p;
  u_int error;
  u_int8_t hostplen;
  u_int16_t rid;
 
  if ((ctx = calloc(1, sizeof(*ctx))) == NULL) {
  nomem:
- log_warn("rde_dump_ctx_new");
+ log_warn(__func__);
  error = CTL_RES_NOMEM;
  imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, pid, -1, &error,
     sizeof(error));
  return;
  }
+
+ memcpy(&ctx->req, req, sizeof(struct ctl_show_rib_request));
+ ctx->req.pid = pid;
+ ctx->req.type = type;
+
  if (req->flags & (F_CTL_ADJ_IN | F_CTL_INVALID)) {
  rid = RIB_ADJ_IN;
  } else if (req->flags & F_CTL_ADJ_OUT) {
- rid = RIB_ADJ_OUT;
+ struct rde_peer *peer;
+
+ peer = peer_match(&req->neighbor, 0);
+ if (peer == NULL) {
+ log_warnx("%s: no peer found for adj-rib-out",
+    __func__);
+ error = CTL_RES_NOSUCHPEER;
+ imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, pid, -1,
+    &error, sizeof(error));
+ free(ctx);
+ return;
+ }
+ ctx->peerid = peer->conf.id;
+ switch (ctx->req.type) {
+ case IMSG_CTL_SHOW_RIB:
+ if (prefix_dump_new(peer, ctx->req.aid,
+    CTL_MSG_HIGH_MARK, ctx, rde_dump_adjout_upcall,
+    rde_dump_done, rde_dump_throttled) == -1)
+ goto nomem;
+ break;
+ case IMSG_CTL_SHOW_RIB_PREFIX:
+ if (req->flags & F_LONGER) {
+ if (prefix_dump_new(peer, ctx->req.aid,
+    CTL_MSG_HIGH_MARK, ctx,
+    rde_dump_adjout_prefix_upcall,
+    rde_dump_done, rde_dump_throttled) == -1)
+ goto nomem;
+ break;
+ }
+ switch (req->prefix.aid) {
+ case AID_INET:
+ case AID_VPN_IPv4:
+ hostplen = 32;
+ break;
+ case AID_INET6:
+ case AID_VPN_IPv6:
+ hostplen = 128;
+ break;
+ default:
+ fatalx("%s: unknown af", __func__);
+ }
+
+ do {
+ if (req->prefixlen == hostplen)
+ p = prefix_match(peer, &req->prefix);
+ else
+ p = prefix_lookup(peer, &req->prefix,
+    req->prefixlen);
+ if (p)
+ rde_dump_adjout_upcall(p, ctx);
+ } while ((peer = peer_match(&req->neighbor,
+    peer->conf.id)));
+
+ imsg_compose(ibuf_se_ctl, IMSG_CTL_END, 0, ctx->req.pid,
+    -1, NULL, 0);
+ free(ctx);
+ return;
+ default:
+ fatalx("%s: unsupported imsg type", __func__);
+ }
+
+ LIST_INSERT_HEAD(&rde_dump_h, ctx, entry);
+ return;
  } else if ((rid = rib_find(req->rib)) == RIB_NOTFOUND) {
- log_warnx("rde_dump_ctx_new: no such rib %s", req->rib);
+ log_warnx("%s: no such rib %s", __func__, req->rib);
  error = CTL_RES_NOSUCHRIB;
  imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, pid, -1, &error,
     sizeof(error));
@@ -2429,10 +2558,6 @@ rde_dump_ctx_new(struct ctl_show_rib_req
  return;
  }
 
- memcpy(&ctx->req, req, sizeof(struct ctl_show_rib_request));
- ctx->req.pid = pid;
- ctx->req.type = type;
- ctx->rid = rid;
  switch (ctx->req.type) {
  case IMSG_CTL_SHOW_NETWORK:
  if (rib_dump_new(rid, ctx->req.aid, CTL_MSG_HIGH_MARK, ctx,
@@ -2463,10 +2588,10 @@ rde_dump_ctx_new(struct ctl_show_rib_req
  hostplen = 128;
  break;
  default:
- fatalx("rde_dump_ctx_new: unknown af");
+ fatalx("%s: unknown af", __func__);
  }
  if (req->prefixlen == hostplen)
- re = rib_lookup(rib_byid(rid), &req->prefix);
+ re = rib_match(rib_byid(rid), &req->prefix);
  else
  re = rib_get(rib_byid(rid), &req->prefix,
     req->prefixlen);
@@ -2502,21 +2627,7 @@ rde_dump_ctx_terminate(pid_t pid)
 
  LIST_FOREACH(ctx, &rde_dump_h, entry) {
  if (ctx->req.pid == pid) {
- void (*upcall)(struct rib_entry *, void *);
- switch (ctx->req.type) {
- case IMSG_CTL_SHOW_NETWORK:
- upcall = network_dump_upcall;
- break;
- case IMSG_CTL_SHOW_RIB:
- upcall = rde_dump_upcall;
- break;
- case IMSG_CTL_SHOW_RIB_PREFIX:
- upcall = rde_dump_prefix_upcall;
- break;
- default:
- fatalx("%s: unsupported imsg type", __func__);
- }
- rib_dump_terminate(ctx->rid, ctx, upcall);
+ rib_dump_terminate(ctx);
  return;
  }
  }
@@ -2697,16 +2808,9 @@ rde_up_dump_upcall(struct rib_entry *re,
 }
 
 static void
-rde_up_flush_upcall(struct rib_entry *re, void *ptr)
+rde_up_flush_upcall(struct prefix *p, void *ptr)
 {
- struct rde_peer *peer = ptr;
- struct prefix *p, *np;
-
- LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
- if (peer != prefix_peer(p))
- continue;
- up_generate_updates(out_rules, peer, NULL, p);
- }
+ up_generate_updates(out_rules, prefix_peer(p), NULL, p);
 }
 
 static void
@@ -3011,18 +3115,16 @@ rde_reload_done(void)
  peer->reconf_out = 0;
  peer->reconf_rib = 0;
  if (peer->loc_rib_id != rib_find(peer->conf.rib)) {
- char *p = log_fmt_peer(&peer->conf);
- log_debug("rib change: reloading peer %s", p);
- free(p);
+ log_peer_info(&peer->conf, "rib change, reloading");
  peer->loc_rib_id = rib_find(peer->conf.rib);
  if (peer->loc_rib_id == RIB_NOTFOUND)
  fatalx("King Bula's peer met an unknown RIB");
  peer->reconf_rib = 1;
  softreconfig++;
- if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC,
-    RDE_RUNNER_ROUNDS, peer, rde_up_flush_upcall,
+ if (prefix_dump_new(peer, AID_UNSPEC,
+    RDE_RUNNER_ROUNDS, NULL, rde_up_flush_upcall,
     rde_softreconfig_in_done, NULL) == -1)
- fatal("%s: rib_dump_new", __func__);
+ fatal("%s: prefix_dump_new", __func__);
  log_peer_info(&peer->conf, "flushing Adj-RIB-Out");
  continue;
  }
@@ -3197,7 +3299,7 @@ rde_softreconfig_in(struct rib_entry *re
 
  pt = re->prefix;
  pt_getaddr(pt, &prefix);
- LIST_FOREACH(p, &re->prefix_h, rib_l) {
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
  asp = prefix_aspath(p);
  peer = prefix_peer(p);
  force_eval = 0;
@@ -3358,6 +3460,35 @@ peer_get(u_int32_t id)
 }
 
 struct rde_peer *
+peer_match(struct ctl_neighbor *n, u_int32_t peerid)
+{
+ struct rde_peer_head *head;
+ struct rde_peer *peer;
+ u_int32_t i = 0;
+
+ if (peerid != 0)
+ i = peerid & peertable.peer_hashmask;
+
+ while (i <= peertable.peer_hashmask) {
+ head = &peertable.peer_hashtbl[i];
+ LIST_FOREACH(peer, head, hash_l) {
+ /* skip peers until peerid is found */
+ if (peerid == peer->conf.id) {
+ peerid = 0;
+ continue;
+ }
+ if (peerid != 0)
+ continue;
+
+ if (rde_match_peer(peer, n))
+ return (peer);
+ }
+ i++;
+ }
+ return (NULL);
+}
+
+struct rde_peer *
 peer_add(u_int32_t id, struct peer_config *p_conf)
 {
  struct rde_peer_head *head;
@@ -3441,17 +3572,9 @@ peer_localaddrs(struct rde_peer *peer, s
 }
 
 static void
-peer_adjout_flush_upcall(struct rib_entry *re, void *arg)
+peer_adjout_clear_upcall(struct prefix *p, void *arg)
 {
- struct rde_peer *peer = arg;
- struct prefix *p, *np;
-
- LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
- if (peer != prefix_peer(p))
- continue;
- prefix_destroy(p);
- break; /* optimization, only one match per peer possible */
- }
+ prefix_adjout_destroy(p);
 }
 
 void
@@ -3472,9 +3595,9 @@ peer_up(u_int32_t id, struct session_up
  * There is a race condition when doing PEER_ERR -> PEER_DOWN.
  * So just do a full reset of the peer here.
  */
- if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC, 0, peer,
-    peer_adjout_flush_upcall, NULL, NULL) == -1)
- fatal("%s: rib_dump_new", __func__);
+ if (prefix_dump_new(peer, AID_UNSPEC, 0, NULL,
+    peer_adjout_clear_upcall, NULL, NULL) == -1)
+ fatal("%s: prefix_dump_new", __func__);
  peer_flush(peer, AID_UNSPEC, 0);
  peer->prefix_cnt = 0;
  peer->state = PEER_DOWN;
@@ -3519,12 +3642,12 @@ peer_down(u_int32_t id)
  peer->remote_bgpid = 0;
  peer->state = PEER_DOWN;
  /* stop all pending dumps which may depend on this peer */
- rib_dump_terminate(peer->loc_rib_id, peer, rde_up_dump_upcall);
+ rib_dump_terminate(peer);
 
  /* flush Adj-RIB-Out for this peer */
- if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC, 0, peer,
-    peer_adjout_flush_upcall, NULL, NULL) == -1)
- fatal("%s: rib_dump_new", __func__);
+ if (prefix_dump_new(peer, AID_UNSPEC, 0, NULL,
+    peer_adjout_clear_upcall, NULL, NULL) == -1)
+ fatal("%s: prefix_dump_new", __func__);
 
  peer_flush(peer, AID_UNSPEC, 0);
 
@@ -3553,7 +3676,7 @@ peer_flush_upcall(struct rib_entry *re,
 
  pt_getaddr(re->prefix, &addr);
  prefixlen = re->prefix->prefixlen;
- LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
+ LIST_FOREACH_SAFE(p, &re->prefix_h, entry.list.rib, np) {
  if (peer != prefix_peer(p))
  continue;
  if (staletime && p->lastchange > staletime)
@@ -3888,7 +4011,7 @@ network_dump_upcall(struct rib_entry *re
  struct bgpd_addr addr;
  struct rde_dump_ctx *ctx = ptr;
 
- LIST_FOREACH(p, &re->prefix_h, rib_l) {
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
  asp = prefix_aspath(p);
  if (!(asp->flags & F_PREFIX_ANNOUNCED))
  continue;
@@ -3925,7 +4048,7 @@ network_flush_upcall(struct rib_entry *r
 
  pt_getaddr(re->prefix, &addr);
  prefixlen = re->prefix->prefixlen;
- LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
+ LIST_FOREACH_SAFE(p, &re->prefix_h, entry.list.rib, np) {
  if (prefix_peer(p) != peer)
  continue;
  asp = prefix_aspath(p);
@@ -3965,9 +4088,6 @@ rde_shutdown(void)
  for (i = 0; i <= peertable.peer_hashmask; i++)
  while ((p = LIST_FIRST(&peertable.peer_hashtbl[i])) != NULL)
  peer_down(p->conf.id);
-
- /* then since decision process is off, kill RIB_ADJ_OUT */
- rib_free(rib_byid(RIB_ADJ_OUT));
 
  /* free filters */
  filterlist_free(out_rules);
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.219
diff -u -p -r1.219 rde.h
--- rde.h 1 Jul 2019 07:07:08 -0000 1.219
+++ rde.h 10 Jul 2019 14:42:42 -0000
@@ -57,8 +57,7 @@ struct rib {
 };
 
 #define RIB_ADJ_IN 0
-#define RIB_ADJ_OUT 1
-#define RIB_LOC_START 2
+#define RIB_LOC_START 1
 #define RIB_NOTFOUND 0xffff
 
 struct rib_desc {
@@ -78,6 +77,7 @@ LIST_HEAD(aspath_list, aspath);
 LIST_HEAD(attr_list, attr);
 LIST_HEAD(aspath_head, rde_aspath);
 RB_HEAD(prefix_tree, prefix);
+RB_HEAD(prefix_index, prefix);
 
 struct rde_peer {
  LIST_ENTRY(rde_peer) hash_l; /* hash list over all peers */
@@ -87,6 +87,7 @@ struct rde_peer {
  struct bgpd_addr local_v4_addr;
  struct bgpd_addr local_v6_addr;
  struct capabilities capa;
+ struct prefix_index adj_rib_out;
  struct prefix_tree updates[AID_MAX];
  struct prefix_tree withdraws[AID_MAX];
  time_t staletime[AID_MAX];
@@ -306,8 +307,14 @@ struct pt_entry_vpn6 {
 };
 
 struct prefix {
- LIST_ENTRY(prefix) rib_l, nexthop_l;
- RB_ENTRY(prefix) entry;
+ union {
+ struct {
+ LIST_ENTRY(prefix) rib, nexthop;
+ } list;
+ struct {
+ RB_ENTRY(prefix) index, update;
+ } tree;
+ } entry;
  struct pt_entry *pt;
  struct rib_entry *re;
  struct rde_aspath *aspath;
@@ -317,12 +324,17 @@ struct prefix {
  time_t lastchange;
  u_int8_t validation_state;
  u_int8_t nhflags;
- u_int8_t flags;
  u_int8_t eor;
-#define PREFIX_FLAG_WITHDRAW 0x01
-#define PREFIX_FLAG_UPDATE 0x02
+ u_int8_t flags;
+#define PREFIX_FLAG_WITHDRAW 0x01 /* queued for withdraw */
+#define PREFIX_FLAG_UPDATE 0x02 /* queued for update */
+#define PREFIX_FLAG_DEAD 0x04 /* locked but removed */
+#define PREFIX_FLAG_MASK 0x07 /* mask for the three prefix types */
+#define PREFIX_NEXTHOP_LINKED 0x40 /* prefix is linked onto nexthop list */
+#define PREFIX_FLAG_LOCKED 0x80 /* locked by rib walker */
 };
 
+/* possible states for nhflags */
 #define NEXTHOP_SELF 0x01
 #define NEXTHOP_REJECT 0x02
 #define NEXTHOP_BLACKHOLE 0x04
@@ -356,6 +368,7 @@ u_int32_t rde_local_as(void);
 int rde_noevaluate(void);
 int rde_decisionflags(void);
 int rde_as4byte(struct rde_peer *);
+struct rde_peer *peer_get(u_int32_t);
 
 /* rde_attr.c */
 int attr_write(void *, u_int16_t, u_int8_t, u_int8_t, void *,
@@ -395,6 +408,7 @@ u_char *aspath_override(struct aspath *
     u_int16_t *);
 int aspath_lenmatch(struct aspath *, enum aslen_spec, u_int);
 
+/* rde_community.c */
 int community_match(struct rde_community *, struct community *,
     struct rde_peer *);
 int community_set(struct rde_community *, struct community *,
@@ -499,15 +513,14 @@ struct rib_desc *rib_desc(struct rib *);
 void rib_free(struct rib *);
 void rib_shutdown(void);
 struct rib_entry *rib_get(struct rib *, struct bgpd_addr *, int);
-struct rib_entry *rib_lookup(struct rib *, struct bgpd_addr *);
+struct rib_entry *rib_match(struct rib *, struct bgpd_addr *);
 int rib_dump_pending(void);
 void rib_dump_runner(void);
 int rib_dump_new(u_int16_t, u_int8_t, unsigned int, void *,
     void (*)(struct rib_entry *, void *),
     void (*)(void *, u_int8_t),
     int (*)(void *));
-void rib_dump_terminate(u_int16_t, void *,
-    void (*)(struct rib_entry *, void *));
+void rib_dump_terminate(void *);
 
 static inline struct rib *
 re_rib(struct rib_entry *re)
@@ -540,13 +553,20 @@ void path_put(struct rde_aspath *);
 #define PREFIX_SIZE(x) (((x) + 7) / 8 + 1)
 struct prefix *prefix_get(struct rib *, struct rde_peer *,
     struct bgpd_addr *, int);
+struct prefix *prefix_lookup(struct rde_peer *, struct bgpd_addr *, int);
+struct prefix *prefix_match(struct rde_peer *, struct bgpd_addr *);
 int prefix_remove(struct rib *, struct rde_peer *,
     struct bgpd_addr *, int);
 void prefix_add_eor(struct rde_peer *, u_int8_t);
-void prefix_update(struct rib *, struct rde_peer *,
-    struct bgpd_addr *, int);
-int prefix_withdraw(struct rib *, struct rde_peer *,
-    struct bgpd_addr *, int);
+int prefix_update(struct rde_peer *, struct filterstate *,
+    struct bgpd_addr *, int, u_int8_t);
+int prefix_withdraw(struct rde_peer *, struct bgpd_addr *, int);
+void prefix_adjout_destroy(struct prefix *p);
+void prefix_adjout_dump(struct rde_peer *, void *,
+    void (*)(struct prefix *, void *));
+int prefix_dump_new(struct rde_peer *, u_int8_t, unsigned int,
+    void *, void (*)(struct prefix *, void *),
+        void (*)(void *, u_int8_t), int (*)(void *));
 int prefix_write(u_char *, int, struct bgpd_addr *, u_int8_t, int);
 int prefix_writebuf(struct ibuf *, struct bgpd_addr *, u_int8_t);
 struct prefix *prefix_bypeer(struct rib_entry *, struct rde_peer *);
Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.74
diff -u -p -r1.74 rde_decide.c
--- rde_decide.c 21 Jan 2019 02:07:56 -0000 1.74
+++ rde_decide.c 20 Jun 2019 19:50:25 -0000
@@ -245,7 +245,7 @@ prefix_evaluate(struct prefix *p, struct
  if (re_rib(re)->flags & F_RIB_NOEVALUATE || rde_noevaluate()) {
  /* decision process is turned off */
  if (p != NULL)
- LIST_INSERT_HEAD(&re->prefix_h, p, rib_l);
+ LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
  if (re->active != NULL)
  re->active = NULL;
  return;
@@ -253,15 +253,18 @@ prefix_evaluate(struct prefix *p, struct
 
  if (p != NULL) {
  if (LIST_EMPTY(&re->prefix_h))
- LIST_INSERT_HEAD(&re->prefix_h, p, rib_l);
+ LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
  else {
- LIST_FOREACH(xp, &re->prefix_h, rib_l) {
+ LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) {
  if (prefix_cmp(p, xp) > 0) {
- LIST_INSERT_BEFORE(xp, p, rib_l);
+ LIST_INSERT_BEFORE(xp, p,
+    entry.list.rib);
  break;
- } else if (LIST_NEXT(xp, rib_l) == NULL) {
+ } else if (LIST_NEXT(xp, entry.list.rib) ==
+    NULL) {
  /* if xp last element ... */
- LIST_INSERT_AFTER(xp, p, rib_l);
+ LIST_INSERT_AFTER(xp, p,
+    entry.list.rib);
  break;
  }
  }
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
retrieving revision 1.198
diff -u -p -r1.198 rde_rib.c
--- rde_rib.c 1 Jul 2019 14:47:56 -0000 1.198
+++ rde_rib.c 10 Jul 2019 14:27:45 -0000
@@ -52,8 +52,10 @@ RB_GENERATE(rib_tree, rib_entry, rib_e,
 struct rib_context {
  LIST_ENTRY(rib_context) entry;
  struct rib_entry *ctx_re;
- u_int16_t ctx_rib_id;
- void (*ctx_upcall)(struct rib_entry *, void *);
+ struct prefix *ctx_p;
+ u_int32_t ctx_id;
+ void (*ctx_rib_call)(struct rib_entry *, void *);
+ void (*ctx_prefix_call)(struct prefix *, void *);
  void (*ctx_done)(void *, u_int8_t);
  int (*ctx_throttle)(void *);
  void *ctx_arg;
@@ -69,6 +71,7 @@ static int prefix_add(struct bgpd_addr *
 static int prefix_move(struct prefix *, struct rde_peer *,
     struct rde_aspath *, struct rde_community *,
     struct nexthop *, u_int8_t, u_int8_t);
+static void prefix_dump_r(struct rib_context *);
 
 static inline struct rib_entry *
 re_lock(struct rib_entry *re)
@@ -128,7 +131,7 @@ rib_new(char *name, u_int rtableid, u_in
  rib_size = id + 1;
  }
 
- bzero(&ribs[id], sizeof(struct rib_desc));
+ memset(&ribs[id], 0, sizeof(struct rib_desc));
  strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
  RB_INIT(rib_tree(&ribs[id].rib));
  ribs[id].state = RECONF_REINIT;
@@ -196,7 +199,7 @@ rib_free(struct rib *rib)
  */
  while ((p = LIST_FIRST(&re->prefix_h))) {
  struct rde_aspath *asp = prefix_aspath(p);
- np = LIST_NEXT(p, rib_l);
+ np = LIST_NEXT(p, entry.list.rib);
  if (asp && asp->pftableid) {
  struct bgpd_addr addr;
 
@@ -215,7 +218,7 @@ rib_free(struct rib *rib)
  rd = &ribs[rib->id];
  filterlist_free(rd->in_rules_tmp);
  filterlist_free(rd->in_rules);
- bzero(rd, sizeof(struct rib_desc));
+ memset(rd, 0, sizeof(struct rib_desc));
 }
 
 void
@@ -235,7 +238,7 @@ rib_shutdown(void)
  struct rib_desc *rd = &ribs[id];
  filterlist_free(rd->in_rules_tmp);
  filterlist_free(rd->in_rules);
- bzero(rd, sizeof(struct rib_desc));
+ memset(rd, 0, sizeof(struct rib_desc));
  }
  free(ribs);
 }
@@ -247,7 +250,7 @@ rib_get(struct rib *rib, struct bgpd_add
  struct pt_entry *pte;
 
  pte = pt_fill(prefix, prefixlen);
- bzero(&xre, sizeof(xre));
+ memset(&xre, 0, sizeof(xre));
  xre.prefix = pte;
 
  re = RB_FIND(rib_tree, rib_tree(rib), &xre);
@@ -258,7 +261,7 @@ rib_get(struct rib *rib, struct bgpd_add
 }
 
 struct rib_entry *
-rib_lookup(struct rib *rib, struct bgpd_addr *addr)
+rib_match(struct rib *rib, struct bgpd_addr *addr)
 {
  struct rib_entry *re;
  int i;
@@ -281,7 +284,7 @@ rib_lookup(struct rib *rib, struct bgpd_
  }
  break;
  default:
- fatalx("rib_lookup: unknown af");
+ fatalx("%s: unknown af", __func__);
  }
  return (NULL);
 }
@@ -367,9 +370,9 @@ rib_dump_r(struct rib_context *ctx)
  struct rib *rib;
  unsigned int i;
 
- rib = rib_byid(ctx->ctx_rib_id);
+ rib = rib_byid(ctx->ctx_id);
  if (rib == NULL)
- fatalx("%s: rib id %u gone", __func__, ctx->ctx_rib_id);
+ fatalx("%s: rib id %u gone", __func__, ctx->ctx_id);
 
  if (ctx->ctx_re == NULL)
  re = RB_MIN(rib_tree, rib_tree(rib));
@@ -378,9 +381,9 @@ rib_dump_r(struct rib_context *ctx)
 
  for (i = 0; re != NULL; re = next) {
  next = RB_NEXT(rib_tree, unused, re);
- if (re->rib_id != ctx->ctx_rib_id)
+ if (re->rib_id != ctx->ctx_id)
  fatalx("%s: Unexpected RIB %u != %u.", __func__,
-    re->rib_id, ctx->ctx_rib_id);
+    re->rib_id, ctx->ctx_id);
  if (ctx->ctx_aid != AID_UNSPEC &&
     ctx->ctx_aid != re->prefix->aid)
  continue;
@@ -391,7 +394,7 @@ rib_dump_r(struct rib_context *ctx)
  re_lock(re);
  return;
  }
- ctx->ctx_upcall(re, ctx->ctx_arg);
+ ctx->ctx_rib_call(re, ctx->ctx_arg);
  }
 
  if (ctx->ctx_done)
@@ -422,7 +425,10 @@ rib_dump_runner(void)
  LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
  if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
  continue;
- rib_dump_r(ctx);
+ if (ctx->ctx_rib_call != NULL)
+ rib_dump_r(ctx);
+ else
+ prefix_dump_r(ctx);
  }
 }
 
@@ -432,7 +438,7 @@ rib_dump_abort(u_int16_t id)
  struct rib_context *ctx, *next;
 
  LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
- if (id != ctx->ctx_rib_id)
+ if (id != ctx->ctx_id)
  continue;
  if (ctx->ctx_done)
  ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
@@ -450,11 +456,11 @@ rib_dump_new(u_int16_t id, u_int8_t aid,
 
  if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
  return -1;
- ctx->ctx_rib_id = id;
+ ctx->ctx_id = id;
  ctx->ctx_aid = aid;
  ctx->ctx_count = count;
  ctx->ctx_arg = arg;
- ctx->ctx_upcall = upcall;
+ ctx->ctx_rib_call = upcall;
  ctx->ctx_done = done;
  ctx->ctx_throttle = throttle;
 
@@ -468,14 +474,12 @@ rib_dump_new(u_int16_t id, u_int8_t aid,
 }
 
 void
-rib_dump_terminate(u_int16_t id, void *arg,
-    void (*upcall)(struct rib_entry *, void *))
+rib_dump_terminate(void *arg)
 {
  struct rib_context *ctx, *next;
 
  LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
- if (id != ctx->ctx_rib_id || ctx->ctx_arg != arg ||
-    ctx->ctx_upcall != upcall)
+ if (ctx->ctx_arg != arg)
  continue;
  if (ctx->ctx_done)
  ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
@@ -610,15 +614,6 @@ path_update(struct rib *rib, struct rde_
  p->validation_state = vstate;
  return (2);
  }
- if (p->flags) {
- struct prefix_tree *prefix_head;
- /* prefix is a pending update */
- prefix_head = p->flags & PREFIX_FLAG_UPDATE ?
-    &peer->updates[prefix->aid] :
-    &peer->withdraws[prefix->aid];
- RB_REMOVE(prefix_tree, prefix_head, p);
- p->flags = 0;
- }
  }
 
  /*
@@ -881,7 +876,14 @@ prefix_cmp(struct prefix *a, struct pref
  return pt_prefix_cmp(a->pt, b->pt);
 }
 
-RB_GENERATE(prefix_tree, prefix, entry, prefix_cmp)
+static inline int
+prefix_index_cmp(struct prefix *a, struct prefix *b)
+{
+ return pt_prefix_cmp(a->pt, b->pt);
+}
+
+RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp)
+RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp)
 
 /*
  * search for specified prefix of a peer. Returns NULL if not found.
@@ -899,6 +901,52 @@ prefix_get(struct rib *rib, struct rde_p
 }
 
 /*
+ * lookup prefix in the peer prefix_index. Returns NULL if not found.
+ */
+struct prefix *
+prefix_lookup(struct rde_peer *peer, struct bgpd_addr *prefix,
+    int prefixlen)
+{
+ struct prefix xp;
+ struct pt_entry *pte;
+
+ memset(&xp, 0, sizeof(xp));
+ pte = pt_fill(prefix, prefixlen);
+ xp.pt = pte;
+
+ return RB_FIND(prefix_index, &peer->adj_rib_out, &xp);
+}
+
+struct prefix *
+prefix_match(struct rde_peer *peer, struct bgpd_addr *addr)
+{
+ struct prefix *p;
+ int i;
+
+ switch (addr->aid) {
+ case AID_INET:
+ case AID_VPN_IPv4:
+ for (i = 32; i >= 0; i--) {
+ p = prefix_lookup(peer, addr, i);
+ if (p != NULL)
+ return p;
+ }
+ break;
+ case AID_INET6:
+ case AID_VPN_IPv6:
+ for (i = 128; i >= 0; i--) {
+ p = prefix_lookup(peer, addr, i);
+ if (p != NULL)
+ return p;
+ }
+ break;
+ default:
+ fatalx("%s: unknown af", __func__);
+ }
+ return NULL;
+}
+
+/*
  * Adds or updates a prefix.
  */
 static int
@@ -936,8 +984,8 @@ prefix_move(struct prefix *p, struct rde
  np->aspath = path_ref(asp);
  np->communities = communities_ref(comm);
  np->peer = peer;
- np->pt = p->pt; /* skip refcnt update since ref is moved */
  np->re = p->re;
+ np->pt = p->pt; /* skip refcnt update since ref is moved */
  np->validation_state = vstate;
  np->nhflags = nhflags;
  np->nexthop = nexthop_ref(nexthop);
@@ -957,7 +1005,7 @@ prefix_move(struct prefix *p, struct rde
  * This is safe because we create a new prefix and so the change
  * is noticed by prefix_evaluate().
  */
- LIST_REMOVE(p, rib_l);
+ LIST_REMOVE(p, entry.list.rib);
  prefix_evaluate(np, np->re);
 
  /* remove old prefix node */
@@ -1020,26 +1068,97 @@ prefix_add_eor(struct rde_peer *peer, u_
  if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
  /* no need to add if EoR marker already present */
  prefix_free(p);
+ /* EOR marker is not inserted into the adj_rib_out index */
 }
 
 /*
  * Put a prefix from the Adj-RIB-Out onto the update queue.
  */
-void
-prefix_update(struct rib *rib, struct rde_peer *peer,
-    struct bgpd_addr *prefix, int prefixlen)
+int
+prefix_update(struct rde_peer *peer, struct filterstate *state,
+    struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate)
 {
+ struct prefix_tree *prefix_head = NULL;
+ struct rde_aspath *asp;
+ struct rde_community *comm;
  struct prefix *p;
+ int created = 0;
 
- p = prefix_get(rib, peer, prefix, prefixlen);
- if (p == NULL) /* Got a dummy withdrawn request. */
- return;
+ if ((p = prefix_lookup(peer, prefix, prefixlen)) != NULL) {
+ /* prefix is already in the Adj-RIB-Out */
+ if (p->flags & PREFIX_FLAG_WITHDRAW) {
+ created = 1; /* consider this a new entry */
+ peer->up_wcnt--;
+ prefix_head = &peer->withdraws[prefix->aid];
+ RB_REMOVE(prefix_tree, prefix_head, p);
+ } else if (p->flags & PREFIX_FLAG_DEAD) {
+ created = 1; /* consider this a new entry */
+ } else {
+ if (prefix_nhflags(p) == state->nhflags &&
+    prefix_nexthop(p) == state->nexthop &&
+    communities_equal(&state->communities,
+    prefix_communities(p)) &&
+    path_compare(&state->aspath, prefix_aspath(p)) ==
+    0) {
+ /* nothing changed */
+ p->validation_state = vstate;
+ p->lastchange = time(NULL);
+ return 0;
+ }
+
+ if (p->flags & PREFIX_FLAG_UPDATE) {
+ /* created = 0 so up_nlricnt is not increased */
+ prefix_head = &peer->updates[prefix->aid];
+ RB_REMOVE(prefix_tree, prefix_head, p);
+ }
+ }
+ /* unlink from aspath and remove nexthop ref */
+ nexthop_unref(p->nexthop);
+ communities_unref(p->communities);
+ path_unref(p->aspath);
+ p->flags &= ~PREFIX_FLAG_MASK;
+
+ /* peer and pt remain */
+ } else {
+ p = prefix_alloc();
+ created = 1;
+
+ p->pt = pt_get(prefix, prefixlen);
+ if (p->pt == NULL)
+ fatalx("%s: update for non existing prefix", __func__);
+ pt_ref(p->pt);
+ p->peer = peer;
+
+ if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
+ fatalx("%s: RB index invariant violated", __func__);
+ }
+
+ if ((asp = path_lookup(&state->aspath)) == NULL) {
+ /* Path not available, create and link a new one. */
+ asp = path_copy(path_get(), &state->aspath);
+ path_link(asp);
+ }
+
+ if ((comm = communities_lookup(&state->communities)) == NULL) {
+ /* Communities not available, create and link a new one. */
+ comm = communities_link(&state->communities);
+ }
+
+ p->aspath = path_ref(asp);
+ p->communities = communities_ref(comm);
+ p->nexthop = nexthop_ref(state->nexthop);
+ p->nhflags = state->nhflags;
 
- if (p->flags != 0)
+ p->validation_state = vstate;
+ p->lastchange = time(NULL);
+
+ if (p->flags & PREFIX_FLAG_MASK)
  fatalx("%s: bad flags %x", __func__, p->flags);
- p->flags = PREFIX_FLAG_UPDATE;
+ p->flags |= PREFIX_FLAG_UPDATE;
  if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL)
  fatalx("%s: RB tree invariant violated", __func__);
+
+ return created;
 }
 
 /*
@@ -1047,15 +1166,19 @@ prefix_update(struct rib *rib, struct rd
  * the prefix in the RIB linked to the peer withdraw list.
  */
 int
-prefix_withdraw(struct rib *rib, struct rde_peer *peer,
-    struct bgpd_addr *prefix, int prefixlen)
+prefix_withdraw(struct rde_peer *peer, struct bgpd_addr *prefix, int prefixlen)
 {
  struct prefix *p;
 
- p = prefix_get(rib, peer, prefix, prefixlen);
+ p = prefix_lookup(peer, prefix, prefixlen);
  if (p == NULL) /* Got a dummy withdrawn request. */
  return (0);
 
+ /* remove nexthop ref ... */
+ nexthop_unref(p->nexthop);
+ p->nexthop = NULL;
+ p->nhflags = 0;
+
  /* unlink from aspath ...*/
  path_unref(p->aspath);
  p->aspath = NULL;
@@ -1063,29 +1186,181 @@ prefix_withdraw(struct rib *rib, struct
  /* ... communities ... */
  communities_unref(p->communities);
  p->communities = NULL;
+ /* and unlink from aspath */
+ path_unref(p->aspath);
+ p->aspath = NULL;
+ /* re already NULL */
 
- /* ... and nexthop but keep the re link */
- nexthop_unlink(p);
- nexthop_unref(p->nexthop);
- p->nexthop = NULL;
- p->nhflags = 0;
- /* re link still exists */
+ p->lastchange = time(NULL);
 
- if (p->flags) {
+ if (p->flags & PREFIX_FLAG_MASK) {
  struct prefix_tree *prefix_head;
  /* p is a pending update or withdraw, remove first */
  prefix_head = p->flags & PREFIX_FLAG_UPDATE ?
     &peer->updates[prefix->aid] :
     &peer->withdraws[prefix->aid];
  RB_REMOVE(prefix_tree, prefix_head, p);
- p->flags = 0;
+ p->flags &= ~PREFIX_FLAG_MASK;
  }
- p->flags = PREFIX_FLAG_WITHDRAW;
+ p->flags |= PREFIX_FLAG_WITHDRAW;
  if (RB_INSERT(prefix_tree, &peer->withdraws[prefix->aid], p) != NULL)
  fatalx("%s: RB tree invariant violated", __func__);
  return (1);
 }
 
+static inline void
+prefix_lock(struct prefix *p)
+{
+ if (p->flags & PREFIX_FLAG_LOCKED)
+ fatalx("%s: locking locked prefix", __func__);
+ p->flags |= PREFIX_FLAG_LOCKED;
+}
+
+static inline void
+prefix_unlock(struct prefix *p)
+{
+ if ((p->flags & PREFIX_FLAG_LOCKED) == 0)
+ fatalx("%s: unlocking unlocked prefix", __func__);
+ p->flags &= ~PREFIX_FLAG_LOCKED;
+}
+
+static inline int
+prefix_is_locked(struct prefix *p)
+{
+ return (p->flags & PREFIX_FLAG_LOCKED) != 0;
+}
+
+static inline int
+prefix_is_dead(struct prefix *p)
+{
+ return (p->flags & PREFIX_FLAG_DEAD) != 0;
+}
+
+static struct prefix *
+prefix_restart(struct rib_context *ctx)
+{
+ struct prefix *p;
+
+ p = ctx->ctx_p;
+ prefix_unlock(p);
+
+ if (prefix_is_dead(p)) {
+ struct prefix *next;
+
+ next = RB_NEXT(prefix_index, unused, p);
+ prefix_adjout_destroy(p);
+ p = next;
+ }
+ return p;
+}
+
+void
+prefix_adjout_destroy(struct prefix *p)
+{
+ struct rde_peer *peer = prefix_peer(p);
+
+ if (p->eor) {
+ /* EOR marker is not linked in the index */
+ prefix_free(p);
+ return;
+ }
+
+ if (p->flags & PREFIX_FLAG_WITHDRAW)
+ RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p);
+ else if (p->flags & PREFIX_FLAG_UPDATE)
+ RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
+ /* nothing needs to be done for PREFIX_FLAG_DEAD */
+ p->flags &= ~PREFIX_FLAG_MASK;
+
+
+ if (prefix_is_locked(p)) {
+ /* remove nexthop ref ... */
+ nexthop_unref(p->nexthop);
+ p->nexthop = NULL;
+ /* ... communities ... */
+ communities_unref(p->communities);
+ p->communities = NULL;
+ /* and unlink from aspath */
+ path_unref(p->aspath);
+ p->aspath = NULL;
+ p->nhflags = 0;
+ /* re already NULL */
+
+ /* finally mark prefix dead */
+ p->flags |= PREFIX_FLAG_DEAD;
+ return;
+ }
+
+ RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
+
+ prefix_unlink(p);
+ prefix_free(p);
+}
+
+static void
+prefix_dump_r(struct rib_context *ctx)
+{
+ struct prefix *p, *next;
+ struct rde_peer *peer;
+ unsigned int i;
+
+ if ((peer = peer_get(ctx->ctx_id)) == NULL)
+ goto done;
+
+ if (ctx->ctx_p == NULL)
+ p = RB_MIN(prefix_index, &peer->adj_rib_out);
+ else
+ p = prefix_restart(ctx);
+
+ for (i = 0; p != NULL; p = next) {
+ next = RB_NEXT(prefix_index, unused, p);
+ if (prefix_is_dead(p))
+ continue;
+ if (ctx->ctx_aid != AID_UNSPEC &&
+    ctx->ctx_aid != p->pt->aid)
+ continue;
+ if (ctx->ctx_count && i++ >= ctx->ctx_count &&
+    !prefix_is_locked(p)) {
+ /* store and lock last element */
+ ctx->ctx_p = p;
+ prefix_lock(p);
+ return;
+ }
+ ctx->ctx_prefix_call(p, ctx->ctx_arg);
+ }
+
+done:
+ if (ctx->ctx_done)
+ ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
+ LIST_REMOVE(ctx, entry);
+ free(ctx);
+}
+
+int
+prefix_dump_new(struct rde_peer *peer, u_int8_t aid, unsigned int count,
+    void *arg, void (*upcall)(struct prefix *, void *),
+    void (*done)(void *, u_int8_t), int (*throttle)(void *))
+{
+ struct rib_context *ctx;
+
+ if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
+ return -1;
+ ctx->ctx_id = peer->conf.id;
+ ctx->ctx_aid = aid;
+ ctx->ctx_count = count;
+ ctx->ctx_arg = arg;
+ ctx->ctx_prefix_call = upcall;
+ ctx->ctx_done = done;
+ ctx->ctx_throttle = throttle;
+
+ LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
+
+ /* requested a sync traversal */
+ if (count == 0)
+ prefix_dump_r(ctx);
+
+ return 0;
+}
 
 /* dump a prefix into specified buffer */
 int
@@ -1205,7 +1480,7 @@ prefix_bypeer(struct rib_entry *re, stru
 {
  struct prefix *p;
 
- LIST_FOREACH(p, &re->prefix_h, rib_l)
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
  if (prefix_peer(p) == peer)
  return (p);
  return (NULL);
@@ -1237,7 +1512,7 @@ prefix_updateall(struct prefix *p, enum
  }
 
  /* redo the route decision */
- LIST_REMOVE(p, rib_l);
+ LIST_REMOVE(p, entry.list.rib);
  /*
  * If the prefix is the active one remove it first,
  * this has to be done because we can not detect when
@@ -1255,6 +1530,10 @@ prefix_updateall(struct prefix *p, enum
 void
 prefix_destroy(struct prefix *p)
 {
+ /* make route decision */
+ LIST_REMOVE(p, entry.list.rib);
+ prefix_evaluate(NULL, p->re);
+
  prefix_unlink(p);
  prefix_free(p);
 }
@@ -1290,13 +1569,6 @@ prefix_unlink(struct prefix *p)
 {
  struct rib_entry *re = p->re;
 
- if (p->eor) /* nothing to unlink for EoR markers */
- return;
-
- /* make route decision */
- LIST_REMOVE(p, rib_l);
- prefix_evaluate(NULL, re);
-
  /* destroy all references to other objects */
  nexthop_unlink(p);
  nexthop_unref(p->nexthop);
@@ -1310,7 +1582,7 @@ prefix_unlink(struct prefix *p)
  p->re = NULL;
  p->pt = NULL;
 
- if (rib_empty(re))
+ if (re && rib_empty(re))
  rib_remove(re);
 
  /*
@@ -1319,7 +1591,7 @@ prefix_unlink(struct prefix *p)
  */
 }
 
-/* alloc and bzero new entry. May not fail. */
+/* alloc and zero new entry. May not fail. */
 static struct prefix *
 prefix_alloc(void)
 {
@@ -1430,7 +1702,7 @@ nexthop_runner(void)
  p = nh->next_prefix;
  for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
  prefix_updateall(p, nh->state, nh->oldstate);
- p = LIST_NEXT(p, nexthop_l);
+ p = LIST_NEXT(p, entry.list.nexthop);
  }
 
  /* prep for next run, if not finished readd to tail of queue */
@@ -1540,22 +1812,21 @@ nexthop_link(struct prefix *p)
  if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
  return;
 
- LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, nexthop_l);
+ p->flags |= PREFIX_NEXTHOP_LINKED;
+ LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop);
 }
 
 void
 nexthop_unlink(struct prefix *p)
 {
- if (p->nexthop == NULL)
- return;
-
- if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
+ if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0)
  return;
 
  if (p == p->nexthop->next_prefix)
- p->nexthop->next_prefix = LIST_NEXT(p, nexthop_l);
+ p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop);
 
- LIST_REMOVE(p, nexthop_l);
+ p->flags &= ~PREFIX_NEXTHOP_LINKED;
+ LIST_REMOVE(p, entry.list.nexthop);
 }
 
 struct nexthop *
Index: rde_update.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v
retrieving revision 1.119
diff -u -p -r1.119 rde_update.c
--- rde_update.c 2 Jul 2019 12:07:00 -0000 1.119
+++ rde_update.c 4 Jul 2019 08:59:13 -0000
@@ -143,8 +143,7 @@ withdraw:
 
  /* withdraw prefix */
  pt_getaddr(old->pt, &addr);
- if (prefix_withdraw(&ribs[RIB_ADJ_OUT].rib, peer, &addr,
-    old->pt->prefixlen) == 1)
+ if (prefix_withdraw(peer, &addr, old->pt->prefixlen) == 1)
  peer->up_wcnt++;
  } else {
  switch (up_test_update(peer, new)) {
@@ -165,13 +164,11 @@ withdraw:
  }
 
  pt_getaddr(new->pt, &addr);
- if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr,
-    new->pt->prefixlen, prefix_vstate(new)) != 2) {
- /* only send update if path changed */
- prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr,
-    new->pt->prefixlen);
+
+ /* only send update if path changed */
+ if (prefix_update(peer, &state, &addr, new->pt->prefixlen,
+    prefix_vstate(new)) == 1)
  peer->up_nlricnt++;
- }
 
  rde_filterstate_clean(&state);
  }
@@ -229,11 +226,8 @@ up_generate_default(struct filter_head *
  return;
  }
 
- if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr, 0,
-    ROA_NOTFOUND) != 2) {
- prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr, 0);
+ if (prefix_update(peer, &state, &addr, 0, ROA_NOTFOUND) == 1)
  peer->up_nlricnt++;
- }
 
  /* no longer needed */
  rde_filterstate_clean(&state);
@@ -576,8 +570,13 @@ up_is_eor(struct rde_peer *peer, u_int8_
 
  p = RB_MIN(prefix_tree, &peer->updates[aid]);
  if (p != NULL && p->eor) {
+ /*
+ * Need to remove eor from update tree because
+ * prefix_adjout_destroy() can't handle that.
+ */
  RB_REMOVE(prefix_tree, &peer->updates[aid], p);
- prefix_destroy(p);
+ p->flags &= ~PREFIX_FLAG_MASK;
+ prefix_adjout_destroy(p);
  return 1;
  }
  return 0;
@@ -616,11 +615,11 @@ up_dump_prefix(u_char *buf, int len, str
 
  /* prefix sent, remove from list and clear flag */
  RB_REMOVE(prefix_tree, prefix_head, p);
- p->flags = 0;
+ p->flags &= ~PREFIX_FLAG_MASK;
 
  if (withdraw) {
  /* prefix no longer needed, remove it */
- prefix_destroy(p);
+ prefix_adjout_destroy(p);
  peer->up_wcnt--;
  peer->prefix_sent_withdraw++;
  } else {

Reply | Threaded
Open this post in threaded view
|

Re: bgpd adj-rib-out rewrite

Job Snijders-2
On Wed, Jul 10, 2019 at 10:08:38PM +0200, Claudio Jeker wrote:

> This diff is a bit of a monster. It changes the Adj-RIB-Out to be a
> peer specific set of RB trees instead of using a rib in the original
> sense. The reason for this is that the more peers a system has the
> more elements end up being linked into the adj-rib-out and many
> operations do linear searches which does not scale.
>
> I did some testing with 4000 peers sending 1 prefix each which then
> are sent back to all peers (resulting in 16Mio updates being put in
> Adj-RIB-Out).  Without this diff the system takes about 1h to bring up
> all sessions. With the diff the system finishes in around 5min.
>
> To not increase the memory footprint struct prefix is now using a
> union for the lists or RB trees. Additionally the rib dump runner was
> adjusted so that it also works with the Adj-RIB-Out. bgpctl show rib
> out changed a bit since it will dump now one peer after the other
> apart from that behaviour should be the same.

Some testing indicates that things indeed seem faster.

More reports from the real world would be appreciated, to better
understand the specific performance characteristics this change
introduces in various different deployment models, but that can happen
in tree.

OK job@

Kind regards,

Job

Reply | Threaded
Open this post in threaded view
|

Re: bgpd adj-rib-out rewrite

Sebastian Benoit-3
In reply to this post by Claudio Jeker
Claudio Jeker([hidden email]) on 2019.07.10 22:08:38 +0200:

> This diff is a bit of a monster. It changes the Adj-RIB-Out to be a peer
> specific set of RB trees instead of using a rib in the original sense.
> The reason for this is that the more peers a system has the more elements
> end up being linked into the adj-rib-out and many operations do linear
> searches which does not scale.
>
> I did some testing with 4000 peers sending 1 prefix each which then are
> sent back to all peers (resulting in 16Mio updates being put in Adj-RIB-Out).
> Without this diff the system takes about 1h to bring up all sessions. With
> the diff the system finishes in around 5min.
>
> To not increase the memory footprint struct prefix is now using a union
> for the lists or RB trees. Additionally the rib dump runner was adjusted
> so that it also works with the Adj-RIB-Out. bgpctl show rib out changed
> a bit since it will dump now one peer after the other apart from that
> behaviour should be the same.

ok benno@

you have to fix up the regress tests too.

>
> Please test
> --
> :wq Claudio
>
>
> Index: mrt.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/mrt.c,v
> retrieving revision 1.97
> diff -u -p -r1.97 mrt.c
> --- mrt.c 25 Jun 2019 21:33:55 -0000 1.97
> +++ mrt.c 26 Jun 2019 09:44:12 -0000
> @@ -512,15 +512,11 @@ mrt_dump_entry_v2(struct mrt *mrt, struc
>   goto fail;
>   }
>   nump = 0;
> - LIST_FOREACH(p, &re->prefix_h, rib_l) {
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
>   struct nexthop *nexthop;
>   struct bgpd_addr *nh;
>   struct ibuf *tbuf;
>  
> - /* skip pending withdraw in Adj-RIB-Out */
> - if (prefix_aspath(p) == NULL)
> - continue;
> -
>   nexthop = prefix_nexthop(p);
>   if (nexthop == NULL) {
>   bzero(&addr, sizeof(struct bgpd_addr));
> @@ -683,10 +679,7 @@ mrt_dump_upcall(struct rib_entry *re, vo
>   * dumps the table so we do the same. If only the active route should
>   * be dumped p should be set to p = pt->active.
>   */
> - LIST_FOREACH(p, &re->prefix_h, rib_l) {
> - /* skip pending withdraw in Adj-RIB-Out */
> - if (prefix_aspath(p) == NULL)
> - continue;
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
>   if (mrtbuf->type == MRT_TABLE_DUMP)
>   mrt_dump_entry(mrtbuf, p, mrtbuf->seqnum++,
>      prefix_peer(p));
> Index: parse.y
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
> retrieving revision 1.392
> diff -u -p -r1.392 parse.y
> --- parse.y 22 Jun 2019 05:36:40 -0000 1.392
> +++ parse.y 22 Jun 2019 05:44:57 -0000
> @@ -3282,8 +3282,6 @@ parse_config(char *filename, struct peer
>  
>   add_rib("Adj-RIB-In", conf->default_tableid,
>      F_RIB_NOFIB | F_RIB_NOEVALUATE);
> - add_rib("Adj-RIB-Out", conf->default_tableid,
> -    F_RIB_NOFIB | F_RIB_NOEVALUATE);
>   add_rib("Loc-RIB", conf->default_tableid, F_RIB_LOCAL);
>  
>   if ((file = pushfile(filename, 1)) == NULL)
> Index: rde.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
> retrieving revision 1.475
> diff -u -p -r1.475 rde.c
> --- rde.c 1 Jul 2019 07:07:08 -0000 1.475
> +++ rde.c 1 Jul 2019 07:37:09 -0000
> @@ -94,8 +94,8 @@ u_int8_t rde_roa_validity(struct rde_pr
>  void peer_init(u_int32_t);
>  void peer_shutdown(void);
>  int peer_localaddrs(struct rde_peer *, struct bgpd_addr *);
> +struct rde_peer *peer_match(struct ctl_neighbor *, u_int32_t);
>  struct rde_peer *peer_add(u_int32_t, struct peer_config *);
> -struct rde_peer *peer_get(u_int32_t);
>  void peer_up(u_int32_t, struct session_up *);
>  void peer_down(u_int32_t);
>  void peer_flush(struct rde_peer *, u_int8_t, time_t);
> @@ -133,7 +133,7 @@ int softreconfig;
>  struct rde_dump_ctx {
>   LIST_ENTRY(rde_dump_ctx) entry;
>   struct ctl_show_rib_request req;
> - u_int16_t rid;
> + u_int32_t peerid;
>   u_int8_t throttled;
>  };
>  
> @@ -220,7 +220,6 @@ rde_main(int debug, int verbose)
>  
>   /* make sure the default RIBs are setup */
>   rib_new("Adj-RIB-In", 0, F_RIB_NOFIB | F_RIB_NOEVALUATE);
> - rib_new("Adj-RIB-Out", 0, F_RIB_NOFIB | F_RIB_NOEVALUATE);
>  
>   out_rules = calloc(1, sizeof(struct filter_head));
>   if (out_rules == NULL)
> @@ -2242,7 +2241,7 @@ rde_dump_rib_as(struct prefix *p, struct
>   rib.origin = asp->origin;
>   rib.validation_state = p->validation_state;
>   rib.flags = 0;
> - if (p->re->active == p)
> + if (p->re != NULL && p->re->active == p)
>   rib.flags |= F_PREF_ACTIVE;
>   if (!prefix_peer(p)->conf.ebgp)
>   rib.flags |= F_PREF_INTERNAL;
> @@ -2305,7 +2304,7 @@ rde_dump_rib_as(struct prefix *p, struct
>  }
>  
>  static int
> -rde_dump_match_peer(struct rde_peer *p, struct ctl_neighbor *n)
> +rde_match_peer(struct rde_peer *p, struct ctl_neighbor *n)
>  {
>   char *s;
>  
> @@ -2326,7 +2325,7 @@ rde_dump_filter(struct prefix *p, struct
>  {
>   struct rde_aspath *asp;
>  
> - if (!rde_dump_match_peer(prefix_peer(p), &req->neighbor))
> + if (!rde_match_peer(prefix_peer(p), &req->neighbor))
>   return;
>  
>   asp = prefix_aspath(p);
> @@ -2353,10 +2352,10 @@ rde_dump_filter(struct prefix *p, struct
>  static void
>  rde_dump_upcall(struct rib_entry *re, void *ptr)
>  {
> - struct prefix *p;
>   struct rde_dump_ctx *ctx = ptr;
> + struct prefix *p;
>  
> - LIST_FOREACH(p, &re->prefix_h, rib_l)
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
>   rde_dump_filter(p, &ctx->req);
>  }
>  
> @@ -2375,10 +2374,38 @@ rde_dump_prefix_upcall(struct rib_entry
>   if (ctx->req.prefixlen > pt->prefixlen)
>   return;
>   if (!prefix_compare(&ctx->req.prefix, &addr, ctx->req.prefixlen))
> - LIST_FOREACH(p, &re->prefix_h, rib_l)
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
>   rde_dump_filter(p, &ctx->req);
>  }
>  
> +static void
> +rde_dump_adjout_upcall(struct prefix *p, void *ptr)
> +{
> + struct rde_dump_ctx *ctx = ptr;
> +
> + if (p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD))
> + return;
> + rde_dump_filter(p, &ctx->req);
> +}
> +
> +static void
> +rde_dump_adjout_prefix_upcall(struct prefix *p, void *ptr)
> +{
> + struct rde_dump_ctx *ctx = ptr;
> + struct bgpd_addr addr;
> +
> + if (p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD))
> + return;
> +
> + pt_getaddr(p->pt, &addr);
> + if (addr.aid != ctx->req.prefix.aid)
> + return;
> + if (ctx->req.prefixlen > p->pt->prefixlen)
> + return;
> + if (!prefix_compare(&ctx->req.prefix, &addr, ctx->req.prefixlen))
> + rde_dump_filter(p, &ctx->req);
> +}
> +
>  static int
>  rde_dump_throttled(void *arg)
>  {
> @@ -2391,11 +2418,45 @@ static void
>  rde_dump_done(void *arg, u_int8_t aid)
>  {
>   struct rde_dump_ctx *ctx = arg;
> + struct rde_peer *peer;
> + u_int error;
>  
> - imsg_compose(ibuf_se_ctl, IMSG_CTL_END, 0, ctx->req.pid,
> -    -1, NULL, 0);
> + if (ctx->req.flags & F_CTL_ADJ_OUT) {
> + peer = peer_match(&ctx->req.neighbor, ctx->peerid);
> + if (peer == NULL)
> + goto done;
> + ctx->peerid = peer->conf.id;
> + switch (ctx->req.type) {
> + case IMSG_CTL_SHOW_RIB:
> + if (prefix_dump_new(peer, ctx->req.aid,
> +    CTL_MSG_HIGH_MARK, ctx, rde_dump_adjout_upcall,
> +    rde_dump_done, rde_dump_throttled) == -1)
> + goto nomem;
> + break;
> + case IMSG_CTL_SHOW_RIB_PREFIX:
> + if (prefix_dump_new(peer, ctx->req.aid,
> +    CTL_MSG_HIGH_MARK, ctx,
> +    rde_dump_adjout_prefix_upcall,
> +    rde_dump_done, rde_dump_throttled) == -1)
> + goto nomem;
> + break;
> + default:
> + fatalx("%s: unsupported imsg type", __func__);
> + }
> + return;
> + }
> +done:
> + imsg_compose(ibuf_se_ctl, IMSG_CTL_END, 0, ctx->req.pid, -1, NULL, 0);
>   LIST_REMOVE(ctx, entry);
>   free(ctx);
> + return;
> +
> +nomem:
> + log_warn(__func__);
> + error = CTL_RES_NOMEM;
> + imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, ctx->req.pid, -1, &error,
> +    sizeof(error));
> + return;
>  }
>  
>  void
> @@ -2404,24 +2465,92 @@ rde_dump_ctx_new(struct ctl_show_rib_req
>  {
>   struct rde_dump_ctx *ctx;
>   struct rib_entry *re;
> + struct prefix *p;
>   u_int error;
>   u_int8_t hostplen;
>   u_int16_t rid;
>  
>   if ((ctx = calloc(1, sizeof(*ctx))) == NULL) {
>   nomem:
> - log_warn("rde_dump_ctx_new");
> + log_warn(__func__);
>   error = CTL_RES_NOMEM;
>   imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, pid, -1, &error,
>      sizeof(error));
>   return;
>   }
> +
> + memcpy(&ctx->req, req, sizeof(struct ctl_show_rib_request));
> + ctx->req.pid = pid;
> + ctx->req.type = type;
> +
>   if (req->flags & (F_CTL_ADJ_IN | F_CTL_INVALID)) {
>   rid = RIB_ADJ_IN;
>   } else if (req->flags & F_CTL_ADJ_OUT) {
> - rid = RIB_ADJ_OUT;
> + struct rde_peer *peer;
> +
> + peer = peer_match(&req->neighbor, 0);
> + if (peer == NULL) {
> + log_warnx("%s: no peer found for adj-rib-out",
> +    __func__);
> + error = CTL_RES_NOSUCHPEER;
> + imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, pid, -1,
> +    &error, sizeof(error));
> + free(ctx);
> + return;
> + }
> + ctx->peerid = peer->conf.id;
> + switch (ctx->req.type) {
> + case IMSG_CTL_SHOW_RIB:
> + if (prefix_dump_new(peer, ctx->req.aid,
> +    CTL_MSG_HIGH_MARK, ctx, rde_dump_adjout_upcall,
> +    rde_dump_done, rde_dump_throttled) == -1)
> + goto nomem;
> + break;
> + case IMSG_CTL_SHOW_RIB_PREFIX:
> + if (req->flags & F_LONGER) {
> + if (prefix_dump_new(peer, ctx->req.aid,
> +    CTL_MSG_HIGH_MARK, ctx,
> +    rde_dump_adjout_prefix_upcall,
> +    rde_dump_done, rde_dump_throttled) == -1)
> + goto nomem;
> + break;
> + }
> + switch (req->prefix.aid) {
> + case AID_INET:
> + case AID_VPN_IPv4:
> + hostplen = 32;
> + break;
> + case AID_INET6:
> + case AID_VPN_IPv6:
> + hostplen = 128;
> + break;
> + default:
> + fatalx("%s: unknown af", __func__);
> + }
> +
> + do {
> + if (req->prefixlen == hostplen)
> + p = prefix_match(peer, &req->prefix);
> + else
> + p = prefix_lookup(peer, &req->prefix,
> +    req->prefixlen);
> + if (p)
> + rde_dump_adjout_upcall(p, ctx);
> + } while ((peer = peer_match(&req->neighbor,
> +    peer->conf.id)));
> +
> + imsg_compose(ibuf_se_ctl, IMSG_CTL_END, 0, ctx->req.pid,
> +    -1, NULL, 0);
> + free(ctx);
> + return;
> + default:
> + fatalx("%s: unsupported imsg type", __func__);
> + }
> +
> + LIST_INSERT_HEAD(&rde_dump_h, ctx, entry);
> + return;
>   } else if ((rid = rib_find(req->rib)) == RIB_NOTFOUND) {
> - log_warnx("rde_dump_ctx_new: no such rib %s", req->rib);
> + log_warnx("%s: no such rib %s", __func__, req->rib);
>   error = CTL_RES_NOSUCHRIB;
>   imsg_compose(ibuf_se_ctl, IMSG_CTL_RESULT, 0, pid, -1, &error,
>      sizeof(error));
> @@ -2429,10 +2558,6 @@ rde_dump_ctx_new(struct ctl_show_rib_req
>   return;
>   }
>  
> - memcpy(&ctx->req, req, sizeof(struct ctl_show_rib_request));
> - ctx->req.pid = pid;
> - ctx->req.type = type;
> - ctx->rid = rid;
>   switch (ctx->req.type) {
>   case IMSG_CTL_SHOW_NETWORK:
>   if (rib_dump_new(rid, ctx->req.aid, CTL_MSG_HIGH_MARK, ctx,
> @@ -2463,10 +2588,10 @@ rde_dump_ctx_new(struct ctl_show_rib_req
>   hostplen = 128;
>   break;
>   default:
> - fatalx("rde_dump_ctx_new: unknown af");
> + fatalx("%s: unknown af", __func__);
>   }
>   if (req->prefixlen == hostplen)
> - re = rib_lookup(rib_byid(rid), &req->prefix);
> + re = rib_match(rib_byid(rid), &req->prefix);
>   else
>   re = rib_get(rib_byid(rid), &req->prefix,
>      req->prefixlen);
> @@ -2502,21 +2627,7 @@ rde_dump_ctx_terminate(pid_t pid)
>  
>   LIST_FOREACH(ctx, &rde_dump_h, entry) {
>   if (ctx->req.pid == pid) {
> - void (*upcall)(struct rib_entry *, void *);
> - switch (ctx->req.type) {
> - case IMSG_CTL_SHOW_NETWORK:
> - upcall = network_dump_upcall;
> - break;
> - case IMSG_CTL_SHOW_RIB:
> - upcall = rde_dump_upcall;
> - break;
> - case IMSG_CTL_SHOW_RIB_PREFIX:
> - upcall = rde_dump_prefix_upcall;
> - break;
> - default:
> - fatalx("%s: unsupported imsg type", __func__);
> - }
> - rib_dump_terminate(ctx->rid, ctx, upcall);
> + rib_dump_terminate(ctx);
>   return;
>   }
>   }
> @@ -2697,16 +2808,9 @@ rde_up_dump_upcall(struct rib_entry *re,
>  }
>  
>  static void
> -rde_up_flush_upcall(struct rib_entry *re, void *ptr)
> +rde_up_flush_upcall(struct prefix *p, void *ptr)
>  {
> - struct rde_peer *peer = ptr;
> - struct prefix *p, *np;
> -
> - LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
> - if (peer != prefix_peer(p))
> - continue;
> - up_generate_updates(out_rules, peer, NULL, p);
> - }
> + up_generate_updates(out_rules, prefix_peer(p), NULL, p);
>  }
>  
>  static void
> @@ -3011,18 +3115,16 @@ rde_reload_done(void)
>   peer->reconf_out = 0;
>   peer->reconf_rib = 0;
>   if (peer->loc_rib_id != rib_find(peer->conf.rib)) {
> - char *p = log_fmt_peer(&peer->conf);
> - log_debug("rib change: reloading peer %s", p);
> - free(p);
> + log_peer_info(&peer->conf, "rib change, reloading");
>   peer->loc_rib_id = rib_find(peer->conf.rib);
>   if (peer->loc_rib_id == RIB_NOTFOUND)
>   fatalx("King Bula's peer met an unknown RIB");
>   peer->reconf_rib = 1;
>   softreconfig++;
> - if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC,
> -    RDE_RUNNER_ROUNDS, peer, rde_up_flush_upcall,
> + if (prefix_dump_new(peer, AID_UNSPEC,
> +    RDE_RUNNER_ROUNDS, NULL, rde_up_flush_upcall,
>      rde_softreconfig_in_done, NULL) == -1)
> - fatal("%s: rib_dump_new", __func__);
> + fatal("%s: prefix_dump_new", __func__);
>   log_peer_info(&peer->conf, "flushing Adj-RIB-Out");
>   continue;
>   }
> @@ -3197,7 +3299,7 @@ rde_softreconfig_in(struct rib_entry *re
>  
>   pt = re->prefix;
>   pt_getaddr(pt, &prefix);
> - LIST_FOREACH(p, &re->prefix_h, rib_l) {
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
>   asp = prefix_aspath(p);
>   peer = prefix_peer(p);
>   force_eval = 0;
> @@ -3358,6 +3460,35 @@ peer_get(u_int32_t id)
>  }
>  
>  struct rde_peer *
> +peer_match(struct ctl_neighbor *n, u_int32_t peerid)
> +{
> + struct rde_peer_head *head;
> + struct rde_peer *peer;
> + u_int32_t i = 0;
> +
> + if (peerid != 0)
> + i = peerid & peertable.peer_hashmask;
> +
> + while (i <= peertable.peer_hashmask) {
> + head = &peertable.peer_hashtbl[i];
> + LIST_FOREACH(peer, head, hash_l) {
> + /* skip peers until peerid is found */
> + if (peerid == peer->conf.id) {
> + peerid = 0;
> + continue;
> + }
> + if (peerid != 0)
> + continue;
> +
> + if (rde_match_peer(peer, n))
> + return (peer);
> + }
> + i++;
> + }
> + return (NULL);
> +}
> +
> +struct rde_peer *
>  peer_add(u_int32_t id, struct peer_config *p_conf)
>  {
>   struct rde_peer_head *head;
> @@ -3441,17 +3572,9 @@ peer_localaddrs(struct rde_peer *peer, s
>  }
>  
>  static void
> -peer_adjout_flush_upcall(struct rib_entry *re, void *arg)
> +peer_adjout_clear_upcall(struct prefix *p, void *arg)
>  {
> - struct rde_peer *peer = arg;
> - struct prefix *p, *np;
> -
> - LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
> - if (peer != prefix_peer(p))
> - continue;
> - prefix_destroy(p);
> - break; /* optimization, only one match per peer possible */
> - }
> + prefix_adjout_destroy(p);
>  }
>  
>  void
> @@ -3472,9 +3595,9 @@ peer_up(u_int32_t id, struct session_up
>   * There is a race condition when doing PEER_ERR -> PEER_DOWN.
>   * So just do a full reset of the peer here.
>   */
> - if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC, 0, peer,
> -    peer_adjout_flush_upcall, NULL, NULL) == -1)
> - fatal("%s: rib_dump_new", __func__);
> + if (prefix_dump_new(peer, AID_UNSPEC, 0, NULL,
> +    peer_adjout_clear_upcall, NULL, NULL) == -1)
> + fatal("%s: prefix_dump_new", __func__);
>   peer_flush(peer, AID_UNSPEC, 0);
>   peer->prefix_cnt = 0;
>   peer->state = PEER_DOWN;
> @@ -3519,12 +3642,12 @@ peer_down(u_int32_t id)
>   peer->remote_bgpid = 0;
>   peer->state = PEER_DOWN;
>   /* stop all pending dumps which may depend on this peer */
> - rib_dump_terminate(peer->loc_rib_id, peer, rde_up_dump_upcall);
> + rib_dump_terminate(peer);
>  
>   /* flush Adj-RIB-Out for this peer */
> - if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC, 0, peer,
> -    peer_adjout_flush_upcall, NULL, NULL) == -1)
> - fatal("%s: rib_dump_new", __func__);
> + if (prefix_dump_new(peer, AID_UNSPEC, 0, NULL,
> +    peer_adjout_clear_upcall, NULL, NULL) == -1)
> + fatal("%s: prefix_dump_new", __func__);
>  
>   peer_flush(peer, AID_UNSPEC, 0);
>  
> @@ -3553,7 +3676,7 @@ peer_flush_upcall(struct rib_entry *re,
>  
>   pt_getaddr(re->prefix, &addr);
>   prefixlen = re->prefix->prefixlen;
> - LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
> + LIST_FOREACH_SAFE(p, &re->prefix_h, entry.list.rib, np) {
>   if (peer != prefix_peer(p))
>   continue;
>   if (staletime && p->lastchange > staletime)
> @@ -3888,7 +4011,7 @@ network_dump_upcall(struct rib_entry *re
>   struct bgpd_addr addr;
>   struct rde_dump_ctx *ctx = ptr;
>  
> - LIST_FOREACH(p, &re->prefix_h, rib_l) {
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
>   asp = prefix_aspath(p);
>   if (!(asp->flags & F_PREFIX_ANNOUNCED))
>   continue;
> @@ -3925,7 +4048,7 @@ network_flush_upcall(struct rib_entry *r
>  
>   pt_getaddr(re->prefix, &addr);
>   prefixlen = re->prefix->prefixlen;
> - LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) {
> + LIST_FOREACH_SAFE(p, &re->prefix_h, entry.list.rib, np) {
>   if (prefix_peer(p) != peer)
>   continue;
>   asp = prefix_aspath(p);
> @@ -3965,9 +4088,6 @@ rde_shutdown(void)
>   for (i = 0; i <= peertable.peer_hashmask; i++)
>   while ((p = LIST_FIRST(&peertable.peer_hashtbl[i])) != NULL)
>   peer_down(p->conf.id);
> -
> - /* then since decision process is off, kill RIB_ADJ_OUT */
> - rib_free(rib_byid(RIB_ADJ_OUT));
>  
>   /* free filters */
>   filterlist_free(out_rules);
> Index: rde.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
> retrieving revision 1.219
> diff -u -p -r1.219 rde.h
> --- rde.h 1 Jul 2019 07:07:08 -0000 1.219
> +++ rde.h 10 Jul 2019 14:42:42 -0000
> @@ -57,8 +57,7 @@ struct rib {
>  };
>  
>  #define RIB_ADJ_IN 0
> -#define RIB_ADJ_OUT 1
> -#define RIB_LOC_START 2
> +#define RIB_LOC_START 1
>  #define RIB_NOTFOUND 0xffff
>  
>  struct rib_desc {
> @@ -78,6 +77,7 @@ LIST_HEAD(aspath_list, aspath);
>  LIST_HEAD(attr_list, attr);
>  LIST_HEAD(aspath_head, rde_aspath);
>  RB_HEAD(prefix_tree, prefix);
> +RB_HEAD(prefix_index, prefix);
>  
>  struct rde_peer {
>   LIST_ENTRY(rde_peer) hash_l; /* hash list over all peers */
> @@ -87,6 +87,7 @@ struct rde_peer {
>   struct bgpd_addr local_v4_addr;
>   struct bgpd_addr local_v6_addr;
>   struct capabilities capa;
> + struct prefix_index adj_rib_out;
>   struct prefix_tree updates[AID_MAX];
>   struct prefix_tree withdraws[AID_MAX];
>   time_t staletime[AID_MAX];
> @@ -306,8 +307,14 @@ struct pt_entry_vpn6 {
>  };
>  
>  struct prefix {
> - LIST_ENTRY(prefix) rib_l, nexthop_l;
> - RB_ENTRY(prefix) entry;
> + union {
> + struct {
> + LIST_ENTRY(prefix) rib, nexthop;
> + } list;
> + struct {
> + RB_ENTRY(prefix) index, update;
> + } tree;
> + } entry;
>   struct pt_entry *pt;
>   struct rib_entry *re;
>   struct rde_aspath *aspath;
> @@ -317,12 +324,17 @@ struct prefix {
>   time_t lastchange;
>   u_int8_t validation_state;
>   u_int8_t nhflags;
> - u_int8_t flags;
>   u_int8_t eor;
> -#define PREFIX_FLAG_WITHDRAW 0x01
> -#define PREFIX_FLAG_UPDATE 0x02
> + u_int8_t flags;
> +#define PREFIX_FLAG_WITHDRAW 0x01 /* queued for withdraw */
> +#define PREFIX_FLAG_UPDATE 0x02 /* queued for update */
> +#define PREFIX_FLAG_DEAD 0x04 /* locked but removed */
> +#define PREFIX_FLAG_MASK 0x07 /* mask for the three prefix types */
> +#define PREFIX_NEXTHOP_LINKED 0x40 /* prefix is linked onto nexthop list */
> +#define PREFIX_FLAG_LOCKED 0x80 /* locked by rib walker */
>  };
>  
> +/* possible states for nhflags */
>  #define NEXTHOP_SELF 0x01
>  #define NEXTHOP_REJECT 0x02
>  #define NEXTHOP_BLACKHOLE 0x04
> @@ -356,6 +368,7 @@ u_int32_t rde_local_as(void);
>  int rde_noevaluate(void);
>  int rde_decisionflags(void);
>  int rde_as4byte(struct rde_peer *);
> +struct rde_peer *peer_get(u_int32_t);
>  
>  /* rde_attr.c */
>  int attr_write(void *, u_int16_t, u_int8_t, u_int8_t, void *,
> @@ -395,6 +408,7 @@ u_char *aspath_override(struct aspath *
>      u_int16_t *);
>  int aspath_lenmatch(struct aspath *, enum aslen_spec, u_int);
>  
> +/* rde_community.c */
>  int community_match(struct rde_community *, struct community *,
>      struct rde_peer *);
>  int community_set(struct rde_community *, struct community *,
> @@ -499,15 +513,14 @@ struct rib_desc *rib_desc(struct rib *);
>  void rib_free(struct rib *);
>  void rib_shutdown(void);
>  struct rib_entry *rib_get(struct rib *, struct bgpd_addr *, int);
> -struct rib_entry *rib_lookup(struct rib *, struct bgpd_addr *);
> +struct rib_entry *rib_match(struct rib *, struct bgpd_addr *);
>  int rib_dump_pending(void);
>  void rib_dump_runner(void);
>  int rib_dump_new(u_int16_t, u_int8_t, unsigned int, void *,
>      void (*)(struct rib_entry *, void *),
>      void (*)(void *, u_int8_t),
>      int (*)(void *));
> -void rib_dump_terminate(u_int16_t, void *,
> -    void (*)(struct rib_entry *, void *));
> +void rib_dump_terminate(void *);
>  
>  static inline struct rib *
>  re_rib(struct rib_entry *re)
> @@ -540,13 +553,20 @@ void path_put(struct rde_aspath *);
>  #define PREFIX_SIZE(x) (((x) + 7) / 8 + 1)
>  struct prefix *prefix_get(struct rib *, struct rde_peer *,
>      struct bgpd_addr *, int);
> +struct prefix *prefix_lookup(struct rde_peer *, struct bgpd_addr *, int);
> +struct prefix *prefix_match(struct rde_peer *, struct bgpd_addr *);
>  int prefix_remove(struct rib *, struct rde_peer *,
>      struct bgpd_addr *, int);
>  void prefix_add_eor(struct rde_peer *, u_int8_t);
> -void prefix_update(struct rib *, struct rde_peer *,
> -    struct bgpd_addr *, int);
> -int prefix_withdraw(struct rib *, struct rde_peer *,
> -    struct bgpd_addr *, int);
> +int prefix_update(struct rde_peer *, struct filterstate *,
> +    struct bgpd_addr *, int, u_int8_t);
> +int prefix_withdraw(struct rde_peer *, struct bgpd_addr *, int);
> +void prefix_adjout_destroy(struct prefix *p);
> +void prefix_adjout_dump(struct rde_peer *, void *,
> +    void (*)(struct prefix *, void *));
> +int prefix_dump_new(struct rde_peer *, u_int8_t, unsigned int,
> +    void *, void (*)(struct prefix *, void *),
> +        void (*)(void *, u_int8_t), int (*)(void *));
>  int prefix_write(u_char *, int, struct bgpd_addr *, u_int8_t, int);
>  int prefix_writebuf(struct ibuf *, struct bgpd_addr *, u_int8_t);
>  struct prefix *prefix_bypeer(struct rib_entry *, struct rde_peer *);
> Index: rde_decide.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
> retrieving revision 1.74
> diff -u -p -r1.74 rde_decide.c
> --- rde_decide.c 21 Jan 2019 02:07:56 -0000 1.74
> +++ rde_decide.c 20 Jun 2019 19:50:25 -0000
> @@ -245,7 +245,7 @@ prefix_evaluate(struct prefix *p, struct
>   if (re_rib(re)->flags & F_RIB_NOEVALUATE || rde_noevaluate()) {
>   /* decision process is turned off */
>   if (p != NULL)
> - LIST_INSERT_HEAD(&re->prefix_h, p, rib_l);
> + LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
>   if (re->active != NULL)
>   re->active = NULL;
>   return;
> @@ -253,15 +253,18 @@ prefix_evaluate(struct prefix *p, struct
>  
>   if (p != NULL) {
>   if (LIST_EMPTY(&re->prefix_h))
> - LIST_INSERT_HEAD(&re->prefix_h, p, rib_l);
> + LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
>   else {
> - LIST_FOREACH(xp, &re->prefix_h, rib_l) {
> + LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) {
>   if (prefix_cmp(p, xp) > 0) {
> - LIST_INSERT_BEFORE(xp, p, rib_l);
> + LIST_INSERT_BEFORE(xp, p,
> +    entry.list.rib);
>   break;
> - } else if (LIST_NEXT(xp, rib_l) == NULL) {
> + } else if (LIST_NEXT(xp, entry.list.rib) ==
> +    NULL) {
>   /* if xp last element ... */
> - LIST_INSERT_AFTER(xp, p, rib_l);
> + LIST_INSERT_AFTER(xp, p,
> +    entry.list.rib);
>   break;
>   }
>   }
> Index: rde_rib.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
> retrieving revision 1.198
> diff -u -p -r1.198 rde_rib.c
> --- rde_rib.c 1 Jul 2019 14:47:56 -0000 1.198
> +++ rde_rib.c 10 Jul 2019 14:27:45 -0000
> @@ -52,8 +52,10 @@ RB_GENERATE(rib_tree, rib_entry, rib_e,
>  struct rib_context {
>   LIST_ENTRY(rib_context) entry;
>   struct rib_entry *ctx_re;
> - u_int16_t ctx_rib_id;
> - void (*ctx_upcall)(struct rib_entry *, void *);
> + struct prefix *ctx_p;
> + u_int32_t ctx_id;
> + void (*ctx_rib_call)(struct rib_entry *, void *);
> + void (*ctx_prefix_call)(struct prefix *, void *);
>   void (*ctx_done)(void *, u_int8_t);
>   int (*ctx_throttle)(void *);
>   void *ctx_arg;
> @@ -69,6 +71,7 @@ static int prefix_add(struct bgpd_addr *
>  static int prefix_move(struct prefix *, struct rde_peer *,
>      struct rde_aspath *, struct rde_community *,
>      struct nexthop *, u_int8_t, u_int8_t);
> +static void prefix_dump_r(struct rib_context *);
>  
>  static inline struct rib_entry *
>  re_lock(struct rib_entry *re)
> @@ -128,7 +131,7 @@ rib_new(char *name, u_int rtableid, u_in
>   rib_size = id + 1;
>   }
>  
> - bzero(&ribs[id], sizeof(struct rib_desc));
> + memset(&ribs[id], 0, sizeof(struct rib_desc));
>   strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
>   RB_INIT(rib_tree(&ribs[id].rib));
>   ribs[id].state = RECONF_REINIT;
> @@ -196,7 +199,7 @@ rib_free(struct rib *rib)
>   */
>   while ((p = LIST_FIRST(&re->prefix_h))) {
>   struct rde_aspath *asp = prefix_aspath(p);
> - np = LIST_NEXT(p, rib_l);
> + np = LIST_NEXT(p, entry.list.rib);
>   if (asp && asp->pftableid) {
>   struct bgpd_addr addr;
>  
> @@ -215,7 +218,7 @@ rib_free(struct rib *rib)
>   rd = &ribs[rib->id];
>   filterlist_free(rd->in_rules_tmp);
>   filterlist_free(rd->in_rules);
> - bzero(rd, sizeof(struct rib_desc));
> + memset(rd, 0, sizeof(struct rib_desc));
>  }
>  
>  void
> @@ -235,7 +238,7 @@ rib_shutdown(void)
>   struct rib_desc *rd = &ribs[id];
>   filterlist_free(rd->in_rules_tmp);
>   filterlist_free(rd->in_rules);
> - bzero(rd, sizeof(struct rib_desc));
> + memset(rd, 0, sizeof(struct rib_desc));
>   }
>   free(ribs);
>  }
> @@ -247,7 +250,7 @@ rib_get(struct rib *rib, struct bgpd_add
>   struct pt_entry *pte;
>  
>   pte = pt_fill(prefix, prefixlen);
> - bzero(&xre, sizeof(xre));
> + memset(&xre, 0, sizeof(xre));
>   xre.prefix = pte;
>  
>   re = RB_FIND(rib_tree, rib_tree(rib), &xre);
> @@ -258,7 +261,7 @@ rib_get(struct rib *rib, struct bgpd_add
>  }
>  
>  struct rib_entry *
> -rib_lookup(struct rib *rib, struct bgpd_addr *addr)
> +rib_match(struct rib *rib, struct bgpd_addr *addr)
>  {
>   struct rib_entry *re;
>   int i;
> @@ -281,7 +284,7 @@ rib_lookup(struct rib *rib, struct bgpd_
>   }
>   break;
>   default:
> - fatalx("rib_lookup: unknown af");
> + fatalx("%s: unknown af", __func__);
>   }
>   return (NULL);
>  }
> @@ -367,9 +370,9 @@ rib_dump_r(struct rib_context *ctx)
>   struct rib *rib;
>   unsigned int i;
>  
> - rib = rib_byid(ctx->ctx_rib_id);
> + rib = rib_byid(ctx->ctx_id);
>   if (rib == NULL)
> - fatalx("%s: rib id %u gone", __func__, ctx->ctx_rib_id);
> + fatalx("%s: rib id %u gone", __func__, ctx->ctx_id);
>  
>   if (ctx->ctx_re == NULL)
>   re = RB_MIN(rib_tree, rib_tree(rib));
> @@ -378,9 +381,9 @@ rib_dump_r(struct rib_context *ctx)
>  
>   for (i = 0; re != NULL; re = next) {
>   next = RB_NEXT(rib_tree, unused, re);
> - if (re->rib_id != ctx->ctx_rib_id)
> + if (re->rib_id != ctx->ctx_id)
>   fatalx("%s: Unexpected RIB %u != %u.", __func__,
> -    re->rib_id, ctx->ctx_rib_id);
> +    re->rib_id, ctx->ctx_id);
>   if (ctx->ctx_aid != AID_UNSPEC &&
>      ctx->ctx_aid != re->prefix->aid)
>   continue;
> @@ -391,7 +394,7 @@ rib_dump_r(struct rib_context *ctx)
>   re_lock(re);
>   return;
>   }
> - ctx->ctx_upcall(re, ctx->ctx_arg);
> + ctx->ctx_rib_call(re, ctx->ctx_arg);
>   }
>  
>   if (ctx->ctx_done)
> @@ -422,7 +425,10 @@ rib_dump_runner(void)
>   LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
>   if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
>   continue;
> - rib_dump_r(ctx);
> + if (ctx->ctx_rib_call != NULL)
> + rib_dump_r(ctx);
> + else
> + prefix_dump_r(ctx);
>   }
>  }
>  
> @@ -432,7 +438,7 @@ rib_dump_abort(u_int16_t id)
>   struct rib_context *ctx, *next;
>  
>   LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
> - if (id != ctx->ctx_rib_id)
> + if (id != ctx->ctx_id)
>   continue;
>   if (ctx->ctx_done)
>   ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
> @@ -450,11 +456,11 @@ rib_dump_new(u_int16_t id, u_int8_t aid,
>  
>   if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
>   return -1;
> - ctx->ctx_rib_id = id;
> + ctx->ctx_id = id;
>   ctx->ctx_aid = aid;
>   ctx->ctx_count = count;
>   ctx->ctx_arg = arg;
> - ctx->ctx_upcall = upcall;
> + ctx->ctx_rib_call = upcall;
>   ctx->ctx_done = done;
>   ctx->ctx_throttle = throttle;
>  
> @@ -468,14 +474,12 @@ rib_dump_new(u_int16_t id, u_int8_t aid,
>  }
>  
>  void
> -rib_dump_terminate(u_int16_t id, void *arg,
> -    void (*upcall)(struct rib_entry *, void *))
> +rib_dump_terminate(void *arg)
>  {
>   struct rib_context *ctx, *next;
>  
>   LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
> - if (id != ctx->ctx_rib_id || ctx->ctx_arg != arg ||
> -    ctx->ctx_upcall != upcall)
> + if (ctx->ctx_arg != arg)
>   continue;
>   if (ctx->ctx_done)
>   ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
> @@ -610,15 +614,6 @@ path_update(struct rib *rib, struct rde_
>   p->validation_state = vstate;
>   return (2);
>   }
> - if (p->flags) {
> - struct prefix_tree *prefix_head;
> - /* prefix is a pending update */
> - prefix_head = p->flags & PREFIX_FLAG_UPDATE ?
> -    &peer->updates[prefix->aid] :
> -    &peer->withdraws[prefix->aid];
> - RB_REMOVE(prefix_tree, prefix_head, p);
> - p->flags = 0;
> - }
>   }
>  
>   /*
> @@ -881,7 +876,14 @@ prefix_cmp(struct prefix *a, struct pref
>   return pt_prefix_cmp(a->pt, b->pt);
>  }
>  
> -RB_GENERATE(prefix_tree, prefix, entry, prefix_cmp)
> +static inline int
> +prefix_index_cmp(struct prefix *a, struct prefix *b)
> +{
> + return pt_prefix_cmp(a->pt, b->pt);
> +}
> +
> +RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp)
> +RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp)
>  
>  /*
>   * search for specified prefix of a peer. Returns NULL if not found.
> @@ -899,6 +901,52 @@ prefix_get(struct rib *rib, struct rde_p
>  }
>  
>  /*
> + * lookup prefix in the peer prefix_index. Returns NULL if not found.
> + */
> +struct prefix *
> +prefix_lookup(struct rde_peer *peer, struct bgpd_addr *prefix,
> +    int prefixlen)
> +{
> + struct prefix xp;
> + struct pt_entry *pte;
> +
> + memset(&xp, 0, sizeof(xp));
> + pte = pt_fill(prefix, prefixlen);
> + xp.pt = pte;
> +
> + return RB_FIND(prefix_index, &peer->adj_rib_out, &xp);
> +}
> +
> +struct prefix *
> +prefix_match(struct rde_peer *peer, struct bgpd_addr *addr)
> +{
> + struct prefix *p;
> + int i;
> +
> + switch (addr->aid) {
> + case AID_INET:
> + case AID_VPN_IPv4:
> + for (i = 32; i >= 0; i--) {
> + p = prefix_lookup(peer, addr, i);
> + if (p != NULL)
> + return p;
> + }
> + break;
> + case AID_INET6:
> + case AID_VPN_IPv6:
> + for (i = 128; i >= 0; i--) {
> + p = prefix_lookup(peer, addr, i);
> + if (p != NULL)
> + return p;
> + }
> + break;
> + default:
> + fatalx("%s: unknown af", __func__);
> + }
> + return NULL;
> +}
> +
> +/*
>   * Adds or updates a prefix.
>   */
>  static int
> @@ -936,8 +984,8 @@ prefix_move(struct prefix *p, struct rde
>   np->aspath = path_ref(asp);
>   np->communities = communities_ref(comm);
>   np->peer = peer;
> - np->pt = p->pt; /* skip refcnt update since ref is moved */
>   np->re = p->re;
> + np->pt = p->pt; /* skip refcnt update since ref is moved */
>   np->validation_state = vstate;
>   np->nhflags = nhflags;
>   np->nexthop = nexthop_ref(nexthop);
> @@ -957,7 +1005,7 @@ prefix_move(struct prefix *p, struct rde
>   * This is safe because we create a new prefix and so the change
>   * is noticed by prefix_evaluate().
>   */
> - LIST_REMOVE(p, rib_l);
> + LIST_REMOVE(p, entry.list.rib);
>   prefix_evaluate(np, np->re);
>  
>   /* remove old prefix node */
> @@ -1020,26 +1068,97 @@ prefix_add_eor(struct rde_peer *peer, u_
>   if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
>   /* no need to add if EoR marker already present */
>   prefix_free(p);
> + /* EOR marker is not inserted into the adj_rib_out index */
>  }
>  
>  /*
>   * Put a prefix from the Adj-RIB-Out onto the update queue.
>   */
> -void
> -prefix_update(struct rib *rib, struct rde_peer *peer,
> -    struct bgpd_addr *prefix, int prefixlen)
> +int
> +prefix_update(struct rde_peer *peer, struct filterstate *state,
> +    struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate)
>  {
> + struct prefix_tree *prefix_head = NULL;
> + struct rde_aspath *asp;
> + struct rde_community *comm;
>   struct prefix *p;
> + int created = 0;
>  
> - p = prefix_get(rib, peer, prefix, prefixlen);
> - if (p == NULL) /* Got a dummy withdrawn request. */
> - return;
> + if ((p = prefix_lookup(peer, prefix, prefixlen)) != NULL) {
> + /* prefix is already in the Adj-RIB-Out */
> + if (p->flags & PREFIX_FLAG_WITHDRAW) {
> + created = 1; /* consider this a new entry */
> + peer->up_wcnt--;
> + prefix_head = &peer->withdraws[prefix->aid];
> + RB_REMOVE(prefix_tree, prefix_head, p);
> + } else if (p->flags & PREFIX_FLAG_DEAD) {
> + created = 1; /* consider this a new entry */
> + } else {
> + if (prefix_nhflags(p) == state->nhflags &&
> +    prefix_nexthop(p) == state->nexthop &&
> +    communities_equal(&state->communities,
> +    prefix_communities(p)) &&
> +    path_compare(&state->aspath, prefix_aspath(p)) ==
> +    0) {
> + /* nothing changed */
> + p->validation_state = vstate;
> + p->lastchange = time(NULL);
> + return 0;
> + }
> +
> + if (p->flags & PREFIX_FLAG_UPDATE) {
> + /* created = 0 so up_nlricnt is not increased */
> + prefix_head = &peer->updates[prefix->aid];
> + RB_REMOVE(prefix_tree, prefix_head, p);
> + }
> + }
> + /* unlink from aspath and remove nexthop ref */
> + nexthop_unref(p->nexthop);
> + communities_unref(p->communities);
> + path_unref(p->aspath);
> + p->flags &= ~PREFIX_FLAG_MASK;
> +
> + /* peer and pt remain */
> + } else {
> + p = prefix_alloc();
> + created = 1;
> +
> + p->pt = pt_get(prefix, prefixlen);
> + if (p->pt == NULL)
> + fatalx("%s: update for non existing prefix", __func__);
> + pt_ref(p->pt);
> + p->peer = peer;
> +
> + if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
> + fatalx("%s: RB index invariant violated", __func__);
> + }
> +
> + if ((asp = path_lookup(&state->aspath)) == NULL) {
> + /* Path not available, create and link a new one. */
> + asp = path_copy(path_get(), &state->aspath);
> + path_link(asp);
> + }
> +
> + if ((comm = communities_lookup(&state->communities)) == NULL) {
> + /* Communities not available, create and link a new one. */
> + comm = communities_link(&state->communities);
> + }
> +
> + p->aspath = path_ref(asp);
> + p->communities = communities_ref(comm);
> + p->nexthop = nexthop_ref(state->nexthop);
> + p->nhflags = state->nhflags;
>  
> - if (p->flags != 0)
> + p->validation_state = vstate;
> + p->lastchange = time(NULL);
> +
> + if (p->flags & PREFIX_FLAG_MASK)
>   fatalx("%s: bad flags %x", __func__, p->flags);
> - p->flags = PREFIX_FLAG_UPDATE;
> + p->flags |= PREFIX_FLAG_UPDATE;
>   if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL)
>   fatalx("%s: RB tree invariant violated", __func__);
> +
> + return created;
>  }
>  
>  /*
> @@ -1047,15 +1166,19 @@ prefix_update(struct rib *rib, struct rd
>   * the prefix in the RIB linked to the peer withdraw list.
>   */
>  int
> -prefix_withdraw(struct rib *rib, struct rde_peer *peer,
> -    struct bgpd_addr *prefix, int prefixlen)
> +prefix_withdraw(struct rde_peer *peer, struct bgpd_addr *prefix, int prefixlen)
>  {
>   struct prefix *p;
>  
> - p = prefix_get(rib, peer, prefix, prefixlen);
> + p = prefix_lookup(peer, prefix, prefixlen);
>   if (p == NULL) /* Got a dummy withdrawn request. */
>   return (0);
>  
> + /* remove nexthop ref ... */
> + nexthop_unref(p->nexthop);
> + p->nexthop = NULL;
> + p->nhflags = 0;
> +
>   /* unlink from aspath ...*/
>   path_unref(p->aspath);
>   p->aspath = NULL;
> @@ -1063,29 +1186,181 @@ prefix_withdraw(struct rib *rib, struct
>   /* ... communities ... */
>   communities_unref(p->communities);
>   p->communities = NULL;
> + /* and unlink from aspath */
> + path_unref(p->aspath);
> + p->aspath = NULL;
> + /* re already NULL */
>  
> - /* ... and nexthop but keep the re link */
> - nexthop_unlink(p);
> - nexthop_unref(p->nexthop);
> - p->nexthop = NULL;
> - p->nhflags = 0;
> - /* re link still exists */
> + p->lastchange = time(NULL);
>  
> - if (p->flags) {
> + if (p->flags & PREFIX_FLAG_MASK) {
>   struct prefix_tree *prefix_head;
>   /* p is a pending update or withdraw, remove first */
>   prefix_head = p->flags & PREFIX_FLAG_UPDATE ?
>      &peer->updates[prefix->aid] :
>      &peer->withdraws[prefix->aid];
>   RB_REMOVE(prefix_tree, prefix_head, p);
> - p->flags = 0;
> + p->flags &= ~PREFIX_FLAG_MASK;
>   }
> - p->flags = PREFIX_FLAG_WITHDRAW;
> + p->flags |= PREFIX_FLAG_WITHDRAW;
>   if (RB_INSERT(prefix_tree, &peer->withdraws[prefix->aid], p) != NULL)
>   fatalx("%s: RB tree invariant violated", __func__);
>   return (1);
>  }
>  
> +static inline void
> +prefix_lock(struct prefix *p)
> +{
> + if (p->flags & PREFIX_FLAG_LOCKED)
> + fatalx("%s: locking locked prefix", __func__);
> + p->flags |= PREFIX_FLAG_LOCKED;
> +}
> +
> +static inline void
> +prefix_unlock(struct prefix *p)
> +{
> + if ((p->flags & PREFIX_FLAG_LOCKED) == 0)
> + fatalx("%s: unlocking unlocked prefix", __func__);
> + p->flags &= ~PREFIX_FLAG_LOCKED;
> +}
> +
> +static inline int
> +prefix_is_locked(struct prefix *p)
> +{
> + return (p->flags & PREFIX_FLAG_LOCKED) != 0;
> +}
> +
> +static inline int
> +prefix_is_dead(struct prefix *p)
> +{
> + return (p->flags & PREFIX_FLAG_DEAD) != 0;
> +}
> +
> +static struct prefix *
> +prefix_restart(struct rib_context *ctx)
> +{
> + struct prefix *p;
> +
> + p = ctx->ctx_p;
> + prefix_unlock(p);
> +
> + if (prefix_is_dead(p)) {
> + struct prefix *next;
> +
> + next = RB_NEXT(prefix_index, unused, p);
> + prefix_adjout_destroy(p);
> + p = next;
> + }
> + return p;
> +}
> +
> +void
> +prefix_adjout_destroy(struct prefix *p)
> +{
> + struct rde_peer *peer = prefix_peer(p);
> +
> + if (p->eor) {
> + /* EOR marker is not linked in the index */
> + prefix_free(p);
> + return;
> + }
> +
> + if (p->flags & PREFIX_FLAG_WITHDRAW)
> + RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p);
> + else if (p->flags & PREFIX_FLAG_UPDATE)
> + RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
> + /* nothing needs to be done for PREFIX_FLAG_DEAD */
> + p->flags &= ~PREFIX_FLAG_MASK;
> +
> +
> + if (prefix_is_locked(p)) {
> + /* remove nexthop ref ... */
> + nexthop_unref(p->nexthop);
> + p->nexthop = NULL;
> + /* ... communities ... */
> + communities_unref(p->communities);
> + p->communities = NULL;
> + /* and unlink from aspath */
> + path_unref(p->aspath);
> + p->aspath = NULL;
> + p->nhflags = 0;
> + /* re already NULL */
> +
> + /* finally mark prefix dead */
> + p->flags |= PREFIX_FLAG_DEAD;
> + return;
> + }
> +
> + RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
> +
> + prefix_unlink(p);
> + prefix_free(p);
> +}
> +
> +static void
> +prefix_dump_r(struct rib_context *ctx)
> +{
> + struct prefix *p, *next;
> + struct rde_peer *peer;
> + unsigned int i;
> +
> + if ((peer = peer_get(ctx->ctx_id)) == NULL)
> + goto done;
> +
> + if (ctx->ctx_p == NULL)
> + p = RB_MIN(prefix_index, &peer->adj_rib_out);
> + else
> + p = prefix_restart(ctx);
> +
> + for (i = 0; p != NULL; p = next) {
> + next = RB_NEXT(prefix_index, unused, p);
> + if (prefix_is_dead(p))
> + continue;
> + if (ctx->ctx_aid != AID_UNSPEC &&
> +    ctx->ctx_aid != p->pt->aid)
> + continue;
> + if (ctx->ctx_count && i++ >= ctx->ctx_count &&
> +    !prefix_is_locked(p)) {
> + /* store and lock last element */
> + ctx->ctx_p = p;
> + prefix_lock(p);
> + return;
> + }
> + ctx->ctx_prefix_call(p, ctx->ctx_arg);
> + }
> +
> +done:
> + if (ctx->ctx_done)
> + ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
> + LIST_REMOVE(ctx, entry);
> + free(ctx);
> +}
> +
> +int
> +prefix_dump_new(struct rde_peer *peer, u_int8_t aid, unsigned int count,
> +    void *arg, void (*upcall)(struct prefix *, void *),
> +    void (*done)(void *, u_int8_t), int (*throttle)(void *))
> +{
> + struct rib_context *ctx;
> +
> + if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
> + return -1;
> + ctx->ctx_id = peer->conf.id;
> + ctx->ctx_aid = aid;
> + ctx->ctx_count = count;
> + ctx->ctx_arg = arg;
> + ctx->ctx_prefix_call = upcall;
> + ctx->ctx_done = done;
> + ctx->ctx_throttle = throttle;
> +
> + LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
> +
> + /* requested a sync traversal */
> + if (count == 0)
> + prefix_dump_r(ctx);
> +
> + return 0;
> +}
>  
>  /* dump a prefix into specified buffer */
>  int
> @@ -1205,7 +1480,7 @@ prefix_bypeer(struct rib_entry *re, stru
>  {
>   struct prefix *p;
>  
> - LIST_FOREACH(p, &re->prefix_h, rib_l)
> + LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
>   if (prefix_peer(p) == peer)
>   return (p);
>   return (NULL);
> @@ -1237,7 +1512,7 @@ prefix_updateall(struct prefix *p, enum
>   }
>  
>   /* redo the route decision */
> - LIST_REMOVE(p, rib_l);
> + LIST_REMOVE(p, entry.list.rib);
>   /*
>   * If the prefix is the active one remove it first,
>   * this has to be done because we can not detect when
> @@ -1255,6 +1530,10 @@ prefix_updateall(struct prefix *p, enum
>  void
>  prefix_destroy(struct prefix *p)
>  {
> + /* make route decision */
> + LIST_REMOVE(p, entry.list.rib);
> + prefix_evaluate(NULL, p->re);
> +
>   prefix_unlink(p);
>   prefix_free(p);
>  }
> @@ -1290,13 +1569,6 @@ prefix_unlink(struct prefix *p)
>  {
>   struct rib_entry *re = p->re;
>  
> - if (p->eor) /* nothing to unlink for EoR markers */
> - return;
> -
> - /* make route decision */
> - LIST_REMOVE(p, rib_l);
> - prefix_evaluate(NULL, re);
> -
>   /* destroy all references to other objects */
>   nexthop_unlink(p);
>   nexthop_unref(p->nexthop);
> @@ -1310,7 +1582,7 @@ prefix_unlink(struct prefix *p)
>   p->re = NULL;
>   p->pt = NULL;
>  
> - if (rib_empty(re))
> + if (re && rib_empty(re))
>   rib_remove(re);
>  
>   /*
> @@ -1319,7 +1591,7 @@ prefix_unlink(struct prefix *p)
>   */
>  }
>  
> -/* alloc and bzero new entry. May not fail. */
> +/* alloc and zero new entry. May not fail. */
>  static struct prefix *
>  prefix_alloc(void)
>  {
> @@ -1430,7 +1702,7 @@ nexthop_runner(void)
>   p = nh->next_prefix;
>   for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
>   prefix_updateall(p, nh->state, nh->oldstate);
> - p = LIST_NEXT(p, nexthop_l);
> + p = LIST_NEXT(p, entry.list.nexthop);
>   }
>  
>   /* prep for next run, if not finished readd to tail of queue */
> @@ -1540,22 +1812,21 @@ nexthop_link(struct prefix *p)
>   if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
>   return;
>  
> - LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, nexthop_l);
> + p->flags |= PREFIX_NEXTHOP_LINKED;
> + LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop);
>  }
>  
>  void
>  nexthop_unlink(struct prefix *p)
>  {
> - if (p->nexthop == NULL)
> - return;
> -
> - if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
> + if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0)
>   return;
>  
>   if (p == p->nexthop->next_prefix)
> - p->nexthop->next_prefix = LIST_NEXT(p, nexthop_l);
> + p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop);
>  
> - LIST_REMOVE(p, nexthop_l);
> + p->flags &= ~PREFIX_NEXTHOP_LINKED;
> + LIST_REMOVE(p, entry.list.nexthop);
>  }
>  
>  struct nexthop *
> Index: rde_update.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v
> retrieving revision 1.119
> diff -u -p -r1.119 rde_update.c
> --- rde_update.c 2 Jul 2019 12:07:00 -0000 1.119
> +++ rde_update.c 4 Jul 2019 08:59:13 -0000
> @@ -143,8 +143,7 @@ withdraw:
>  
>   /* withdraw prefix */
>   pt_getaddr(old->pt, &addr);
> - if (prefix_withdraw(&ribs[RIB_ADJ_OUT].rib, peer, &addr,
> -    old->pt->prefixlen) == 1)
> + if (prefix_withdraw(peer, &addr, old->pt->prefixlen) == 1)
>   peer->up_wcnt++;
>   } else {
>   switch (up_test_update(peer, new)) {
> @@ -165,13 +164,11 @@ withdraw:
>   }
>  
>   pt_getaddr(new->pt, &addr);
> - if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr,
> -    new->pt->prefixlen, prefix_vstate(new)) != 2) {
> - /* only send update if path changed */
> - prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr,
> -    new->pt->prefixlen);
> +
> + /* only send update if path changed */
> + if (prefix_update(peer, &state, &addr, new->pt->prefixlen,
> +    prefix_vstate(new)) == 1)
>   peer->up_nlricnt++;
> - }
>  
>   rde_filterstate_clean(&state);
>   }
> @@ -229,11 +226,8 @@ up_generate_default(struct filter_head *
>   return;
>   }
>  
> - if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr, 0,
> -    ROA_NOTFOUND) != 2) {
> - prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr, 0);
> + if (prefix_update(peer, &state, &addr, 0, ROA_NOTFOUND) == 1)
>   peer->up_nlricnt++;
> - }
>  
>   /* no longer needed */
>   rde_filterstate_clean(&state);
> @@ -576,8 +570,13 @@ up_is_eor(struct rde_peer *peer, u_int8_
>  
>   p = RB_MIN(prefix_tree, &peer->updates[aid]);
>   if (p != NULL && p->eor) {
> + /*
> + * Need to remove eor from update tree because
> + * prefix_adjout_destroy() can't handle that.
> + */
>   RB_REMOVE(prefix_tree, &peer->updates[aid], p);
> - prefix_destroy(p);
> + p->flags &= ~PREFIX_FLAG_MASK;
> + prefix_adjout_destroy(p);
>   return 1;
>   }
>   return 0;
> @@ -616,11 +615,11 @@ up_dump_prefix(u_char *buf, int len, str
>  
>   /* prefix sent, remove from list and clear flag */
>   RB_REMOVE(prefix_tree, prefix_head, p);
> - p->flags = 0;
> + p->flags &= ~PREFIX_FLAG_MASK;
>  
>   if (withdraw) {
>   /* prefix no longer needed, remove it */
> - prefix_destroy(p);
> + prefix_adjout_destroy(p);
>   peer->up_wcnt--;
>   peer->prefix_sent_withdraw++;
>   } else {
>