Bigger Buffer Cache - Flip Buffers above/below the DMA'able rea

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

Bigger Buffer Cache - Flip Buffers above/below the DMA'able rea

beck-15
       Want more buffer cache? please have a try with this.

This diff breaks the buffer cache into the dma'able region, and the
above dma-able region of memory.  buffers are always allocated in
the dma'able region, and as they age they are moved above the dma'able
region if such memory exists. I/O operations on buffers in high
memory flip the buffer back into dma-able memory first.

With this diff you can have huge tracts of buffer cache on amd64 but this
also needs testing on all arch's.


Index: kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.206
diff -u -p -r1.206 kern_sysctl.c
--- kern_sysctl.c 5 Jul 2011 04:48:02 -0000 1.206
+++ kern_sysctl.c 7 Jul 2011 21:09:33 -0000
@@ -112,6 +112,7 @@ extern struct disklist_head disklist;
 extern fixpt_t ccpu;
 extern  long numvnodes;
 extern u_int mcllivelocks;
+extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
 
 extern void nmbclust_update(void);
 
@@ -566,8 +567,8 @@ kern_sysctl(int *name, u_int namelen, vo
  return (sysctl_int(oldp, oldlenp, newp, newlen,
     &rthreads_enabled));
  case KERN_CACHEPCT: {
- u_int64_t dmapages;
- int opct, pgs;
+ psize_t pgs;
+ int opct;
  opct = bufcachepercent;
  error = sysctl_int(oldp, oldlenp, newp, newlen,
     &bufcachepercent);
@@ -577,11 +578,13 @@ kern_sysctl(int *name, u_int namelen, vo
  bufcachepercent = opct;
  return (EINVAL);
  }
- dmapages = uvm_pagecount(&dma_constraint);
  if (bufcachepercent != opct) {
- pgs = bufcachepercent * dmapages / 100;
+ pgs = (b_highpages_total + b_dmapages_total)
+    * bufcachepercent / 100;
+ b_dmamaxpages = b_dmapages_total * bufcachepercent
+    / 100;
  bufadjust(pgs); /* adjust bufpages */
- bufhighpages = bufpages; /* set high water mark */
+ bufhighpages = bufpages;
  }
  return(0);
  }
Index: spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/kern/spec_vnops.c,v
retrieving revision 1.67
diff -u -p -r1.67 spec_vnops.c
--- spec_vnops.c 5 Jul 2011 05:37:07 -0000 1.67
+++ spec_vnops.c 6 Jul 2011 22:44:00 -0000
@@ -457,7 +457,9 @@ spec_strategy(void *v)
  struct vop_strategy_args *ap = v;
  struct buf *bp = ap->a_bp;
  int maj = major(bp->b_dev);
-
+
+ if (!ISSET(bp->b_flags, B_DAQ) && ISSET(bp->b_flags, B_BC))
+ panic("bogus buf passed to spec_strategy");
  if (LIST_FIRST(&bp->b_dep) != NULL)
  buf_start(bp);
 
Index: vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.133
diff -u -p -r1.133 vfs_bio.c
--- vfs_bio.c 6 Jul 2011 20:50:05 -0000 1.133
+++ vfs_bio.c 7 Jul 2011 21:34:52 -0000
@@ -68,9 +68,13 @@
 #define BQ_DIRTY 0 /* LRU queue with dirty buffers */
 #define BQ_CLEAN 1 /* LRU queue with clean buffers */
 
-TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
-int needbuffer;
+struct uvm_constraint_range high_constraint;
 struct bio_ops bioops;
+TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+TAILQ_HEAD(bqda, buf) bufqueue_da;
+psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
+int needbuffer,needda;
+int needda;
 
 /*
  * Buffer pool for I/O buffers.
@@ -87,12 +91,13 @@ void buf_put(struct buf *);
 
 struct buf *bio_doread(struct vnode *, daddr64_t, int, int);
 struct buf *buf_get(struct vnode *, daddr64_t, size_t);
+struct buf *buf_remove_from_freelist(struct buf *);
 void bread_cluster_callback(struct buf *);
 
 /*
  * We keep a few counters to monitor the utilization of the buffer cache
  *
- *  numbufpages   - number of pages totally allocated.
+ *  numbufpages  - number of pages totally allocated.
  *  numdirtypages - number of pages on BQ_DIRTY queue.
  *  lodirtypages  - low water mark for buffer cleaning daemon.
  *  hidirtypages  - high water mark for buffer cleaning daemon.
@@ -110,14 +115,112 @@ long hicleanpages;
 long maxcleanpages;
 long backoffpages; /* backoff counter for page allocations */
 long buflowpages; /* bufpages low water mark */
-long bufhighpages; /* bufpages high water mark */
-long bufbackpages; /* number of pages we back off when asked to shrink */
+long bufhighpages; /* bufpages high water mark */
+long bufbackpages; /* number of pages we back off when asked to shrink */
+
+/* XXX - should be defined here but we have md issues */
+extern int bufcachepercent;
 
 vsize_t bufkvm;
 
 struct proc *cleanerproc;
 int bd_req; /* Sleep point for cleaner daemon. */
 
+/* nuke a buf off it's freelist - returns next buf. skips busy buffers */
+struct buf *
+buf_remove_from_freelist(struct buf * bp)
+{
+ struct buf * nbp;
+ nbp = TAILQ_NEXT(bp, b_freelist);
+ /* skip busy buffers */
+ if (!ISSET(bp->b_flags, B_BUSY)) {
+ bremfree(bp);
+ if (bp->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+    &bp->b_vp->v_bufs_tree, bp);
+ brelvp(bp);
+ }
+ buf_put(bp);
+ }
+ return(nbp);
+}
+/*
+ * Add buf to the head of the dma reachable queue
+ * and ensure that it is dma reachable.
+ */
+void
+buf_daq_add(struct buf *buf)
+{
+ struct buf *b;
+ int s;
+
+start:
+ KASSERT(ISSET(buf->b_flags, B_BC));
+ KASSERT(ISSET(buf->b_flags, B_BUSY));
+ KASSERT(buf->b_pobj != NULL);
+ s = splbio();
+ /*
+ * if we are adding to the queue, ensure we free down below the
+ * max
+ */
+ while (b_highpages_total &&
+    (!ISSET(buf->b_flags, B_DAQ)) && (!ISSET(buf->b_flags, B_DMA)) &&
+    (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) {
+ b = TAILQ_FIRST(&bufqueue_da);
+ /* find first non-busy buffer */
+ while (b && ISSET(b->b_flags, B_BUSY))
+ b = TAILQ_NEXT(b, b_qda);
+ if (b == NULL) {
+ /* no non-busy buffers. */
+ needda++;
+ tsleep(&needda, PRIBIO, "needda", 0);
+ needda--;
+ splx(s);
+ goto start;
+ } else {
+ if (b_highpages_total) {
+ buf_acquire_unmapped(b);
+ /* move buffer to above dma reachable memory */
+ TAILQ_REMOVE(&bufqueue_da, b, b_qda);
+ buf_realloc_pages(b, &high_constraint);
+ if (ISSET(b->b_flags, B_DMA))
+ panic("B_DMA after high flip %p", b);
+ CLR(b->b_flags, B_DAQ);
+ buf_release(b);
+ splx(s);
+ goto start;
+ } else {
+       /* no high pages to flip to. */
+       needda++;
+       tsleep(&needda, PRIBIO, "needda", 0);
+       needda--;
+       splx(s);
+       goto start;
+ }
+ }
+ }
+ /* don't copy it if it's already in dma reachable memory */
+ if (ISSET(buf->b_flags, B_DMA)) {
+ /* buf already there, just move it to the end */
+ if (ISSET(buf->b_flags, B_DAQ))
+ TAILQ_REMOVE(&bufqueue_da, buf, b_qda);
+ TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda);
+ SET(buf->b_flags, B_DAQ);
+ } else {
+ if (ISSET(buf->b_flags, B_DAQ))
+ panic("non-dma buffer on dma queue %p\n", buf);
+ /* move buf to dma reachable memory */
+ buf_realloc_pages(buf, &dma_constraint);
+ if (!ISSET(buf->b_flags, B_DMA))
+ panic("non-dma buffer after dma move %p\n", buf);
+ TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda);
+ SET(buf->b_flags, B_DAQ);
+ }
+ splx(s);
+ return;
+
+}
+
 void
 bremfree(struct buf *bp)
 {
@@ -139,11 +242,10 @@ bremfree(struct buf *bp)
  if (dp == &bufqueues[BQUEUES])
  panic("bremfree: lost tail");
  }
- if (!ISSET(bp->b_flags, B_DELWRI)) {
+ if (!ISSET(bp->b_flags, B_DELWRI))
  bcstats.numcleanpages -= atop(bp->b_bufsize);
- } else {
+ else
  bcstats.numdirtypages -= atop(bp->b_bufsize);
- }
  TAILQ_REMOVE(dp, bp, b_freelist);
  bcstats.freebufs--;
 }
@@ -175,7 +277,10 @@ buf_put(struct buf *bp)
  if (backoffpages < 0)
  backoffpages = 0;
  }
-
+ if (ISSET(bp->b_flags, B_DAQ)) {
+ TAILQ_REMOVE(&bufqueue_da, bp, b_qda);
+ CLR(bp->b_flags, B_DAQ);
+ }
  if (buf_dealloc_mem(bp) != 0)
  return;
  pool_put(&bufpool, bp);
@@ -187,10 +292,22 @@ buf_put(struct buf *bp)
 void
 bufinit(void)
 {
- u_int64_t dmapages;
  struct bqueues *dp;
 
- dmapages = uvm_pagecount(&dma_constraint);
+ bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0;
+ /*
+ * XXX note this really is "high" - i.e. *above* dma_constraint
+ */
+ high_constraint.ucr_low = dma_constraint.ucr_high;
+ high_constraint.ucr_high = no_constraint.ucr_high;
+
+ /* do we have memory above dma_constraint, or not? */
+ if (high_constraint.ucr_low != high_constraint.ucr_high) {
+ high_constraint.ucr_low++;
+ b_highpages_total = uvm_pagecount(&high_constraint);
+ } else
+ b_highpages_total = 0;
+ b_dmapages_total = uvm_pagecount(&dma_constraint);
 
  /*
  * If MD code doesn't say otherwise, use 10% of kvm for mappings and
@@ -199,25 +316,31 @@ bufinit(void)
  if (bufcachepercent == 0)
  bufcachepercent = 10;
  if (bufpages == 0)
- bufpages = dmapages * bufcachepercent / 100;
+ bufpages = (b_highpages_total + b_dmapages_total)
+    * bufcachepercent / 100;
 
  bufhighpages = bufpages;
+ b_dmamaxpages = b_dmapages_total * bufcachepercent / 100;
+
+ printf("buffer cache from %d dma pages and %d high pages\n",
+    b_dmapages_total, b_highpages_total);
 
  /*
  * set the base backoff level for the buffer cache to bufpages.
  * we will not allow uvm to steal back more than this number of
  * pages
  */
- buflowpages = dmapages * 10 / 100;
+ buflowpages = b_dmapages_total * 10 / 100;
 
  /*
- * set bufbackpages to 100 pages, or 10 percent of the low water mark
- * if we don't have that many pages.
+ * set bufbackpages to 1 MB worth or pages, or 10 percent of
+ * the low water mark if we don't have that many pages.
  */
 
  bufbackpages = buflowpages * 10 / 100;
- if (bufbackpages > 100)
- bufbackpages = 100;
+
+ if (bufbackpages > (1048576 / PAGE_SIZE))
+ bufbackpages = (1048576 / PAGE_SIZE);
 
  if (bufkvm == 0)
  bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10;
@@ -238,15 +361,16 @@ bufinit(void)
  pool_setipl(&bufpool, IPL_BIO);
  for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
  TAILQ_INIT(dp);
+ TAILQ_INIT(&bufqueue_da);
 
  /*
  * hmm - bufkvm is an argument because it's static, while
  * bufpages is global because it can change while running.
- */
+ */
  buf_mem_init(bufkvm);
 
- hidirtypages = (bufpages / 4) * 3;
- lodirtypages = bufpages / 2;
+ hidirtypages = (b_dmamaxpages / 4) * 3;
+ lodirtypages = b_dmamaxpages / 2;
 
  /*
  * When we hit 95% of pages being clean, we bring them down to
@@ -259,6 +383,39 @@ bufinit(void)
 }
 
 /*
+ * Flip some dma reachable cache pages high
+ */
+void
+bufhigh(int delta)
+{
+ psize_t newdmapages;
+ struct buf *b;
+ int s;
+
+ if (!b_highpages_total)
+ return;
+ s = splbio();
+ newdmapages = bcstats.dmapages - delta;
+ while ((bcstats.dmapages > newdmapages) &&
+    (b = TAILQ_FIRST(&bufqueue_da))) {
+ while (ISSET(b->b_flags, B_BUSY))
+ b = TAILQ_NEXT(b, b_qda);
+ if (b != NULL) {
+ buf_acquire_unmapped(b);
+ /* move buffer to above dma reachable memory */
+ buf_realloc_pages(b, &high_constraint);
+ if (ISSET(b->b_flags, B_DMA))
+ panic("DMA flagged buffer after high flip %p", b);
+ TAILQ_REMOVE(&bufqueue_da, b, b_qda);
+ CLR(b->b_flags, B_DAQ);
+ buf_release(b);
+ }
+ }
+ wakeup(&needda);
+ splx(s);
+}
+
+/*
  * Change cachepct
  */
 void
@@ -272,10 +429,19 @@ bufadjust(int newbufpages)
  int s;
 
  s = splbio();
+ /* XXX for hibernate  - throw away everything we can.*/
+ if (newbufpages == 0) {
+ bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+ while (bp)
+ bp = buf_remove_from_freelist(bp);
+ splx(s);
+ return;
+ }
+
  bufpages = newbufpages;
 
- hidirtypages = (bufpages / 4) * 3;
- lodirtypages = bufpages / 2;
+ hidirtypages = (b_dmamaxpages / 4) * 3;
+ lodirtypages = b_dmamaxpages / 2;
 
  /*
  * When we hit 95% of pages being clean, we bring them down to
@@ -291,16 +457,9 @@ bufadjust(int newbufpages)
  * free them up to get back down. this may possibly consume
  * all our clean pages...
  */
- while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
-    (bcstats.numbufpages > bufpages)) {
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
-    &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
- }
+ bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+ while (bp && (bcstats.numbufpages > bufpages))
+ bp = buf_remove_from_freelist(bp);
 
  /*
  * Wake up cleaner if we're getting low on pages. We might
@@ -336,23 +495,39 @@ bufbackoff(struct uvm_constraint_range *
  * On success, it frees N pages from the buffer cache, and sets
  * a flag so that the next N allocations from buf_get will recycle
  * a buffer rather than allocate a new one. It then returns 0 to the
- * caller.
+ * caller.
  *
  * on failure, it could free no pages from the buffer cache, does
- * nothing and returns -1 to the caller.
+ * nothing and returns -1 to the caller.
+ */
+
+ psize_t d, s;
+
+ /*
+ * back of by at least bufbackpages, or bufbackpages + what
+ * the pagedaemon needs if it happens to know when it calls us
  */
- long d;
+ s = (size > 0) ? bufbackpages + size : bufbackpages;
 
- if (bufpages <= buflowpages)
+ if (bufpages <= buflowpages)
  return(-1);
 
- if (bufpages - bufbackpages >= buflowpages)
- d = bufbackpages;
+ if (bufpages - s >= buflowpages)
+ d = s;
  else
  d = bufpages - buflowpages;
- backoffpages = bufbackpages;
- bufadjust(bufpages - d);
- backoffpages = bufbackpages;
+
+ if (b_highpages_total
+    && (range->ucr_high <= dma_constraint.ucr_high)) {
+ if (bcstats.dmapages - s > b_dmamaxpages)
+ s += (bcstats.dmapages - b_dmamaxpages);
+ bufhigh(s);
+ }
+ else {
+ backoffpages = bufbackpages;
+ bufadjust(bufpages - d);
+ backoffpages = bufbackpages;
+ }
  return(0);
 }
 
@@ -534,12 +709,18 @@ bread_cluster(struct vnode *vp, daddr64_
  for (i = 1; i < howmany; i++) {
  bcstats.pendingreads++;
  bcstats.numreads++;
- SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
+ /*
+ * We set B_DMA here  because bp above should be
+ * and we are playing buffer slice-n-dice games
+ * from the memory allocated in bp.
+ */
+ SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
  xbpp[i]->b_blkno = sblkno + (i * inc);
  xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
  xbpp[i]->b_data = NULL;
  xbpp[i]->b_pobj = bp->b_pobj;
  xbpp[i]->b_poffs = bp->b_poffs + (i * size);
+ buf_daq_add(xbpp[i]);
  }
 
  KASSERT(bp->b_lblkno == blkno + 1);
@@ -618,7 +799,7 @@ bwrite(struct buf *bp)
  reassignbuf(bp);
  } else
  curproc->p_stats->p_ru.ru_oublock++;
-
+
 
  /* Initiate disk write.  Make sure the appropriate party is charged. */
  bp->b_vp->v_numoutput++;
@@ -793,6 +974,8 @@ brelse(struct buf *bp)
  CLR(bp->b_flags, B_WANTED);
  wakeup(bp);
  }
+ if (ISSET(bp->b_flags, B_DMA) && needda)
+ wakeup(&needda);
  if (bp->b_vp != NULL)
  RB_REMOVE(buf_rb_bufs,
     &bp->b_vp->v_bufs_tree, bp);
@@ -833,19 +1016,6 @@ brelse(struct buf *bp)
  bcstats.freebufs++;
  CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
  buf_release(bp);
-
- /* Wake up any processes waiting for any buffer to become free. */
- if (needbuffer) {
- needbuffer--;
- wakeup(&needbuffer);
- }
-
- /* Wake up any processes waiting for _this_ buffer to become free. */
- if (ISSET(bp->b_flags, B_WANTED)) {
- CLR(bp->b_flags, B_WANTED);
- wakeup(bp);
- }
-
  splx(s);
 }
 
@@ -981,16 +1151,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
  * free down to the low water mark.
  */
  if (bcstats.numcleanpages > hicleanpages) {
- while (bcstats.numcleanpages > locleanpages) {
- bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
-    &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
- }
+ bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+ while (bp && (bcstats.numcleanpages > locleanpages))
+ bp = buf_remove_from_freelist(bp);
  }
 
  npages = atop(round_page(size));
@@ -1002,15 +1165,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
     || backoffpages) {
  int freemax = 5;
  int i = freemax;
- while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) {
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
-    &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
- }
+ bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+ while (bp && i--)
+ bp = buf_remove_from_freelist(bp);
  if (freemax == i &&
     (bcstats.numbufpages + npages > bufpages)) {
  needbuffer++;
@@ -1027,6 +1184,8 @@ buf_get(struct vnode *vp, daddr64_t blkn
  splx(s);
  return (NULL);
  }
+ /* Mark buffer as the cache's */
+ SET(bp->b_flags, B_BC);
 
  bp->b_freelist.tqe_next = NOLIST;
  bp->b_synctime = time_uptime + 300;
@@ -1041,7 +1200,7 @@ buf_get(struct vnode *vp, daddr64_t blkn
  * We insert the buffer into the hash with B_BUSY set
  * while we allocate pages for it. This way any getblk
  * that happens while we allocate pages will wait for
- * this buffer instead of starting its own guf_get.
+ * this buffer instead of starting its own buf_get.
  *
  * But first, we check if someone beat us to it.
  */
@@ -1067,10 +1226,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
  if (size) {
  buf_alloc_pages(bp, round_page(size));
  buf_map(bp);
+ buf_daq_add(bp);
  }
-
  splx(s);
-
  return (bp);
 }
 
@@ -1082,23 +1240,46 @@ buf_daemon(struct proc *p)
 {
  struct timeval starttime, timediff;
  struct buf *bp;
- int s;
+ int s, nb, error;
 
  cleanerproc = curproc;
 
  s = splbio();
  for (;;) {
+ struct buf *nbp;
  if (bcstats.numdirtypages < hidirtypages)
  tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
 
  getmicrouptime(&starttime);
-
+start:
+ nb = 0;
  while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
  struct timeval tv;
+ nbp = TAILQ_NEXT(bp, b_freelist);
 
  if (bcstats.numdirtypages < lodirtypages)
  break;
 
+ /*
+ * If we haven't found any other buffers to
+ * process and this last one is busy, wait for
+ * it and restart. otherwise, continue and
+ * process the rest of them..
+ *
+ */
+ if ((nb == 0) && (nbp == NULL) &&
+    ISSET(bp->b_flags, B_BUSY)) {
+ SET(bp->b_flags, B_WANTED);
+ error = tsleep(bp, PRIBIO + 1, "getblk", 0);
+ splx(s);
+ if (error)
+ return;
+ s = splbio();
+ goto start;
+ } else {
+ continue;
+ }
+ nb++;
  bremfree(bp);
  buf_acquire(bp);
  splx(s);
@@ -1132,7 +1313,6 @@ buf_daemon(struct proc *p)
  s = splbio();
  if (timediff.tv_sec)
  break;
-
  }
  }
 }
Index: vfs_biomem.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_biomem.c,v
retrieving revision 1.17
diff -u -p -r1.17 vfs_biomem.c
--- vfs_biomem.c 7 Apr 2011 19:07:42 -0000 1.17
+++ vfs_biomem.c 7 Jul 2011 21:17:09 -0000
@@ -33,6 +33,8 @@ TAILQ_HEAD(,buf) buf_valist;
 int buf_nkvmsleep;
 
 extern struct bcachestats bcstats;
+extern int needbuffer;
+extern int needda;
 
 /*
  * Pages are allocated from a uvm object (we only use it for page storage,
@@ -99,6 +101,11 @@ buf_acquire_unmapped(struct buf *bp)
 
  s = splbio();
  SET(bp->b_flags, B_BUSY|B_NOTMAPPED);
+ /* XXX */
+ if (bp->b_data != NULL) {
+ TAILQ_REMOVE(&buf_valist, bp, b_valist);
+ bcstats.busymapped++;
+ }
  splx(s);
 }
 
@@ -170,6 +177,24 @@ buf_release(struct buf *bp)
  }
  }
  CLR(bp->b_flags, B_BUSY|B_NOTMAPPED);
+ if (ISSET(bp->b_flags, B_DMA) && needda) {
+ wakeup(&needda);
+ }
+ /* Wake up any processes waiting for any buffer to become free. */
+ if (needbuffer) {
+ needbuffer--;
+ wakeup(&needbuffer);
+ }
+
+ /*
+ * Wake up any processes waiting for _this_ buffer to become
+ * free.
+ */
+
+ if (ISSET(bp->b_flags, B_WANTED)) {
+ CLR(bp->b_flags, B_WANTED);
+ wakeup(bp);
+ }
  splx(s);
 }
 
@@ -286,6 +311,8 @@ buf_alloc_pages(struct buf *bp, vsize_t
 
  uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
  bcstats.numbufpages += atop(size);
+ bcstats.dmapages += atop(size);
+ SET(bp->b_flags, B_DMA);
  bp->b_pobj = buf_object;
  bp->b_poffs = offs;
  bp->b_bufsize = size;
@@ -302,6 +329,7 @@ buf_free_pages(struct buf *bp)
 
  KASSERT(bp->b_data == NULL);
  KASSERT(uobj != NULL);
+ KASSERT(!ISSET(bp->b_flags, B_DAQ));
 
  s = splbio();
 
@@ -316,11 +344,57 @@ buf_free_pages(struct buf *bp)
  pg->wire_count = 0;
  uvm_pagefree(pg);
  bcstats.numbufpages--;
+ if (ISSET(bp->b_flags, B_DMA))
+ bcstats.dmapages--;
  }
+ CLR(bp->b_flags, B_DMA);
  splx(s);
 }
 
-/*
- * XXX - it might make sense to make a buf_realloc_pages to avoid
- *       bouncing through the free list all the time.
- */
+/* Reallocate a buf into a particular location specified by "where" */
+void
+buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where)
+{
+ vaddr_t va;
+ int dma;
+   int s, i;
+
+ s = splbio();
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+ dma = ISSET(bp->b_flags, B_DMA);
+
+ /* if the original buf is mapped, unmap it */
+ if (bp->b_data != NULL) {
+ va = (vaddr_t)bp->b_data;
+ pmap_kremove(va, bp->b_bufsize);
+ pmap_update(pmap_kernel());
+ }
+ uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, bp->b_bufsize,
+    UVM_PLA_WAITOK, where);
+ /*
+ * do this now, and put it back later when we know where we are
+ */
+ if (dma)
+ bcstats.dmapages -= atop(bp->b_bufsize);
+
+ dma = 1;
+ /* if the original buf was mapped, re-map it */
+ for (i = 0; i < atop(bp->b_bufsize); i++) {
+ struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
+    bp->b_poffs + ptoa(i));
+ KASSERT(pg != NULL);
+ if  (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
+ dma = 0;
+ if (bp->b_data != NULL) {
+ pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
+    VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ }
+ }
+ if (dma) {
+ SET(bp->b_flags, B_DMA);
+ bcstats.dmapages += atop(bp->b_bufsize);
+ } else
+ CLR(bp->b_flags, B_DMA);
+ splx(s);
+}
Index: vfs_vops.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_vops.c,v
retrieving revision 1.4
diff -u -p -r1.4 vfs_vops.c
--- vfs_vops.c 2 Jul 2011 15:52:25 -0000 1.4
+++ vfs_vops.c 6 Jul 2011 22:39:28 -0000
@@ -614,6 +614,17 @@ VOP_STRATEGY(struct buf *bp)
  if (bp->b_vp->v_op->vop_strategy == NULL)
  return (EOPNOTSUPP);
 
+ /*
+ * Flip buffer to dma reachable memory if
+ * necessary.
+ *
+ * XXX if you're making your own buffers and not
+ * having the buffer cache manage them then it's your
+ * problem to ensure they can be dma'ed to and from.
+ */
+ if (ISSET(bp->b_flags, B_BC))
+ buf_daq_add(bp);
+
  return ((bp->b_vp->v_op->vop_strategy)(&a));
 }

Reply | Threaded
Open this post in threaded view
|

Re: Bigger Buffer Cache - Flip Buffers above/below the DMA'able rea

Bob Beck-4
oops. ignore this -  diff in wrong directory..


On 7 July 2011 15:53, Bob Beck <[hidden email]> wrote:

>       Want more buffer cache? please have a try with this.
>
> This diff breaks the buffer cache into the dma'able region, and the
> above dma-able region of memory.  buffers are always allocated in
> the dma'able region, and as they age they are moved above the dma'able
> region if such memory exists. I/O operations on buffers in high
> memory flip the buffer back into dma-able memory first.
>
> With this diff you can have huge tracts of buffer cache on amd64 but this
> also needs testing on all arch's.
>
>
> Index: kern_sysctl.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.206
> diff -u -p -r1.206 kern_sysctl.c
> --- kern_sysctl.c       5 Jul 2011 04:48:02 -0000       1.206
> +++ kern_sysctl.c       7 Jul 2011 21:09:33 -0000
> @@ -112,6 +112,7 @@ extern struct disklist_head disklist;
>  extern fixpt_t ccpu;
>  extern  long numvnodes;
>  extern u_int mcllivelocks;
> +extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
>
>  extern void nmbclust_update(void);
>
> @@ -566,8 +567,8 @@ kern_sysctl(int *name, u_int namelen, vo
>                return (sysctl_int(oldp, oldlenp, newp, newlen,
>                    &rthreads_enabled));
>        case KERN_CACHEPCT: {
> -               u_int64_t dmapages;
> -               int opct, pgs;
> +               psize_t pgs;
> +               int opct;
>                opct = bufcachepercent;
>                error = sysctl_int(oldp, oldlenp, newp, newlen,
>                    &bufcachepercent);
> @@ -577,11 +578,13 @@ kern_sysctl(int *name, u_int namelen, vo
>                        bufcachepercent = opct;
>                        return (EINVAL);
>                }
> -               dmapages = uvm_pagecount(&dma_constraint);
>                if (bufcachepercent != opct) {
> -                       pgs = bufcachepercent * dmapages / 100;
> +                       pgs = (b_highpages_total + b_dmapages_total)
> +                           * bufcachepercent / 100;
> +                       b_dmamaxpages = b_dmapages_total * bufcachepercent
> +                           / 100;
>                        bufadjust(pgs); /* adjust bufpages */
> -                       bufhighpages = bufpages; /* set high water mark */
> +                       bufhighpages = bufpages;
>                }
>                return(0);
>        }
> Index: spec_vnops.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/spec_vnops.c,v
> retrieving revision 1.67
> diff -u -p -r1.67 spec_vnops.c
> --- spec_vnops.c        5 Jul 2011 05:37:07 -0000       1.67
> +++ spec_vnops.c        6 Jul 2011 22:44:00 -0000
> @@ -457,7 +457,9 @@ spec_strategy(void *v)
>        struct vop_strategy_args *ap = v;
>        struct buf *bp = ap->a_bp;
>        int maj = major(bp->b_dev);
> -
> +
> +       if (!ISSET(bp->b_flags, B_DAQ) && ISSET(bp->b_flags, B_BC))
> +               panic("bogus buf passed to spec_strategy");
>        if (LIST_FIRST(&bp->b_dep) != NULL)
>                buf_start(bp);
>
> Index: vfs_bio.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_bio.c,v
> retrieving revision 1.133
> diff -u -p -r1.133 vfs_bio.c
> --- vfs_bio.c   6 Jul 2011 20:50:05 -0000       1.133
> +++ vfs_bio.c   7 Jul 2011 21:34:52 -0000
> @@ -68,9 +68,13 @@
>  #define        BQ_DIRTY        0               /* LRU queue with dirty
buffers */
>  #define        BQ_CLEAN        1               /* LRU queue with clean
buffers */

>
> -TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
> -int needbuffer;
> +struct uvm_constraint_range high_constraint;
>  struct bio_ops bioops;
> +TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
> +TAILQ_HEAD(bqda, buf) bufqueue_da;
> +psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
> +int needbuffer,needda;
> +int needda;
>
>  /*
>  * Buffer pool for I/O buffers.
> @@ -87,12 +91,13 @@ void buf_put(struct buf *);
>
>  struct buf *bio_doread(struct vnode *, daddr64_t, int, int);
>  struct buf *buf_get(struct vnode *, daddr64_t, size_t);
> +struct buf *buf_remove_from_freelist(struct buf *);
>  void bread_cluster_callback(struct buf *);
>
>  /*
>  * We keep a few counters to monitor the utilization of the buffer cache
>  *
> - *  numbufpages   - number of pages totally allocated.
> + *  numbufpages          - number of pages totally allocated.
>  *  numdirtypages - number of pages on BQ_DIRTY queue.
>  *  lodirtypages  - low water mark for buffer cleaning daemon.
>  *  hidirtypages  - high water mark for buffer cleaning daemon.
> @@ -110,14 +115,112 @@ long hicleanpages;
>  long maxcleanpages;
>  long backoffpages;     /* backoff counter for page allocations */
>  long buflowpages;      /* bufpages low water mark */
> -long bufhighpages;     /* bufpages high water mark */
> -long bufbackpages;     /* number of pages we back off when asked to shrink
*/
> +long bufhighpages;     /* bufpages high water mark */
> +long bufbackpages;     /* number of pages we back off when asked to shrink
*/

> +
> +/* XXX - should be defined here but we have md issues */
> +extern int bufcachepercent;
>
>  vsize_t bufkvm;
>
>  struct proc *cleanerproc;
>  int bd_req;                    /* Sleep point for cleaner daemon. */
>
> +/* nuke a buf off it's freelist - returns next buf. skips busy buffers */
> +struct buf *
> +buf_remove_from_freelist(struct buf * bp)
> +{
> +       struct buf * nbp;
> +       nbp = TAILQ_NEXT(bp, b_freelist);
> +       /* skip busy buffers */
> +       if (!ISSET(bp->b_flags, B_BUSY)) {
> +               bremfree(bp);
> +               if (bp->b_vp) {
> +                       RB_REMOVE(buf_rb_bufs,
> +                           &bp->b_vp->v_bufs_tree, bp);
> +                       brelvp(bp);
> +               }
> +               buf_put(bp);
> +       }
> +       return(nbp);
> +}
> +/*
> + * Add buf to the head of the dma reachable queue
> + * and ensure that it is dma reachable.
> + */
> +void
> +buf_daq_add(struct buf *buf)
> +{
> +       struct buf *b;
> +       int s;
> +
> +start:
> +       KASSERT(ISSET(buf->b_flags, B_BC));
> +       KASSERT(ISSET(buf->b_flags, B_BUSY));
> +       KASSERT(buf->b_pobj != NULL);
> +       s = splbio();
> +       /*
> +        * if we are adding to the queue, ensure we free down below the
> +        * max
> +        */
> +       while (b_highpages_total &&
> +           (!ISSET(buf->b_flags, B_DAQ)) && (!ISSET(buf->b_flags, B_DMA))
&&

> +           (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) {
> +               b = TAILQ_FIRST(&bufqueue_da);
> +               /* find first non-busy buffer */
> +               while (b && ISSET(b->b_flags, B_BUSY))
> +                       b = TAILQ_NEXT(b, b_qda);
> +               if (b == NULL) {
> +                       /* no non-busy buffers. */
> +                       needda++;
> +                       tsleep(&needda, PRIBIO, "needda", 0);
> +                       needda--;
> +                       splx(s);
> +                       goto start;
> +               } else {
> +                       if (b_highpages_total) {
> +                               buf_acquire_unmapped(b);
> +                               /* move buffer to above dma reachable memory
*/
> +                               TAILQ_REMOVE(&bufqueue_da, b, b_qda);
> +                               buf_realloc_pages(b, &high_constraint);
> +                               if (ISSET(b->b_flags, B_DMA))
> +                                       panic("B_DMA after high flip %p",
b);

> +                               CLR(b->b_flags, B_DAQ);
> +                               buf_release(b);
> +                               splx(s);
> +                               goto start;
> +                       } else {
> +                              /* no high pages to flip to. */
> +                              needda++;
> +                              tsleep(&needda, PRIBIO, "needda", 0);
> +                              needda--;
> +                              splx(s);
> +                              goto start;
> +                       }
> +               }
> +       }
> +       /* don't copy it if it's already in dma reachable memory */
> +       if (ISSET(buf->b_flags, B_DMA)) {
> +               /* buf already there, just move it to the end */
> +               if (ISSET(buf->b_flags, B_DAQ))
> +                       TAILQ_REMOVE(&bufqueue_da, buf, b_qda);
> +               TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda);
> +               SET(buf->b_flags, B_DAQ);
> +       } else {
> +               if (ISSET(buf->b_flags, B_DAQ))
> +                       panic("non-dma buffer on dma queue %p\n", buf);
> +               /* move buf to dma reachable memory */
> +               buf_realloc_pages(buf, &dma_constraint);
> +               if (!ISSET(buf->b_flags, B_DMA))
> +                       panic("non-dma buffer after dma move %p\n", buf);
> +               TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda);
> +               SET(buf->b_flags, B_DAQ);
> +       }
> +       splx(s);
> +       return;
> +
> +}
> +
>  void
>  bremfree(struct buf *bp)
>  {
> @@ -139,11 +242,10 @@ bremfree(struct buf *bp)
>                if (dp == &bufqueues[BQUEUES])
>                        panic("bremfree: lost tail");
>        }
> -       if (!ISSET(bp->b_flags, B_DELWRI)) {
> +       if (!ISSET(bp->b_flags, B_DELWRI))
>                bcstats.numcleanpages -= atop(bp->b_bufsize);
> -       } else {
> +       else
>                bcstats.numdirtypages -= atop(bp->b_bufsize);
> -       }
>        TAILQ_REMOVE(dp, bp, b_freelist);
>        bcstats.freebufs--;
>  }
> @@ -175,7 +277,10 @@ buf_put(struct buf *bp)
>                if (backoffpages < 0)
>                        backoffpages = 0;
>        }
> -
> +       if (ISSET(bp->b_flags, B_DAQ)) {
> +               TAILQ_REMOVE(&bufqueue_da, bp, b_qda);
> +               CLR(bp->b_flags, B_DAQ);
> +       }
>        if (buf_dealloc_mem(bp) != 0)
>                return;
>        pool_put(&bufpool, bp);
> @@ -187,10 +292,22 @@ buf_put(struct buf *bp)
>  void
>  bufinit(void)
>  {
> -       u_int64_t dmapages;
>        struct bqueues *dp;
>
> -       dmapages = uvm_pagecount(&dma_constraint);
> +       bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm =
0;

> +       /*
> +        * XXX note this really is "high" - i.e. *above* dma_constraint
> +        */
> +       high_constraint.ucr_low = dma_constraint.ucr_high;
> +       high_constraint.ucr_high = no_constraint.ucr_high;
> +
> +       /* do we have memory above dma_constraint, or not? */
> +       if (high_constraint.ucr_low != high_constraint.ucr_high) {
> +               high_constraint.ucr_low++;
> +               b_highpages_total = uvm_pagecount(&high_constraint);
> +       } else
> +               b_highpages_total = 0;
> +       b_dmapages_total = uvm_pagecount(&dma_constraint);
>
>        /*
>         * If MD code doesn't say otherwise, use 10% of kvm for mappings and
> @@ -199,25 +316,31 @@ bufinit(void)
>        if (bufcachepercent == 0)
>                bufcachepercent = 10;
>        if (bufpages == 0)
> -               bufpages = dmapages * bufcachepercent / 100;
> +               bufpages = (b_highpages_total + b_dmapages_total)
> +                   * bufcachepercent / 100;
>
>        bufhighpages = bufpages;
> +       b_dmamaxpages = b_dmapages_total * bufcachepercent / 100;
> +
> +       printf("buffer cache from %d dma pages and %d high pages\n",
> +           b_dmapages_total, b_highpages_total);
>
>        /*
>         * set the base backoff level for the buffer cache to bufpages.
>         * we will not allow uvm to steal back more than this number of
>         * pages
>         */
> -       buflowpages = dmapages * 10 / 100;
> +       buflowpages = b_dmapages_total * 10 / 100;
>
>        /*
> -        * set bufbackpages to 100 pages, or 10 percent of the low water
mark

> -        * if we don't have that many pages.
> +        * set bufbackpages to 1 MB worth or pages, or 10 percent of
> +        * the low water mark if we don't have that many pages.
>         */
>
>        bufbackpages = buflowpages * 10 / 100;
> -       if (bufbackpages > 100)
> -               bufbackpages = 100;
> +
> +       if (bufbackpages > (1048576 / PAGE_SIZE))
> +               bufbackpages = (1048576 / PAGE_SIZE);
>
>        if (bufkvm == 0)
>                bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
10;

> @@ -238,15 +361,16 @@ bufinit(void)
>        pool_setipl(&bufpool, IPL_BIO);
>        for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
>                TAILQ_INIT(dp);
> +       TAILQ_INIT(&bufqueue_da);
>
>        /*
>         * hmm - bufkvm is an argument because it's static, while
>         * bufpages is global because it can change while running.
> -        */
> +        */
>        buf_mem_init(bufkvm);
>
> -       hidirtypages = (bufpages / 4) * 3;
> -       lodirtypages = bufpages / 2;
> +       hidirtypages = (b_dmamaxpages / 4) * 3;
> +       lodirtypages = b_dmamaxpages / 2;
>
>        /*
>         * When we hit 95% of pages being clean, we bring them down to
> @@ -259,6 +383,39 @@ bufinit(void)
>  }
>
>  /*
> + * Flip some dma reachable cache pages high
> + */
> +void
> +bufhigh(int delta)
> +{
> +       psize_t newdmapages;
> +       struct buf *b;
> +       int s;
> +
> +       if (!b_highpages_total)
> +               return;
> +       s = splbio();
> +       newdmapages = bcstats.dmapages - delta;
> +       while ((bcstats.dmapages > newdmapages) &&
> +           (b = TAILQ_FIRST(&bufqueue_da))) {
> +               while (ISSET(b->b_flags, B_BUSY))
> +                       b = TAILQ_NEXT(b, b_qda);
> +               if (b != NULL) {
> +                       buf_acquire_unmapped(b);
> +                       /* move buffer to above dma reachable memory */
> +                       buf_realloc_pages(b, &high_constraint);
> +                       if (ISSET(b->b_flags, B_DMA))
> +                               panic("DMA flagged buffer after high flip
%p", b);

> +                       TAILQ_REMOVE(&bufqueue_da, b, b_qda);
> +                       CLR(b->b_flags, B_DAQ);
> +                       buf_release(b);
> +               }
> +       }
> +       wakeup(&needda);
> +       splx(s);
> +}
> +
> +/*
>  * Change cachepct
>  */
>  void
> @@ -272,10 +429,19 @@ bufadjust(int newbufpages)
>        int s;
>
>        s = splbio();
> +       /* XXX for hibernate  - throw away everything we can.*/
> +       if (newbufpages == 0) {
> +               bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
> +               while (bp)
> +                       bp = buf_remove_from_freelist(bp);
> +               splx(s);
> +               return;
> +       }
> +
>        bufpages = newbufpages;
>
> -       hidirtypages = (bufpages / 4) * 3;
> -       lodirtypages = bufpages / 2;
> +       hidirtypages = (b_dmamaxpages / 4) * 3;
> +       lodirtypages = b_dmamaxpages / 2;
>
>        /*
>         * When we hit 95% of pages being clean, we bring them down to
> @@ -291,16 +457,9 @@ bufadjust(int newbufpages)
>         * free them up to get back down. this may possibly consume
>         * all our clean pages...
>         */
> -       while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
> -           (bcstats.numbufpages > bufpages)) {
> -               bremfree(bp);
> -               if (bp->b_vp) {
> -                       RB_REMOVE(buf_rb_bufs,
> -                           &bp->b_vp->v_bufs_tree, bp);
> -                       brelvp(bp);
> -               }
> -               buf_put(bp);
> -       }
> +       bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
> +       while (bp && (bcstats.numbufpages > bufpages))
> +               bp = buf_remove_from_freelist(bp);
>
>        /*
>         * Wake up cleaner if we're getting low on pages. We might
> @@ -336,23 +495,39 @@ bufbackoff(struct uvm_constraint_range *
>         * On success, it frees N pages from the buffer cache, and sets
>         * a flag so that the next N allocations from buf_get will recycle
>         * a buffer rather than allocate a new one. It then returns 0 to the
> -        * caller.
> +        * caller.
>         *
>         * on failure, it could free no pages from the buffer cache, does
> -        * nothing and returns -1 to the caller.
> +        * nothing and returns -1 to the caller.
> +        */
> +
> +       psize_t d, s;
> +
> +       /*
> +        * back of by at least bufbackpages, or bufbackpages + what
> +        * the pagedaemon needs if it happens to know when it calls us
>         */
> -       long d;
> +       s = (size > 0) ? bufbackpages + size : bufbackpages;
>
> -       if (bufpages <= buflowpages)
> +       if (bufpages <= buflowpages)
>                return(-1);
>
> -       if (bufpages - bufbackpages >= buflowpages)
> -               d = bufbackpages;
> +       if (bufpages - s >= buflowpages)
> +               d = s;
>        else
>                d = bufpages - buflowpages;
> -       backoffpages = bufbackpages;
> -       bufadjust(bufpages - d);
> -       backoffpages = bufbackpages;
> +
> +       if (b_highpages_total
> +           && (range->ucr_high <= dma_constraint.ucr_high)) {
> +               if (bcstats.dmapages - s > b_dmamaxpages)
> +                       s += (bcstats.dmapages - b_dmamaxpages);
> +               bufhigh(s);
> +       }
> +       else {
> +               backoffpages = bufbackpages;
> +               bufadjust(bufpages - d);
> +               backoffpages = bufbackpages;
> +       }
>        return(0);
>  }
>
> @@ -534,12 +709,18 @@ bread_cluster(struct vnode *vp, daddr64_
>        for (i = 1; i < howmany; i++) {
>                bcstats.pendingreads++;
>                bcstats.numreads++;
> -               SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
> +               /*
> +                * We set B_DMA here  because bp above should be
> +                * and we are playing buffer slice-n-dice games
> +                * from the memory allocated in bp.
> +                */
> +               SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
>                xbpp[i]->b_blkno = sblkno + (i * inc);
>                xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
>                xbpp[i]->b_data = NULL;
>                xbpp[i]->b_pobj = bp->b_pobj;
>                xbpp[i]->b_poffs = bp->b_poffs + (i * size);
> +               buf_daq_add(xbpp[i]);
>        }
>
>        KASSERT(bp->b_lblkno == blkno + 1);
> @@ -618,7 +799,7 @@ bwrite(struct buf *bp)
>                reassignbuf(bp);
>        } else
>                curproc->p_stats->p_ru.ru_oublock++;
> -
> +
>
>        /* Initiate disk write.  Make sure the appropriate party is charged.
*/

>        bp->b_vp->v_numoutput++;
> @@ -793,6 +974,8 @@ brelse(struct buf *bp)
>                                CLR(bp->b_flags, B_WANTED);
>                                wakeup(bp);
>                        }
> +                       if (ISSET(bp->b_flags, B_DMA) && needda)
> +                               wakeup(&needda);
>                        if (bp->b_vp != NULL)
>                                RB_REMOVE(buf_rb_bufs,
>                                    &bp->b_vp->v_bufs_tree, bp);
> @@ -833,19 +1016,6 @@ brelse(struct buf *bp)
>        bcstats.freebufs++;
>        CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
>        buf_release(bp);
> -
> -       /* Wake up any processes waiting for any buffer to become free. */
> -       if (needbuffer) {
> -               needbuffer--;
> -               wakeup(&needbuffer);
> -       }
> -
> -       /* Wake up any processes waiting for _this_ buffer to become free.
*/

> -       if (ISSET(bp->b_flags, B_WANTED)) {
> -               CLR(bp->b_flags, B_WANTED);
> -               wakeup(bp);
> -       }
> -
>        splx(s);
>  }
>
> @@ -981,16 +1151,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
>                 * free down to the low water mark.
>                 */
>                if (bcstats.numcleanpages > hicleanpages) {
> -                       while (bcstats.numcleanpages > locleanpages) {
> -                               bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
> -                               bremfree(bp);
> -                               if (bp->b_vp) {
> -                                       RB_REMOVE(buf_rb_bufs,
> -                                           &bp->b_vp->v_bufs_tree, bp);
> -                                       brelvp(bp);
> -                               }
> -                               buf_put(bp);
> -                       }
> +                       bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
> +                       while (bp && (bcstats.numcleanpages >
locleanpages))
> +                               bp = buf_remove_from_freelist(bp);
>                }
>
>                npages = atop(round_page(size));
> @@ -1002,15 +1165,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
>                    || backoffpages) {
>                        int freemax = 5;
>                        int i = freemax;
> -                       while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
i--) {

> -                               bremfree(bp);
> -                               if (bp->b_vp) {
> -                                       RB_REMOVE(buf_rb_bufs,
> -                                           &bp->b_vp->v_bufs_tree, bp);
> -                                       brelvp(bp);
> -                               }
> -                               buf_put(bp);
> -                       }
> +                       bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
> +                       while (bp && i--)
> +                               bp = buf_remove_from_freelist(bp);
>                        if (freemax == i &&
>                            (bcstats.numbufpages + npages > bufpages)) {
>                                needbuffer++;
> @@ -1027,6 +1184,8 @@ buf_get(struct vnode *vp, daddr64_t blkn
>                splx(s);
>                return (NULL);
>        }
> +       /* Mark buffer as the cache's */
> +       SET(bp->b_flags, B_BC);
>
>        bp->b_freelist.tqe_next = NOLIST;
>        bp->b_synctime = time_uptime + 300;
> @@ -1041,7 +1200,7 @@ buf_get(struct vnode *vp, daddr64_t blkn
>                 * We insert the buffer into the hash with B_BUSY set
>                 * while we allocate pages for it. This way any getblk
>                 * that happens while we allocate pages will wait for
> -                * this buffer instead of starting its own guf_get.
> +                * this buffer instead of starting its own buf_get.
>                 *
>                 * But first, we check if someone beat us to it.
>                 */
> @@ -1067,10 +1226,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
>        if (size) {
>                buf_alloc_pages(bp, round_page(size));
>                buf_map(bp);
> +               buf_daq_add(bp);
>        }
> -
>        splx(s);
> -
>        return (bp);
>  }
>
> @@ -1082,23 +1240,46 @@ buf_daemon(struct proc *p)
>  {
>        struct timeval starttime, timediff;
>        struct buf *bp;
> -       int s;
> +       int s, nb, error;
>
>        cleanerproc = curproc;
>
>        s = splbio();
>        for (;;) {
> +               struct buf *nbp;
>                if (bcstats.numdirtypages < hidirtypages)
>                        tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
>
>                getmicrouptime(&starttime);
> -
> +start:
> +               nb = 0;
>                while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
>                        struct timeval tv;
> +                       nbp = TAILQ_NEXT(bp, b_freelist);
>
>                        if (bcstats.numdirtypages < lodirtypages)
>                                break;
>
> +                       /*
> +                        * If we haven't found any other buffers to
> +                        * process and this last one is busy, wait for
> +                        * it and restart. otherwise, continue and
> +                        * process the rest of them..
> +                        *
> +                        */
> +                       if ((nb == 0) && (nbp == NULL) &&
> +                           ISSET(bp->b_flags, B_BUSY)) {
> +                               SET(bp->b_flags, B_WANTED);
> +                               error = tsleep(bp, PRIBIO + 1, "getblk",
0);

> +                               splx(s);
> +                               if (error)
> +                                       return;
> +                               s = splbio();
> +                               goto start;
> +                       } else {
> +                               continue;
> +                       }
> +                       nb++;
>                        bremfree(bp);
>                        buf_acquire(bp);
>                        splx(s);
> @@ -1132,7 +1313,6 @@ buf_daemon(struct proc *p)
>                        s = splbio();
>                        if (timediff.tv_sec)
>                                break;
> -
>                }
>        }
>  }
> Index: vfs_biomem.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_biomem.c,v
> retrieving revision 1.17
> diff -u -p -r1.17 vfs_biomem.c
> --- vfs_biomem.c        7 Apr 2011 19:07:42 -0000       1.17
> +++ vfs_biomem.c        7 Jul 2011 21:17:09 -0000
> @@ -33,6 +33,8 @@ TAILQ_HEAD(,buf) buf_valist;
>  int buf_nkvmsleep;
>
>  extern struct bcachestats bcstats;
> +extern int needbuffer;
> +extern int needda;
>
>  /*
>  * Pages are allocated from a uvm object (we only use it for page storage,
> @@ -99,6 +101,11 @@ buf_acquire_unmapped(struct buf *bp)
>
>        s = splbio();
>        SET(bp->b_flags, B_BUSY|B_NOTMAPPED);
> +       /* XXX */
> +       if (bp->b_data != NULL) {
> +               TAILQ_REMOVE(&buf_valist, bp, b_valist);
> +               bcstats.busymapped++;
> +       }
>        splx(s);
>  }
>
> @@ -170,6 +177,24 @@ buf_release(struct buf *bp)
>                }
>        }
>        CLR(bp->b_flags, B_BUSY|B_NOTMAPPED);
> +       if (ISSET(bp->b_flags, B_DMA) && needda) {
> +               wakeup(&needda);
> +       }
> +       /* Wake up any processes waiting for any buffer to become free. */
> +       if (needbuffer) {
> +               needbuffer--;
> +               wakeup(&needbuffer);
> +       }
> +
> +       /*
> +        * Wake up any processes waiting for _this_ buffer to become
> +        * free.
> +        */
> +
> +       if (ISSET(bp->b_flags, B_WANTED)) {
> +               CLR(bp->b_flags, B_WANTED);
> +               wakeup(bp);
> +       }
>        splx(s);
>  }
>
> @@ -286,6 +311,8 @@ buf_alloc_pages(struct buf *bp, vsize_t
>
>        uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
>        bcstats.numbufpages += atop(size);
> +       bcstats.dmapages += atop(size);
> +       SET(bp->b_flags, B_DMA);
>        bp->b_pobj = buf_object;
>        bp->b_poffs = offs;
>        bp->b_bufsize = size;
> @@ -302,6 +329,7 @@ buf_free_pages(struct buf *bp)
>
>        KASSERT(bp->b_data == NULL);
>        KASSERT(uobj != NULL);
> +       KASSERT(!ISSET(bp->b_flags, B_DAQ));
>
>        s = splbio();
>
> @@ -316,11 +344,57 @@ buf_free_pages(struct buf *bp)
>                pg->wire_count = 0;
>                uvm_pagefree(pg);
>                bcstats.numbufpages--;
> +               if (ISSET(bp->b_flags, B_DMA))
> +                       bcstats.dmapages--;
>        }
> +       CLR(bp->b_flags, B_DMA);
>        splx(s);
>  }
>
> -/*
> - * XXX - it might make sense to make a buf_realloc_pages to avoid
> - *       bouncing through the free list all the time.
> - */
> +/* Reallocate a buf into a particular location specified by "where" */
> +void
> +buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where)
> +{
> +       vaddr_t va;
> +       int dma;
> +       int s, i;
> +
> +       s = splbio();
> +       KASSERT(ISSET(bp->b_flags, B_BUSY));
> +       dma = ISSET(bp->b_flags, B_DMA);
> +
> +       /* if the original buf is mapped, unmap it */
> +       if (bp->b_data != NULL) {
> +               va = (vaddr_t)bp->b_data;
> +               pmap_kremove(va, bp->b_bufsize);
> +               pmap_update(pmap_kernel());
> +       }
> +       uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, bp->b_bufsize,
> +           UVM_PLA_WAITOK, where);
> +       /*
> +        * do this now, and put it back later when we know where we are
> +        */
> +       if (dma)
> +               bcstats.dmapages -= atop(bp->b_bufsize);
> +
> +       dma = 1;
> +       /* if the original buf was mapped, re-map it */
> +       for (i = 0; i < atop(bp->b_bufsize); i++) {
> +               struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
> +                   bp->b_poffs + ptoa(i));
> +               KASSERT(pg != NULL);
> +               if  (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
> +                       dma = 0;
> +               if (bp->b_data != NULL) {
> +                       pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
> +                           VM_PROT_READ|VM_PROT_WRITE);
> +                       pmap_update(pmap_kernel());
> +               }
> +       }
> +       if (dma) {
> +               SET(bp->b_flags, B_DMA);
> +               bcstats.dmapages += atop(bp->b_bufsize);
> +       } else
> +               CLR(bp->b_flags, B_DMA);
> +       splx(s);
> +}
> Index: vfs_vops.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_vops.c,v
> retrieving revision 1.4
> diff -u -p -r1.4 vfs_vops.c
> --- vfs_vops.c  2 Jul 2011 15:52:25 -0000       1.4
> +++ vfs_vops.c  6 Jul 2011 22:39:28 -0000
> @@ -614,6 +614,17 @@ VOP_STRATEGY(struct buf *bp)
>        if (bp->b_vp->v_op->vop_strategy == NULL)
>                return (EOPNOTSUPP);
>
> +       /*
> +        * Flip buffer to dma reachable memory if
> +        * necessary.
> +        *
> +        * XXX if you're making your own buffers and not
> +        * having the buffer cache manage them then it's your
> +        * problem to ensure they can be dma'ed to and from.
> +        */
> +       if (ISSET(bp->b_flags, B_BC))
> +               buf_daq_add(bp);
> +
>        return ((bp->b_vp->v_op->vop_strategy)(&a));
>  }