[PATCH} Optimized rasops32 putchar

classic Classic list List threaded Threaded
6 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH} Optimized rasops32 putchar

johnc
Optimized 32 bit character rendering with unrolled rows and pairwise
foreground / background pixel rendering.

If it weren't for the 5x8 font, I would have just assumed everything
was an even width and made the fallback path also pairwise.

In isolation, the 16x32 character case got 2x faster, but that wasn't
a huge real world speedup where the space rendering that was already
at memory bandwidth limits accounted for most of the character
rendering time.  However, in combination with the previous fast
conditional console scrolling that removes most of the space rendering,
it becomes significant.

I also found that at least the efi and intel framebuffers are not
currently mapped write combining, which makes this much slower than
it should be.


Index: rasops32.c
===================================================================
RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
retrieving revision 1.10
diff -u -p -r1.10 rasops32.c
--- rasops32.c 25 May 2020 09:55:49 -0000 1.10
+++ rasops32.c 26 Jun 2020 14:34:06 -0000
@@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
 int
 rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
attr)
 {
- int width, height, cnt, fs, fb, clr[2];
+ int width, height, step, cnt, fs, b, f;
+ uint32_t fb, clr[2];
  struct rasops_info *ri;
- int32_t *dp, *rp;
+ int64_t *rp, q;
+ union {
+ int64_t q[4];
+ int32_t d[4][2];
+ } u;
  u_char *fr;
 
  ri = (struct rasops_info *)cookie;
@@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row,
  return 0;
 #endif
 
- rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
+ rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
 
  height = ri->ri_font->fontheight;
  width = ri->ri_font->fontwidth;
+ step = ri->ri_stride >> 3;
 
- clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
- clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
+ b = ri->ri_devcmap[(attr >> 16) & 0xf];
+ f = ri->ri_devcmap[(attr >> 24) & 0xf];
+ u.d[0][0] = b; u.d[0][1] = b;
+ u.d[1][0] = b; u.d[1][1] = f;
+ u.d[2][0] = f; u.d[2][1] = b;
+ u.d[3][0] = f; u.d[3][1] = f;
 
  if (uc == ' ') {
+ q = u.q[0];
  while (height--) {
- dp = rp;
- DELTA(rp, ri->ri_stride, int32_t *);
-
- for (cnt = width; cnt; cnt--)
- *dp++ = clr[0];
+ /* the general, pixel-at-a-time case is fast enough */
+ for (cnt = 0; cnt < width; cnt++)
+ ((int *)rp)[cnt] = b;
+ rp += step;
  }
  } else {
  uc -= ri->ri_font->firstchar;
  fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
  fs = ri->ri_font->stride;
-
- while (height--) {
- dp = rp;
- fb = fr[3] | (fr[2] << 8) | (fr[1] << 16) |
-    (fr[0] << 24);
- fr += fs;
- DELTA(rp, ri->ri_stride, int32_t *);
-
- for (cnt = width; cnt; cnt--) {
- *dp++ = clr[(fb >> 31) & 1];
- fb <<= 1;
- }
+ /* double-pixel special cases for the common widths */
+ switch (width) {
+ case 8:
+ while (height--) {
+ fb = fr[0];
+ rp[0] = u.q[fb >> 6];
+ rp[1] = u.q[(fb >> 4) & 3];
+ rp[2] = u.q[(fb >> 2) & 3];
+ rp[3] = u.q[fb & 3];
+ rp += step;
+ fr += 1;
+ }
+ break;
+
+ case 12:
+ while (height--) {
+ fb = fr[0];
+ rp[0] = u.q[fb >> 6];
+ rp[1] = u.q[(fb >> 4) & 3];
+ rp[2] = u.q[(fb >> 2) & 3];
+ rp[3] = u.q[fb & 3];
+ fb = fr[1];
+ rp[4] = u.q[fb >> 6];
+ rp[5] = u.q[(fb >> 4) & 3];
+ rp += step;
+ fr += 2;
+ }
+ break;
+
+ case 16:
+ while (height--) {
+ fb = fr[0];
+ rp[0] = u.q[fb >> 6];
+ rp[1] = u.q[(fb >> 4) & 3];
+ rp[2] = u.q[(fb >> 2) & 3];
+ rp[3] = u.q[fb & 3];
+ fb = fr[1];
+ rp[4] = u.q[fb >> 6];
+ rp[5] = u.q[(fb >> 4) & 3];
+ rp[6] = u.q[(fb >> 2) & 3];
+ rp[7] = u.q[fb & 3];
+ rp += step;
+ fr += 2;
+ }
+ break;
+ case 32:
+ while (height--) {
+ fb = fr[0];
+ rp[0] = u.q[fb >> 6];
+ rp[1] = u.q[(fb >> 4) & 3];
+ rp[2] = u.q[(fb >> 2) & 3];
+ rp[3] = u.q[fb & 3];
+ fb = fr[1];
+ rp[4] = u.q[fb >> 6];
+ rp[5] = u.q[(fb >> 4) & 3];
+ rp[6] = u.q[(fb >> 2) & 3];
+ rp[7] = u.q[fb & 3];
+ fb = fr[2];
+ rp[8] = u.q[fb >> 6];
+ rp[9] = u.q[(fb >> 4) & 3];
+ rp[10] = u.q[(fb >> 2) & 3];
+ rp[11] = u.q[fb & 3];
+ fb = fr[3];
+ rp[12] = u.q[fb >> 6];
+ rp[13] = u.q[(fb >> 4) & 3];
+ rp[14] = u.q[(fb >> 2) & 3];
+ rp[15] = u.q[fb & 3];
+ rp += step;
+ fr += 4;
+ }
+ break;
+
+
+ default: /* there is a 5x8 font, so fall back to per-pixel */
+ clr[0] = b;
+ clr[1] = f;
+ while (height--) {
+ fb = fr[3] | (fr[2] << 8) | (fr[1] << 16) |
+    (fr[0] << 24);
+ fr += fs;
+ for (cnt = 0; cnt < width; cnt++) {
+ ((int *)rp)[cnt] = clr[fb >> 31];
+ fb <<= 1;
+ }
+ rp += step;
+ }
+ break;
  }
  }
 
- /* Do underline */
+ /* Do underline a pixel at a time */
  if ((attr & 1) != 0) {
- DELTA(rp, -(ri->ri_stride << 1), int32_t *);
-
- while (width--)
- *rp++ = clr[1];
+ rp -= step;
+ for (cnt = 0; cnt < width ; cnt++)
+ ((int *)rp)[cnt] = f;
  }
 
  return 0;
 }
+

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH} Optimized rasops32 putchar

Mark Kettenis
> From: <[hidden email]>
> Date: Fri, 26 Jun 2020 07:42:50 -0700
>
> Optimized 32 bit character rendering with unrolled rows and pairwise
> foreground / background pixel rendering.
>
> If it weren't for the 5x8 font, I would have just assumed everything
> was an even width and made the fallback path also pairwise.
>
> In isolation, the 16x32 character case got 2x faster, but that wasn't
> a huge real world speedup where the space rendering that was already
> at memory bandwidth limits accounted for most of the character
> rendering time.  However, in combination with the previous fast
> conditional console scrolling that removes most of the space rendering,
> it becomes significant.
>
> I also found that at least the efi and intel framebuffers are not
> currently mapped write combining, which makes this much slower than
> it should be.

Hi John,

The framebuffer should be mapped write-combining.  In OpenBSD this is
requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
bbus_space_map(9) when mapping the framebuffer.

I'm fairly confident since until last January the initial mapping of
the framebuffer that we used wasn't write-combining.  And things were
really, really slow.

Cheers,

Mark

> Index: rasops32.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 rasops32.c
> --- rasops32.c 25 May 2020 09:55:49 -0000 1.10
> +++ rasops32.c 26 Jun 2020 14:34:06 -0000
> @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
>  int
>  rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> attr)
>  {
> - int width, height, cnt, fs, fb, clr[2];
> + int width, height, step, cnt, fs, b, f;
> + uint32_t fb, clr[2];
>   struct rasops_info *ri;
> - int32_t *dp, *rp;
> + int64_t *rp, q;
> + union {
> + int64_t q[4];
> + int32_t d[4][2];
> + } u;
>   u_char *fr;
>  
>   ri = (struct rasops_info *)cookie;
> @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row,
>   return 0;
>  #endif
>  
> - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
>  
>   height = ri->ri_font->fontheight;
>   width = ri->ri_font->fontwidth;
> + step = ri->ri_stride >> 3;
>  
> - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> + b = ri->ri_devcmap[(attr >> 16) & 0xf];
> + f = ri->ri_devcmap[(attr >> 24) & 0xf];
> + u.d[0][0] = b; u.d[0][1] = b;
> + u.d[1][0] = b; u.d[1][1] = f;
> + u.d[2][0] = f; u.d[2][1] = b;
> + u.d[3][0] = f; u.d[3][1] = f;
>  
>   if (uc == ' ') {
> + q = u.q[0];
>   while (height--) {
> - dp = rp;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--)
> - *dp++ = clr[0];
> + /* the general, pixel-at-a-time case is fast enough */
> + for (cnt = 0; cnt < width; cnt++)
> + ((int *)rp)[cnt] = b;
> + rp += step;
>   }
>   } else {
>   uc -= ri->ri_font->firstchar;
>   fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
>   fs = ri->ri_font->stride;
> -
> - while (height--) {
> - dp = rp;
> - fb = fr[3] | (fr[2] << 8) | (fr[1] << 16) |
> -    (fr[0] << 24);
> - fr += fs;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--) {
> - *dp++ = clr[(fb >> 31) & 1];
> - fb <<= 1;
> - }
> + /* double-pixel special cases for the common widths */
> + switch (width) {
> + case 8:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + rp += step;
> + fr += 1;
> + }
> + break;
> +
> + case 12:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> +
> + case 16:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> + case 32:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + fb = fr[2];
> + rp[8] = u.q[fb >> 6];
> + rp[9] = u.q[(fb >> 4) & 3];
> + rp[10] = u.q[(fb >> 2) & 3];
> + rp[11] = u.q[fb & 3];
> + fb = fr[3];
> + rp[12] = u.q[fb >> 6];
> + rp[13] = u.q[(fb >> 4) & 3];
> + rp[14] = u.q[(fb >> 2) & 3];
> + rp[15] = u.q[fb & 3];
> + rp += step;
> + fr += 4;
> + }
> + break;
> +
> +
> + default: /* there is a 5x8 font, so fall back to per-pixel */
> + clr[0] = b;
> + clr[1] = f;
> + while (height--) {
> + fb = fr[3] | (fr[2] << 8) | (fr[1] << 16) |
> +    (fr[0] << 24);
> + fr += fs;
> + for (cnt = 0; cnt < width; cnt++) {
> + ((int *)rp)[cnt] = clr[fb >> 31];
> + fb <<= 1;
> + }
> + rp += step;
> + }
> + break;
>   }
>   }
>  
> - /* Do underline */
> + /* Do underline a pixel at a time */
>   if ((attr & 1) != 0) {
> - DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> -
> - while (width--)
> - *rp++ = clr[1];
> + rp -= step;
> + for (cnt = 0; cnt < width ; cnt++)
> + ((int *)rp)[cnt] = f;
>   }
>  
>   return 0;
>  }
> +
>
>

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH} Optimized rasops32 putchar

johnc
In reply to this post by johnc
I believe it is mapped as normally cached right now, rather than
uncached or write combining.

Reads aren't ultra-slow, and the timings of 48 byte writes appear to
involve a cacheline read.

128 byte writes are actually slower than 64 byte writes, which I
guessed might be because of automatic prefetching kicking in and
reading the following cacheline.


-------- Original Message --------
Subject: Re: [PATCH} Optimized rasops32 putchar
From: Mark Kettenis <[hidden email]>
Date: Sat, June 27, 2020 7:56 am
To: <[hidden email]>
Cc: [hidden email]

> From: <[hidden email]>
> Date: Fri, 26 Jun 2020 07:42:50 -0700
>
> Optimized 32 bit character rendering with unrolled rows and pairwise
> foreground / background pixel rendering.
>
> If it weren't for the 5x8 font, I would have just assumed everything
> was an even width and made the fallback path also pairwise.
>
> In isolation, the 16x32 character case got 2x faster, but that wasn't
> a huge real world speedup where the space rendering that was already
> at memory bandwidth limits accounted for most of the character
> rendering time. However, in combination with the previous fast
> conditional console scrolling that removes most of the space rendering,
> it becomes significant.
>
> I also found that at least the efi and intel framebuffers are not
> currently mapped write combining, which makes this much slower than
> it should be.

Hi John,

The framebuffer should be mapped write-combining. In OpenBSD this is
requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
bbus_space_map(9) when mapping the framebuffer.

I'm fairly confident since until last January the initial mapping of
the framebuffer that we used wasn't write-combining. And things were
really, really slow.

Cheers,

Mark

> Index: rasops32.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 rasops32.c
> --- rasops32.c 25 May 2020 09:55:49 -0000 1.10
> +++ rasops32.c 26 Jun 2020 14:34:06 -0000
> @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
> int
> rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> attr)
> {
> - int width, height, cnt, fs, fb, clr[2];
> + int width, height, step, cnt, fs, b, f;
> + uint32_t fb, clr[2];
> struct rasops_info *ri;
> - int32_t *dp, *rp;
> + int64_t *rp, q;
> + union {
> + int64_t q[4];
> + int32_t d[4][2];
> + } u;
> u_char *fr;
>
> ri = (struct rasops_info *)cookie;
> @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row,
> return 0;
> #endif
>
> - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
>
> height = ri->ri_font->fontheight;
> width = ri->ri_font->fontwidth;
> + step = ri->ri_stride >> 3;
>
> - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> + b = ri->ri_devcmap[(attr >> 16) & 0xf];
> + f = ri->ri_devcmap[(attr >> 24) & 0xf];
> + u.d[0][0] = b; u.d[0][1] = b;
> + u.d[1][0] = b; u.d[1][1] = f;
> + u.d[2][0] = f; u.d[2][1] = b;
> + u.d[3][0] = f; u.d[3][1] = f;
>
> if (uc == ' ') {
> + q = u.q[0];
> while (height--) {
> - dp = rp;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--)
> - *dp++ = clr[0];
> + /* the general, pixel-at-a-time case is fast enough */
> + for (cnt = 0; cnt < width; cnt++)
> + ((int *)rp)[cnt] = b;
> + rp += step;
> }
> } else {
> uc -= ri->ri_font->firstchar;
> fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
> fs = ri->ri_font->stride;
> -
> - while (height--) {
> - dp = rp;
> - fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> - (fr[0] << 24);
> - fr += fs;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--) {
> - *dp++ = clr[(fb >> 31) & 1];
> - fb <<= 1;
> - }
> + /* double-pixel special cases for the common widths */
> + switch (width) {
> + case 8:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + rp += step;
> + fr += 1;
> + }
> + break;
> +
> + case 12:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> +
> + case 16:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> + case 32:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + fb = fr[2];
> + rp[8] = u.q[fb >> 6];
> + rp[9] = u.q[(fb >> 4) & 3];
> + rp[10] = u.q[(fb >> 2) & 3];
> + rp[11] = u.q[fb & 3];
> + fb = fr[3];
> + rp[12] = u.q[fb >> 6];
> + rp[13] = u.q[(fb >> 4) & 3];
> + rp[14] = u.q[(fb >> 2) & 3];
> + rp[15] = u.q[fb & 3];
> + rp += step;
> + fr += 4;
> + }
> + break;
> +
> +
> + default: /* there is a 5x8 font, so fall back to per-pixel */
> + clr[0] = b;
> + clr[1] = f;
> + while (height--) {
> + fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> + (fr[0] << 24);
> + fr += fs;
> + for (cnt = 0; cnt < width; cnt++) {
> + ((int *)rp)[cnt] = clr[fb >> 31];
> + fb <<= 1;
> + }
> + rp += step;
> + }
> + break;
> }
> }
>
> - /* Do underline */
> + /* Do underline a pixel at a time */
> if ((attr & 1) != 0) {
> - DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> -
> - while (width--)
> - *rp++ = clr[1];
> + rp -= step;
> + for (cnt = 0; cnt < width ; cnt++)
> + ((int *)rp)[cnt] = f;
> }
>
> return 0;
> }
> +
>
>

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH} Optimized rasops32 putchar

johnc
In reply to this post by johnc
I was doing my timings with a user mode program after mmmaping the
efifb display, so the mapping might be different in the kernel.

Related to that, I was going to add mmap / WSDISPLAYIO_LINEBYTES /
WSDISPLAYIO_SMODE to the drm drivers by consolidating code into
rasops. While the point of the DRM drivers is to get fully hardware
accelerated drawing in X, there isn't any reason why they can't
support dumb framebuffer mappings as well.


-------- Original Message --------
Subject: RE: [PATCH} Optimized rasops32 putchar
From: <[hidden email]>
Date: Sat, June 27, 2020 11:13 am
To: "Mark Kettenis" <[hidden email]>
Cc: "[hidden email]" <[hidden email]>

I believe it is mapped as normally cached right now, rather than
uncached or write combining.

Reads aren't ultra-slow, and the timings of 48 byte writes appear to
involve a cacheline read.

128 byte writes are actually slower than 64 byte writes, which I
guessed might be because of automatic prefetching kicking in and
reading the following cacheline.


-------- Original Message --------
Subject: Re: [PATCH} Optimized rasops32 putchar
From: Mark Kettenis <[hidden email]>
Date: Sat, June 27, 2020 7:56 am
To: <[hidden email]>
Cc: [hidden email]

> From: <[hidden email]>
> Date: Fri, 26 Jun 2020 07:42:50 -0700
>
> Optimized 32 bit character rendering with unrolled rows and pairwise
> foreground / background pixel rendering.
>
> If it weren't for the 5x8 font, I would have just assumed everything
> was an even width and made the fallback path also pairwise.
>
> In isolation, the 16x32 character case got 2x faster, but that wasn't
> a huge real world speedup where the space rendering that was already
> at memory bandwidth limits accounted for most of the character
> rendering time. However, in combination with the previous fast
> conditional console scrolling that removes most of the space rendering,
> it becomes significant.
>
> I also found that at least the efi and intel framebuffers are not
> currently mapped write combining, which makes this much slower than
> it should be.

Hi John,

The framebuffer should be mapped write-combining. In OpenBSD this is
requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
bbus_space_map(9) when mapping the framebuffer.

I'm fairly confident since until last January the initial mapping of
the framebuffer that we used wasn't write-combining. And things were
really, really slow.

Cheers,

Mark

> Index: rasops32.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 rasops32.c
> --- rasops32.c 25 May 2020 09:55:49 -0000 1.10
> +++ rasops32.c 26 Jun 2020 14:34:06 -0000
> @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
> int
> rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> attr)
> {
> - int width, height, cnt, fs, fb, clr[2];
> + int width, height, step, cnt, fs, b, f;
> + uint32_t fb, clr[2];
> struct rasops_info *ri;
> - int32_t *dp, *rp;
> + int64_t *rp, q;
> + union {
> + int64_t q[4];
> + int32_t d[4][2];
> + } u;
> u_char *fr;
>
> ri = (struct rasops_info *)cookie;
> @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row,
> return 0;
> #endif
>
> - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
>
> height = ri->ri_font->fontheight;
> width = ri->ri_font->fontwidth;
> + step = ri->ri_stride >> 3;
>
> - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> + b = ri->ri_devcmap[(attr >> 16) & 0xf];
> + f = ri->ri_devcmap[(attr >> 24) & 0xf];
> + u.d[0][0] = b; u.d[0][1] = b;
> + u.d[1][0] = b; u.d[1][1] = f;
> + u.d[2][0] = f; u.d[2][1] = b;
> + u.d[3][0] = f; u.d[3][1] = f;
>
> if (uc == ' ') {
> + q = u.q[0];
> while (height--) {
> - dp = rp;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--)
> - *dp++ = clr[0];
> + /* the general, pixel-at-a-time case is fast enough */
> + for (cnt = 0; cnt < width; cnt++)
> + ((int *)rp)[cnt] = b;
> + rp += step;
> }
> } else {
> uc -= ri->ri_font->firstchar;
> fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
> fs = ri->ri_font->stride;
> -
> - while (height--) {
> - dp = rp;
> - fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> - (fr[0] << 24);
> - fr += fs;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--) {
> - *dp++ = clr[(fb >> 31) & 1];
> - fb <<= 1;
> - }
> + /* double-pixel special cases for the common widths */
> + switch (width) {
> + case 8:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + rp += step;
> + fr += 1;
> + }
> + break;
> +
> + case 12:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> +
> + case 16:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> + case 32:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + fb = fr[2];
> + rp[8] = u.q[fb >> 6];
> + rp[9] = u.q[(fb >> 4) & 3];
> + rp[10] = u.q[(fb >> 2) & 3];
> + rp[11] = u.q[fb & 3];
> + fb = fr[3];
> + rp[12] = u.q[fb >> 6];
> + rp[13] = u.q[(fb >> 4) & 3];
> + rp[14] = u.q[(fb >> 2) & 3];
> + rp[15] = u.q[fb & 3];
> + rp += step;
> + fr += 4;
> + }
> + break;
> +
> +
> + default: /* there is a 5x8 font, so fall back to per-pixel */
> + clr[0] = b;
> + clr[1] = f;
> + while (height--) {
> + fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> + (fr[0] << 24);
> + fr += fs;
> + for (cnt = 0; cnt < width; cnt++) {
> + ((int *)rp)[cnt] = clr[fb >> 31];
> + fb <<= 1;
> + }
> + rp += step;
> + }
> + break;
> }
> }
>
> - /* Do underline */
> + /* Do underline a pixel at a time */
> if ((attr & 1) != 0) {
> - DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> -
> - while (width--)
> - *rp++ = clr[1];
> + rp -= step;
> + for (cnt = 0; cnt < width ; cnt++)
> + ((int *)rp)[cnt] = f;
> }
>
> return 0;
> }
> +
>
>

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH} Optimized rasops32 putchar

Mark Kettenis
> Content-Type: text/plain; charset="utf-8"
> From: <[hidden email]>
>
> I was doing my timings with a user mode program after mmmaping the
> efifb display, so the mapping might be different in the kernel.

That should still give you a write-combining mapping as efifb_mmap()
adds the PMAP_WC flag to the physical address.

Cachable on x86 means write-back cachable.  And using a write-back
cachable mapping for a framebuffer often leads to interesting "damage"
where pixels in certain cache lines show up "late" on the display.
Not sure if you'd see that on recent Intel graphics hardware as the
current hardware designs are much more coherent than what was produced
in the past.

> Related to that, I was going to add mmap / WSDISPLAYIO_LINEBYTES /
> WSDISPLAYIO_SMODE to the drm drivers by consolidating code into
> rasops. While the point of the DRM drivers is to get fully hardware
> accelerated drawing in X, there isn't any reason why they can't
> support dumb framebuffer mappings as well.

True.  Although there are DRM interfaces that give you a dumb
framebuffer as well.  Using those interfaces is a bit more complicated
though.

Centralising the code would be good.  That code probably should use
bus_space_mmap(4) as the PMAP_WC flag is amd64-specific.
Unfortunately the amd64 implementation of bus_space_mmap(4) is
incomplete and doesn't actually implement write-combining for mappings
with the BUS_SPACE_MAP_PREFETCHABLE flag set.  So that has to be fixed
as well.

> -------- Original Message --------
> Subject: RE: [PATCH} Optimized rasops32 putchar
> From: <[hidden email]>
> Date: Sat, June 27, 2020 11:13 am
> To: "Mark Kettenis" <[hidden email]>
> Cc: "[hidden email]" <[hidden email]>
>
> I believe it is mapped as normally cached right now, rather than
> uncached or write combining.
>
> Reads aren't ultra-slow, and the timings of 48 byte writes appear to
> involve a cacheline read.
>
> 128 byte writes are actually slower than 64 byte writes, which I
> guessed might be because of automatic prefetching kicking in and
> reading the following cacheline.
>
>
> -------- Original Message --------
> Subject: Re: [PATCH} Optimized rasops32 putchar
> From: Mark Kettenis <[hidden email]>
> Date: Sat, June 27, 2020 7:56 am
> To: <[hidden email]>
> Cc: [hidden email]
>
> > From: <[hidden email]>
> > Date: Fri, 26 Jun 2020 07:42:50 -0700
> >
> > Optimized 32 bit character rendering with unrolled rows and pairwise
> > foreground / background pixel rendering.
> >
> > If it weren't for the 5x8 font, I would have just assumed everything
> > was an even width and made the fallback path also pairwise.
> >
> > In isolation, the 16x32 character case got 2x faster, but that wasn't
> > a huge real world speedup where the space rendering that was already
> > at memory bandwidth limits accounted for most of the character
> > rendering time. However, in combination with the previous fast
> > conditional console scrolling that removes most of the space rendering,
> > it becomes significant.
> >
> > I also found that at least the efi and intel framebuffers are not
> > currently mapped write combining, which makes this much slower than
> > it should be.
>
> Hi John,
>
> The framebuffer should be mapped write-combining. In OpenBSD this is
> requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
> bbus_space_map(9) when mapping the framebuffer.
>
> I'm fairly confident since until last January the initial mapping of
> the framebuffer that we used wasn't write-combining. And things were
> really, really slow.
>
> Cheers,
>
> Mark
>
> > Index: rasops32.c
> > ===================================================================
> > RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> > retrieving revision 1.10
> > diff -u -p -r1.10 rasops32.c
> > --- rasops32.c 25 May 2020 09:55:49 -0000 1.10
> > +++ rasops32.c 26 Jun 2020 14:34:06 -0000
> > @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
> > int
> > rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> > attr)
> > {
> > - int width, height, cnt, fs, fb, clr[2];
> > + int width, height, step, cnt, fs, b, f;
> > + uint32_t fb, clr[2];
> > struct rasops_info *ri;
> > - int32_t *dp, *rp;
> > + int64_t *rp, q;
> > + union {
> > + int64_t q[4];
> > + int32_t d[4][2];
> > + } u;
> > u_char *fr;
> >
> > ri = (struct rasops_info *)cookie;
> > @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row,
> > return 0;
> > #endif
> >
> > - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> > + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> >
> > height = ri->ri_font->fontheight;
> > width = ri->ri_font->fontwidth;
> > + step = ri->ri_stride >> 3;
> >
> > - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> > - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> > + b = ri->ri_devcmap[(attr >> 16) & 0xf];
> > + f = ri->ri_devcmap[(attr >> 24) & 0xf];
> > + u.d[0][0] = b; u.d[0][1] = b;
> > + u.d[1][0] = b; u.d[1][1] = f;
> > + u.d[2][0] = f; u.d[2][1] = b;
> > + u.d[3][0] = f; u.d[3][1] = f;
> >
> > if (uc == ' ') {
> > + q = u.q[0];
> > while (height--) {
> > - dp = rp;
> > - DELTA(rp, ri->ri_stride, int32_t *);
> > -
> > - for (cnt = width; cnt; cnt--)
> > - *dp++ = clr[0];
> > + /* the general, pixel-at-a-time case is fast enough */
> > + for (cnt = 0; cnt < width; cnt++)
> > + ((int *)rp)[cnt] = b;
> > + rp += step;
> > }
> > } else {
> > uc -= ri->ri_font->firstchar;
> > fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
> > fs = ri->ri_font->stride;
> > -
> > - while (height--) {
> > - dp = rp;
> > - fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> > - (fr[0] << 24);
> > - fr += fs;
> > - DELTA(rp, ri->ri_stride, int32_t *);
> > -
> > - for (cnt = width; cnt; cnt--) {
> > - *dp++ = clr[(fb >> 31) & 1];
> > - fb <<= 1;
> > - }
> > + /* double-pixel special cases for the common widths */
> > + switch (width) {
> > + case 8:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + rp += step;
> > + fr += 1;
> > + }
> > + break;
> > +
> > + case 12:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + fb = fr[1];
> > + rp[4] = u.q[fb >> 6];
> > + rp[5] = u.q[(fb >> 4) & 3];
> > + rp += step;
> > + fr += 2;
> > + }
> > + break;
> > +
> > + case 16:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + fb = fr[1];
> > + rp[4] = u.q[fb >> 6];
> > + rp[5] = u.q[(fb >> 4) & 3];
> > + rp[6] = u.q[(fb >> 2) & 3];
> > + rp[7] = u.q[fb & 3];
> > + rp += step;
> > + fr += 2;
> > + }
> > + break;
> > + case 32:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + fb = fr[1];
> > + rp[4] = u.q[fb >> 6];
> > + rp[5] = u.q[(fb >> 4) & 3];
> > + rp[6] = u.q[(fb >> 2) & 3];
> > + rp[7] = u.q[fb & 3];
> > + fb = fr[2];
> > + rp[8] = u.q[fb >> 6];
> > + rp[9] = u.q[(fb >> 4) & 3];
> > + rp[10] = u.q[(fb >> 2) & 3];
> > + rp[11] = u.q[fb & 3];
> > + fb = fr[3];
> > + rp[12] = u.q[fb >> 6];
> > + rp[13] = u.q[(fb >> 4) & 3];
> > + rp[14] = u.q[(fb >> 2) & 3];
> > + rp[15] = u.q[fb & 3];
> > + rp += step;
> > + fr += 4;
> > + }
> > + break;
> > +
> > +
> > + default: /* there is a 5x8 font, so fall back to per-pixel */
> > + clr[0] = b;
> > + clr[1] = f;
> > + while (height--) {
> > + fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> > + (fr[0] << 24);
> > + fr += fs;
> > + for (cnt = 0; cnt < width; cnt++) {
> > + ((int *)rp)[cnt] = clr[fb >> 31];
> > + fb <<= 1;
> > + }
> > + rp += step;
> > + }
> > + break;
> > }
> > }
> >
> > - /* Do underline */
> > + /* Do underline a pixel at a time */
> > if ((attr & 1) != 0) {
> > - DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> > -
> > - while (width--)
> > - *rp++ = clr[1];
> > + rp -= step;
> > + for (cnt = 0; cnt < width ; cnt++)
> > + ((int *)rp)[cnt] = f;
> > }
> >
> > return 0;
> > }
> > +
> >
> >
>

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH} Optimized rasops32 putchar

johnc
In reply to this post by johnc
I did some more tests, and I think the odd performance I am seeing
may be due to TLB thrash on the 32x64 characters with 4k pages,
since writing each character will require 64 data TLB.

Are huge page mappings supported in OpenBSD?

-------- Original Message --------
Subject: Re: [PATCH} Optimized rasops32 putchar
From: Mark Kettenis <[hidden email]>
Date: Sat, June 27, 2020 1:30 pm
To: <[hidden email]>
Cc: [hidden email]

> Content-Type: text/plain; charset="utf-8"
> From: <[hidden email]>
>
> I was doing my timings with a user mode program after mmmaping the
> efifb display, so the mapping might be different in the kernel.

That should still give you a write-combining mapping as efifb_mmap()
adds the PMAP_WC flag to the physical address.

Cachable on x86 means write-back cachable. And using a write-back
cachable mapping for a framebuffer often leads to interesting "damage"
where pixels in certain cache lines show up "late" on the display.
Not sure if you'd see that on recent Intel graphics hardware as the
current hardware designs are much more coherent than what was produced
in the past.

> Related to that, I was going to add mmap / WSDISPLAYIO_LINEBYTES /
> WSDISPLAYIO_SMODE to the drm drivers by consolidating code into
> rasops. While the point of the DRM drivers is to get fully hardware
> accelerated drawing in X, there isn't any reason why they can't
> support dumb framebuffer mappings as well.

True. Although there are DRM interfaces that give you a dumb
framebuffer as well. Using those interfaces is a bit more complicated
though.

Centralising the code would be good. That code probably should use
bus_space_mmap(4) as the PMAP_WC flag is amd64-specific.
Unfortunately the amd64 implementation of bus_space_mmap(4) is
incomplete and doesn't actually implement write-combining for mappings
with the BUS_SPACE_MAP_PREFETCHABLE flag set. So that has to be fixed
as well.

> -------- Original Message --------
> Subject: RE: [PATCH} Optimized rasops32 putchar
> From: <[hidden email]>
> Date: Sat, June 27, 2020 11:13 am
> To: "Mark Kettenis" <[hidden email]>
> Cc: "[hidden email]" <[hidden email]>
>
> I believe it is mapped as normally cached right now, rather than
> uncached or write combining.
>
> Reads aren't ultra-slow, and the timings of 48 byte writes appear to
> involve a cacheline read.
>
> 128 byte writes are actually slower than 64 byte writes, which I
> guessed might be because of automatic prefetching kicking in and
> reading the following cacheline.
>
>
> -------- Original Message --------
> Subject: Re: [PATCH} Optimized rasops32 putchar
> From: Mark Kettenis <[hidden email]>
> Date: Sat, June 27, 2020 7:56 am
> To: <[hidden email]>
> Cc: [hidden email]
>
> > From: <[hidden email]>
> > Date: Fri, 26 Jun 2020 07:42:50 -0700
> >
> > Optimized 32 bit character rendering with unrolled rows and pairwise
> > foreground / background pixel rendering.
> >
> > If it weren't for the 5x8 font, I would have just assumed everything
> > was an even width and made the fallback path also pairwise.
> >
> > In isolation, the 16x32 character case got 2x faster, but that wasn't
> > a huge real world speedup where the space rendering that was already
> > at memory bandwidth limits accounted for most of the character
> > rendering time. However, in combination with the previous fast
> > conditional console scrolling that removes most of the space rendering,
> > it becomes significant.
> >
> > I also found that at least the efi and intel framebuffers are not
> > currently mapped write combining, which makes this much slower than
> > it should be.
>
> Hi John,
>
> The framebuffer should be mapped write-combining. In OpenBSD this is
> requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
> bbus_space_map(9) when mapping the framebuffer.
>
> I'm fairly confident since until last January the initial mapping of
> the framebuffer that we used wasn't write-combining. And things were
> really, really slow.
>
> Cheers,
>
> Mark
>
> > Index: rasops32.c
> > ===================================================================
> > RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> > retrieving revision 1.10
> > diff -u -p -r1.10 rasops32.c
> > --- rasops32.c 25 May 2020 09:55:49 -0000 1.10
> > +++ rasops32.c 26 Jun 2020 14:34:06 -0000
> > @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
> > int
> > rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> > attr)
> > {
> > - int width, height, cnt, fs, fb, clr[2];
> > + int width, height, step, cnt, fs, b, f;
> > + uint32_t fb, clr[2];
> > struct rasops_info *ri;
> > - int32_t *dp, *rp;
> > + int64_t *rp, q;
> > + union {
> > + int64_t q[4];
> > + int32_t d[4][2];
> > + } u;
> > u_char *fr;
> >
> > ri = (struct rasops_info *)cookie;
> > @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row,
> > return 0;
> > #endif
> >
> > - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> > + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> >
> > height = ri->ri_font->fontheight;
> > width = ri->ri_font->fontwidth;
> > + step = ri->ri_stride >> 3;
> >
> > - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> > - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> > + b = ri->ri_devcmap[(attr >> 16) & 0xf];
> > + f = ri->ri_devcmap[(attr >> 24) & 0xf];
> > + u.d[0][0] = b; u.d[0][1] = b;
> > + u.d[1][0] = b; u.d[1][1] = f;
> > + u.d[2][0] = f; u.d[2][1] = b;
> > + u.d[3][0] = f; u.d[3][1] = f;
> >
> > if (uc == ' ') {
> > + q = u.q[0];
> > while (height--) {
> > - dp = rp;
> > - DELTA(rp, ri->ri_stride, int32_t *);
> > -
> > - for (cnt = width; cnt; cnt--)
> > - *dp++ = clr[0];
> > + /* the general, pixel-at-a-time case is fast enough */
> > + for (cnt = 0; cnt < width; cnt++)
> > + ((int *)rp)[cnt] = b;
> > + rp += step;
> > }
> > } else {
> > uc -= ri->ri_font->firstchar;
> > fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
> > fs = ri->ri_font->stride;
> > -
> > - while (height--) {
> > - dp = rp;
> > - fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> > - (fr[0] << 24);
> > - fr += fs;
> > - DELTA(rp, ri->ri_stride, int32_t *);
> > -
> > - for (cnt = width; cnt; cnt--) {
> > - *dp++ = clr[(fb >> 31) & 1];
> > - fb <<= 1;
> > - }
> > + /* double-pixel special cases for the common widths */
> > + switch (width) {
> > + case 8:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + rp += step;
> > + fr += 1;
> > + }
> > + break;
> > +
> > + case 12:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + fb = fr[1];
> > + rp[4] = u.q[fb >> 6];
> > + rp[5] = u.q[(fb >> 4) & 3];
> > + rp += step;
> > + fr += 2;
> > + }
> > + break;
> > +
> > + case 16:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + fb = fr[1];
> > + rp[4] = u.q[fb >> 6];
> > + rp[5] = u.q[(fb >> 4) & 3];
> > + rp[6] = u.q[(fb >> 2) & 3];
> > + rp[7] = u.q[fb & 3];
> > + rp += step;
> > + fr += 2;
> > + }
> > + break;
> > + case 32:
> > + while (height--) {
> > + fb = fr[0];
> > + rp[0] = u.q[fb >> 6];
> > + rp[1] = u.q[(fb >> 4) & 3];
> > + rp[2] = u.q[(fb >> 2) & 3];
> > + rp[3] = u.q[fb & 3];
> > + fb = fr[1];
> > + rp[4] = u.q[fb >> 6];
> > + rp[5] = u.q[(fb >> 4) & 3];
> > + rp[6] = u.q[(fb >> 2) & 3];
> > + rp[7] = u.q[fb & 3];
> > + fb = fr[2];
> > + rp[8] = u.q[fb >> 6];
> > + rp[9] = u.q[(fb >> 4) & 3];
> > + rp[10] = u.q[(fb >> 2) & 3];
> > + rp[11] = u.q[fb & 3];
> > + fb = fr[3];
> > + rp[12] = u.q[fb >> 6];
> > + rp[13] = u.q[(fb >> 4) & 3];
> > + rp[14] = u.q[(fb >> 2) & 3];
> > + rp[15] = u.q[fb & 3];
> > + rp += step;
> > + fr += 4;
> > + }
> > + break;
> > +
> > +
> > + default: /* there is a 5x8 font, so fall back to per-pixel */
> > + clr[0] = b;
> > + clr[1] = f;
> > + while (height--) {
> > + fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> > + (fr[0] << 24);
> > + fr += fs;
> > + for (cnt = 0; cnt < width; cnt++) {
> > + ((int *)rp)[cnt] = clr[fb >> 31];
> > + fb <<= 1;
> > + }
> > + rp += step;
> > + }
> > + break;
> > }
> > }
> >
> > - /* Do underline */
> > + /* Do underline a pixel at a time */
> > if ((attr & 1) != 0) {
> > - DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> > -
> > - while (width--)
> > - *rp++ = clr[1];
> > + rp -= step;
> > + for (cnt = 0; cnt < width ; cnt++)
> > + ((int *)rp)[cnt] = f;
> > }
> >
> > return 0;
> > }
> > +
> >
> >
>