Quantcast

improving qsort worst case behavior

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
4 messages Options
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

improving qsort worst case behavior

Todd C. Miller
On Wed, 17 May 2017 19:15:45 +0200, Ingo Schwarze wrote:

> For the record, this commit changes worst-case stack space requirements
> from O(n) to O(log n).  The following are unchanged:
>
>  - average stack space:  O(log n)
>  - average run time:     O(n log n)
>  - worst case run time:  O(n^2)
>
> The latter is "ouch", but unavoidable for Quicksort-based algos.

There are two main approaches one can take to avoid quadratic run
time in quicksort:

1. Choose a random pivot.  This is just a matter of replacing the
   "median of three" pivot selection with a uniform random index.
   However, even with our arc4random() rarely calling into the
   kernel the overhead added makes this unattractive.  You might
   as well just use a different sort function like mergesort or
   headsort.

2. Switch to a different sort function if quicksort starts to
   go quadratic.  The difficulty with this is knowing when to
   switch.  Since quicksort is recursive we can't easily track
   the number of compares or swaps.  However, we can limit the
   recursion depth.  This corresponds to David Musser's "Introsort"
   (http://www.cs.rpi.edu/~musser/gp/introsort.ps).

   In essence, if the recursion depth exceeds 2*lg(n), switch to
   heapsort().  This results in a worst case run time that is still
   O(n log n) without appreciably affecting the average case.  There
   will be inputs where heapsort() gets used that would not have
   resulted in quadratic behavior from quicksort but this doesn't
   cause performance problems in my testing.

Another interesting approach to investigate is to use more pivots.
Dual-Pivot Quicksort by Vladimir Yaroslavskiy was published in 2009
(http://codeblab.com/wp-content/uploads/2009/09/DualPivotQuicksort.pdf).
and since then other multi-pivot quicksort algorithms have been
proposed.  A recent paper with analysis on this is:
https://cs.uwaterloo.ca/~skushagr/papers/multipivotQuicksort.pdf
While using 2 or 3 pivots improves the average run time on modern
hardware, it is not clear that it improves the worst case.

I believe the best approach is to switch qsort.c to "introsort".
The changes are minimal and the elimination of the O(n^2) worst
case is compelling.

It is possible for heapsort(3) to fail due to lack of memory as it
needs to allocate a single element for swapping.  If this happens
(unlikely) we can just continue with quicksort.  This will result
in maxdepth wrapping but that is defined behavior since it is
unsigned.

 - todd

Index: lib/libc/stdlib/qsort.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/qsort.c,v
retrieving revision 1.15
diff -u -p -u -r1.15 qsort.c
--- lib/libc/stdlib/qsort.c 17 May 2017 16:58:20 -0000 1.15
+++ lib/libc/stdlib/qsort.c 18 May 2017 15:51:48 -0000
@@ -38,6 +38,18 @@ static __inline void swapfunc(char *, c
 
 /*
  * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ *
+ * This version differs from Bentley & McIlroy in the following ways:
+ *   1. The partition value is swapped into a[0] instead of being
+ * stored out of line for simplicity's sake.
+ *
+ *   2. It uses David Musser's introsort algorithm to fall back to
+ * heapsort(3) when the recursion depth reaches 2*lg(n + 1).
+ * This avoids qsort's quadratic behavior for pathological
+ * input without appreciably changing the average run time.
+ *
+ *   3. Tail recursion is eliminated when sorting the larger of two
+ * subpartitions to save stack space.
  */
 #define swapcode(TYPE, parmi, parmj, n) { \
  size_t i = (n) / sizeof (TYPE); \
@@ -80,15 +92,20 @@ med3(char *a, char *b, char *c, int (*cm
               :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
 }
 
-void
-qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *))
+static void
+introsort(char *a, size_t n, size_t es, size_t maxdepth,
+    int (*cmp)(const void *, const void *))
 {
  char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
  int cmp_result, swaptype;
  size_t d, r, s;
- char *a = aa;
 
-loop: SWAPINIT(a, es);
+loop: if (maxdepth == 0) {
+ if (heapsort(a, n, es, cmp) == 0)
+ return;
+ }
+ maxdepth--;
+ SWAPINIT(a, es);
  if (n < 7) {
  for (pm = a + es; pm < a + n * es; pm += es)
  for (pl = pm; pl > a && cmp(pl - es, pl) > 0;
@@ -110,7 +127,6 @@ loop: SWAPINIT(a, es);
  }
  swap(a, pm);
  pa = pb = a + es;
-
  pc = pd = a + (n - 1) * es;
  for (;;) {
  while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) {
@@ -149,7 +165,7 @@ loop: SWAPINIT(a, es);
  /* Recurse for 1st side, iterate for 2nd side. */
  if (s > es) {
  if (r > es)
- qsort(a, r / es, es, cmp);
+ introsort(a, r / es, es, maxdepth, cmp);
  a = pn - s;
  n = s / es;
  goto loop;
@@ -158,10 +174,24 @@ loop: SWAPINIT(a, es);
  /* Recurse for 2nd side, iterate for 1st side. */
  if (r > es) {
  if (s > es)
- qsort(pn - s, s / es, es, cmp);
+ introsort(pn - s, s / es, es, maxdepth, cmp);
  n = r / es;
  goto loop;
  }
  }
 }
+
+void
+qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *))
+{
+ size_t i, maxdepth = 0;
+
+ /* Approximate 2*ceil(lg(n + 1)) */
+ for (i = n; i > 0; i >>= 1)
+ maxdepth++;
+ maxdepth *= 2;
+
+ introsort(a, n, es, maxdepth, cmp);
+}
+
 DEF_STRONG(qsort);

Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: improving qsort worst case behavior

Todd C. Miller
On Thu, 18 May 2017 09:58:14 -0600, "Todd C. Miller" wrote:

> I believe the best approach is to switch qsort.c to "introsort".
> The changes are minimal and the elimination of the O(n^2) worst
> case is compelling.

I've added input arrays to the qsort regress test that exhibit
quadratic behavior in our quicksort.  The introsort diff prevents
qsort from going quadratic.

McIlroy's "A Killer Adversary for Quicksort" was used to generate
the test vectors.  http://www.cs.dartmouth.edu/~doug/mdmspe.pdf

 - todd

Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: improving qsort worst case behavior

Otto Moerbeek
On Fri, May 19, 2017 at 09:04:30AM -0600, Todd C. Miller wrote:

> On Thu, 18 May 2017 09:58:14 -0600, "Todd C. Miller" wrote:
>
> > I believe the best approach is to switch qsort.c to "introsort".
> > The changes are minimal and the elimination of the O(n^2) worst
> > case is compelling.
>
> I've added input arrays to the qsort regress test that exhibit
> quadratic behavior in our quicksort.  The introsort diff prevents
> qsort from going quadratic.
>
> McIlroy's "A Killer Adversary for Quicksort" was used to generate
> the test vectors.  http://www.cs.dartmouth.edu/~doug/mdmspe.pdf
>
>  - todd

Hmm, on arm I get:

cc -O2 -pipe -g -Wimplicit -I/usr/src/lib/libc/include
-I/usr/src/lib/libc/hidden -D__LIBC__
-Werror-implicit-function-declaration -include namespace.h
-Werror=deprecated-declarations -DAPIWARN -DYP -I/usr/src/lib/libc/yp
-DSOFTFLOAT_FOR_GCC -I/usr/src/lib/libc/softfloat -I/usr/src/lib/libc
-I/usr/src/lib/libc/gdtoa -I/usr/src/lib/libc/arch/arm/gdtoa
-DINFNAN_CHECK -DMULTIPLE_THREADS -DNO_FENV_H -DUSE_LOCALE
-I/usr/src/lib/libc -I/usr/src/lib/libc/citrus -DRESOLVSORT
-DFLOATING_POINT -DPRINTF_WIDE_CHAR -DSCANF_WIDE_CHAR   -DSOFTFLOAT -c
/usr/src/lib/libc/stdlib/qsort.c -o qsort.o
/usr/src/lib/libc/stdlib/qsort.c: In function 'introsort':
/usr/src/lib/libc/stdlib/qsort.c:104: error: 'heapsort' is deprecated
(declared at /usr/src/lib/libc/hidden/stdlib.h:95)
*** Error 1 in /usr/src/lib/libc (<bsd.lib.mk>:41 'qsort.o': @cc -O2
-pipe -g -Wimplicit -I/usr/src/lib/libc/include -I/usr/src/lib/libc/hid...)

        -Otto

Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: improving qsort worst case behavior

Otto Moerbeek
On Sat, May 20, 2017 at 10:20:42AM +0200, Otto Moerbeek wrote:

> On Fri, May 19, 2017 at 09:04:30AM -0600, Todd C. Miller wrote:
>
> > On Thu, 18 May 2017 09:58:14 -0600, "Todd C. Miller" wrote:
> >
> > > I believe the best approach is to switch qsort.c to "introsort".
> > > The changes are minimal and the elimination of the O(n^2) worst
> > > case is compelling.
> >
> > I've added input arrays to the qsort regress test that exhibit
> > quadratic behavior in our quicksort.  The introsort diff prevents
> > qsort from going quadratic.
> >
> > McIlroy's "A Killer Adversary for Quicksort" was used to generate
> > the test vectors.  http://www.cs.dartmouth.edu/~doug/mdmspe.pdf
> >
> >  - todd
>
> Hmm, on arm I get:
>
> cc -O2 -pipe -g -Wimplicit -I/usr/src/lib/libc/include
> -I/usr/src/lib/libc/hidden -D__LIBC__
> -Werror-implicit-function-declaration -include namespace.h
> -Werror=deprecated-declarations -DAPIWARN -DYP -I/usr/src/lib/libc/yp
> -DSOFTFLOAT_FOR_GCC -I/usr/src/lib/libc/softfloat -I/usr/src/lib/libc
> -I/usr/src/lib/libc/gdtoa -I/usr/src/lib/libc/arch/arm/gdtoa
> -DINFNAN_CHECK -DMULTIPLE_THREADS -DNO_FENV_H -DUSE_LOCALE
> -I/usr/src/lib/libc -I/usr/src/lib/libc/citrus -DRESOLVSORT
> -DFLOATING_POINT -DPRINTF_WIDE_CHAR -DSCANF_WIDE_CHAR   -DSOFTFLOAT -c
> /usr/src/lib/libc/stdlib/qsort.c -o qsort.o
> /usr/src/lib/libc/stdlib/qsort.c: In function 'introsort':
> /usr/src/lib/libc/stdlib/qsort.c:104: error: 'heapsort' is deprecated
> (declared at /usr/src/lib/libc/hidden/stdlib.h:95)
> *** Error 1 in /usr/src/lib/libc (<bsd.lib.mk>:41 'qsort.o': @cc -O2
> -pipe -g -Wimplicit -I/usr/src/lib/libc/include -I/usr/src/lib/libc/hid...)
>
> -Otto

Since heapsort is now called from within libc, we need to modify it's
attributes (see lib/libc/include/README)

With this, it's ok otto@
       
        -Otto

Index: hidden/stdlib.h
===================================================================
RCS file: /cvs/src/lib/libc/hidden/stdlib.h,v
retrieving revision 1.10
diff -u -p -r1.10 stdlib.h
--- hidden/stdlib.h 10 Apr 2017 05:45:02 -0000 1.10
+++ hidden/stdlib.h 20 May 2017 11:54:19 -0000
@@ -92,7 +92,7 @@ PROTO_DEPRECATED(getloadavg);
 PROTO_DEPRECATED(getprogname);
 PROTO_DEPRECATED(getsubopt);
 PROTO_DEPRECATED(grantpt);
-PROTO_DEPRECATED(heapsort);
+PROTO_NORMAL(heapsort);
 PROTO_DEPRECATED(initstate);
 PROTO_DEPRECATED(jrand48);
 PROTO_DEPRECATED(l64a);
Index: stdlib/heapsort.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/heapsort.c,v
retrieving revision 1.10
diff -u -p -r1.10 heapsort.c
--- stdlib/heapsort.c 22 Oct 2016 19:19:34 -0000 1.10
+++ stdlib/heapsort.c 20 May 2017 11:54:19 -0000
@@ -172,3 +172,4 @@ heapsort(void *vbase, size_t nmemb, size
  free(k);
  return (0);
 }
+DEF_WEAK(heapsort);
Index: stdlib/qsort.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/qsort.c,v
retrieving revision 1.15
diff -u -p -r1.15 qsort.c
--- stdlib/qsort.c 17 May 2017 16:58:20 -0000 1.15
+++ stdlib/qsort.c 20 May 2017 11:54:19 -0000
@@ -38,6 +38,18 @@ static __inline void swapfunc(char *, c
 
 /*
  * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ *
+ * This version differs from Bentley & McIlroy in the following ways:
+ *   1. The partition value is swapped into a[0] instead of being
+ * stored out of line for simplicity's sake.
+ *
+ *   2. It uses David Musser's introsort algorithm to fall back to
+ * heapsort(3) when the recursion depth reaches 2*lg(n + 1).
+ * This avoids qsort's quadratic behavior for pathological
+ * input without appreciably changing the average run time.
+ *
+ *   3. Tail recursion is eliminated when sorting the larger of two
+ * subpartitions to save stack space.
  */
 #define swapcode(TYPE, parmi, parmj, n) { \
  size_t i = (n) / sizeof (TYPE); \
@@ -80,15 +92,20 @@ med3(char *a, char *b, char *c, int (*cm
               :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
 }
 
-void
-qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *))
+static void
+introsort(char *a, size_t n, size_t es, size_t maxdepth,
+    int (*cmp)(const void *, const void *))
 {
  char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
  int cmp_result, swaptype;
  size_t d, r, s;
- char *a = aa;
 
-loop: SWAPINIT(a, es);
+loop: if (maxdepth == 0) {
+ if (heapsort(a, n, es, cmp) == 0)
+ return;
+ }
+ maxdepth--;
+ SWAPINIT(a, es);
  if (n < 7) {
  for (pm = a + es; pm < a + n * es; pm += es)
  for (pl = pm; pl > a && cmp(pl - es, pl) > 0;
@@ -110,7 +127,6 @@ loop: SWAPINIT(a, es);
  }
  swap(a, pm);
  pa = pb = a + es;
-
  pc = pd = a + (n - 1) * es;
  for (;;) {
  while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) {
@@ -149,7 +165,7 @@ loop: SWAPINIT(a, es);
  /* Recurse for 1st side, iterate for 2nd side. */
  if (s > es) {
  if (r > es)
- qsort(a, r / es, es, cmp);
+ introsort(a, r / es, es, maxdepth, cmp);
  a = pn - s;
  n = s / es;
  goto loop;
@@ -158,10 +174,24 @@ loop: SWAPINIT(a, es);
  /* Recurse for 2nd side, iterate for 1st side. */
  if (r > es) {
  if (s > es)
- qsort(pn - s, s / es, es, cmp);
+ introsort(pn - s, s / es, es, maxdepth, cmp);
  n = r / es;
  goto loop;
  }
  }
 }
+
+void
+qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *))
+{
+ size_t i, maxdepth = 0;
+
+ /* Approximate 2*ceil(lg(n + 1)) */
+ for (i = n; i > 0; i >>= 1)
+ maxdepth++;
+ maxdepth *= 2;
+
+ introsort(a, n, es, maxdepth, cmp);
+}
+
 DEF_STRONG(qsort);

Loading...