Quicksort - why is my flagship flag implementation slower than my Hoare-2 sections implementation?

As a training exercise, I implement the Quicksort algorithm in C. Pivot is a 3-value median, and for sections with 4 or less elements, I switch to Sorting Sort.

Now I tested two options: one uses the Hoare partition scheme , the other uses the Dutch flag .

UPDATE Included the whole file for both options.

Hoara:

#include <stdlib.h> #include "quicksort.h" #define THRESHOLD 4 #define SWAP(a, b) \ { \ char *a_swap = (a); \ char *b_swap = (b); \ int size_swap = size_q; \ char tmp; \ while(size_swap-- > 0) {\ tmp = *a_swap; \ *a_swap++ = *b_swap;\ *b_swap++ = tmp; \ } \ } #define MEDIAN_OF_3(left, mid, right) \ { \ char *l = (left); \ char *m = (mid); \ char *r = (right); \ if((*cmp_q)((void *)m, (void *)l) < 0) {\ SWAP(m, l); \ } \ if((*cmp_q)((void *)r, (void *)m) < 0) {\ SWAP(r, m); \ } else { \ goto jump; \ } \ if((*cmp_q)((void *)m, (void *)l) < 0) {\ SWAP(m, l); \ } \ jump:; \ } #define COPY(dest, src) \ { \ char *src_copy = (src); \ char *dest_copy = (dest); \ size_t size_copy = size_q; \ while(size_copy-- > 0) { \ *dest_copy++ = *src_copy++; \ } \ } static size_t size_q = 0; static char *e = NULL; static int (*cmp_q)(const void *, const void *) = NULL; void sort(char *left, char *right) { int elements = (right+size_q-left)/size_q; //========== QUICKSORT ========== if(elements > THRESHOLD) { //========== PIVOT = MEDIAN OF THREE ========== char *mid = left+size_q*((right-left)/size_q>>1); MEDIAN_OF_3(left, mid, right); char *pivot = mid; //========== PARTITIONING ========== char *left_part = left+size_q; char *right_part = right-size_q; while(left_part < right_part) { while((*cmp_q)((void *)left_part, (void *)pivot) < 0) { left_part += size_q; } while((*cmp_q)((void *)right_part, (void *)pivot) > 0) { right_part -= size_q; } if(left_part < right_part) { SWAP(left_part, right_part); if(pivot == left_part) { pivot = right_part; } else if(pivot == right_part) { pivot = left_part; } left_part += size_q; right_part -= size_q; } } //========== RECURSIVE CALLS ========== sort(left, right_part); sort(left_part, right); } else if(elements > 1) { //========== INSERTION SORT ========== char *i, *j; for(i = left+size_q; i <= right; i += size_q) { if((*cmp_q)((void *)i, (void *)(i-size_q)) < 0) { COPY(e, i); for(j = i-size_q; j >= left && (*cmp_q)((void *)e, (void *)j) < 0; j -= size_q) { COPY(j+size_q, j); } COPY(j+size_q, e); } } } } void quicksort(void *array, size_t num, size_t size, int (*cmp)(const void *a, const void *b)) { char *array_q = (char *)array; size_q = size; cmp_q = cmp; e = malloc(size_q); sort(array_q, array_q+size_q*(num-1)); free(e); } 

Dutch flag:

 #include <stdlib.h> #include "quicksort.h" #define THRESHOLD 4 #define SWAP(a, b) \ { \ char *a_q = (a); \ char *b_q = (b); \ int size_swap = size_q; \ char tmp; \ while(size_swap-- > 0) {\ tmp = *a_q; \ *a_q++ = *b_q; \ *b_q++ = tmp; \ } \ \ } #define MEDIAN_OF_3(left, mid, right) \ { \ char *l = (left); \ char *m = (mid); \ char *r = (right); \ if((*cmp_q)((void *)m, (void *)l) < 0) {\ SWAP(m, l); \ } \ if((*cmp_q)((void *)r, (void *)m) < 0) {\ SWAP(r, m); \ } else { \ goto jump; \ } \ if((*cmp_q)((void *)m, (void *)l) < 0) {\ SWAP(m, l); \ } \ jump:; \ } #define COPY(dest, src) \ { \ char *src_copy = (src); \ char *dest_copy = (dest); \ size_t size_copy = size_q; \ while(size_copy-- > 0) { \ *dest_copy++ = *src_copy++; \ } \ } static size_t size_q = 0; static char *pivot = NULL; static char *e = NULL; static int (*cmp_q)(const void *, const void *) = NULL; void sort(char *left, char *right) { int elements = (right+size_q-left)/size_q; //========== QUICKSORT ========== if(elements > THRESHOLD) { //========== PIVOT = MEDIAN OF THREE ========== char *mid = left+size_q*((right-left)/size_q>>1); MEDIAN_OF_3(left, mid, right); COPY(pivot, mid); //========== 3-WAY PARTITIONING (DUTCH FLAG PROBLEM) ========== char *less = left; char *equal = left; char *greater = right; int value; while(equal <= greater) { value = (*cmp_q)((void *)equal, (void *)pivot); if(value < 0) { SWAP(less, equal); less += size_q; equal += size_q; } else if(value > 0) { SWAP(equal, greater); greater -= size_q; } else { equal += size_q; } } //========== RECURSIVE CALLS ========== sort(left, less-size_q); sort(greater+size_q, right); } else if(elements > 1) { //========== INSERTION SORT ========== char *i, *j; for(i = left+size_q; i <= right; i += size_q) { if((*cmp_q)((void *)i, (void *)(i-size_q)) < 0) { COPY(e, i); for(j = i-size_q; j >= left && (*cmp_q)((void *)e, (void *)j) < 0; j -= size_q) { COPY(j+size_q, j); } COPY(j+size_q, e); } } } } void quicksort(void *array, size_t num, size_t size, int (*cmp)(const void *a, const void *b)) { char *array_q = (char *)array; size_q = size; cmp_q = cmp; pivot = malloc(size_q); e = malloc(size_q); sort(array_q, array_q+size_q*(num-1)); free(pivot); free(e); } 

Both will receive the same input, a series of files, each of which contains 10^n random integer values ​​with the range [0:(10^n)+1] . n varies from 1 to 7 (from 10 to 10 million elements). I expected that the implementation of the Dutch flag would be at least as fast as Hoar, but it was not.

Flags: -O3

 Implementation Size Runs Time Hoare 10^7 10 avg=2.148s Dutch Flag 10^7 10 avg=3.312s 

Then I changed the input: the same size, 10^n , but with the values [0:10^(n-1)] , which guaranteed a lot of duplicate values.

Result:

 Implementation Size Runs Time Hoare 10^7 10 avg=0.170s Dutch Flag 10^7 10 avg=0.260s 

Even for repeated values, the Dutch flag is slower than that of Hoar. What for? It does not look like the selected rod is unique.

My environment, if that matters:

 CPU=Intel(R) Core(TM) i7-6820HK @ 2.70GHz VM OS=Linux version 4.4.0-36-generic, Ubuntu 16.04.2, gcc version 5.4.0 Host=Microsoft Windows 10 Home IDE=Eclipse CDT Neon 
+5
source share
1 answer
  • Do not use malloc and free . They are used in every recursive call (total N times), and this takes a lot of time.

  • Comparison will be more useful if you enable optimization ( -O3 ).

  • Is SWAP macro or a function? If this is a function, try making it inline .

+3
source