summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile18
-rw-r--r--sorters.c1
-rw-r--r--sorters.h5
-rw-r--r--wikisort_ref.cc969
4 files changed, 985 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index a2aada1..a454294 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,11 @@
-CFLAGS = -Os -pipe -std=c99 -D_XOPEN_SOURCE=500
-LDFLAGS = -lm
+COMMONFLAGS = -Os -pipe -D_XOPEN_SOURCE=500
+CFLAGS = -std=c99
+CXXFLAGS = -std=c++11
+LIBS = -lm
-SRCS = $(sort $(wildcard *.c))
-OBJS = $(SRCS:.c=.o)
+SRCS_C = $(sort $(wildcard *.c))
+SRCS_CC = $(sort $(wildcard *.cc))
+OBJS = $(SRCS_C:.c=.o) $(SRCS_CC:.cc=.o)
BINS = bench
.PHONY: all clean
@@ -14,7 +17,10 @@ clean:
rm -f $(OBJS)
bench: $(OBJS)
- $(CC) $(LDFLAGS) $^ -o $@
+ $(CXX) $^ -o $@ $(LIBS)
+
+%.o: %.cc
+ $(CXX) $(COMMONFLAGS) $(CXXFLAGS) -c $< -o $@
%.o: %.c
- $(CC) $(CFLAGS) -c $< -o $@
+ $(CC) $(COMMONFLAGS) $(CFLAGS) -c $< -o $@
diff --git a/sorters.c b/sorters.c
index 8db94e0..c2d04e5 100644
--- a/sorters.c
+++ b/sorters.c
@@ -9,6 +9,7 @@ const struct sorter sorters[] = {
{ .name = "glibc mergesort", .func = glibc_mergesort },
{ .name = "musl", .func = musl_qsort },
{ .name = "wikisort", .func = wikisort },
+ { .name = "wikisort (ref)", .func = wikisort_ref },
{ 0 }
};
diff --git a/sorters.h b/sorters.h
index d451813..669f5c4 100644
--- a/sorters.h
+++ b/sorters.h
@@ -3,15 +3,16 @@
void assert_sorted(int *buffer, size_t size);
typedef int (*cmpfun)(const void *, const void *);
-typedef void (*sorter)(void *, size_t, size_t, cmpfun);
+typedef void (*sorterfn)(void *, size_t, size_t, cmpfun);
void musl_qsort(void *, size_t, size_t, cmpfun);
void glibc_quicksort(void *, size_t, size_t, cmpfun);
void glibc_mergesort(void *, size_t, size_t, cmpfun);
void freebsd_qsort(void *, size_t, size_t, cmpfun);
void wikisort(void *, size_t, size_t, cmpfun);
+void wikisort_ref(void *, size_t, size_t, cmpfun);
extern const struct sorter {
const char *name;
- sorter func;
+ sorterfn func;
} sorters[];
diff --git a/wikisort_ref.cc b/wikisort_ref.cc
new file mode 100644
index 0000000..f6b970b
--- /dev/null
+++ b/wikisort_ref.cc
@@ -0,0 +1,969 @@
+/***********************************************************
+ WikiSort: a public domain implementation of "Block Sort"
+ https://github.com/BonzaiThePenguin/WikiSort
+
+ to run:
+ clang++ -o WikiSort.x WikiSort.cpp -O3
+ (or replace 'clang++' with 'g++')
+ ./WikiSort.x
+***********************************************************/
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <ctime>
+
+extern "C" {
+#include "sorters.h"
+}
+
+// record the number of comparisons and assignments
+// note that this reduces WikiSort's performance when enabled
+#define PROFILE false
+
+// verify that WikiSort is actually correct
+// (this also reduces performance slightly)
+#define VERIFY false
+
+// simulate comparisons that have a bit more overhead than just an inlined (int < int)
+// (so we can tell whether reducing the number of comparisons was worth the added complexity)
+#define SLOW_COMPARISONS false
+
+// if true, test against std::__inplace_stable_sort() rather than std::stable_sort()
+#define TEST_INPLACE false
+
+// whether to give WikiSort a full-size cache, to see how it performs when given more memory
+#define DYNAMIC_CACHE false
+
+
+#if PROFILE
+ // global for testing how many comparisons are performed for each sorting algorithm
+ long comparisons, assignments;
+#endif
+
+// structure to represent ranges within the array
+class Range {
+public:
+ size_t start;
+ size_t end;
+
+ Range() {}
+ Range(size_t start, size_t end) : start(start), end(end) {}
+ size_t length() const { return end - start; }
+};
+
+// toolbox functions used by the sorter
+
+// 63 -> 32, 64 -> 64, etc.
+// this comes from Hacker's Delight
+size_t FloorPowerOfTwo (const size_t value) {
+ size_t x = value;
+ x = x | (x >> 1);
+ x = x | (x >> 2);
+ x = x | (x >> 4);
+ x = x | (x >> 8);
+ x = x | (x >> 16);
+#if __LP64__
+ x = x | (x >> 32);
+#endif
+ return x - (x >> 1);
+}
+
+// find the index of the first value within the range that is equal to array[index]
+template <typename T, typename Comparison>
+size_t BinaryFirst(const T array[], const T & value, const Range & range, const Comparison compare) {
+ return std::lower_bound(&array[range.start], &array[range.end], value, compare) - &array[0];
+}
+
+// find the index of the last value within the range that is equal to array[index], plus 1
+template <typename T, typename Comparison>
+size_t BinaryLast(const T array[], const T & value, const Range & range, const Comparison compare) {
+ return std::upper_bound(&array[range.start], &array[range.end], value, compare) - &array[0];
+}
+
+// combine a linear search with a binary search to reduce the number of comparisons in situations
+// where have some idea as to how many unique values there are and where the next value might be
+template <typename T, typename Comparison>
+size_t FindFirstForward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+ if (range.length() == 0) return range.start;
+ size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+ for (index = range.start + skip; compare(array[index - 1], value); index += skip)
+ if (index >= range.end - skip)
+ return BinaryFirst(array, value, Range(index, range.end), compare);
+
+ return BinaryFirst(array, value, Range(index - skip, index), compare);
+}
+
+template <typename T, typename Comparison>
+size_t FindLastForward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+ if (range.length() == 0) return range.start;
+ size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+ for (index = range.start + skip; !compare(value, array[index - 1]); index += skip)
+ if (index >= range.end - skip)
+ return BinaryLast(array, value, Range(index, range.end), compare);
+
+ return BinaryLast(array, value, Range(index - skip, index), compare);
+}
+
+template <typename T, typename Comparison>
+size_t FindFirstBackward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+ if (range.length() == 0) return range.start;
+ size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+ for (index = range.end - skip; index > range.start && !compare(array[index - 1], value); index -= skip)
+ if (index < range.start + skip)
+ return BinaryFirst(array, value, Range(range.start, index), compare);
+
+ return BinaryFirst(array, value, Range(index, index + skip), compare);
+}
+
+template <typename T, typename Comparison>
+size_t FindLastBackward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+ if (range.length() == 0) return range.start;
+ size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+ for (index = range.end - skip; index > range.start && compare(value, array[index - 1]); index -= skip)
+ if (index < range.start + skip)
+ return BinaryLast(array, value, Range(range.start, index), compare);
+
+ return BinaryLast(array, value, Range(index, index + skip), compare);
+}
+
+// n^2 sorting algorithm used to sort tiny chunks of the full array
+template <typename T, typename Comparison>
+void InsertionSort(T array[], const Range & range, const Comparison compare) {
+ for (size_t j, i = range.start + 1; i < range.end; i++) {
+ const T temp = array[i];
+ for (j = i; j > range.start && compare(temp, array[j - 1]); j--)
+ array[j] = array[j - 1];
+ array[j] = temp;
+ }
+}
+
+// reverse a range of values within the array
+template <typename T>
+void Reverse(T array[], const Range & range) {
+ std::reverse(&array[range.start], &array[range.end]);
+}
+
+// swap a series of values in the array
+template <typename T>
+void BlockSwap(T array[], const size_t start1, const size_t start2, const size_t block_size) {
+ std::swap_ranges(&array[start1], &array[start1 + block_size], &array[start2]);
+}
+
+// rotate the values in an array ([0 1 2 3] becomes [1 2 3 0] if we rotate by 1)
+// this assumes that 0 <= amount <= range.length()
+template <typename T>
+void Rotate(T array[], size_t amount, Range range) {
+ std::rotate(&array[range.start], &array[range.start + amount], &array[range.end]);
+}
+
+namespace Wiki {
+ // merge two ranges from one array and save the results into a different array
+ template <typename T, typename Comparison>
+ void MergeInto(T from[], const Range & A, const Range & B, const Comparison compare, T into[]) {
+ T *A_index = &from[A.start], *B_index = &from[B.start];
+ T *A_last = &from[A.end], *B_last = &from[B.end];
+ T *insert_index = &into[0];
+
+ while (true) {
+ if (!compare(*B_index, *A_index)) {
+ *insert_index = *A_index;
+ A_index++;
+ insert_index++;
+ if (A_index == A_last) {
+ // copy the remainder of B into the final array
+ std::copy(B_index, B_last, insert_index);
+ break;
+ }
+ } else {
+ *insert_index = *B_index;
+ B_index++;
+ insert_index++;
+ if (B_index == B_last) {
+ // copy the remainder of A into the final array
+ std::copy(A_index, A_last, insert_index);
+ break;
+ }
+ }
+ }
+ }
+
+ // merge operation using an external buffer
+ template <typename T, typename Comparison>
+ void MergeExternal(T array[], const Range & A, const Range & B, const Comparison compare, T cache[]) {
+ // A fits into the cache, so use that instead of the internal buffer
+ T *A_index = &cache[0], *B_index = &array[B.start];
+ T *A_last = &cache[A.length()], *B_last = &array[B.end];
+ T *insert_index = &array[A.start];
+
+ if (B.length() > 0 && A.length() > 0) {
+ while (true) {
+ if (!compare(*B_index, *A_index)) {
+ *insert_index = *A_index;
+ A_index++;
+ insert_index++;
+ if (A_index == A_last) break;
+ } else {
+ *insert_index = *B_index;
+ B_index++;
+ insert_index++;
+ if (B_index == B_last) break;
+ }
+ }
+ }
+
+ // copy the remainder of A into the final array
+ std::copy(A_index, A_last, insert_index);
+ }
+
+ // merge operation using an internal buffer
+ template <typename T, typename Comparison>
+ void MergeInternal(T array[], const Range & A, const Range & B, const Comparison compare, const Range & buffer) {
+ // whenever we find a value to add to the final array, swap it with the value that's already in that spot
+ // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
+ T *A_index = &array[buffer.start], *B_index = &array[B.start];
+ T *A_last = &array[buffer.start + A.length()], *B_last = &array[B.end];
+ T *insert_index = &array[A.start];
+
+ if (B.length() > 0 && A.length() > 0) {
+ while (true) {
+ if (!compare(*B_index, *A_index)) {
+ std::swap(*insert_index, *A_index);
+ A_index++;
+ insert_index++;
+ if (A_index == A_last) break;
+ } else {
+ std::swap(*insert_index, *B_index);
+ B_index++;
+ insert_index++;
+ if (B_index == B_last) break;
+ }
+ }
+ }
+
+ // BlockSwap
+ std::swap_ranges(A_index, A_last, insert_index);
+ }
+
+ // merge operation without a buffer
+ template <typename T, typename Comparison>
+ void MergeInPlace(T array[], Range A, Range B, const Comparison compare) {
+ if (A.length() == 0 || B.length() == 0) return;
+
+ /*
+ this just repeatedly binary searches into B and rotates A into position.
+ the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
+ but I decided to stick with this because it had better situational performance
+
+ (Hwang and Lin is designed for merging subarrays of very different sizes,
+ but WikiSort almost always uses subarrays that are roughly the same size)
+
+ normally this is incredibly suboptimal, but this function is only called
+ when none of the A or B blocks in any subarray contained 2√A unique values,
+ which places a hard limit on the number of times this will ACTUALLY need
+ to binary search and rotate.
+
+ according to my analysis the worst case is √A rotations performed on √A items
+ once the constant factors are removed, which ends up being O(n)
+
+ again, this is NOT a general-purpose solution – it only works well in this case!
+ kind of like how the O(n^2) insertion sort is used in some places
+ */
+
+ while (true) {
+ // find the first place in B where the first item in A needs to be inserted
+ size_t mid = BinaryFirst(array, array[A.start], B, compare);
+
+ // rotate A into place
+ size_t amount = mid - A.end;
+ Rotate(array, A.length(), Range(A.start, mid));
+ if (B.end == mid) break;
+
+ // calculate the new A and B ranges
+ B.start = mid;
+ A = Range(A.start + amount, B.start);
+ A.start = BinaryLast(array, array[A.start], A, compare);
+ if (A.length() == 0) break;
+ }
+ }
+
+ // calculate how to scale the index value to the range within the array
+ // the bottom-up merge sort only operates on values that are powers of two,
+ // so scale down to that power of two, then use a fraction to scale back again
+ class Iterator {
+ size_t size, power_of_two;
+ size_t decimal, numerator, denominator;
+ size_t decimal_step, numerator_step;
+
+ public:
+ Iterator(size_t size2, size_t min_level) {
+ size = size2;
+ power_of_two = FloorPowerOfTwo(size);
+ denominator = power_of_two/min_level;
+ numerator_step = size % denominator;
+ decimal_step = size/denominator;
+ begin();
+ }
+
+ void begin() {
+ numerator = decimal = 0;
+ }
+
+ Range nextRange() {
+ size_t start = decimal;
+
+ decimal += decimal_step;
+ numerator += numerator_step;
+ if (numerator >= denominator) {
+ numerator -= denominator;
+ decimal++;
+ }
+
+ return Range(start, decimal);
+ }
+
+ bool finished() {
+ return (decimal >= size);
+ }
+
+ bool nextLevel() {
+ decimal_step += decimal_step;
+ numerator_step += numerator_step;
+ if (numerator_step >= denominator) {
+ numerator_step -= denominator;
+ decimal_step++;
+ }
+
+ return (decimal_step < size);
+ }
+
+ size_t length() {
+ return decimal_step;
+ }
+ };
+
+#if DYNAMIC_CACHE
+ // use a class so the memory for the cache is freed when the object goes out of scope,
+ // regardless of whether exceptions were thrown (only needed in the C++ version)
+ template <typename T>
+ class Cache {
+ public:
+ T *cache;
+ size_t cache_size;
+
+ ~Cache() {
+ if (cache) delete[] cache;
+ }
+
+ Cache(size_t size) {
+ // good choices for the cache size are:
+ // (size + 1)/2 – turns into a full-speed standard merge sort since everything fits into the cache
+ cache_size = (size + 1)/2;
+ cache = new (std::nothrow) T[cache_size];
+ if (cache) return;
+
+ // sqrt((size + 1)/2) + 1 – this will be the size of the A blocks at the largest level of merges,
+ // so a buffer of this size would allow it to skip using internal or in-place merges for anything
+ cache_size = sqrt(cache_size) + 1;
+ cache = new (std::nothrow) T[cache_size];
+ if (cache) return;
+
+ // 512 – chosen from careful testing as a good balance between fixed-size memory use and run time
+ if (cache_size > 512) {
+ cache_size = 512;
+ cache = new (std::nothrow) T[cache_size];
+ if (cache) return;
+ }
+
+ // 0 – if the system simply cannot allocate any extra memory whatsoever, no memory works just fine
+ cache_size = 0;
+ }
+ };
+#endif
+
+ // bottom-up merge sort combined with an in-place merge algorithm for O(1) memory use
+ template <typename Iterator, typename Comparison>
+ void Sort(Iterator first, Iterator last, const Comparison compare) {
+ // map first and last to a C-style array, so we don't have to change the rest of the code
+ // (bit of a nasty hack, but it's good enough for now...)
+ typedef typename std::iterator_traits<Iterator>::value_type T;
+ const size_t size = last - first;
+ T *array = &first[0];
+
+ // if the array is of size 0, 1, 2, or 3, just sort them like so:
+ if (size < 4) {
+ if (size == 3) {
+ // hard-coded insertion sort
+ if (compare(array[1], array[0])) std::swap(array[0], array[1]);
+ if (compare(array[2], array[1])) {
+ std::swap(array[1], array[2]);
+ if (compare(array[1], array[0])) std::swap(array[0], array[1]);
+ }
+ } else if (size == 2) {
+ // swap the items if they're out of order
+ if (compare(array[1], array[0])) std::swap(array[0], array[1]);
+ }
+
+ return;
+ }
+
+ // sort groups of 4-8 items at a time using an unstable sorting network,
+ // but keep track of the original item orders to force it to be stable
+ // http://pages.ripco.net/~jgamble/nw.html
+ Wiki::Iterator iterator (size, 4);
+ while (!iterator.finished()) {
+ uint8_t order[] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+ Range range = iterator.nextRange();
+
+ #define SWAP(x, y) \
+ if (compare(array[range.start + y], array[range.start + x]) || \
+ (order[x] > order[y] && !compare(array[range.start + x], array[range.start + y]))) { \
+ std::swap(array[range.start + x], array[range.start + y]); std::swap(order[x], order[y]); }
+
+ if (range.length() == 8) {
+ SWAP(0, 1); SWAP(2, 3); SWAP(4, 5); SWAP(6, 7);
+ SWAP(0, 2); SWAP(1, 3); SWAP(4, 6); SWAP(5, 7);
+ SWAP(1, 2); SWAP(5, 6); SWAP(0, 4); SWAP(3, 7);
+ SWAP(1, 5); SWAP(2, 6);
+ SWAP(1, 4); SWAP(3, 6);
+ SWAP(2, 4); SWAP(3, 5);
+ SWAP(3, 4);
+
+ } else if (range.length() == 7) {
+ SWAP(1, 2); SWAP(3, 4); SWAP(5, 6);
+ SWAP(0, 2); SWAP(3, 5); SWAP(4, 6);
+ SWAP(0, 1); SWAP(4, 5); SWAP(2, 6);
+ SWAP(0, 4); SWAP(1, 5);
+ SWAP(0, 3); SWAP(2, 5);
+ SWAP(1, 3); SWAP(2, 4);
+ SWAP(2, 3);
+
+ } else if (range.length() == 6) {
+ SWAP(1, 2); SWAP(4, 5);
+ SWAP(0, 2); SWAP(3, 5);
+ SWAP(0, 1); SWAP(3, 4); SWAP(2, 5);
+ SWAP(0, 3); SWAP(1, 4);
+ SWAP(2, 4); SWAP(1, 3);
+ SWAP(2, 3);
+
+ } else if (range.length() == 5) {
+ SWAP(0, 1); SWAP(3, 4);
+ SWAP(2, 4);
+ SWAP(2, 3); SWAP(1, 4);
+ SWAP(0, 3);
+ SWAP(0, 2); SWAP(1, 3);
+ SWAP(1, 2);
+
+ } else if (range.length() == 4) {
+ SWAP(0, 1); SWAP(2, 3);
+ SWAP(0, 2); SWAP(1, 3);
+ SWAP(1, 2);
+ }
+ }
+ if (size < 8) return;
+
+ // use a small cache to speed up some of the operations
+ #if DYNAMIC_CACHE
+ Cache<T> cache_obj (size);
+ T *cache = cache_obj.cache;
+ const size_t cache_size = cache_obj.cache_size;
+ #else
+ // since the cache size is fixed, it's still O(1) memory!
+ // just keep in mind that making it too small ruins the point (nothing will fit into it),
+ // and making it too large also ruins the point (so much for "low memory"!)
+ // removing the cache entirely still gives 75% of the performance of a standard merge
+ const size_t cache_size = 512;
+ T cache[cache_size];
+ #endif
+
+ // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
+ while (true) {
+ // if every A and B block will fit into the cache, use a special branch specifically for merging with the cache
+ // (we use < rather than <= since the block size might be one more than iterator.length())
+ if (iterator.length() < cache_size) {
+
+ // if four subarrays fit into the cache, it's faster to merge both pairs of subarrays into the cache,
+ // then merge the two merged subarrays from the cache back into the original array
+ if ((iterator.length() + 1) * 4 <= cache_size && iterator.length() * 4 <= size) {
+ iterator.begin();
+ while (!iterator.finished()) {
+ // merge A1 and B1 into the cache
+ Range A1 = iterator.nextRange();
+ Range B1 = iterator.nextRange();
+ Range A2 = iterator.nextRange();
+ Range B2 = iterator.nextRange();
+
+ if (compare(array[B1.end - 1], array[A1.start])) {
+ // the two ranges are in reverse order, so copy them in reverse order into the cache
+ std::copy(&array[A1.start], &array[A1.end], &cache[B1.length()]);
+ std::copy(&array[B1.start], &array[B1.end], &cache[0]);
+ } else if (compare(array[B1.start], array[A1.end - 1])) {
+ // these two ranges weren't already in order, so merge them into the cache
+ MergeInto(array, A1, B1, compare, &cache[0]);
+ } else {
+ // if A1, B1, A2, and B2 are all in order, skip doing anything else
+ if (!compare(array[B2.start], array[A2.end - 1]) && !compare(array[A2.start], array[B1.end - 1])) continue;
+
+ // copy A1 and B1 into the cache in the same order
+ std::copy(&array[A1.start], &array[A1.end], &cache[0]);
+ std::copy(&array[B1.start], &array[B1.end], &cache[A1.length()]);
+ }
+ A1 = Range(A1.start, B1.end);
+
+ // merge A2 and B2 into the cache
+ if (compare(array[B2.end - 1], array[A2.start])) {
+ // the two ranges are in reverse order, so copy them in reverse order into the cache
+ std::copy(&array[A2.start], &array[A2.end], &cache[A1.length() + B2.length()]);
+ std::copy(&array[B2.start], &array[B2.end], &cache[A1.length()]);
+ } else if (compare(array[B2.start], array[A2.end - 1])) {
+ // these two ranges weren't already in order, so merge them into the cache
+ MergeInto(array, A2, B2, compare, &cache[A1.length()]);
+ } else {
+ // copy A2 and B2 into the cache in the same order
+ std::copy(&array[A2.start], &array[A2.end], &cache[A1.length()]);
+ std::copy(&array[B2.start], &array[B2.end], &cache[A1.length() + A2.length()]);
+ }
+ A2 = Range(A2.start, B2.end);
+
+ // merge A1 and A2 from the cache into the array
+ Range A3 = Range(0, A1.length());
+ Range B3 = Range(A1.length(), A1.length() + A2.length());
+
+ if (compare(cache[B3.end - 1], cache[A3.start])) {
+ // the two ranges are in reverse order, so copy them in reverse order into the array
+ std::copy(&cache[A3.start], &cache[A3.end], &array[A1.start + A2.length()]);
+ std::copy(&cache[B3.start], &cache[B3.end], &array[A1.start]);
+ } else if (compare(cache[B3.start], cache[A3.end - 1])) {
+ // these two ranges weren't already in order, so merge them back into the array
+ MergeInto(cache, A3, B3, compare, &array[A1.start]);
+ } else {
+ // copy A3 and B3 into the array in the same order
+ std::copy(&cache[A3.start], &cache[A3.end], &array[A1.start]);
+ std::copy(&cache[B3.start], &cache[B3.end], &array[A1.start + A1.length()]);
+ }
+ }
+
+ // we merged two levels at the same time, so we're done with this level already
+ // (iterator.nextLevel() is called again at the bottom of this outer merge loop)
+ iterator.nextLevel();
+
+ } else {
+ iterator.begin();
+ while (!iterator.finished()) {
+ Range A = iterator.nextRange();
+ Range B = iterator.nextRange();
+
+ if (compare(array[B.end - 1], array[A.start])) {
+ // the two ranges are in reverse order, so a simple rotation should fix it
+ Rotate(array, A.length(), Range(A.start, B.end));
+ } else if (compare(array[B.start], array[A.end - 1])) {
+ // these two ranges weren't already in order, so we'll need to merge them!
+ std::copy(&array[A.start], &array[A.end], &cache[0]);
+ MergeExternal(array, A, B, compare, cache);
+ }
+ }
+ }
+ } else {
+ // this is where the in-place merge logic starts!
+ // 1. pull out two internal buffers each containing √A unique values
+ // 1a. adjust block_size and buffer_size if we couldn't find enough unique values
+ // 2. loop over the A and B subarrays within this level of the merge sort
+ // 3. break A and B into blocks of size 'block_size'
+ // 4. "tag" each of the A blocks with values from the first internal buffer
+ // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
+ // 6. merge each A block with any B values that follow, using the cache or the second internal buffer
+ // 7. sort the second internal buffer if it exists
+ // 8. redistribute the two internal buffers back into the array
+
+ size_t block_size = sqrt(iterator.length());
+ size_t buffer_size = iterator.length()/block_size + 1;
+
+ // as an optimization, we really only need to pull out the internal buffers once for each level of merges
+ // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
+ Range buffer1 = Range(0, 0), buffer2 = Range(0, 0);
+ size_t index, last, count, pull_index = 0;
+ struct { size_t from, to, count; Range range; } pull[2];
+ pull[0].from = pull[0].to = pull[0].count = 0; pull[0].range = Range(0, 0);
+ pull[1].from = pull[1].to = pull[1].count = 0; pull[1].range = Range(0, 0);
+
+ // find two internal buffers of size 'buffer_size' each
+ // let's try finding both buffers at the same time from a single A or B subarray
+ size_t find = buffer_size + buffer_size;
+ bool find_separately = false;
+
+ if (block_size <= cache_size) {
+ // if every A block fits into the cache then we won't need the second internal buffer,
+ // so we really only need to find 'buffer_size' unique values
+ find = buffer_size;
+ } else if (find > iterator.length()) {
+ // we can't fit both buffers into the same A or B subarray, so find two buffers separately
+ find = buffer_size;
+ find_separately = true;
+ }
+
+ // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
+ // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
+ // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
+
+ // in the case where it couldn't find a single buffer of at least √A unique values,
+ // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
+
+ iterator.begin();
+ while (!iterator.finished()) {
+ Range A = iterator.nextRange();
+ Range B = iterator.nextRange();
+
+ // just store information about where the values will be pulled from and to,
+ // as well as how many values there are, to create the two internal buffers
+ #define PULL(_to) \
+ pull[pull_index].range = Range(A.start, B.end); \
+ pull[pull_index].count = count; \
+ pull[pull_index].from = index; \
+ pull[pull_index].to = _to
+
+ // check A for the number of unique values we need to fill an internal buffer
+ // these values will be pulled out to the start of A
+ for (last = A.start, count = 1; count < find; last = index, count++) {
+ index = FindLastForward(array, array[last], Range(last + 1, A.end), compare, find - count);
+ if (index == A.end) break;
+ assert(index < A.end);
+ }
+ index = last;
+
+ if (count >= buffer_size) {
+ // keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer
+ PULL(A.start);
+ pull_index = 1;
+
+ if (count == buffer_size + buffer_size) {
+ // we were able to find a single contiguous section containing 2√A unique values,
+ // so this section can be used to contain both of the internal buffers we'll need
+ buffer1 = Range(A.start, A.start + buffer_size);
+ buffer2 = Range(A.start + buffer_size, A.start + count);
+ break;
+ } else if (find == buffer_size + buffer_size) {
+ // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+ // so we still need to find a second separate buffer of at least √A unique values
+ buffer1 = Range(A.start, A.start + count);
+ find = buffer_size;
+ } else if (block_size <= cache_size) {
+ // we found the first and only internal buffer that we need, so we're done!
+ buffer1 = Range(A.start, A.start + count);
+ break;
+ } else if (find_separately) {
+ // found one buffer, but now find the other one
+ buffer1 = Range(A.start, A.start + count);
+ find_separately = false;
+ } else {
+ // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
+ buffer2 = Range(A.start, A.start + count);
+ break;
+ }
+ } else if (pull_index == 0 && count > buffer1.length()) {
+ // keep track of the largest buffer we were able to find
+ buffer1 = Range(A.start, A.start + count);
+ PULL(A.start);
+ }
+
+ // check B for the number of unique values we need to fill an internal buffer
+ // these values will be pulled out to the end of B
+ for (last = B.end - 1, count = 1; count < find; last = index - 1, count++) {
+ index = FindFirstBackward(array, array[last], Range(B.start, last), compare, find - count);
+ if (index == B.start) break;
+ assert(index > B.start);
+ }
+ index = last;
+
+ if (count >= buffer_size) {
+ // keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer
+ PULL(B.end);
+ pull_index = 1;
+
+ if (count == buffer_size + buffer_size) {
+ // we were able to find a single contiguous section containing 2√A unique values,
+ // so this section can be used to contain both of the internal buffers we'll need
+ buffer1 = Range(B.end - count, B.end - buffer_size);
+ buffer2 = Range(B.end - buffer_size, B.end);
+ break;
+ } else if (find == buffer_size + buffer_size) {
+ // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+ // so we still need to find a second separate buffer of at least √A unique values
+ buffer1 = Range(B.end - count, B.end);
+ find = buffer_size;
+ } else if (block_size <= cache_size) {
+ // we found the first and only internal buffer that we need, so we're done!
+ buffer1 = Range(B.end - count, B.end);
+ break;
+ } else if (find_separately) {
+ // found one buffer, but now find the other one
+ buffer1 = Range(B.end - count, B.end);
+ find_separately = false;
+ } else {
+ // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
+ // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
+ if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
+
+ // we found a second buffer in a 'B' subarray containing √A unique values, so we're done!
+ buffer2 = Range(B.end - count, B.end);
+ break;
+ }
+ } else if (pull_index == 0 && count > buffer1.length()) {
+ // keep track of the largest buffer we were able to find
+ buffer1 = Range(B.end - count, B.end);
+ PULL(B.end);
+ }
+ }
+
+ // pull out the two ranges so we can use them as internal buffers
+ for (pull_index = 0; pull_index < 2; pull_index++) {
+ size_t length = pull[pull_index].count;
+
+ if (pull[pull_index].to < pull[pull_index].from) {
+ // we're pulling the values out to the left, which means the start of an A subarray
+ index = pull[pull_index].from;
+ for (count = 1; count < length; count++) {
+ index = FindFirstBackward(array, array[index - 1], Range(pull[pull_index].to, pull[pull_index].from - (count - 1)), compare, length - count);
+ Range range = Range(index + 1, pull[pull_index].from + 1);
+ Rotate(array, range.length() - count, range);
+ pull[pull_index].from = index + count;
+ }
+ } else if (pull[pull_index].to > pull[pull_index].from) {
+ // we're pulling values out to the right, which means the end of a B subarray
+ index = pull[pull_index].from + 1;
+ for (count = 1; count < length; count++) {
+ index = FindLastForward(array, array[index], Range(index, pull[pull_index].to), compare, length - count);
+ Range range = Range(pull[pull_index].from, index - 1);
+ Rotate(array, count, range);
+ pull[pull_index].from = index - 1 - count;
+ }
+ }
+ }
+
+ // adjust block_size and buffer_size based on the values we were able to pull out
+ buffer_size = buffer1.length();
+ block_size = iterator.length()/buffer_size + 1;
+
+ // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
+ // so this was originally here to test the math for adjusting block_size above
+ //assert((iterator.length() + 1)/block_size <= buffer_size);
+
+ // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
+ iterator.begin();
+ while (!iterator.finished()) {
+ Range A = iterator.nextRange();
+ Range B = iterator.nextRange();
+
+ // remove any parts of A or B that are being used by the internal buffers
+ size_t start = A.start;
+ if (start == pull[0].range.start) {
+ if (pull[0].from > pull[0].to) {
+ A.start += pull[0].count;
+
+ // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
+ // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
+ // which also only happens when cache_size is small or 0 since it'd otherwise use MergeExternal
+ if (A.length() == 0) continue;
+ } else if (pull[0].from < pull[0].to) {
+ B.end -= pull[0].count;
+ if (B.length() == 0) continue;
+ }
+ }
+ if (start == pull[1].range.start) {
+ if (pull[1].from > pull[1].to) {
+ A.start += pull[1].count;
+ if (A.length() == 0) continue;
+ } else if (pull[1].from < pull[1].to) {
+ B.end -= pull[1].count;
+ if (B.length() == 0) continue;
+ }
+ }
+
+ if (compare(array[B.end - 1], array[A.start])) {
+ // the two ranges are in reverse order, so a simple rotation should fix it
+ Rotate(array, A.length(), Range(A.start, B.end));
+ } else if (compare(array[A.end], array[A.end - 1])) {
+ // these two ranges weren't already in order, so we'll need to merge them!
+
+ // break the remainder of A into blocks. firstA is the uneven-sized first A block
+ Range blockA = Range(A.start, A.end);
+ Range firstA = Range(A.start, A.start + blockA.length() % block_size);
+
+ // swap the first value of each A block with the values in buffer1
+ for (size_t indexA = buffer1.start, index = firstA.end; index < blockA.end; indexA++, index += block_size)
+ std::swap(array[indexA], array[index]);
+
+ // start rolling the A blocks through the B blocks!
+ // when we leave an A block behind we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
+ Range lastA = firstA;
+ Range lastB = Range(0, 0);
+ Range blockB = Range(B.start, B.start + std::min(block_size, B.length()));
+ blockA.start += firstA.length();
+ size_t indexA = buffer1.start;
+
+ // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
+ // otherwise, if the second buffer is available, block swap the contents into that
+ if (lastA.length() <= cache_size)
+ std::copy(&array[lastA.start], &array[lastA.end], &cache[0]);
+ else if (buffer2.length() > 0)
+ BlockSwap(array, lastA.start, buffer2.start, lastA.length());
+
+ if (blockA.length() > 0) {
+ while (true) {
+ // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
+ // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
+ if ((lastB.length() > 0 && !compare(array[lastB.end - 1], array[indexA])) || blockB.length() == 0) {
+ // figure out where to split the previous B block, and rotate it at the split
+ size_t B_split = BinaryFirst(array, array[indexA], lastB, compare);
+ size_t B_remaining = lastB.end - B_split;
+
+ // swap the minimum A block to the beginning of the rolling A blocks
+ size_t minA = blockA.start;
+ for (size_t findA = minA + block_size; findA < blockA.end; findA += block_size)
+ if (compare(array[findA], array[minA]))
+ minA = findA;
+ BlockSwap(array, blockA.start, minA, block_size);
+
+ // swap the first item of the previous A block back with its original value, which is stored in buffer1
+ std::swap(array[blockA.start], array[indexA]);
+ indexA++;
+
+ // locally merge the previous A block with the B values that follow it
+ // if lastA fits into the external cache we'll use that (with MergeExternal),
+ // or if the second internal buffer exists we'll use that (with MergeInternal),
+ // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
+ if (lastA.length() <= cache_size)
+ MergeExternal(array, lastA, Range(lastA.end, B_split), compare, cache);
+ else if (buffer2.length() > 0)
+ MergeInternal(array, lastA, Range(lastA.end, B_split), compare, buffer2);
+ else
+ MergeInPlace(array, lastA, Range(lastA.end, B_split), compare);
+
+ if (buffer2.length() > 0 || block_size <= cache_size) {
+ // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
+ if (block_size <= cache_size)
+ std::copy(&array[blockA.start], &array[blockA.start + block_size], cache);
+ else
+ BlockSwap(array, blockA.start, buffer2.start, block_size);
+
+ // this is equivalent to rotating, but faster
+ // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
+ // either way we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
+ BlockSwap(array, B_split, blockA.start + block_size - B_remaining, B_remaining);
+ } else {
+ // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
+ Rotate(array, blockA.start - B_split, Range(B_split, blockA.start + block_size));
+ }
+
+ // update the range for the remaining A blocks, and the range remaining from the B block after it was split
+ lastA = Range(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
+ lastB = Range(lastA.end, lastA.end + B_remaining);
+
+ // if there are no more A blocks remaining, this step is finished!
+ blockA.start += block_size;
+ if (blockA.length() == 0)
+ break;
+
+ } else if (blockB.length() < block_size) {
+ // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
+ Rotate(array, blockB.start - blockA.start, Range(blockA.start, blockB.end));
+
+ lastB = Range(blockA.start, blockA.start + blockB.length());
+ blockA.start += blockB.length();
+ blockA.end += blockB.length();
+ blockB.end = blockB.start;
+ } else {
+ // roll the leftmost A block to the end by swapping it with the next B block
+ BlockSwap(array, blockA.start, blockB.start, block_size);
+ lastB = Range(blockA.start, blockA.start + block_size);
+
+ blockA.start += block_size;
+ blockA.end += block_size;
+ blockB.start += block_size;
+
+ if (blockB.end > B.end - block_size) blockB.end = B.end;
+ else blockB.end += block_size;
+ }
+ }
+ }
+
+ // merge the last A block with the remaining B values
+ if (lastA.length() <= cache_size)
+ MergeExternal(array, lastA, Range(lastA.end, B.end), compare, cache);
+ else if (buffer2.length() > 0)
+ MergeInternal(array, lastA, Range(lastA.end, B.end), compare, buffer2);
+ else
+ MergeInPlace(array, lastA, Range(lastA.end, B.end), compare);
+ }
+ }
+
+ // when we're finished with this merge step we should have the one or two internal buffers left over, where the second buffer is all jumbled up
+ // insertion sort the second buffer, then redistribute the buffers back into the array using the opposite process used for creating the buffer
+
+ // while an unstable sort like std::sort could be applied here, in benchmarks it was consistently slightly slower than a simple insertion sort,
+ // even for tens of millions of items. this may be because insertion sort is quite fast when the data is already somewhat sorted, like it is here
+ InsertionSort(array, buffer2, compare);
+
+ for (pull_index = 0; pull_index < 2; pull_index++) {
+ size_t unique = pull[pull_index].count * 2;
+ if (pull[pull_index].from > pull[pull_index].to) {
+ // the values were pulled out to the left, so redistribute them back to the right
+ Range buffer = Range(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
+ while (buffer.length() > 0) {
+ index = FindFirstForward(array, array[buffer.start], Range(buffer.end, pull[pull_index].range.end), compare, unique);
+ size_t amount = index - buffer.end;
+ Rotate(array, buffer.length(), Range(buffer.start, index));
+ buffer.start += (amount + 1);
+ buffer.end += amount;
+ unique -= 2;
+ }
+ } else if (pull[pull_index].from < pull[pull_index].to) {
+ // the values were pulled out to the right, so redistribute them back to the left
+ Range buffer = Range(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
+ while (buffer.length() > 0) {
+ index = FindLastBackward(array, array[buffer.end - 1], Range(pull[pull_index].range.start, buffer.start), compare, unique);
+ size_t amount = buffer.start - index;
+ Rotate(array, amount, Range(index, buffer.end));
+ buffer.start -= amount;
+ buffer.end -= (amount + 1);
+ unique -= 2;
+ }
+ }
+ }
+ }
+
+ // double the size of each A and B subarray that will be merged in the next level
+ if (!iterator.nextLevel()) break;
+ }
+ }
+}
+
+/* wrap the reference implementation above in a qsort-compatible interface.
+ * it's not thread safe, but it's good enough for testing performance. */
+
+extern "C" {
+
+static cmpfun innercmp;
+
+static bool cmpwrapper(int a, int b)
+{
+ return innercmp(&a, &b) < 0;
+}
+
+/* we assume the input is an int[] */
+void wikisort_ref(void *base, size_t nmel, size_t width, cmpfun cmp)
+{
+ int *begin = (int*) base;
+ int *end = begin + nmel;
+ innercmp = cmp;
+ Wiki::Sort(begin, end, cmpwrapper);
+}
+
+}
+