From dbe88638530aacb4cffccfb0b58fa934fc78dd6b Mon Sep 17 00:00:00 2001
From: Bobby Bingham <koorogi@koorogi.info>
Date: Fri, 4 Jul 2014 10:45:37 -0500
Subject: Add the C++ reference wikisort implementation

---
 wikisort_ref.cc | 969 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 969 insertions(+)
 create mode 100644 wikisort_ref.cc

(limited to 'wikisort_ref.cc')
diff --git a/wikisort_ref.cc b/wikisort_ref.cc
new file mode 100644
index 0000000..f6b970b
--- /dev/null
+++ b/wikisort_ref.cc
@@ -0,0 +1,969 @@
+/***********************************************************
+ WikiSort: a public domain implementation of "Block Sort"
+ https://github.com/BonzaiThePenguin/WikiSort
+
+ to run:
+ clang++ -o WikiSort.x WikiSort.cpp -O3
+ (or replace 'clang++' with 'g++')
+ ./WikiSort.x
+***********************************************************/
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <ctime>
+
+extern "C" {
+#include "sorters.h"
+}
+
+// record the number of comparisons and assignments
+// note that this reduces WikiSort's performance when enabled
+#define PROFILE false
+
+// verify that WikiSort is actually correct
+// (this also reduces performance slightly)
+#define VERIFY false
+
+// simulate comparisons that have a bit more overhead than just an inlined (int < int)
+// (so we can tell whether reducing the number of comparisons was worth the added complexity)
+#define SLOW_COMPARISONS false
+
+// if true, test against std::__inplace_stable_sort() rather than std::stable_sort()
+#define TEST_INPLACE false
+
+// whether to give WikiSort a full-size cache, to see how it performs when given more memory
+#define DYNAMIC_CACHE false
+
+
+#if PROFILE
+	// global for testing how many comparisons are performed for each sorting algorithm
+	long comparisons, assignments;
+#endif
+
+// structure to represent ranges within the array
+class Range {
+public:
+	size_t start;
+	size_t end;
+
+	Range() {}
+	Range(size_t start, size_t end) : start(start), end(end) {}
+	size_t length() const { return end - start; }
+};
+
+// toolbox functions used by the sorter
+
+// 63 -> 32, 64 -> 64, etc.
+// this comes from Hacker's Delight
+size_t FloorPowerOfTwo (const size_t value) {
+	size_t x = value;
+	x = x | (x >> 1);
+	x = x | (x >> 2);
+	x = x | (x >> 4);
+	x = x | (x >> 8);
+	x = x | (x >> 16);
+#if __LP64__
+	x = x | (x >> 32);
+#endif
+	return x - (x >> 1);
+}
+
+// find the index of the first value within the range that is equal to array[index]
+template <typename T, typename Comparison>
+size_t BinaryFirst(const T array[], const T & value, const Range & range, const Comparison compare) {
+	return std::lower_bound(&array[range.start], &array[range.end], value, compare) - &array[0];
+}
+
+// find the index of the last value within the range that is equal to array[index], plus 1
+template <typename T, typename Comparison>
+size_t BinaryLast(const T array[], const T & value, const Range & range, const Comparison compare) {
+	return std::upper_bound(&array[range.start], &array[range.end], value, compare) - &array[0];
+}
+
+// combine a linear search with a binary search to reduce the number of comparisons in situations
+// where have some idea as to how many unique values there are and where the next value might be
+template <typename T, typename Comparison>
+size_t FindFirstForward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+	if (range.length() == 0) return range.start;
+	size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+	for (index = range.start + skip; compare(array[index - 1], value); index += skip)
+		if (index >= range.end - skip)
+			return BinaryFirst(array, value, Range(index, range.end), compare);
+
+	return BinaryFirst(array, value, Range(index - skip, index), compare);
+}
+
+template <typename T, typename Comparison>
+size_t FindLastForward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+	if (range.length() == 0) return range.start;
+	size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+	for (index = range.start + skip; !compare(value, array[index - 1]); index += skip)
+		if (index >= range.end - skip)
+			return BinaryLast(array, value, Range(index, range.end), compare);
+
+	return BinaryLast(array, value, Range(index - skip, index), compare);
+}
+
+template <typename T, typename Comparison>
+size_t FindFirstBackward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+	if (range.length() == 0) return range.start;
+	size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+	for (index = range.end - skip; index > range.start && !compare(array[index - 1], value); index -= skip)
+		if (index < range.start + skip)
+			return BinaryFirst(array, value, Range(range.start, index), compare);
+
+	return BinaryFirst(array, value, Range(index, index + skip), compare);
+}
+
+template <typename T, typename Comparison>
+size_t FindLastBackward(const T array[], const T & value, const Range & range, const Comparison compare, const size_t unique) {
+	if (range.length() == 0) return range.start;
+	size_t index, skip = std::max(range.length()/unique, (size_t)1);
+
+	for (index = range.end - skip; index > range.start && compare(value, array[index - 1]); index -= skip)
+		if (index < range.start + skip)
+			return BinaryLast(array, value, Range(range.start, index), compare);
+
+	return BinaryLast(array, value, Range(index, index + skip), compare);
+}
+
+// n^2 sorting algorithm used to sort tiny chunks of the full array
+template <typename T, typename Comparison>
+void InsertionSort(T array[], const Range & range, const Comparison compare) {
+	for (size_t j, i = range.start + 1; i < range.end; i++) {
+		const T temp = array[i];
+		for (j = i; j > range.start && compare(temp, array[j - 1]); j--)
+			array[j] = array[j - 1];
+		array[j] = temp;
+	}
+}
+
+// reverse a range of values within the array
+template <typename T>
+void Reverse(T array[], const Range & range) {
+	std::reverse(&array[range.start], &array[range.end]);
+}
+
+// swap a series of values in the array
+template <typename T>
+void BlockSwap(T array[], const size_t start1, const size_t start2, const size_t block_size) {
+	std::swap_ranges(&array[start1], &array[start1 + block_size], &array[start2]);
+}
+
+// rotate the values in an array ([0 1 2 3] becomes [1 2 3 0] if we rotate by 1)
+// this assumes that 0 <= amount <= range.length()
+template <typename T>
+void Rotate(T array[], size_t amount, Range range) {
+	std::rotate(&array[range.start], &array[range.start + amount], &array[range.end]);
+}
+
+namespace Wiki {
+	// merge two ranges from one array and save the results into a different array
+	template <typename T, typename Comparison>
+	void MergeInto(T from[], const Range & A, const Range & B, const Comparison compare, T into[]) {
+		T *A_index = &from[A.start], *B_index = &from[B.start];
+		T *A_last = &from[A.end], *B_last = &from[B.end];
+		T *insert_index = &into[0];
+
+		while (true) {
+			if (!compare(*B_index, *A_index)) {
+				*insert_index = *A_index;
+				A_index++;
+				insert_index++;
+				if (A_index == A_last) {
+					// copy the remainder of B into the final array
+					std::copy(B_index, B_last, insert_index);
+					break;
+				}
+			} else {
+				*insert_index = *B_index;
+				B_index++;
+				insert_index++;
+				if (B_index == B_last) {
+					// copy the remainder of A into the final array
+					std::copy(A_index, A_last, insert_index);
+					break;
+				}
+			}
+		}
+	}
+
+	// merge operation using an external buffer
+	template <typename T, typename Comparison>
+	void MergeExternal(T array[], const Range & A, const Range & B, const Comparison compare, T cache[]) {
+		// A fits into the cache, so use that instead of the internal buffer
+		T *A_index = &cache[0], *B_index = &array[B.start];
+		T *A_last = &cache[A.length()], *B_last = &array[B.end];
+		T *insert_index = &array[A.start];
+
+		if (B.length() > 0 && A.length() > 0) {
+			while (true) {
+				if (!compare(*B_index, *A_index)) {
+					*insert_index = *A_index;
+					A_index++;
+					insert_index++;
+					if (A_index == A_last) break;
+				} else {
+					*insert_index = *B_index;
+					B_index++;
+					insert_index++;
+					if (B_index == B_last) break;
+				}
+			}
+		}
+
+		// copy the remainder of A into the final array
+		std::copy(A_index, A_last, insert_index);
+	}
+
+	// merge operation using an internal buffer
+	template <typename T, typename Comparison>
+	void MergeInternal(T array[], const Range & A, const Range & B, const Comparison compare, const Range & buffer) {
+		// whenever we find a value to add to the final array, swap it with the value that's already in that spot
+		// when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
+		T *A_index = &array[buffer.start], *B_index = &array[B.start];
+		T *A_last = &array[buffer.start + A.length()], *B_last = &array[B.end];
+		T *insert_index = &array[A.start];
+
+		if (B.length() > 0 && A.length() > 0) {
+			while (true) {
+				if (!compare(*B_index, *A_index)) {
+					std::swap(*insert_index, *A_index);
+					A_index++;
+					insert_index++;
+					if (A_index == A_last) break;
+				} else {
+					std::swap(*insert_index, *B_index);
+					B_index++;
+					insert_index++;
+					if (B_index == B_last) break;
+				}
+			}
+		}
+
+		// BlockSwap
+		std::swap_ranges(A_index, A_last, insert_index);
+	}
+
+	// merge operation without a buffer
+	template <typename T, typename Comparison>
+	void MergeInPlace(T array[], Range A, Range B, const Comparison compare) {
+		if (A.length() == 0 || B.length() == 0) return;
+
+		/*
+		 this just repeatedly binary searches into B and rotates A into position.
+		 the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
+		 but I decided to stick with this because it had better situational performance
+
+		 (Hwang and Lin is designed for merging subarrays of very different sizes,
+		 but WikiSort almost always uses subarrays that are roughly the same size)
+
+		 normally this is incredibly suboptimal, but this function is only called
+		 when none of the A or B blocks in any subarray contained 2√A unique values,
+		 which places a hard limit on the number of times this will ACTUALLY need
+		 to binary search and rotate.
+
+		 according to my analysis the worst case is √A rotations performed on √A items
+		 once the constant factors are removed, which ends up being O(n)
+
+		 again, this is NOT a general-purpose solution – it only works well in this case!
+		 kind of like how the O(n^2) insertion sort is used in some places
+		 */
+
+		while (true) {
+			// find the first place in B where the first item in A needs to be inserted
+			size_t mid = BinaryFirst(array, array[A.start], B, compare);
+
+			// rotate A into place
+			size_t amount = mid - A.end;
+			Rotate(array, A.length(), Range(A.start, mid));
+			if (B.end == mid) break;
+
+			// calculate the new A and B ranges
+			B.start = mid;
+			A = Range(A.start + amount, B.start);
+			A.start = BinaryLast(array, array[A.start], A, compare);
+			if (A.length() == 0) break;
+		}
+	}
+
+	// calculate how to scale the index value to the range within the array
+	// the bottom-up merge sort only operates on values that are powers of two,
+	// so scale down to that power of two, then use a fraction to scale back again
+	class Iterator {
+		size_t size, power_of_two;
+		size_t decimal, numerator, denominator;
+		size_t decimal_step, numerator_step;
+
+	public:
+		Iterator(size_t size2, size_t min_level) {
+			size = size2;
+			power_of_two = FloorPowerOfTwo(size);
+			denominator = power_of_two/min_level;
+			numerator_step = size % denominator;
+			decimal_step = size/denominator;
+			begin();
+		}
+
+		void begin() {
+			numerator = decimal = 0;
+		}
+
+		Range nextRange() {
+			size_t start = decimal;
+
+			decimal += decimal_step;
+			numerator += numerator_step;
+			if (numerator >= denominator) {
+				numerator -= denominator;
+				decimal++;
+			}
+
+			return Range(start, decimal);
+		}
+
+		bool finished() {
+			return (decimal >= size);
+		}
+
+		bool nextLevel() {
+			decimal_step += decimal_step;
+			numerator_step += numerator_step;
+			if (numerator_step >= denominator) {
+				numerator_step -= denominator;
+				decimal_step++;
+			}
+
+			return (decimal_step < size);
+		}
+
+		size_t length() {
+			return decimal_step;
+		}
+	};
+
+#if DYNAMIC_CACHE
+	// use a class so the memory for the cache is freed when the object goes out of scope,
+	// regardless of whether exceptions were thrown (only needed in the C++ version)
+	template <typename T>
+	class Cache {
+	public:
+		T *cache;
+		size_t cache_size;
+
+		~Cache() {
+			if (cache) delete[] cache;
+		}
+
+		Cache(size_t size) {
+			// good choices for the cache size are:
+			// (size + 1)/2 – turns into a full-speed standard merge sort since everything fits into the cache
+			cache_size = (size + 1)/2;
+			cache = new (std::nothrow) T[cache_size];
+			if (cache) return;
+
+			// sqrt((size + 1)/2) + 1 – this will be the size of the A blocks at the largest level of merges,
+			// so a buffer of this size would allow it to skip using internal or in-place merges for anything
+			cache_size = sqrt(cache_size) + 1;
+			cache = new (std::nothrow) T[cache_size];
+			if (cache) return;
+
+			// 512 – chosen from careful testing as a good balance between fixed-size memory use and run time
+			if (cache_size > 512) {
+				cache_size = 512;
+				cache = new (std::nothrow) T[cache_size];
+				if (cache) return;
+			}
+
+			// 0 – if the system simply cannot allocate any extra memory whatsoever, no memory works just fine
+			cache_size = 0;
+		}
+	};
+#endif
+
+	// bottom-up merge sort combined with an in-place merge algorithm for O(1) memory use
+	template <typename Iterator, typename Comparison>
+	void Sort(Iterator first, Iterator last, const Comparison compare) {
+		// map first and last to a C-style array, so we don't have to change the rest of the code
+		// (bit of a nasty hack, but it's good enough for now...)
+		typedef typename std::iterator_traits<Iterator>::value_type T;
+		const size_t size = last - first;
+		T *array = &first[0];
+
+		// if the array is of size 0, 1, 2, or 3, just sort them like so:
+		if (size < 4) {
+			if (size == 3) {
+				// hard-coded insertion sort
+				if (compare(array[1], array[0])) std::swap(array[0], array[1]);
+				if (compare(array[2], array[1])) {
+					std::swap(array[1], array[2]);
+					if (compare(array[1], array[0])) std::swap(array[0], array[1]);
+				}
+			} else if (size == 2) {
+				// swap the items if they're out of order
+				if (compare(array[1], array[0])) std::swap(array[0], array[1]);
+			}
+
+			return;
+		}
+
+		// sort groups of 4-8 items at a time using an unstable sorting network,
+		// but keep track of the original item orders to force it to be stable
+		// http://pages.ripco.net/~jgamble/nw.html
+		Wiki::Iterator iterator (size, 4);
+		while (!iterator.finished()) {
+			uint8_t order[] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+			Range range = iterator.nextRange();
+
+			#define SWAP(x, y) \
+				if (compare(array[range.start + y], array[range.start + x]) || \
+					(order[x] > order[y] && !compare(array[range.start + x], array[range.start + y]))) { \
+					std::swap(array[range.start + x], array[range.start + y]); std::swap(order[x], order[y]); }
+
+			if (range.length() == 8) {
+				SWAP(0, 1); SWAP(2, 3); SWAP(4, 5); SWAP(6, 7);
+				SWAP(0, 2); SWAP(1, 3); SWAP(4, 6); SWAP(5, 7);
+				SWAP(1, 2); SWAP(5, 6); SWAP(0, 4); SWAP(3, 7);
+				SWAP(1, 5); SWAP(2, 6);
+				SWAP(1, 4); SWAP(3, 6);
+				SWAP(2, 4); SWAP(3, 5);
+				SWAP(3, 4);
+
+			} else if (range.length() == 7) {
+				SWAP(1, 2); SWAP(3, 4); SWAP(5, 6);
+				SWAP(0, 2); SWAP(3, 5); SWAP(4, 6);
+				SWAP(0, 1); SWAP(4, 5); SWAP(2, 6);
+				SWAP(0, 4); SWAP(1, 5);
+				SWAP(0, 3); SWAP(2, 5);
+				SWAP(1, 3); SWAP(2, 4);
+				SWAP(2, 3);
+
+			} else if (range.length() == 6) {
+				SWAP(1, 2); SWAP(4, 5);
+				SWAP(0, 2); SWAP(3, 5);
+				SWAP(0, 1); SWAP(3, 4); SWAP(2, 5);
+				SWAP(0, 3); SWAP(1, 4);
+				SWAP(2, 4); SWAP(1, 3);
+				SWAP(2, 3);
+
+			} else if (range.length() == 5) {
+				SWAP(0, 1); SWAP(3, 4);
+				SWAP(2, 4);
+				SWAP(2, 3); SWAP(1, 4);
+				SWAP(0, 3);
+				SWAP(0, 2); SWAP(1, 3);
+				SWAP(1, 2);
+
+			} else if (range.length() == 4) {
+				SWAP(0, 1); SWAP(2, 3);
+				SWAP(0, 2); SWAP(1, 3);
+				SWAP(1, 2);
+			}
+		}
+		if (size < 8) return;
+
+		// use a small cache to speed up some of the operations
+		#if DYNAMIC_CACHE
+			Cache<T> cache_obj (size);
+			T *cache = cache_obj.cache;
+			const size_t cache_size = cache_obj.cache_size;
+		#else
+			// since the cache size is fixed, it's still O(1) memory!
+			// just keep in mind that making it too small ruins the point (nothing will fit into it),
+			// and making it too large also ruins the point (so much for "low memory"!)
+			// removing the cache entirely still gives 75% of the performance of a standard merge
+			const size_t cache_size = 512;
+			T cache[cache_size];
+		#endif
+
+		// then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
+		while (true) {
+			// if every A and B block will fit into the cache, use a special branch specifically for merging with the cache
+			// (we use < rather than <= since the block size might be one more than iterator.length())
+			if (iterator.length() < cache_size) {
+
+				// if four subarrays fit into the cache, it's faster to merge both pairs of subarrays into the cache,
+				// then merge the two merged subarrays from the cache back into the original array
+				if ((iterator.length() + 1) * 4 <= cache_size && iterator.length() * 4 <= size) {
+					iterator.begin();
+					while (!iterator.finished()) {
+						// merge A1 and B1 into the cache
+						Range A1 = iterator.nextRange();
+						Range B1 = iterator.nextRange();
+						Range A2 = iterator.nextRange();
+						Range B2 = iterator.nextRange();
+
+						if (compare(array[B1.end - 1], array[A1.start])) {
+							// the two ranges are in reverse order, so copy them in reverse order into the cache
+							std::copy(&array[A1.start], &array[A1.end], &cache[B1.length()]);
+							std::copy(&array[B1.start], &array[B1.end], &cache[0]);
+						} else if (compare(array[B1.start], array[A1.end - 1])) {
+							// these two ranges weren't already in order, so merge them into the cache
+							MergeInto(array, A1, B1, compare, &cache[0]);
+						} else {
+							// if A1, B1, A2, and B2 are all in order, skip doing anything else
+							if (!compare(array[B2.start], array[A2.end - 1]) && !compare(array[A2.start], array[B1.end - 1])) continue;
+
+							// copy A1 and B1 into the cache in the same order
+							std::copy(&array[A1.start], &array[A1.end], &cache[0]);
+							std::copy(&array[B1.start], &array[B1.end], &cache[A1.length()]);
+						}
+						A1 = Range(A1.start, B1.end);
+
+						// merge A2 and B2 into the cache
+						if (compare(array[B2.end - 1], array[A2.start])) {
+							// the two ranges are in reverse order, so copy them in reverse order into the cache
+							std::copy(&array[A2.start], &array[A2.end], &cache[A1.length() + B2.length()]);
+							std::copy(&array[B2.start], &array[B2.end], &cache[A1.length()]);
+						} else if (compare(array[B2.start], array[A2.end - 1])) {
+							// these two ranges weren't already in order, so merge them into the cache
+							MergeInto(array, A2, B2, compare, &cache[A1.length()]);
+						} else {
+							// copy A2 and B2 into the cache in the same order
+							std::copy(&array[A2.start], &array[A2.end], &cache[A1.length()]);
+							std::copy(&array[B2.start], &array[B2.end], &cache[A1.length() + A2.length()]);
+						}
+						A2 = Range(A2.start, B2.end);
+
+						// merge A1 and A2 from the cache into the array
+						Range A3 = Range(0, A1.length());
+						Range B3 = Range(A1.length(), A1.length() + A2.length());
+
+						if (compare(cache[B3.end - 1], cache[A3.start])) {
+							// the two ranges are in reverse order, so copy them in reverse order into the array
+							std::copy(&cache[A3.start], &cache[A3.end], &array[A1.start + A2.length()]);
+							std::copy(&cache[B3.start], &cache[B3.end], &array[A1.start]);
+						} else if (compare(cache[B3.start], cache[A3.end - 1])) {
+							// these two ranges weren't already in order, so merge them back into the array
+							MergeInto(cache, A3, B3, compare, &array[A1.start]);
+						} else {
+							// copy A3 and B3 into the array in the same order
+							std::copy(&cache[A3.start], &cache[A3.end], &array[A1.start]);
+							std::copy(&cache[B3.start], &cache[B3.end], &array[A1.start + A1.length()]);
+						}
+					}
+
+					// we merged two levels at the same time, so we're done with this level already
+					// (iterator.nextLevel() is called again at the bottom of this outer merge loop)
+					iterator.nextLevel();
+
+				} else {
+					iterator.begin();
+					while (!iterator.finished()) {
+						Range A = iterator.nextRange();
+						Range B = iterator.nextRange();
+
+						if (compare(array[B.end - 1], array[A.start])) {
+							// the two ranges are in reverse order, so a simple rotation should fix it
+							Rotate(array, A.length(), Range(A.start, B.end));
+						} else if (compare(array[B.start], array[A.end - 1])) {
+							// these two ranges weren't already in order, so we'll need to merge them!
+							std::copy(&array[A.start], &array[A.end], &cache[0]);
+							MergeExternal(array, A, B, compare, cache);
+						}
+					}
+				}
+			} else {
+				// this is where the in-place merge logic starts!
+				// 1. pull out two internal buffers each containing √A unique values
+				//     1a. adjust block_size and buffer_size if we couldn't find enough unique values
+				// 2. loop over the A and B subarrays within this level of the merge sort
+				//     3. break A and B into blocks of size 'block_size'
+				//     4. "tag" each of the A blocks with values from the first internal buffer
+				//     5. roll the A blocks through the B blocks and drop/rotate them where they belong
+				//     6. merge each A block with any B values that follow, using the cache or the second internal buffer
+				// 7. sort the second internal buffer if it exists
+				// 8. redistribute the two internal buffers back into the array
+
+				size_t block_size = sqrt(iterator.length());
+				size_t buffer_size = iterator.length()/block_size + 1;
+
+				// as an optimization, we really only need to pull out the internal buffers once for each level of merges
+				// after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
+				Range buffer1 = Range(0, 0), buffer2 = Range(0, 0);
+				size_t index, last, count, pull_index = 0;
+				struct { size_t from, to, count; Range range; } pull[2];
+				pull[0].from = pull[0].to = pull[0].count = 0; pull[0].range = Range(0, 0);
+				pull[1].from = pull[1].to = pull[1].count = 0; pull[1].range = Range(0, 0);
+
+				// find two internal buffers of size 'buffer_size' each
+				// let's try finding both buffers at the same time from a single A or B subarray
+				size_t find = buffer_size + buffer_size;
+				bool find_separately = false;
+
+				if (block_size <= cache_size) {
+					// if every A block fits into the cache then we won't need the second internal buffer,
+					// so we really only need to find 'buffer_size' unique values
+					find = buffer_size;
+				} else if (find > iterator.length()) {
+					// we can't fit both buffers into the same A or B subarray, so find two buffers separately
+					find = buffer_size;
+					find_separately = true;
+				}
+
+				// we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
+				// or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
+				// OR if we couldn't find that many unique values, we need the largest possible buffer we can get
+
+				// in the case where it couldn't find a single buffer of at least √A unique values,
+				// all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
+
+				iterator.begin();
+				while (!iterator.finished()) {
+					Range A = iterator.nextRange();
+					Range B = iterator.nextRange();
+
+					// just store information about where the values will be pulled from and to,
+					// as well as how many values there are, to create the two internal buffers
+					#define PULL(_to) \
+						pull[pull_index].range = Range(A.start, B.end); \
+						pull[pull_index].count = count; \
+						pull[pull_index].from = index; \
+						pull[pull_index].to = _to
+
+					// check A for the number of unique values we need to fill an internal buffer
+					// these values will be pulled out to the start of A
+					for (last = A.start, count = 1; count < find; last = index, count++) {
+						index = FindLastForward(array, array[last], Range(last + 1, A.end), compare, find - count);
+						if (index == A.end) break;
+						assert(index < A.end);
+					}
+					index = last;
+
+					if (count >= buffer_size) {
+						// keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer
+						PULL(A.start);
+						pull_index = 1;
+
+						if (count == buffer_size + buffer_size) {
+							// we were able to find a single contiguous section containing 2√A unique values,
+							// so this section can be used to contain both of the internal buffers we'll need
+							buffer1 = Range(A.start, A.start + buffer_size);
+							buffer2 = Range(A.start + buffer_size, A.start + count);
+							break;
+						} else if (find == buffer_size + buffer_size) {
+							// we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+							// so we still need to find a second separate buffer of at least √A unique values
+							buffer1 = Range(A.start, A.start + count);
+							find = buffer_size;
+						} else if (block_size <= cache_size) {
+							// we found the first and only internal buffer that we need, so we're done!
+							buffer1 = Range(A.start, A.start + count);
+							break;
+						} else if (find_separately) {
+							// found one buffer, but now find the other one
+							buffer1 = Range(A.start, A.start + count);
+							find_separately = false;
+						} else {
+							// we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
+							buffer2 = Range(A.start, A.start + count);
+							break;
+						}
+					} else if (pull_index == 0 && count > buffer1.length()) {
+						// keep track of the largest buffer we were able to find
+						buffer1 = Range(A.start, A.start + count);
+						PULL(A.start);
+					}
+
+					// check B for the number of unique values we need to fill an internal buffer
+					// these values will be pulled out to the end of B
+					for (last = B.end - 1, count = 1; count < find; last = index - 1, count++) {
+						index = FindFirstBackward(array, array[last], Range(B.start, last), compare, find - count);
+						if (index == B.start) break;
+						assert(index > B.start);
+					}
+					index = last;
+
+					if (count >= buffer_size) {
+						// keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer
+						PULL(B.end);
+						pull_index = 1;
+
+						if (count == buffer_size + buffer_size) {
+							// we were able to find a single contiguous section containing 2√A unique values,
+							// so this section can be used to contain both of the internal buffers we'll need
+							buffer1 = Range(B.end - count, B.end - buffer_size);
+							buffer2 = Range(B.end - buffer_size, B.end);
+							break;
+						} else if (find == buffer_size + buffer_size) {
+							// we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+							// so we still need to find a second separate buffer of at least √A unique values
+							buffer1 = Range(B.end - count, B.end);
+							find = buffer_size;
+						} else if (block_size <= cache_size) {
+							// we found the first and only internal buffer that we need, so we're done!
+							buffer1 = Range(B.end - count, B.end);
+							break;
+						} else if (find_separately) {
+							// found one buffer, but now find the other one
+							buffer1 = Range(B.end - count, B.end);
+							find_separately = false;
+						} else {
+							// buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
+							// we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
+							if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
+
+							// we found a second buffer in a 'B' subarray containing √A unique values, so we're done!
+							buffer2 = Range(B.end - count, B.end);
+							break;
+						}
+					} else if (pull_index == 0 && count > buffer1.length()) {
+						// keep track of the largest buffer we were able to find
+						buffer1 = Range(B.end - count, B.end);
+						PULL(B.end);
+					}
+				}
+
+				// pull out the two ranges so we can use them as internal buffers
+				for (pull_index = 0; pull_index < 2; pull_index++) {
+					size_t length = pull[pull_index].count;
+
+					if (pull[pull_index].to < pull[pull_index].from) {
+						// we're pulling the values out to the left, which means the start of an A subarray
+						index = pull[pull_index].from;
+						for (count = 1; count < length; count++) {
+							index = FindFirstBackward(array, array[index - 1], Range(pull[pull_index].to, pull[pull_index].from - (count - 1)), compare, length - count);
+							Range range = Range(index + 1, pull[pull_index].from + 1);
+							Rotate(array, range.length() - count, range);
+							pull[pull_index].from = index + count;
+						}
+					} else if (pull[pull_index].to > pull[pull_index].from) {
+						// we're pulling values out to the right, which means the end of a B subarray
+						index = pull[pull_index].from + 1;
+						for (count = 1; count < length; count++) {
+							index = FindLastForward(array, array[index], Range(index, pull[pull_index].to), compare, length - count);
+							Range range = Range(pull[pull_index].from, index - 1);
+							Rotate(array, count, range);
+							pull[pull_index].from = index - 1 - count;
+						}
+					}
+				}
+
+				// adjust block_size and buffer_size based on the values we were able to pull out
+				buffer_size = buffer1.length();
+				block_size = iterator.length()/buffer_size + 1;
+
+				// the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
+				// so this was originally here to test the math for adjusting block_size above
+				//assert((iterator.length() + 1)/block_size <= buffer_size);
+
+				// now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
+				iterator.begin();
+				while (!iterator.finished()) {
+					Range A = iterator.nextRange();
+					Range B = iterator.nextRange();
+
+					// remove any parts of A or B that are being used by the internal buffers
+					size_t start = A.start;
+					if (start == pull[0].range.start) {
+						if (pull[0].from > pull[0].to) {
+							A.start += pull[0].count;
+
+							// if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
+							// this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
+							// which also only happens when cache_size is small or 0 since it'd otherwise use MergeExternal
+							if (A.length() == 0) continue;
+						} else if (pull[0].from < pull[0].to) {
+							B.end -= pull[0].count;
+							if (B.length() == 0) continue;
+						}
+					}
+					if (start == pull[1].range.start) {
+						if (pull[1].from > pull[1].to) {
+							A.start += pull[1].count;
+							if (A.length() == 0) continue;
+						} else if (pull[1].from < pull[1].to) {
+							B.end -= pull[1].count;
+							if (B.length() == 0) continue;
+						}
+					}
+
+					if (compare(array[B.end - 1], array[A.start])) {
+						// the two ranges are in reverse order, so a simple rotation should fix it
+						Rotate(array, A.length(), Range(A.start, B.end));
+					} else if (compare(array[A.end], array[A.end - 1])) {
+						// these two ranges weren't already in order, so we'll need to merge them!
+
+						// break the remainder of A into blocks. firstA is the uneven-sized first A block
+						Range blockA = Range(A.start, A.end);
+						Range firstA = Range(A.start, A.start + blockA.length() % block_size);
+
+						// swap the first value of each A block with the values in buffer1
+						for (size_t indexA = buffer1.start, index = firstA.end; index < blockA.end; indexA++, index += block_size)
+							std::swap(array[indexA], array[index]);
+
+						// start rolling the A blocks through the B blocks!
+						// when we leave an A block behind we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
+						Range lastA = firstA;
+						Range lastB = Range(0, 0);
+						Range blockB = Range(B.start, B.start + std::min(block_size, B.length()));
+						blockA.start += firstA.length();
+						size_t indexA = buffer1.start;
+
+						// if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
+						// otherwise, if the second buffer is available, block swap the contents into that
+						if (lastA.length() <= cache_size)
+							std::copy(&array[lastA.start], &array[lastA.end], &cache[0]);
+						else if (buffer2.length() > 0)
+							BlockSwap(array, lastA.start, buffer2.start, lastA.length());
+
+						if (blockA.length() > 0) {
+							while (true) {
+								// if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
+								// then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
+								if ((lastB.length() > 0 && !compare(array[lastB.end - 1], array[indexA])) || blockB.length() == 0) {
+									// figure out where to split the previous B block, and rotate it at the split
+									size_t B_split = BinaryFirst(array, array[indexA], lastB, compare);
+									size_t B_remaining = lastB.end - B_split;
+
+									// swap the minimum A block to the beginning of the rolling A blocks
+									size_t minA = blockA.start;
+									for (size_t findA = minA + block_size; findA < blockA.end; findA += block_size)
+										if (compare(array[findA], array[minA]))
+											minA = findA;
+									BlockSwap(array, blockA.start, minA, block_size);
+
+									// swap the first item of the previous A block back with its original value, which is stored in buffer1
+									std::swap(array[blockA.start], array[indexA]);
+									indexA++;
+
+									// locally merge the previous A block with the B values that follow it
+									// if lastA fits into the external cache we'll use that (with MergeExternal),
+									// or if the second internal buffer exists we'll use that (with MergeInternal),
+									// or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
+									if (lastA.length() <= cache_size)
+										MergeExternal(array, lastA, Range(lastA.end, B_split), compare, cache);
+									else if (buffer2.length() > 0)
+										MergeInternal(array, lastA, Range(lastA.end, B_split), compare, buffer2);
+									else
+										MergeInPlace(array, lastA, Range(lastA.end, B_split), compare);
+
+									if (buffer2.length() > 0 || block_size <= cache_size) {
+										// copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
+										if (block_size <= cache_size)
+											std::copy(&array[blockA.start], &array[blockA.start + block_size], cache);
+										else
+											BlockSwap(array, blockA.start, buffer2.start, block_size);
+
+										// this is equivalent to rotating, but faster
+										// the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
+										// either way we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
+										BlockSwap(array, B_split, blockA.start + block_size - B_remaining, B_remaining);
+									} else {
+										// we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
+										Rotate(array, blockA.start - B_split, Range(B_split, blockA.start + block_size));
+									}
+
+									// update the range for the remaining A blocks, and the range remaining from the B block after it was split
+									lastA = Range(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
+									lastB = Range(lastA.end, lastA.end + B_remaining);
+
+									// if there are no more A blocks remaining, this step is finished!
+									blockA.start += block_size;
+									if (blockA.length() == 0)
+										break;
+
+								} else if (blockB.length() < block_size) {
+									// move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
+									Rotate(array, blockB.start - blockA.start, Range(blockA.start, blockB.end));
+
+									lastB = Range(blockA.start, blockA.start + blockB.length());
+									blockA.start += blockB.length();
+									blockA.end += blockB.length();
+									blockB.end = blockB.start;
+								} else {
+									// roll the leftmost A block to the end by swapping it with the next B block
+									BlockSwap(array, blockA.start, blockB.start, block_size);
+									lastB = Range(blockA.start, blockA.start + block_size);
+
+									blockA.start += block_size;
+									blockA.end += block_size;
+									blockB.start += block_size;
+
+									if (blockB.end > B.end - block_size) blockB.end = B.end;
+									else blockB.end += block_size;
+								}
+							}
+						}
+
+						// merge the last A block with the remaining B values
+						if (lastA.length() <= cache_size)
+							MergeExternal(array, lastA, Range(lastA.end, B.end), compare, cache);
+						else if (buffer2.length() > 0)
+							MergeInternal(array, lastA, Range(lastA.end, B.end), compare, buffer2);
+						else
+							MergeInPlace(array, lastA, Range(lastA.end, B.end), compare);
+					}
+				}
+
+				// when we're finished with this merge step we should have the one or two internal buffers left over, where the second buffer is all jumbled up
+				// insertion sort the second buffer, then redistribute the buffers back into the array using the opposite process used for creating the buffer
+
+				// while an unstable sort like std::sort could be applied here, in benchmarks it was consistently slightly slower than a simple insertion sort,
+				// even for tens of millions of items. this may be because insertion sort is quite fast when the data is already somewhat sorted, like it is here
+				InsertionSort(array, buffer2, compare);
+
+				for (pull_index = 0; pull_index < 2; pull_index++) {
+					size_t unique = pull[pull_index].count * 2;
+					if (pull[pull_index].from > pull[pull_index].to) {
+						// the values were pulled out to the left, so redistribute them back to the right
+						Range buffer = Range(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
+						while (buffer.length() > 0) {
+							index = FindFirstForward(array, array[buffer.start], Range(buffer.end, pull[pull_index].range.end), compare, unique);
+							size_t amount = index - buffer.end;
+							Rotate(array, buffer.length(), Range(buffer.start, index));
+							buffer.start += (amount + 1);
+							buffer.end += amount;
+							unique -= 2;
+						}
+					} else if (pull[pull_index].from < pull[pull_index].to) {
+						// the values were pulled out to the right, so redistribute them back to the left
+						Range buffer = Range(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
+						while (buffer.length() > 0) {
+							index = FindLastBackward(array, array[buffer.end - 1], Range(pull[pull_index].range.start, buffer.start), compare, unique);
+							size_t amount = buffer.start - index;
+							Rotate(array, amount, Range(index, buffer.end));
+							buffer.start -= amount;
+							buffer.end -= (amount + 1);
+							unique -= 2;
+						}
+					}
+				}
+			}
+
+			// double the size of each A and B subarray that will be merged in the next level
+			if (!iterator.nextLevel()) break;
+		}
+	}
+}
+
+/* wrap the reference implementation above in a qsort-compatible interface.
+ * it's not thread safe, but it's good enough for testing performance. */
+
+extern "C" {
+
+static cmpfun innercmp;
+
+static bool cmpwrapper(int a, int b)
+{
+    return innercmp(&a, &b) < 0;
+}
+
+/* we assume the input is an int[] */
+void wikisort_ref(void *base, size_t nmel, size_t width, cmpfun cmp)
+{
+    int *begin = (int*) base;
+    int *end   = begin + nmel;
+    innercmp   = cmp;
+    Wiki::Sort(begin, end, cmpwrapper);
+}
+
+}
+
-- 
cgit v1.2.3