(Updated 22:37 UTC 2011-08-20)
I suggest a fixed-size binary mini-heap holding the M largest elements (but still in mini-heap order!). This will probably not be so fast in practice, since I think that sorting OPs attachments probably has decent performance in the real world (at least when the recommendations of other posters in this thread are taken into account).
The look in case of failure should be constant: if the current element is less than the minimum element of the heap (containing the maximum elements of M), we can completely reject it.
If it turns out that we have an element larger than the current minimum of the heap (Mth is the largest element), we extract (discard) the previous min and insert a new element.
If items are needed in sorted order, the heap can be sorted afterwards.
First attempt at a minimal C ++ implementation:
template<unsigned size, typename T> class m_heap { private: T nodes[size]; static const unsigned last = size - 1; static unsigned parent(unsigned i) { return (i - 1) / 2; } static unsigned left(unsigned i) { return i * 2; } static unsigned right(unsigned i) { return i * 2 + 1; } void bubble_down(unsigned int i) { for (;;) { unsigned j = i; if (left(i) < size && nodes[left(i)] < nodes[i]) j = left(i); if (right(i) < size && nodes[right(i)] < nodes[j]) j = right(i); if (i != j) { swap(nodes[i], nodes[j]); i = j; } else { break; } } } void bubble_up(unsigned i) { while (i > 0 && nodes[i] < nodes[parent(i)]) { swap(nodes[parent(i)], nodes[i]); i = parent(i); } } public: m_heap() { for (unsigned i = 0; i < size; i++) { nodes[i] = numeric_limits<T>::min(); } } void add(const T& x) { if (x < nodes[0]) {
Small test / usage example:
#include <iostream> #include <limits> #include <algorithm> #include <vector> #include <stdlib.h> #include <assert.h> #include <math.h> using namespace std; // INCLUDE TEMPLATED CLASS FROM ABOVE typedef vector<float> vf; bool compare(float a, float b) { return a > b; } int main() { int N = 2000; vf v; for (int i = 0; i < N; i++) v.push_back( rand()*1e6 / RAND_MAX); static const int M = 50; m_heap<M, float> h; for (int i = 0; i < N; i++) h.add( v[i] ); sort(v.begin(), v.end(), compare); vf heap(h.get(), h.get() + M); // assume public in m_heap: T* get() { return nodes; } sort(heap.begin(), heap.end(), compare); cout << "Real\tFake" << endl; for (int i = 0; i < M; i++) { cout << v[i] << "\t" << heap[i] << endl; if (fabs(v[i] - heap[i]) > 1e-5) abort(); } }
user786653
source share