Many comments (and even answers) focus on the risks in your implementation. But the question is.
As shown in the figure below, eliminating the flaws of the perceived code will not change anything significant in performance.
Here is the OP code, modified as safe (A), and (B), supporting the same operations as std::stack , and (C) reserve buffer space also for std::stack , in order to clarify the situation for those who mistakenly believe that this material is relevant for the execution of:
#define _SECURE_SCL 0 #define _SCL_SECURE_NO_WARNINGS #include <algorithm> // std::swap #include <iostream> #include <vector> #include <stack> #include <stddef.h> // ptrdiff_t #include <type_traits> // std::is_pod using namespace std; #undef UNICODE #define UNICODE #include <Windows.h> typedef ptrdiff_t Size; typedef Size Index; template< class Type, class Container > void reserve( Size const newBufSize, std::stack< Type, Container >& st ) { struct Access: std::stack< Type, Container > { static Container& container( std::stack< Type, Container >& st ) { return st.*&Access::c; } }; Access::container( st ).reserve( newBufSize ); } class HighResolutionTimer { public: HighResolutionTimer(); double GetFrequency() const; void Start() ; double Stop(); double GetTime() const; private: LARGE_INTEGER start; LARGE_INTEGER stop; double frequency; }; HighResolutionTimer::HighResolutionTimer() { frequency = GetFrequency(); } double HighResolutionTimer::GetFrequency() const { LARGE_INTEGER proc_freq; if (!::QueryPerformanceFrequency(&proc_freq)) return -1; return static_cast< double >( proc_freq.QuadPart ); } void HighResolutionTimer::Start() { DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0); ::QueryPerformanceCounter(&start); ::SetThreadAffinityMask(::GetCurrentThread(), oldmask); } double HighResolutionTimer::Stop() { DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0); ::QueryPerformanceCounter(&stop); ::SetThreadAffinityMask(::GetCurrentThread(), oldmask); return ((stop.QuadPart - start.QuadPart) / frequency); } double HighResolutionTimer::GetTime() const { LARGE_INTEGER time; ::QueryPerformanceCounter(&time); return time.QuadPart / frequency; } template< class Type, bool elemTypeIsPOD = !!std::is_pod< Type >::value > class FastStack; template< class Type > class FastStack< Type, true > { private: Type* st_; Index lastIndex_; Size capacity_; public: Size const size() const { return lastIndex_ + 1; } Size const capacity() const { return capacity_; } void reserve( Size const newCapacity ) { if( newCapacity > capacity_ ) { FastStack< Type >( *this, newCapacity ).swapWith( *this ); } } void push( Type const& x ) { if( size() == capacity() ) { reserve( 2*capacity() ); } st_[++lastIndex_] = x; } void pop() { --lastIndex_; } Type top() const { return st_[lastIndex_]; } void swapWith( FastStack& other ) throw() { using std::swap; swap( st_, other.st_ ); swap( lastIndex_, other.lastIndex_ ); swap( capacity_, other.capacity_ ); } void operator=( FastStack other ) { other.swapWith( *this ); } ~FastStack() { delete[] st_; } FastStack( Size const aCapacity = 0 ) : st_( new Type[aCapacity] ) , capacity_( aCapacity ) { lastIndex_ = -1; } FastStack( FastStack const& other, int const newBufSize = -1 ) { capacity_ = (newBufSize < other.size()? other.size(): newBufSize); st_ = new Type[capacity_]; lastIndex_ = other.lastIndex_; copy( other.st_, other.st_ + other.size(), st_ ); // Can't throw for POD. } }; template< class Type > void reserve( Size const newCapacity, FastStack< Type >& st ) { st.reserve( newCapacity ); } template< class StackType > void test( char const* const description ) { for( int it = 0; it < 4; ++it ) { StackType st; reserve( 200, st ); // after this two loops, st capacity will be 141 so there will be no more reallocating for( int i = 0; i < 100; ++i ) { st.push( i ); } for( int i = 0; i < 100; ++i ) { st.pop(); } // when you uncomment this line, std::stack performance will magically rise about 18% // std::vector<int> magicVector(10); HighResolutionTimer timer; timer.Start(); for( Index i = 0; i < 1000000000; ++i ) { st.push( i ); (void) st.top(); if( i % 100 == 0 && i != 0 ) { for( int j = 0; j < 100; ++j ) { st.pop(); } } } double const totalTime = timer.Stop(); wcout << description << ": " << totalTime << endl; } } int main() { typedef stack< Index, vector< Index > > SStack; typedef FastStack< Index > FStack; test< SStack >( "std::stack" ); test< FStack >( "FastStack" ); cout << "Done"; }
Results for this slow molasses Samsung RC530:
[D: \ dev \ test \ so \ 12704314]
> a
std :: stack: 3.21319
std :: stack: 3.16456
std :: stack: 3.23298
std :: stack: 3.20854
FastStack: 1.97636
FastStack: 1.97958
FastStack: 2.12977
FastStack: 2.13507
Done
[D: \ dev \ test \ so \ 12704314]
> _
And similarly for Visual C ++.
Now let's look at a typical implementation of std::vector::push_back , which is called std::stack<T, std::vector<T>>::push (incidentally, I know only 3 programmers who have ever used this indentation style, namely PJP, Petzold and I, since 1998 or so, consider this terrible!):
void push_back(const value_type& _Val) { // insert element at end if (_Inside(_STD addressof(_Val))) { // push back an element size_type _Idx = _STD addressof(_Val) - this->_Myfirst; if (this->_Mylast == this->_Myend) _Reserve(1); _Orphan_range(this->_Mylast, this->_Mylast); this->_Getal().construct(this->_Mylast, this->_Myfirst[_Idx]); ++this->_Mylast; } else { // push back a non-element if (this->_Mylast == this->_Myend) _Reserve(1); _Orphan_range(this->_Mylast, this->_Mylast); this->_Getal().construct(this->_Mylast, _Val); ++this->_Mylast; } }
I suspect that measured inefficiency lies at least partially in everything that happens there, and perhaps it is also a matter of automatically generated security checks.
To build debugging, the performance of std::stack so unsafe that I abandoned the expected result.
EDIT : following Xeo & rsquo; s comment below, I updated push to check for "self-inflating" in case of buffer redistribution, multiplying it as a separate function:
void push( Type const& x ) { if( size() == capacity() ) { reserveAndPush( x ); } st_[++lastIndex_] = x; }
Mysteriously, although reserveAndPush never called in this test, does it affect performance - due to insufficient cache size?
[D: \ dev \ test \ so \ 12704314]
> a
std :: stack: 3.21623
std :: stack: 3.30501
std :: stack: 3.24337
std :: stack: 3.27711
FastStack: 2.52791
FastStack: 2.44621
FastStack: 2.44759
FastStack: 2.47287
Done
[D: \ dev \ test \ so \ 12704314]
> _
EDIT 2 : DeadMG showed that the code should be faulty. I believe the problem was the missing
return , plus an expression calculating the new size (two times zero is still zero). He also noted that I forgot to show
reserveAndPush . Must be:
void reserveAndPush( Type const& x ) { Type const xVal = x; reserve( capacity_ == 0? 1 : 2*capacity_ ); push( xVal ); } void push( Type const& x ) { if( size() == capacity() ) { return reserveAndPush( x );