The PIMPL idiom is often used for public API objects, which sometimes also contain virtual functions. There, heap distribution is often used to isolate a polymorphic object, which is then stored in unique_ptror similar. A well-known example of this is the Qt API, in which most objects (especially QWidgets, etc.) are allocated on the heap and tracked by the parent / child QObject. Thus, we pay for two distributions as soon as the object c itself 2*sizeof(void*)must hold the PIMPL pointer and v_table, and as soon as the personal data itself.
Now, to answer my question: I wonder if the two distributions can be combined, similar to the optimization used make_shared. Then I wonder if this optimization is worth it, since the implementations from are mallocpotentially not bad for handling word size requests. On the other hand, the positive effects of the cache can be quite noticeable, that is, have private data highlighted directly next to the public object.
I played with the following code:
#include <memory>
#include <cstring>
#include <vector>
#include <iostream>
using namespace std;
#ifdef NDEBUG
#define debug(x)
#else
#define debug(x) x
#endif
class MyInterface
{
public:
virtual ~MyInterface() = default;
virtual int i() const = 0;
};
class MyObjOpt : public MyInterface
{
public:
MyObjOpt(int i);
virtual ~MyObjOpt();
int i() const override;
static void *operator new(size_t size);
static void operator delete(void *ptr);
private:
struct Private;
Private* d;
};
struct MyObjOpt::Private
{
Private(int i)
: i(i)
{
debug(cout << " Private " << i << '\n';)
}
~Private()
{
debug(cout << " ~Private " << i << '\n';)
}
int i;
};
MyObjOpt::MyObjOpt(int i)
{
debug(cout << " MyObjOpt " << i << "\n";)
if (reinterpret_cast<void*>(d) == reinterpret_cast<void*>(this + 1)) {
new (d) Private(i);
} else {
d = new Private(i);
}
};
MyObjOpt::~MyObjOpt()
{
debug(cout << " ~MyObjOpt " << d->i << '\n';)
if (reinterpret_cast<void*>(d) != reinterpret_cast<void*>(this + 1)) {
delete d;
}
}
int MyObjOpt::i() const
{
return d->i;
}
void* MyObjOpt::operator new(size_t )
{
void *ret = malloc(sizeof(MyObjOpt) + sizeof(MyObjOpt::Private));
auto obj = reinterpret_cast<MyObjOpt*>(ret);
obj->d = reinterpret_cast<Private*>(obj + 1);
return ret;
}
void MyObjOpt::operator delete(void *ptr)
{
auto obj = reinterpret_cast<MyObjOpt*>(ptr);
obj->d->~Private();
free(ptr);
}
class MyObj : public MyInterface
{
public:
MyObj(int i);
~MyObj();
int i() const override;
private:
struct Private;
unique_ptr<Private> d;
};
struct MyObj::Private
{
Private(int i)
: i(i)
{
debug(cout << " Private " << i << '\n';)
}
~Private()
{
debug(cout << " ~Private " << i << '\n';)
}
int i;
};
MyObj::MyObj(int i)
: d(new Private(i))
{
debug(cout << " MyObj " << i << "\n";)
};
MyObj::~MyObj()
{
debug(cout << " ~MyObj " << d->i << "\n";)
}
int MyObj::i() const
{
return d->i;
}
int main(int argc, char** argv)
{
if (argc == 1) {
{
cout << "Heap usage:\n";
auto heap1 = unique_ptr<MyObjOpt>(new MyObjOpt(1));
auto heap2 = unique_ptr<MyObjOpt>(new MyObjOpt(2));
}
{
cout << "Stack usage:\n";
MyObjOpt stack1(-1);
MyObjOpt stack2(-2);
}
} else {
const int NUM_ITEMS = 100000;
vector<unique_ptr<MyInterface>> items;
items.reserve(NUM_ITEMS);
if (!strcmp(argv[1], "fast")) {
for (int i = 0; i < NUM_ITEMS; ++i) {
items.emplace_back(new MyObjOpt(i));
}
} else {
for (int i = 0; i < NUM_ITEMS; ++i) {
items.emplace_back(new MyObj(i));
}
}
int sum = 0;
for (const auto& item : items) {
sum += item->i();
}
return sum > 0;
}
return 0;
}
Compiled with gcc -std=c++11 -goutput will be as you would expect:
Heap usage:
MyObjOpt 1
Private 1
MyObjOpt 2
Private 2
~MyObjOpt 2
~Private 2
~MyObjOpt 1
~Private 1
Stack usage:
MyObjOpt -1
Private -1
MyObjOpt -2
Private -2
~MyObjOpt -2
~Private -2
~MyObjOpt -1
~Private -1
But when you run it in valgrind, you will see the following:
Stack usage:
MyObjOpt -1
==21217== Conditional jump or move depends on uninitialised value(s)
==21217== at 0x400DC0: MyObjOpt::MyObjOpt(int) (pimpl.cpp:54)
==21217== by 0x401200: main (pimpl.cpp:142)
==21217==
Private -1
MyObjOpt -2
==21217== Conditional jump or move depends on uninitialised value(s)
==21217== at 0x400DC0: MyObjOpt::MyObjOpt(int) (pimpl.cpp:54)
==21217== by 0x401211: main (pimpl.cpp:143)
==21217==
Private -2
, , , , dptr. , ? , , - factory.
, (-) , con/destructor. new ...
, :
gcc -std=c++11 -O2 -g -DNDEBUG :
$ perf stat -r 10 ./pimpl fast
Performance counter stats for './pimpl fast' (10 runs):
9.004201 task-clock (msec)
1 context-switches
0 cpu-migrations
1,071 page-faults
19,455,553 cycles
31,478,797 instructions
8,121,492 branches
8,059 branch-misses
0.009422989 seconds time elapsed ( +- 3.46% )
$ perf stat -r 10 ./pimpl slow
Performance counter stats for './pimpl slow' (10 runs):
17.674142 task-clock (msec)
2 context-switches
1 cpu-migrations
1,850 page-faults
43,142,007 cycles
68,780,331 instructions
16,369,560 branches
19,774 branch-misses
0.018142227 seconds time elapsed ( +- 2.26% )
, , 2. , , dptr - .
, :
$ perf stat -r 10 -e cache-misses ./pimpl slow
Performance counter stats for './pimpl slow' (10 runs):
37,947 cache-misses ( +- 2.38% )
0.018457998 seconds time elapsed ( +- 2.30% )
$ perf stat -r 10 -e cache-misses ./pimpl fast
Performance counter stats for './pimpl fast' (10 runs):
9,698 cache-misses ( +- 4.46% )
0.009171249 seconds time elapsed ( +- 2.91% )
? ?