Overwriting operator new for combining PIMPL distributions

The PIMPL idiom is often used for public API objects, which sometimes also contain virtual functions. There, heap distribution is often used to isolate a polymorphic object, which is then stored in unique_ptror similar. A well-known example of this is the Qt API, in which most objects (especially QWidgets, etc.) are allocated on the heap and tracked by the parent / child QObject. Thus, we pay for two distributions as soon as the object c itself 2*sizeof(void*)must hold the PIMPL pointer and v_table, and as soon as the personal data itself.

Now, to answer my question: I wonder if the two distributions can be combined, similar to the optimization used make_shared. Then I wonder if this optimization is worth it, since the implementations from are mallocpotentially not bad for handling word size requests. On the other hand, the positive effects of the cache can be quite noticeable, that is, have private data highlighted directly next to the public object.

I played with the following code:


#include <memory>
#include <cstring>
#include <vector>
#include <iostream>

using namespace std;

#ifdef NDEBUG
#define debug(x)
#else
#define debug(x) x
#endif

class MyInterface
{
public:
  virtual ~MyInterface() = default;

  virtual int i() const = 0;
};

class MyObjOpt : public MyInterface
{
public:
  MyObjOpt(int i);
  virtual ~MyObjOpt();

  int i() const override;

  static void *operator new(size_t size);
  static void operator delete(void *ptr);
private:
  struct Private;
  Private* d;
};

struct MyObjOpt::Private
{
  Private(int i)
    : i(i)
  {
    debug(cout << "    Private " << i << '\n';)
  }
  ~Private()
  {
    debug(cout << "    ~Private " << i << '\n';)
  }
  int i;
};

MyObjOpt::MyObjOpt(int i)
{
  debug(cout << "  MyObjOpt " << i << "\n";)
  if (reinterpret_cast<void*>(d) == reinterpret_cast<void*>(this + 1)) {
    new (d) Private(i);
  } else {
    d = new Private(i);
  }
};

MyObjOpt::~MyObjOpt()
{
  debug(cout << "  ~MyObjOpt " << d->i << '\n';)
  if (reinterpret_cast<void*>(d) != reinterpret_cast<void*>(this + 1)) {
    delete d;
  }
}

int MyObjOpt::i() const
{
  return d->i;
}

void* MyObjOpt::operator new(size_t /*size*/)
{
  void *ret = malloc(sizeof(MyObjOpt) + sizeof(MyObjOpt::Private));
  auto obj = reinterpret_cast<MyObjOpt*>(ret);
  obj->d = reinterpret_cast<Private*>(obj + 1);
  return ret;
}

void MyObjOpt::operator delete(void *ptr)
{
  auto obj = reinterpret_cast<MyObjOpt*>(ptr);
  obj->d->~Private();
  free(ptr);
}

class MyObj : public MyInterface
{
public:
  MyObj(int i);
  ~MyObj();

  int i() const override;

private:
  struct Private;
  unique_ptr<Private> d;
};

struct MyObj::Private
{
  Private(int i)
    : i(i)
  {
    debug(cout << "    Private " << i << '\n';)
  }
  ~Private()
  {
    debug(cout << "    ~Private " << i << '\n';)
  }
  int i;
};

MyObj::MyObj(int i)
  : d(new Private(i))
{
  debug(cout << "  MyObj " << i << "\n";)
};

MyObj::~MyObj()
{
  debug(cout << "  ~MyObj " << d->i << "\n";)
}

int MyObj::i() const
{
  return d->i;
}

int main(int argc, char** argv)
{
  if (argc == 1) {
    {
      cout << "Heap usage:\n";
      auto heap1 = unique_ptr<MyObjOpt>(new MyObjOpt(1));
      auto heap2 = unique_ptr<MyObjOpt>(new MyObjOpt(2));
    }
    {
      cout << "Stack usage:\n";
      MyObjOpt stack1(-1);
      MyObjOpt stack2(-2);
    }
  } else {
    const int NUM_ITEMS = 100000;
    vector<unique_ptr<MyInterface>> items;
    items.reserve(NUM_ITEMS);
    if (!strcmp(argv[1], "fast")) {
      for (int i = 0; i < NUM_ITEMS; ++i) {
        items.emplace_back(new MyObjOpt(i));
      }
    } else {
      for (int i = 0; i < NUM_ITEMS; ++i) {
        items.emplace_back(new MyObj(i));
      }
    }
    int sum = 0;
    for (const auto& item : items) {
      sum += item->i();
    }
    return sum > 0;
  }
  return 0;
}

Compiled with gcc -std=c++11 -goutput will be as you would expect:

Heap usage:
  MyObjOpt 1
    Private 1
  MyObjOpt 2
    Private 2
  ~MyObjOpt 2
    ~Private 2
  ~MyObjOpt 1
    ~Private 1
Stack usage:
  MyObjOpt -1
    Private -1
  MyObjOpt -2
    Private -2
  ~MyObjOpt -2
    ~Private -2
  ~MyObjOpt -1
    ~Private -1

But when you run it in valgrind, you will see the following:

Stack usage:
  MyObjOpt -1
==21217== Conditional jump or move depends on uninitialised value(s)
==21217==    at 0x400DC0: MyObjOpt::MyObjOpt(int) (pimpl.cpp:54)
==21217==    by 0x401200: main (pimpl.cpp:142)
==21217== 
    Private -1
  MyObjOpt -2
==21217== Conditional jump or move depends on uninitialised value(s)
==21217==    at 0x400DC0: MyObjOpt::MyObjOpt(int) (pimpl.cpp:54)
==21217==    by 0x401211: main (pimpl.cpp:143)
==21217== 
    Private -2

, , , , dptr. , ? , , - factory.

, (-) , con/destructor. new ...


, :

gcc -std=c++11 -O2 -g -DNDEBUG :

$ perf stat -r 10 ./pimpl fast

 Performance counter stats for './pimpl fast' (10 runs):

      9.004201      task-clock (msec)         #    0.956 CPUs utilized            ( +-  3.61% )
             1      context-switches          #    0.111 K/sec                    ( +- 14.91% )
             0      cpu-migrations            #    0.022 K/sec                    ( +- 66.67% )
         1,071      page-faults               #    0.119 M/sec                    ( +-  0.05% )
    19,455,553      cycles                    #    2.161 GHz                      ( +-  5.81% ) [45.21%]
    31,478,797      instructions              #    1.62  insns per cycle          ( +-  5.41% ) [84.34%]
     8,121,492      branches                  #  901.967 M/sec                    ( +-  2.38% )
         8,059      branch-misses             #    0.10% of all branches          ( +-  2.35% ) [66.75%]

   0.009422989 seconds time elapsed                                          ( +-  3.46% )

$ perf stat -r 10 ./pimpl slow

 Performance counter stats for './pimpl slow' (10 runs):

     17.674142      task-clock (msec)         #    0.974 CPUs utilized            ( +-  2.32% )
             2      context-switches          #    0.113 K/sec                    ( +- 10.54% )
             1      cpu-migrations            #    0.028 K/sec                    ( +- 53.75% )
         1,850      page-faults               #    0.105 M/sec                    ( +-  0.02% )
    43,142,007      cycles                    #    2.441 GHz                      ( +-  1.13% ) [54.62%]
    68,780,331      instructions              #    1.59  insns per cycle          ( +-  0.50% ) [82.62%]
    16,369,560      branches                  #  926.187 M/sec                    ( +-  1.65% ) [83.06%]
        19,774      branch-misses             #    0.12% of all branches          ( +-  5.66% ) [66.07%]

   0.018142227 seconds time elapsed                                          ( +-  2.26% )

, , 2. , , dptr - .

, :

$ perf stat -r 10 -e cache-misses ./pimpl slow

 Performance counter stats for './pimpl slow' (10 runs):

        37,947      cache-misses                                                  ( +-  2.38% )

   0.018457998 seconds time elapsed                                          ( +-  2.30% )

$ perf stat -r 10 -e cache-misses ./pimpl fast

 Performance counter stats for './pimpl fast' (10 runs):

         9,698      cache-misses                                                  ( +-  4.46% )

   0.009171249 seconds time elapsed                                          ( +-  2.91% )

? ?

+4
1

pimpls, Windows Thread Information, , , - alloca dtor- .

, , , . , pimpl 400 13 , . - 90- : .

.

, , , , , , . , .

, , . :

// --------------------------------------------------------
// In some public header:
// --------------------------------------------------------
class Interface
{
public:
    virtual ~Interface() {}
    virtual void foo() = 0;
};
std::unique_ptr<Interface> create_concrete();

// --------------------------------------------------------
// In some private source file:
// --------------------------------------------------------
// Include all the extra headers you need here 
// to implement the interface.

class Concrete: public Interface
{
public:
    // Store all the hidden stuff you want here. 
    virtual void foo() override {...}
};

unique_ptr<Interface> create_concrete()
{
    // Can use a fast, fixed allocator here.
    return unique_ptr<Interface>(new Concrete);
}

pimpl , . , . , /.

, , , C- , / , - , . - ++- .

, . , , / O (1) (: , - node, , ). , , , ( , alloc ).

, alloc , ( , ). , ( ) . , .

, , , - std::aligned_storage. , , , , . - , , ABI , pimpl.

+1

All Articles