Recently, I was tasked with improving inline code in which a disproportionate amount of time was spent in malloc .
The client wants an easy fix that can be done with very simple search and replace commands (i.e., without massive, complex changes to the source code), while at the same time providing significant benefits.
In the analysis, it seems that the vast majority (~ 80%) of distributions were for 128 bytes or less.
To this end, I gathered some proof of the concept code using a quick pool, the idea is that at startup, one allocation from malloc is performed (enough to store, for example, 1024 blocks of 128 bytes each) and allocations that can be satisfied from this fast the pool. I was hoping that removing more complex calculations in malloc (more complicated as it should handle different size blocks) would increase the speed.
Then, the entire source code is modified to use a quick pool instead of the usual malloc . In case of exhaustion of Quickpool or distribution, more than the block size is required, the request will be transmitted before malloc .
The results were not bad, giving me a (approximately) 50 percent reduction in time, but I wonder if I can do better.
Shortcut functions are shown below. First, the header file:
#ifndef QUICKPOOL_H_INCLUDED #define QUICKPOOL_H_INCLUDED #include <stdlib.h> typedef struct qp_pool_s qp_pool; qp_pool *qp_create (size_t, size_t); qp_pool *qp_destroy (qp_pool *); void *qp_malloc (qp_pool *, size_t); void *qp_free (qp_pool *, void *); #endif
The create and destroy functions allow you to create different pools with different block sizes and number of blocks:
#include <string.h> #include "quickpool.h" // Various global values. #define DEFLT_BLKS 1024 #define DEFLT_SZ 128 struct qp_pool_s { char *data; // Actual blocks. char *enddata; // First byte beyond data. char *isused; // Used map for blocks. size_t total_blks; // Number of blocks in pool. size_t free_blks; // Free blocks in pool. size_t blk_sz; // Size of each block. size_t low_free; // Lowest free block. };
qp_pool *qp_create (size_t quant, size_t sz) { // Zero means a default size. if (quant == 0) quant = DEFLT_BLKS; if (sz == 0) sz = DEFLT_SZ; // Allocate memory blocks for pool. qp_pool *pool = malloc (sizeof (*pool)); if (pool == NULL) return NULL; if ((pool->data = malloc (quant * sz)) == NULL) { free (pool); return NULL; } if ((pool->isused = malloc (quant)) == NULL) { free (pool->data); free (pool); return NULL; } // Set information on pool and return it. pool->enddata = &(pool->data[quant * sz]); memset (pool->isused, 0, quant); pool->total_blks = quant; pool->free_blks = quant; pool->blk_sz = sz; pool->low_free = 0; return pool; }
qp_pool *qp_destroy (qp_pool *pool) { // Just free all the memory for pool. if (pool != NULL) { free (pool->data); free (pool->isused); free (pool); } return NULL; }
And then there are copies of malloc and free :
void *qp_malloc (qp_pool *pool, size_t sz) { int index; // If no pool, need more than BUFSZ bytes or pool empty, use default. if (pool == NULL) return malloc (sz); if ((sz > pool->blk_sz) || (pool->free_blks == 0)) return malloc (sz); // Otherwise, get from quickpool. First we find a free block. for (index = pool->low_free; pool->isused[index]; index++) ; // Then we mark it used. pool->isused[index] = 1; pool->free_blks--; // Set lowest possible free block for speeding up next search. pool->low_free = index + 1; return &(pool->data[index * pool->blk_sz]); }
void *qp_free (qp_pool *pool, void *ptr) { int index; // No pool created, use default. if (pool == NULL) { free (ptr); return NULL; } // Not in quick pool, use default. if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) { free (ptr); return NULL; } // This is a quickpool address, free it. index = ((char*)ptr - pool->data) / pool->blk_sz; pool->isused[index] = 0; pool->free_blks++; // Optimise next search. if (index < pool->low_free) pool->low_free = index; return NULL; }
To complete the main testing program below:
#include <stdio.h> #include <string.h> #include <time.h> #include "quickpool.h" #define FREE 0 #define ALLOC 1 #define NUMPTRS 512 static void *pointer[NUMPTRS]; static size_t numPointers = 0; int main (int argCount, char *argVal[]) { int count, val, index, memsz, stMode, seed; qp_pool *quickPool; seed = atoi (argVal[1]); stMode = (strcmp (argVal[2], "standard") == 0); srand (seed); int baseline = clock(); quickPool = qp_create (0, 0); for (count = 0; count < 1000000; count++) { if (numPointers == 0) val = ALLOC; else if (numPointers == NUMPTRS) val = FREE; else if (numPointers > NUMPTRS/2) val = ((rand() % 100) < 50) ? FREE : ALLOC; else val = ((rand() % 100) < 33) ? FREE : ALLOC; if (val == FREE) { index = rand() % numPointers; if (stMode) free (pointer[index]); else qp_free (quickPool, pointer[index]); pointer[index] = pointer[--numPointers]; } else { memsz = rand() % 160; if (stMode) pointer[numPointers++] = malloc (memsz); else pointer[numPointers++] = qp_malloc (quickPool, memsz); } } quickPool = qp_destroy (quickPool); baseline = clock() - baseline; printf ("%d\n", baseline * 1000 / CLOCKS_PER_SEC); return 0; }
along with a shell script for analysis:
#!/usr/bin/bash normal=0 quick=0 printf " %10s %10s\n" Normal Quick printf " ========== ==========\n" for iter1 in 0 1 ; do for iter2 in 0 1 2 3 4 5 6 7 8 9 ; do seed=$RANDOM val=$(./qptest.exe $seed standard) printf "${iter1}${iter2} %10d " $val ((normal = normal + val)) val=$(./qptest.exe $seed quickpool) printf "%10d\n" $val ((quick = quick + val)) done done printf " ========== ==========\n" ((pct = quick * 100 / normal)) printf "sum %10d %10d (%d%%)\n" $normal $quick $pct
which outputs:
Normal Quick
Now my question is this: is there any other way to optimize the quick access code that does not depend on circumvention of requirements:
- easy to integrate the fix, requiring only a simple global search and replacement of the sub code.
- the ability to have pools of different sizes (# blocks) with blocks of different sizes.
- transition to
malloc if the fast pool cannot satisfy the request.