Are there more ways to optimize this code?

Recently, I was tasked with improving inline code in which a disproportionate amount of time was spent in malloc .

The client wants an easy fix that can be done with very simple search and replace commands (i.e., without massive, complex changes to the source code), while at the same time providing significant benefits.

In the analysis, it seems that the vast majority (~ 80%) of distributions were for 128 bytes or less.

To this end, I gathered some proof of the concept code using a quick pool, the idea is that at startup, one allocation from malloc is performed (enough to store, for example, 1024 blocks of 128 bytes each) and allocations that can be satisfied from this fast the pool. I was hoping that removing more complex calculations in malloc (more complicated as it should handle different size blocks) would increase the speed.

Then, the entire source code is modified to use a quick pool instead of the usual malloc . In case of exhaustion of Quickpool or distribution, more than the block size is required, the request will be transmitted before malloc .

The results were not bad, giving me a (approximately) 50 percent reduction in time, but I wonder if I can do better.

Shortcut functions are shown below. First, the header file:

 #ifndef QUICKPOOL_H_INCLUDED #define QUICKPOOL_H_INCLUDED #include <stdlib.h> typedef struct qp_pool_s qp_pool; qp_pool *qp_create (size_t, size_t); qp_pool *qp_destroy (qp_pool *); void *qp_malloc (qp_pool *, size_t); void *qp_free (qp_pool *, void *); #endif 

The create and destroy functions allow you to create different pools with different block sizes and number of blocks:

 #include <string.h> #include "quickpool.h" // Various global values. #define DEFLT_BLKS 1024 #define DEFLT_SZ 128 struct qp_pool_s { char *data; // Actual blocks. char *enddata; // First byte beyond data. char *isused; // Used map for blocks. size_t total_blks; // Number of blocks in pool. size_t free_blks; // Free blocks in pool. size_t blk_sz; // Size of each block. size_t low_free; // Lowest free block. }; 

 qp_pool *qp_create (size_t quant, size_t sz) { // Zero means a default size. if (quant == 0) quant = DEFLT_BLKS; if (sz == 0) sz = DEFLT_SZ; // Allocate memory blocks for pool. qp_pool *pool = malloc (sizeof (*pool)); if (pool == NULL) return NULL; if ((pool->data = malloc (quant * sz)) == NULL) { free (pool); return NULL; } if ((pool->isused = malloc (quant)) == NULL) { free (pool->data); free (pool); return NULL; } // Set information on pool and return it. pool->enddata = &(pool->data[quant * sz]); memset (pool->isused, 0, quant); pool->total_blks = quant; pool->free_blks = quant; pool->blk_sz = sz; pool->low_free = 0; return pool; } 

 qp_pool *qp_destroy (qp_pool *pool) { // Just free all the memory for pool. if (pool != NULL) { free (pool->data); free (pool->isused); free (pool); } return NULL; } 

And then there are copies of malloc and free :

 void *qp_malloc (qp_pool *pool, size_t sz) { int index; // If no pool, need more than BUFSZ bytes or pool empty, use default. if (pool == NULL) return malloc (sz); if ((sz > pool->blk_sz) || (pool->free_blks == 0)) return malloc (sz); // Otherwise, get from quickpool. First we find a free block. for (index = pool->low_free; pool->isused[index]; index++) ; // Then we mark it used. pool->isused[index] = 1; pool->free_blks--; // Set lowest possible free block for speeding up next search. pool->low_free = index + 1; return &(pool->data[index * pool->blk_sz]); } 

 void *qp_free (qp_pool *pool, void *ptr) { int index; // No pool created, use default. if (pool == NULL) { free (ptr); return NULL; } // Not in quick pool, use default. if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) { free (ptr); return NULL; } // This is a quickpool address, free it. index = ((char*)ptr - pool->data) / pool->blk_sz; pool->isused[index] = 0; pool->free_blks++; // Optimise next search. if (index < pool->low_free) pool->low_free = index; return NULL; } 

To complete the main testing program below:

 #include <stdio.h> #include <string.h> #include <time.h> #include "quickpool.h" #define FREE 0 #define ALLOC 1 #define NUMPTRS 512 static void *pointer[NUMPTRS]; static size_t numPointers = 0; int main (int argCount, char *argVal[]) { int count, val, index, memsz, stMode, seed; qp_pool *quickPool; seed = atoi (argVal[1]); stMode = (strcmp (argVal[2], "standard") == 0); srand (seed); int baseline = clock(); quickPool = qp_create (0, 0); for (count = 0; count < 1000000; count++) { if (numPointers == 0) val = ALLOC; else if (numPointers == NUMPTRS) val = FREE; else if (numPointers > NUMPTRS/2) val = ((rand() % 100) < 50) ? FREE : ALLOC; else val = ((rand() % 100) < 33) ? FREE : ALLOC; if (val == FREE) { index = rand() % numPointers; if (stMode) free (pointer[index]); else qp_free (quickPool, pointer[index]); pointer[index] = pointer[--numPointers]; } else { memsz = rand() % 160; if (stMode) pointer[numPointers++] = malloc (memsz); else pointer[numPointers++] = qp_malloc (quickPool, memsz); } } quickPool = qp_destroy (quickPool); baseline = clock() - baseline; printf ("%d\n", baseline * 1000 / CLOCKS_PER_SEC); return 0; } 

along with a shell script for analysis:

 #!/usr/bin/bash normal=0 quick=0 printf " %10s %10s\n" Normal Quick printf " ========== ==========\n" for iter1 in 0 1 ; do for iter2 in 0 1 2 3 4 5 6 7 8 9 ; do seed=$RANDOM val=$(./qptest.exe $seed standard) printf "${iter1}${iter2} %10d " $val ((normal = normal + val)) val=$(./qptest.exe $seed quickpool) printf "%10d\n" $val ((quick = quick + val)) done done printf " ========== ==========\n" ((pct = quick * 100 / normal)) printf "sum %10d %10d (%d%%)\n" $normal $quick $pct 

which outputs:

  Normal Quick ========== ========== 00 469 219 01 453 219 02 453 235 03 453 219 04 453 235 05 453 219 06 469 219 07 453 234 08 453 219 09 453 219 10 453 219 11 453 235 12 453 235 13 453 219 14 453 219 15 453 235 16 453 235 17 453 235 18 469 219 19 469 234 ========== ========== sum 9124 4522 (49%) 

Now my question is this: is there any other way to optimize the quick access code that does not depend on circumvention of requirements:

  • easy to integrate the fix, requiring only a simple global search and replacement of the sub code.
  • the ability to have pools of different sizes (# blocks) with blocks of different sizes.
  • transition to malloc if the fast pool cannot satisfy the request.
+4
source share
4 answers

I got a good acceleration compared to your version (about 25% on my hardware), preserving the existing interfaces, using a free list instead of a free map.

As a bonus, the code is even simplified:

 #include <string.h> #include "quickpool.h" // Various global values. #define DEFLT_BLKS 1024 #define DEFLT_SZ 128 struct qp_pool_s { char *data; // Actual blocks. char *enddata; // First byte beyond data. size_t blk_sz; // Size of each block. void *next_free; // Next free block }; qp_pool *qp_create (size_t quant, size_t sz) { char *blk; // Zero means a default size. sizeof(void *) is minimum block size. if (quant == 0) quant = DEFLT_BLKS; if (sz == 0) sz = DEFLT_SZ; /* Round up size to a multiple of sizeof(void *) */ sz = (sz + sizeof(void *) - 1) & ~(sizeof(void *) - 1); // Allocate memory blocks for pool. qp_pool *pool = malloc (sizeof (*pool)); if (pool == NULL) return NULL; if ((pool->data = malloc (quant * sz)) == NULL) { free (pool); return NULL; } /* Set up free chain */ for (blk = pool->data; blk < &pool->data[(quant - 1) * sz]; blk += sz) *(void **)blk = blk + sz; *(void **)blk = NULL; pool->next_free = pool->data; // Set information on pool and return it. pool->enddata = &(pool->data[quant * sz]); pool->blk_sz = sz; return pool; } qp_pool *qp_destroy (qp_pool *pool) { // Just free all the memory for pool. if (pool != NULL) { free (pool->data); free (pool); } return NULL; } void *qp_malloc (qp_pool *pool, size_t sz) { void *blk; // If no pool, need more than BUFSZ bytes or pool empty, use default. if (pool == NULL) return malloc (sz); if ((sz > pool->blk_sz) || (pool->next_free == NULL)) return malloc (sz); // Otherwise, get from quickpool. First we find a free block. blk = pool->next_free; // Then we mark it used. pool->next_free = *(void **)blk; return blk; } void *qp_free (qp_pool *pool, void *ptr) { // No pool created, use default. if (pool == NULL) { free (ptr); return NULL; } // Not in quick pool, use default. if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) { free (ptr); return NULL; } // This is a quickpool address, free it. *(void **)ptr = pool->next_free; pool->next_free = ptr; return NULL; } 

Results (my malloc system is obviously pretty fast):

  Normal Quick Caf ========== ========== ========== 00 210 140 100 01 130 140 100 02 130 130 100 03 130 140 100 04 130 130 100 05 130 130 90 06 130 140 100 07 130 130 100 08 130 140 100 09 130 140 100 10 120 140 100 11 120 140 100 12 130 130 100 13 130 140 100 14 120 130 110 15 130 140 100 16 130 130 100 17 130 140 100 18 130 130 100 19 130 140 100 ========== ========== ========== sum 2650 2720 2000 ( 102%) ( 75%) 
+4
source

If you can afford to spend some memory (for example, a pointer to a block), you can save free blocks in the LIFO stack and exclude searches in qp_malloc () and qp_free (). This makes the code a little more complicated, but provides O (1) time for all distributions.

+4
source

You can save the used / unused block flag as single bits in a byte array - provided that the 32-bit system you can save 1024 bits in 32 intervals.

Then finding a spare block is just a matter of going through 32 ints that is not looking for one that is not 0xffffffff

+2
source

At this point, I think you should use a code profiling tool like gprof to figure out which parts of your new code really cost you the most time. It may also be useful to run a profile on the entire program to find out where to spend your time on optimization.

+1
source

All Articles