Any way to make this relatively simple (nested for memory copy) C ++ code more efficient?

I understand that this is a rather stupid question due to the lack of a better term. I'm just looking for any external idea to increase the efficiency of this code, as it clogs the system up a lot (it has to perform this function a lot), and I have few ideas.

What it does is loading two image containers (imgRGB for full color img and imgBW for b & w image) individually pixelated image pixel stored in "unsigned char * pImage".

Both imgRGB and imgBW are containers for accessing individual pixels as needed.

// input is in the form of an unsigned char
// unsigned char *pImage

for (int y=0; y < 640; y++) {
    for (int x=0; x < 480; x++) {
        imgRGB[y][x].blue = *pImage;
        pImage++;

        imgRGB[y][x].green = *pImage;
        imgBW[y][x]        = *pImage;
        pImage++;

        imgRGB[y][x].red = *pImage;
        pImage++;
    }
}

, / , . , ... . - / , .

+5
13

, ( []?) . .

, - :

for (int y=0; y < height; y++) {
    unsigned char *destBgr = imgRgb.GetScanline(y); // inline methods are better
    unsigned char *destBW = imgBW.GetScanline(y);
    for (int x=0; x < width; x++) {
        *destBgr++ = *pImage++;
        *destBW++ = *destBgr++ = *pImage++; // do this in one shot - don't double deref
        *destBgr++ = *pImage++;
    }
}

. 4 PIXEL.

+4

: ? R, G B ?

, , - , /.

, , .

, , , , , 4 ( int, ) , , ints , , , , :

// First, we need to treat the input image as an array of ints. This is a bit nasty and technically unportable, but you get the idea)
unsigned int* img = reinterpret_cast<unsigned int*>(pImage);

for (int y = 0; y < 640; ++y)
{
  for (int x = 0; x < 480; x += 4)
  {
    // At the start of each iteration, read 3 ints. That 12 bytes, enough to write exactly 4 pixels.
    unsigned int i0 = *img;
    unsigned int i1 = *(img+1);
    unsigned int i2 = *(img+2);
    img += 3;

    // This probably won't make a difference, but keeping a reference to the found pixel saves some typing, and it may assist the compiler in avoiding aliasing.
    ImgRGB& pix0 = imgRGB[y][x];
    pix0.blue = i0 & 0xff;
    pix0.green = (i0 >> 8) & 0xff;
    pix0.red = (i0 >> 16) & 0xff;
    imgBW[y][x] = (i0 >> 8) & 0xff;

    ImgRGB& pix1 = imgRGB[y][x+1];
    pix1.blue = (i0 >> 24) & 0xff;
    pix1.green = i1 & 0xff;
    pix1.red = (i0 >> 8) & 0xff;
    imgBW[y][x+1] = i1 & 0xff;

    ImgRGB& pix2 = imgRGB[y][x+2];
    pix2.blue = (i1 >> 16) & 0xff;
    pix2.green = (i1 >> 24) & 0xff;
    pix2.red = i2 & 0xff;
    imgBW[y][x+2] = (i1 >> 24) & 0xff;

    ImgRGB& pix3 = imgRGB[y][x+3];
    pix3.blue = (i2 >> 8) & 0xff;
    pix3.green = (i2 >> 16) & 0xff;
    pix3.red = (i2 >> 24) & 0xff;
    imgBW[y][x+3] = (i2 >> 16) & 0xff;
  }
}

, ImgRGB, , , : ( , )

ImgRGB& pix0 = imgRGB[y][x];
ImgRGB tmpPix0;
tmpPix0.blue = i0 & 0xff;
tmpPix0.green = (i0 >> 8) & 0xff;
tmpPix0.red = (i0 >> 16) & 0xff;
imgBW[y][x] = (i0 >> 8) & 0xff;
pix0 = tmpPix0;

, , . , (, , , ), 3 4 ( RGB- RGB + BW) 3/4 2 . ( RGB BW)

4 BW- int, , :

bw |= (i0 >> 8) & 0xff;
bw |=  (i1 & 0xff) << 8;
bw |=  ((i1 >> 24) & 0xff) << 16;
bw |=  ((i2 >> 16) & 0xff) << 24;

*(imgBW + y*480+x/4) = bw; // Assuming you can treat imgBW as an array of integers

1,25 (1 RGB 1 4 BW)

, , , ( ), .

, SSE, 4 . (, x86)

, , . Reinterpret_cast, , (, , , , , 32- , ) , -twiddling .

x86. , . ( , . ;))

, , , , . , , , , ints, ( ints RGB), .

, , , , (, , ). , , , , , . , ( ), ( ), . , , .

, 4- RGB (, 32 ), , ( 32- , , 24-)

, ( ), , . , , , , , , , , ( 3 BW. , x86 ( 8 , ). , , , , - .

, . , , , .

, , , . , , . . , ( , ), , , .

+7

, , , , ( ). , . , !

, , . .

+4

, [] [], ( , ).

+3

- . . , OP , , . VS2005 . :

#include <windows.h>
#include <iostream>
using namespace std;

const int
c_width = 640,
c_height = 480;

typedef struct _RGBData
{
  unsigned char
    r,
    g,
    b;
    // I'm assuming there no padding byte here
} RGBData;

//  similar to the code given
void SimpleTest
(
  unsigned char *src,
  RGBData *rgb,
  unsigned char *bw
)
{
  for (int y = 0 ; y < c_height ; ++y)
  {
    for (int x = 0 ; x < c_width ; ++x)
    {
      rgb [x + y * c_width].b = *src;
      src++;

      rgb [x + y * c_width].g = *src;
      bw [x + y * c_width] = *src;
      src++;

      rgb [x + y * c_width].r = *src;
      src++;
    }
  }
}

//  the assembler version
void ASM
(
  unsigned char *src,
  RGBData *rgb,
  unsigned char *bw
)
{
  const int
    count = 3 * c_width * c_height / 12;

  _asm
  {
    push ebp
    mov esi,src
    mov edi,bw
    mov ecx,count
    mov ebp,rgb
l1:
    mov eax,[esi]
    mov ebx,[esi+4]
    mov edx,[esi+8]
    mov [ebp],eax
    shl eax,16
    mov [ebp+4],ebx
    rol ebx,16
    mov [ebp+8],edx
    shr edx,24
    and eax,0xff000000
    and ebx,0x00ffff00
    and edx,0x000000ff
    or eax,ebx
    or eax,edx
    add esi,12
    bswap eax
    add ebp,12
    stosd
    loop l1
    pop ebp
  }
}

//  timing framework
LONGLONG TimeFunction
(
  void (*function) (unsigned char *src, RGBData *rgb, unsigned char *bw),
  char *description,
  unsigned char *src, 
  RGBData *rgb,
  unsigned char *bw
)
{
  LARGE_INTEGER
    start,
    end;

  cout << "Testing '" << description << "'...";
  memset (rgb, 0, sizeof *rgb * c_width * c_height);
  memset (bw, 0, c_width * c_height);

  QueryPerformanceCounter (&start);

  function (src, rgb, bw);

  QueryPerformanceCounter (&end);

  bool
    ok = true;

  unsigned char
    *bw_check = bw,
    i = 0;

  RGBData
    *rgb_check = rgb;

  for (int count = 0 ; count < c_width * c_height ; ++count)
  {
    if (bw_check [count] != i || rgb_check [count].r != i || rgb_check [count].g != i || rgb_check [count].b != i)
    {
      ok = false;
      break;
    }

    ++i;
  }

  cout << (end.QuadPart - start.QuadPart) << (ok ? " OK" : " Failed") << endl;
  return end.QuadPart - start.QuadPart;
}

int main
(
  int argc,
  char *argv []
)
{
  unsigned char
    *source_data = new unsigned char [c_width * c_height * 3];

  RGBData
    *rgb = new RGBData [c_width * c_height];

  unsigned char
    *bw = new unsigned char [c_width * c_height];

  int
    v = 0;

  for (unsigned char *dest = source_data ; dest < &source_data [c_width * c_height * 3] ; ++dest)
  {
    *dest = v++ / 3;
  }

  LONGLONG
    totals [2] = {0, 0};

  for (int i = 0 ; i < 10 ; ++i)
  {
    cout << "Iteration: " << i << endl;
    totals [0] += TimeFunction (SimpleTest, "Initial Copy", source_data, rgb, bw);
    totals [1] += TimeFunction (       ASM, "    ASM Copy", source_data, rgb, bw);
  }

  LARGE_INTEGER
    freq;

  QueryPerformanceFrequency (&freq);

  freq.QuadPart /= 100000;

  cout << totals [0] / freq.QuadPart << "ns" << endl;
  cout << totals [1] / freq.QuadPart << "ns" << endl;


  delete [] bw;
  delete [] rgb;
  delete [] source_data;

  return 0;
}

C 2,5: 1, C 2,5 .

, BGR. B R, . C .

, , . , MMU.

+3

, RGB, :

#pragma pack(1)
typedef unsigned char bw_t;
typedef struct {
    unsigned char blue;
    unsigned char green;
    unsigned char red;
} rgb_t;
#pragma pack(pop)

rgb_t *imageRGB = (rgb_t*)pImage;
bw_t *imageBW = (bw_t*)calloc(640*480, sizeof(bw_t));
// RGB(X,Y) = imageRGB[Y*480 + X]
// BW(X,Y) = imageBW[Y*480 + X]

for (int y = 0; y < 640; ++y)
{
   // try and pull some larger number of bytes from pImage (24 is arbitrary)
   // 24 / sizeof(rgb_t) = 8
   for (int x = 0; x < 480; x += 24)
   {
       imageBW[y*480 + x    ] = GRAYSCALE(imageRGB[y*480 + x    ]);
       imageBW[y*480 + x + 1] = GRAYSCALE(imageRGB[y*480 + x + 1]);
       imageBW[y*480 + x + 2] = GRAYSCALE(imageRGB[y*480 + x + 2]);
       imageBW[y*480 + x + 3] = GRAYSCALE(imageRGB[y*480 + x + 3]);
       imageBW[y*480 + x + 4] = GRAYSCALE(imageRGB[y*480 + x + 4]);
       imageBW[y*480 + x + 5] = GRAYSCALE(imageRGB[y*480 + x + 5]);
       imageBW[y*480 + x + 6] = GRAYSCALE(imageRGB[y*480 + x + 6]);
       imageBW[y*480 + x + 7] = GRAYSCALE(imageRGB[y*480 + x + 7]);
   }
}
+2

, . .

-, .

const unsigned char *pImage;

RGB *rgbOut = imgRGB;
unsigned char *bwOut = imgBW;

for (int y=0; y < 640; ++y) {
    for (int x=0; x < 480; ++x) {
        rgbOut->blue = *pImage;
        ++pImage;

        unsigned char tmp = *pImage;  // Save to reduce amount of reads.
        rgbOut->green = tmp;
        *bwOut = tmp;
        ++pImage;

        rgbOut->red = *pImage;
        ++pImage;

        ++rgbOut;
        ++bwOut;
    }
}

imgRGB imgBW :

unsigned char imgBW[480][640];
RGB imgRGB[480][640];

:

const unsigned char *pImage;

RGB *rgbOut = imgRGB;
unsigned char *bwOut = imgBW;

for (int i=0; i < 640 * 480; ++i) {
    rgbOut->blue = *pImage;
    ++pImage;

    unsigned char tmp = *pImage;  // Save to reduce amount of reads.
    rgbOut->green = tmp;
    *bwOut = tmp;
    ++pImage;

    rgbOut->red = *pImage;
    ++pImage;

    ++rgbOut;
    ++bwOut;
}

, , char . . , .

const unsigned char *pImage;

RGB *rgbOut = imgRGB;
unsigned char *bwOut = imgBW;

const uint32_t *curPixelGroup = pImage;

for (int i=0; i < 640 * 480; ++i) {
    uint64_t pixels = 0;

#define WRITE_PIXEL         \
    rgbOut->blue = pixels;  \
    pixels >>= 8;           \
                            \
    rgbOut->green = pixels; \
    *bwOut = pixels;        \
    pixels >>= 8;           \
                            \
    rgbOut->red = pixels;   \
    pixels >>= 8;           \
                            \
    ++rgbOut;               \
    ++bwOut;

#define READ_PIXEL(shift) \
    pixels |= (*curPixelGroup++) << (shift * 8);

    READ_PIXEL(0);  WRITE_PIXEL;
    READ_PIXEL(1);  WRITE_PIXEL;
    READ_PIXEL(2);  WRITE_PIXEL;
    READ_PIXEL(3);  WRITE_PIXEL;
    /* Remaining */ WRITE_PIXEL;

#undef COPY_PIXELS
}

( , , or READ_PIXEL. , << 0).


RGB :

struct RGB {
     unsigned char blue, green, red;
};

, , (red, green, blue). ( , , , ). ( , ..):

union RGB {
    struct {
        unsigned char blue, green, red;
    };

    uint32_t rgb:24;  // Make sure it a bitfield, otherwise the union will strech and ruin the ++ operator.
};

const unsigned char *pImage;

RGB *rgbOut = imgRGB;
unsigned char *bwOut = imgBW;

const uint32_t *curPixelGroup = pImage;

for (int i=0; i < 640 * 480; ++i) {
    uint64_t pixels = 0;

#define WRITE_PIXEL         \
    rgbOut->rgb = pixels;   \
    pixels >>= 8;           \
                            \
    *bwOut = pixels;        \
    pixels >>= 16;          \
                            \
    ++rgbOut;               \
    ++bwOut;

#define READ_PIXEL(shift) \
    pixels |= (*curPixelGroup++) << (shift * 8);

    READ_PIXEL(0);  WRITE_PIXEL;
    READ_PIXEL(1);  WRITE_PIXEL;
    READ_PIXEL(2);  WRITE_PIXEL;
    READ_PIXEL(3);  WRITE_PIXEL;
    /* Remaining */ WRITE_PIXEL;

#undef COPY_PIXELS
}

, ( , 24 ). , . , . =]


, . , .

+2

, , , , :

a) imgRGB -


    struct ImgRGB
    {
      unsigned char blue;
      unsigned char green;
      unsigned char red;
    };

, , - .

b) imgBW :


    struct ImgBW
    {
       unsigned char BW;
    };

c)

, :

  • BW . , , L1 , , . - , . , . , , imgBW , .
  • , , . , , , . , .

    for (int y=0; y blue = *pImage;
            ...
        }
    }
  • , .
  • (, 25%), char ImgRGB , int. , , , int , , , . , - . char unsigned int , . , pImage unsigned char, .

, , , , , , .

+1

, pImage, imgRGB imgBW __restrict. SSE .

, , , , memcpy() pImage imgRGB ( imgRGB , , , pImage). imgBW, SSE swizzle ops, , , (3 * 16 =) 48 .

, pImage dcache ? , 128 , , .

x86, "SSE" SIMD , . ( VMX, Altivec, SPU, VLIW, HLSL ..)

+1

, , !

  • B & W , ( ). , imgBW .

  • , imgRGB, . , ( ).

, , :

  • : (, - Duff, ...), , ...
0

, . (, int) . , , , . , , [] [].

3 , int. , 3 , . , , . , int.

3 int 3 . .

0

, :

imageRGB [y] [x], , , .

, :

Pixel* apixel;

for (int y=0; y < 640; y++) {
    for (int x=0; x < 480; x++) {
        apixel = &imgRGB[y][x];

        apixel->blue = *pImage;
        pImage++;

        apixel->green = *pImage;
        imgBW[y][x]   = *pImage;
        pImage++;

        apixel->red = *pImage;
        pImage++;
    }
}
0

pImage , ? , -RGB-, /, , ?

If reordering pixel data is important for subsequent operations, consider block operations and / or cache line optimization.

0
source

All Articles