Assuming your data types are similar:
struct float_data
{
float f1;
float f2;
float f3;
float f4;
};
struct uint8_t_data
{
uint8_t f1;
uint8_t f2;
uint8_t f3;
uint8_t f4;
};
You can try the built-in SSE features. For uint8_t_data, a good speed is observed:
typedef uint8_t_data type_copy;
for (int x = 0; x<w / 2; x += 4)
{
int il = x;
int ir = w - 1 - x - 3;
__m128i dl = _mm_loadu_si128((const __m128i*)&data[il]);
__m128i dr = _mm_loadu_si128((const __m128i*)&data[ir]);
_mm_storeu_si128((__m128i*)&data[ir], _mm_shuffle_epi32(dl, _MM_SHUFFLE(0, 1, 2, 3)));
_mm_storeu_si128((__m128i*)&data[il], _mm_shuffle_epi32(dr, _MM_SHUFFLE(0, 1, 2, 3)));
}
Conclusion:
g++ -O3 non vectorized: 16ms
g++ -O3 vectorized: 5ms
However, there are not many speed improvements for float_data:
typedef float_data type_copy;
for (int x = 0; x<w / 2; x+=2) {
int il = x;
int ir = w - 1 - x - 1;
__m256 dl = _mm256_loadu_ps((const float*)&data[il]);
__m256 dr = _mm256_loadu_ps((const float*)&data[ir]);
_mm256_storeu_ps((float*)&data[ir], _mm256_permute2f128_ps(dl, dl, 1));
_mm256_storeu_ps((float*)&data[il], _mm256_permute2f128_ps(dr, dr, 1));
}
Conclusion:
g++ -O3 -mavx non vectorized: 27ms
g++ -O3 -msse4.2 non vectorized: 25ms
g++ -O3 -mavx vectorized: 24ms
source
share