In the code below, I changed "dataLen" and got different performance.
dataLen = 400 SSE time: 758000 us Time in AVX: 483000 us SSE> AVX
dataLen = 2400 SSE time: 4212000 us AVX time: 2636000 us SSE> AVX
dataLen = 2864 SSE time: 6115000 us AVX time: 6146000 us SSE ~ = AVX
dataLen = 3200 SSE time: 8049000 us AVX time: 9297000 us SSE <AVX
dataLen = 4000 SSE time: 10170000us AVX time: 11690000us SSE <AVX
The SSE and AVX code can be simplified: buf3 [i] + = buf1 [1] * buf2 [i];
#include "testfun.h" #include <iostream> #include <chrono> #include <malloc.h> #include "immintrin.h" using namespace std::chrono; void testfun() { int dataLen = 4000; int N = 10000000; float *buf1 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float)*dataLen, 32)); float *buf2 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float)*dataLen, 32)); float *buf3 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float)*dataLen, 32)); for(int i=0; i<dataLen; i++) { buf1[i] = 1; buf2[i] = 1; buf3[i] = 0; } //=========================SSE CODE===================================== system_clock::time_point SSEStart = system_clock::now(); __m128 p1, p2, p3; for(int j=0; j<N; j++) for(int i=0; i<dataLen; i=i+4) { p1 = _mm_load_ps(&buf1[i]); p2 = _mm_load_ps(&buf2[i]); p3 = _mm_load_ps(&buf3[i]); p3 = _mm_add_ps(_mm_mul_ps(p1, p2), p3); _mm_store_ps(&buf3[i], p3); } microseconds SSEtimeUsed = duration_cast<milliseconds>(system_clock::now() - SSEStart); std::cout << "SSE time used: " << SSEtimeUsed.count() << " us, " <<std::endl; //=========================AVX CODE===================================== for(int i=0; i<dataLen; i++) buf3[i] = 0; system_clock::time_point AVXstart = system_clock::now(); __m256 pp1, pp2, pp3; for(int j=0; j<N; j++) for(int i=0; i<dataLen; i=i+8) { pp1 = _mm256_load_ps(&buf1[i]); pp2 = _mm256_load_ps(&buf2[i]); pp3 = _mm256_load_ps(&buf3[i]); pp3 = _mm256_add_ps(_mm256_mul_ps(pp1, pp2), pp3); _mm256_store_ps(&buf3[i], pp3); } microseconds AVXtimeUsed = duration_cast<milliseconds>(system_clock::now() - AVXstart); std::cout << "AVX time used: " << AVXtimeUsed.count() << " us, " <<std::endl; _aligned_free(buf1); _aligned_free(buf2); }
my cpu is an Intel Xeon E3-1225 v2 that has an L1 32KB * 4 cache (4 cores), when running this code it uses only 1 core, so the L1 cache used is 32 KB.
buf1 buf2 and buf3 are small enough to reside in L1 cache and L2 cache (1 MB L2 cache). Both SSE and AVX are limited in bandwidth, but with increasing dataLen, why does AVX take longer than SSE?
performance caching sse avx
myej
source share