log_2(SIMD_width). , x_vec .
x_vec , x_vec , x_vec.
4, 8 16 . printf -s , , x_vec x_trail.
#include <stdio.h>
#include <x86intrin.h>
int print_vec_short(__m256i x);
__m256i hor_tr_max(__m256i x_vec){
__m256i zero = _mm256_setzero_si256();
__m256i pshufb_cnst = _mm256_set_epi64x(0x8080808080808080,0x8080808080808080,0x0F0E0F0E0F0E0F0E,0x0F0E0F0E0F0E0F0E);
__m256i mask1 = _mm256_cmpeq_epi16(x_vec,zero);
__m256i t1 = _mm256_slli_si256(x_vec,2);
__m256i t2 = _mm256_blendv_epi8(x_vec,t1,mask1);
__m256i mask3 = _mm256_cmpeq_epi16(t2,zero);
__m256i t3 = _mm256_slli_si256(t2,4);
__m256i t4 = _mm256_blendv_epi8(t2,t3,mask3);
__m256i mask5 = _mm256_cmpeq_epi16(t4,zero);
__m256i t5 = _mm256_slli_si256(t4,8);
__m256i t6 = _mm256_blendv_epi8(t4,t5,mask5);
__m256i mask7 = _mm256_cmpeq_epi16(t6,zero);
__m256i t7_0 = _mm256_shuffle_epi8(t6,pshufb_cnst);
__m256i t7_1 = _mm256_permute2x128_si256(t7_0,t7_0,0x01);
__m256i x_trail = _mm256_blendv_epi8(t6,t7_1,mask7);
return x_trail;
}
int hor_tr_max_n(short int * x_in, short int * x_out, int n){
__m256i minus_1 = _mm256_set1_epi8(-1);
__m256i zero = _mm256_setzero_si256();
__m256i pshufb_cnst = _mm256_set_epi64x(0x8080808080808080,0x8080808080808080,0x0F0E0F0E0F0E0F0E,0x0F0E0F0E0F0E0F0E);
int indx_last_nz = 0;
for (int i=0;i<n;i=i+16){
__m256i x_vec = _mm256_load_si256((__m256i*)&x_in[i]);
__m256i mask1 = _mm256_cmpeq_epi16(x_vec,zero);
__m256i t1 = _mm256_slli_si256(x_vec,2);
__m256i t2 = _mm256_blendv_epi8(x_vec,t1,mask1);
__m256i mask3 = _mm256_cmpeq_epi16(t2,zero);
__m256i t3 = _mm256_slli_si256(t2,4);
__m256i t4 = _mm256_blendv_epi8(t2,t3,mask3);
__m256i mask5 = _mm256_cmpeq_epi16(t4,zero);
__m256i t5 = _mm256_slli_si256(t4,8);
__m256i t6 = _mm256_blendv_epi8(t4,t5,mask5);
__m256i mask7 = _mm256_cmpeq_epi16(t6,zero);
__m256i t7_0 = _mm256_shuffle_epi8(t6,pshufb_cnst);
__m256i t7_1 = _mm256_permute2x128_si256(t7_0,t7_0,0x01);
__m256i x_trail = _mm256_blendv_epi8(t6,t7_1,mask7);
__m256i isnonzero = _mm256_xor_si256(mask1,minus_1);
int mvmsk_nonz = _mm256_movemask_epi8(isnonzero);
int lz_x_vec = _lzcnt_u32( mvmsk_nonz ) >>1;
__m256i x_last_nz = _mm256_broadcastw_epi16(_mm_load_si128((__m128i*)&x_in[indx_last_nz]));
indx_last_nz = mvmsk_nonz ? (i+15-lz_x_vec) : indx_last_nz;
__m256i x_tr_is_zero = _mm256_cmpeq_epi16(x_trail,zero);
__m256i x_trail_upd = _mm256_blendv_epi8(x_trail,x_last_nz,x_tr_is_zero);
_mm256_store_si256((__m256i*)&x_out[i],x_trail_upd);
}
return 0;
}
int main() {
#define test 0
#if test == 0
printf("Test 0: test functionality\n");
short x[16] = {1, 0, 0, 3, 0, 0, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2};
printf("\n15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0\n");
__m256i x_vec = _mm256_loadu_si256((__m256i*)x);
printf("x_vec = ");print_vec_short(x_vec );
__m256i x_trail = hor_tr_max(x_vec);
printf("x_trail = ");print_vec_short(x_trail );
#elif test == 1 || test == 2
int i, i_o, k;
int n = 8000;
int d = 50;
short int *x_in;
short int *x_out;
x_in = _mm_malloc(n*sizeof(short int),32);
x_out = _mm_malloc(n*sizeof(short int),32);
int j = 73659343;
for (i = 0;i < n;i++){
j = j*653+1;
k = (j & 0x3FF00)>>8;
if (k < d){
x_in[i] = (j&0xFFE)+1-2048;
}else{
x_in[i] = 0;
}
}
#endif
#if test == 1
printf("Test 1: test performance for short int arrays of size n. Use: perf stat -d ./a.out \n");
for (i_o=0;i_o<400000;i_o++){
hor_tr_max_n(x_in,x_out,n);
}
#elif test == 2
printf("Test 2: test performance of the unrolled scalar loop for short int arrays of size n. Use: perf stat -d ./a.out\n");
short int prev_x = 0;
for (i_o=0;i_o<400000;i_o++){
for (i=0;i<n;i=i+4){
short int x_in_i0 = x_in[i];
short int x_in_i1 = x_in[i+1];
short int x_in_i2 = x_in[i+2];
short int x_in_i3 = x_in[i+3];
prev_x = (x_in_i0)?(x_in_i0):(prev_x); x_out[i] = prev_x;
prev_x = (x_in_i1)?(x_in_i1):(prev_x); x_out[i+1] = prev_x;
prev_x = (x_in_i2)?(x_in_i2):(prev_x); x_out[i+2] = prev_x;
prev_x = (x_in_i3)?(x_in_i3):(prev_x); x_out[i+3] = prev_x;
}
}
#elif test == 3
printf("Test 3: Estimate approximately the latency and throughput of hor_tr_max with: perf stat -d ./a.out \n");
int i;
short x0[16] = {1, 0, 0, 3, 0, 0, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2};
short x1[16] = {0, 0, 0, 3, 0, 12, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2};
short x2[16] = {1, 0, 0, 3, 0, 0, 4, 5, 0, 0, 10, 0, 4, 3, 0, 2};
short x3[16] = {110, 0, 0, 1113, 0, 0, 4, 5, 0, 0, 0, 0, 4000, 3, 0, 2};
short x4[16] = {110, 4, 0, 1113, 0, 0, 4, 5, 0, 7, 0, 0, 4000, 3, 0, 2};
__m256i x_vec0 = _mm256_loadu_si256((__m256i*)x0); printf("x_vec0 = ");print_vec_short(x_vec0); __m256i x_trail0 = hor_tr_max(x_vec0);
__m256i x_vec1 = _mm256_loadu_si256((__m256i*)x1); printf("x_vec1 = ");print_vec_short(x_vec1); __m256i x_trail1 = hor_tr_max(x_vec1);
__m256i x_vec2 = _mm256_loadu_si256((__m256i*)x2); printf("x_vec2 = ");print_vec_short(x_vec2); __m256i x_trail2 = hor_tr_max(x_vec2);
__m256i x_vec3 = _mm256_loadu_si256((__m256i*)x3); printf("x_vec3 = ");print_vec_short(x_vec3); __m256i x_trail3 = hor_tr_max(x_vec3);
__m256i x_vec4 = _mm256_loadu_si256((__m256i*)x4); printf("x_vec4 = ");print_vec_short(x_vec4); __m256i x_trail4 = hor_tr_max(x_vec4);
for(i=0;i<100000000;i++){
x_trail0 = hor_tr_max(x_trail0);
}
printf("x_trail0 = ");print_vec_short(x_trail0 );
printf("x_trail1 = ");print_vec_short(x_trail1 );
printf("x_trail2 = ");print_vec_short(x_trail2 );
printf("x_trail3 = ");print_vec_short(x_trail3 );
printf("x_trail4 = ");print_vec_short(x_trail4 );
#endif
#if test == 1 || test == 2
for (i=0;i<400;i++){
printf("%6i %6hi %6hi\n",i,x_in[i],x_out[i]);
}
#endif
return 0;
}
int print_vec_short(__m256i x){
short int v[16];
_mm256_storeu_si256((__m256i *)v,x);
printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi\n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
:
15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
x_vec = 2 0 3 4 | 0 0 0 0 | 5 4 0 0 | 3 0 0 1
x_trail = 2 3 3 4 | 5 5 5 5 | 5 4 3 3 | 3 1 1 1
hor_tr_max 14,2 6,4 (Intel Skylake Core i5-6500).
, , :
short int prev_x = 0;
for (i=0;i<n;i=i+4){
short int x_in_i0 = x_in[i];
short int x_in_i1 = x_in[i+1];
short int x_in_i2 = x_in[i+2];
short int x_in_i3 = x_in[i+3];
prev_x = (x_in_i0)?(x_in_i0):(prev_x); x_out[i] = prev_x;
prev_x = (x_in_i1)?(x_in_i1):(prev_x); x_out[i+1] = prev_x;
prev_x = (x_in_i2)?(x_in_i2):(prev_x); x_out[i+2] = prev_x;
prev_x = (x_in_i3)?(x_in_i3):(prev_x); x_out[i+3] = prev_x;
}
1.26 short int, 20,2 16 short int -s. ,
.
n
hor_tr_max n, n 16.
i. , , .
hor_tr_max_n , , -
.
hor_tr_max_n 12,2 16 short int , 40 ,
.
, Skylake-SP " "
- .