SSE RMS

I want to calculate the rms value with the integrated Intel processor. Like this:

float rms( float *a, float *b , int l) { int n=0; float r=0.0; for(int i=0;i<l;i++) { if(finitef(a[i]) && finitef(b[i])) { n++; tmp = a[i] - b[i]; r += tmp*tmp; } } r /= n; return r; } 

But how to check which NaN elements? And how to count n?

+4
source share
1 answer

You can check the value for NaN by comparing the value with yourself. x == x returns false if x is NaN. So, for an SSE vector with 4 vx float values:

  vmask = _mm_cmpeq_ps(vx, vx); 

will give you a mask vector with all 0s for NaN elements in vx and all 1s for elements other than NaN. You can use a mask to zero out NaN. You can also use a mask to count the number of valid data points, treating it as a vector of 32-bit ints and accumulating.

Here's an example of work that has been tested: note that it assumes that n is a multiple of 4, that a, b are not 16 byte aligned and also note that it requires SSE4.

 float rms(const float *a, const float *b , int n) { int count; float sum; __m128i vcount = _mm_set1_epi32(0); __m128 vsum = _mm_set1_ps(0.0f); assert((n & 3) == 0); for (int i = 0; i < n; i += 4) { __m128 va = _mm_loadu_ps(&a[i]); __m128 vb = _mm_loadu_ps(&b[i]); __m128 vmaska = _mm_cmpeq_ps(va, va); __m128 vmaskb = _mm_cmpeq_ps(vb, vb); __m128 vmask = _mm_and_ps(vmaska, vmaskb); __m128 vtmp = _mm_sub_ps(va, vb); vtmp = _mm_and_ps(vtmp, vmask); vtmp = _mm_mul_ps(vtmp, vtmp); vsum = _mm_add_ps(vsum, vtmp); vcount = _mm_sub_epi32(vcount, (__m128i)vmask); } vsum = _mm_hadd_ps(vsum, vsum); vsum = _mm_hadd_ps(vsum, vsum); _mm_store_ss(&sum, vsum); vcount = _mm_hadd_epi32(vcount, vcount); vcount = _mm_hadd_epi32(vcount, vcount); count = _mm_extract_epi32(vcount, 0); return count > 0 ? sum / (float)count : 0.0f; } 
+5
source

All Articles