Is it possible to optimize the count of byte matches between two strings using SIMD?

Profiling says that this function is a real bottle for my application:

static inline int countEqualChars(const char* string1, const char* string2, int size) { int r = 0; for (int j = 0; j < size; ++j) { if (string1[j] == string2[j]) { ++r; } } return r; } 

Even with -O3 and -march=native , G ++ 4.7.2 does not vectorize this function (I checked the assembler output). I am not an expert on SSE and friends, but I believe that comparing more than one character at a time should be faster. Any ideas how to speed up the process? The target architecture is x86-64.

+8
source share
3 answers

Vector compiler flags:

-ftree-vectorize

-ftree-vectorize -march=<your_architecture>

Using SSEx built-in functions:

  • Fill the buffer and align it to 16 bytes (according to the size of the vector that you are actually going to use)

  • Create a countU8 drive using _mm_set1_epi8 (0)

  • For all n / 16 input (sub) vectors, do:

The code:

 #include <iostream> #include <vector> #include <cassert> #include <cstdint> #include <climits> #include <cstring> #include <emmintrin.h> #ifdef __SSE2__ #if !defined(UINTPTR_MAX) || !defined(UINT64_MAX) || !defined(UINT32_MAX) # error "Limit macros are not defined" #endif #if UINTPTR_MAX == UINT64_MAX #define PTR_64 #elif UINTPTR_MAX == UINT32_MAX #define PTR_32 #else # error "Current UINTPTR_MAX is not supported" #endif template<typename T> void print_vector(std::ostream& out,const __m128i& vec) { static_assert(sizeof(vec) % sizeof(T) == 0,"Invalid element size"); std::cout << '{'; const T* const end = reinterpret_cast<const T*>(&vec)-1; const T* const upper = end+(sizeof(vec)/sizeof(T)); for(const T* elem = upper; elem != end; --elem ) { if(elem != upper) std::cout << ','; std::cout << +(*elem); } std::cout << '}' << std::endl; } #define PRINT_VECTOR(_TYPE,_VEC) do{ std::cout << #_VEC << " : "; print_vector<_TYPE>(std::cout,_VEC); } while(0) ///@note SSE2 required (macro: __SSE2__) ///@warning Not tested! size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count) { assert(a_in != nullptr && (uintptr_t(a_in) % 16) == 0); assert(b_in != nullptr && (uintptr_t(b_in) % 16) == 0); //assert(count > 0); /* //maybe not so good with all that branching and additional loop variables __m128i accumulatorU8 = _mm_set1_epi8(0); __m128i sum2xU64 = _mm_set1_epi8(0); for(size_t i = 0;i < count;++i) { //this operation could also be unrolled, where multiple result registers would be accumulated accumulatorU8 = _mm_sub_epi8(accumulatorU8,_mm_cmpeq_epi8(*a_in++,*b_in++)); if(i % 255 == 0) { //before overflow of uint8, the counter will be extracted __m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0)); sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16); //reset accumulatorU8 accumulatorU8 = _mm_set1_epi8(0); } } //blindly accumulate remaining values __m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0)); sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16); //do a horizontal addition of the two counter values sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8)); #if defined PTR_64 return _mm_cvtsi128_si64(sum2xU64); #elif defined PTR_32 return _mm_cvtsi128_si32(sum2xU64); #else # error "macro PTR_(32|64) is not set" #endif */ __m128i sum2xU64 = _mm_set1_epi32(0); while(count--) { __m128i matches = _mm_sub_epi8(_mm_set1_epi32(0),_mm_cmpeq_epi8(*a_in++,*b_in++)); __m128i sum2xU16 = _mm_sad_epu8(matches,_mm_set1_epi32(0)); sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16); #ifndef NDEBUG PRINT_VECTOR(uint16_t,sum2xU64); #endif } //do a horizontal addition of the two counter values sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8)); #ifndef NDEBUG std::cout << "----------------------------------------" << std::endl; PRINT_VECTOR(uint16_t,sum2xU64); #endif #if !defined(UINTPTR_MAX) || !defined(UINT64_MAX) || !defined(UINT32_MAX) # error "Limit macros are not defined" #endif #if defined PTR_64 return _mm_cvtsi128_si64(sum2xU64); #elif defined PTR_32 return _mm_cvtsi128_si32(sum2xU64); #else # error "macro PTR_(32|64) is not set" #endif } #endif int main(int argc, char* argv[]) { std::vector<__m128i> a(64); // * 16 bytes std::vector<__m128i> b(a.size()); const size_t nBytes = a.size() * sizeof(std::vector<__m128i>::value_type); char* const a_out = reinterpret_cast<char*>(a.data()); char* const b_out = reinterpret_cast<char*>(b.data()); memset(a_out,0,nBytes); memset(b_out,0,nBytes); a_out[1023] = 1; b_out[1023] = 1; size_t equalBytes = counteq_epi8(a.data(),b.data(),a.size()); std::cout << "equalBytes = " << equalBytes << std::endl; return 0; } 

The fastest SSE implementation I got for large and small arrays:

 size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count) { assert((count > 0 ? a_in != nullptr : true) && (uintptr_t(a_in) % sizeof(__m128i)) == 0); assert((count > 0 ? b_in != nullptr : true) && (uintptr_t(b_in) % sizeof(__m128i)) == 0); //assert(count > 0); const size_t maxInnerLoops = 255; const size_t nNestedLoops = count / maxInnerLoops; const size_t nRemainderLoops = count % maxInnerLoops; const __m128i zero = _mm_setzero_si128(); __m128i sum16xU8 = zero; __m128i sum2xU64 = zero; for(size_t i = 0;i < nNestedLoops;++i) { for(size_t j = 0;j < maxInnerLoops;++j) { sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++)); } sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero)); sum16xU8 = zero; } for(size_t j = 0;j < nRemainderLoops;++j) { sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++)); } sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero)); sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8)); #if UINTPTR_MAX == UINT64_MAX return _mm_cvtsi128_si64(sum2xU64); #elif UINTPTR_MAX == UINT32_MAX return _mm_cvtsi128_si32(sum2xU64); #else # error "macro PTR_(32|64) is not set" #endif } 
+7
source

Of course it is possible.

pcmpeqb compares two vectors of 16 bytes and creates a vector with zeros where they differ, and -1 where they match. Use this to compare 16 bytes at a time by adding the result to the battery vector (be sure to copy the results to a maximum of 255 vector to avoid overflow). When you are done, there will be 16 results in the battery. Sum them and negate to get the number of equal elements.

If the lengths are very short, it will be difficult to get significant acceleration from this approach. If the lengths are long, then it will be worth it.

+9
source

Auto-vectorization in current gcc is to help the compiler understand that it is easy to vectorize code. In your case: it will understand the vectorization request if you delete the conditional expression and rewrite the code in a more imperative way:

  static inline int count(const char* string1, const char* string2, int size) { int r = 0; bool b; for (int j = 0; j < size; ++j) { b = (string1[j] == string2[j]); r += b; } return r; } 

In this case:

 movdqa 16(%rsp), %xmm1 movl $.LC2, %esi pxor %xmm2, %xmm2 movzbl 416(%rsp), %edx movdqa .LC1(%rip), %xmm3 pcmpeqb 224(%rsp), %xmm1 cmpb %dl, 208(%rsp) movzbl 417(%rsp), %eax movl $1, %edi pand %xmm3, %xmm1 movdqa %xmm1, %xmm5 sete %dl movdqa %xmm1, %xmm4 movzbl %dl, %edx punpcklbw %xmm2, %xmm5 punpckhbw %xmm2, %xmm4 pxor %xmm1, %xmm1 movdqa %xmm5, %xmm6 movdqa %xmm5, %xmm0 movdqa %xmm4, %xmm5 punpcklwd %xmm1, %xmm6 

(etc.).

+3
source

All Articles