Here is an example using instrinsics SSE2 to use the maskmovdqu statement . The SIMD version, apparently, works about 2 times the speed of the original version on the Haswell processor (code compiled with clang):
#include <stdio.h> #include <string.h> #include <emmintrin.h> // SSE2 #include <sys/time.h> // gettimeofday void copy_if_ref(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore) { for (size_t i = 0; i < size; ++i) { if (src[i] != ignore) dest[i] = src[i]; } } void copy_if_SSE(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore) { const __m128i vignore = _mm_set1_epi8(ignore); size_t i; for (i = 0; i + 16 <= size; i += 16) { __m128i v = _mm_loadu_si128((__m128i *)&src[i]); __m128i vmask = _mm_cmpeq_epi8(v, vignore); vmask = _mm_xor_si128(vmask, _mm_set1_epi8(-1)); _mm_maskmoveu_si128 (v, vmask, (char *)&dest[i]); } for ( ; i < size; ++i) { if (src[i] != ignore) dest[i] = src[i]; } } #define TIME_IT(init, copy_if, src, dest, size, ignore) \ do { \ const int kLoops = 1000; \ struct timeval t0, t1; \ double t_ms = 0.0; \ \ for (int i = 0; i < kLoops; ++i) \ { \ init; \ gettimeofday(&t0, NULL); \ copy_if(src, dest, size, ignore); \ gettimeofday(&t1, NULL); \ t_ms += ((double)(t1.tv_sec - t0.tv_sec) + (double)(t1.tv_usec - t0.tv_usec) * 1.0e-6) * 1.0e3; \ } \ printf("%s: %.3g ns / element\n", #copy_if, t_ms * 1.0e6 / (double)(kLoops * size)); \ } while (0) int main() { const size_t N = 10000000; uint8_t *src = malloc(N); uint8_t *dest_ref = malloc(N); uint8_t *dest_init = malloc(N); uint8_t *dest_test = malloc(N); for (size_t i = 0; i < N; ++i) { src[i] = (uint8_t)rand(); dest_init[i] = (uint8_t)rand(); } memcpy(dest_ref, dest_init, N); copy_if_ref(src, dest_ref, N, 0x42); memcpy(dest_test, dest_init, N); copy_if_SSE(src, dest_test, N, 0x42); printf("copy_if_SSE: %s\n", memcmp(dest_ref, dest_test, N) == 0 ? "PASS" : "FAIL"); TIME_IT(memcpy(dest_test, dest_init, N), copy_if_ref, src, dest_ref, N, 0x42); TIME_IT(memcpy(dest_test, dest_init, N), copy_if_SSE, src, dest_test, N, 0x42); return 0; }
Compile and test:
$ gcc -Wall -msse2 -O3 copy_if.c && ./a.out copy_if_SSE: PASS copy_if_ref: 0.416 ns / element copy_if_SSE: 0.239 ns / element
(Note: an earlier version of this answer had a random factor of 16 in the synchronization code, so earlier numbers were 16 times higher than they should have been.)
UPDATE
Inspired by the @EOF solution and the code generated by the compiler, I tried a different approach with SSE4 and got much better results:
#include <stdio.h> #include <string.h> #include <smmintrin.h> // SSE4 #include <sys/time.h> // gettimeofday void copy_if_ref(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore) { for (size_t i = 0; i < size; ++i) { if (src[i] != ignore) dest[i] = src[i]; } } void copy_if_EOF(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore) { for (size_t i = 0; i < size; ++i) { char temps = src[i]; char tempd = dest[i]; dest[i] = temps == ignore ? tempd : temps; } } void copy_if_SSE(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore) { const __m128i vignore = _mm_set1_epi8(ignore); size_t i; for (i = 0; i + 16 <= size; i += 16) { __m128i vsrc = _mm_loadu_si128((__m128i *)&src[i]); __m128i vdest = _mm_loadu_si128((__m128i *)&dest[i]); __m128i vmask = _mm_cmpeq_epi8(vsrc, vignore); vdest = _mm_blendv_epi8(vsrc, vdest, vmask); _mm_storeu_si128 ((__m128i *)&dest[i], vdest); } for ( ; i < size; ++i) { if (src[i] != ignore) dest[i] = src[i]; } } #define TIME_IT(init, copy_if, src, dest, size, ignore) \ do { \ const int kLoops = 1000; \ struct timeval t0, t1; \ double t_ms = 0.0; \ \ for (int i = 0; i < kLoops; ++i) \ { \ init; \ gettimeofday(&t0, NULL); \ copy_if(src, dest, size, ignore); \ gettimeofday(&t1, NULL); \ t_ms += ((double)(t1.tv_sec - t0.tv_sec) + (double)(t1.tv_usec - t0.tv_usec) * 1.0e-6) * 1.0e3; \ } \ printf("%s: %.3g ns / element\n", #copy_if, t_ms * 1.0e6 / (double)(kLoops * size)); \ } while (0) int main() { const size_t N = 10000000; uint8_t *src = malloc(N); uint8_t *dest_ref = malloc(N); uint8_t *dest_init = malloc(N); uint8_t *dest_test = malloc(N); for (size_t i = 0; i < N; ++i) { src[i] = (uint8_t)rand(); dest_init[i] = (uint8_t)rand(); } memcpy(dest_ref, dest_init, N); copy_if_ref(src, dest_ref, N, 0x42); memcpy(dest_test, dest_init, N); copy_if_EOF(src, dest_test, N, 0x42); printf("copy_if_EOF: %s\n", memcmp(dest_ref, dest_test, N) == 0 ? "PASS" : "FAIL"); memcpy(dest_test, dest_init, N); copy_if_SSE(src, dest_test, N, 0x42); printf("copy_if_SSE: %s\n", memcmp(dest_ref, dest_test, N) == 0 ? "PASS" : "FAIL"); TIME_IT(memcpy(dest_test, dest_init, N), copy_if_ref, src, dest_ref, N, 0x42); TIME_IT(memcpy(dest_test, dest_init, N), copy_if_EOF, src, dest_test, N, 0x42); TIME_IT(memcpy(dest_test, dest_init, N), copy_if_SSE, src, dest_test, N, 0x42); return 0; }
Compile and test:
$ gcc -Wall -msse4 -O3 copy_if_2.c && ./a.out copy_if_EOF: PASS copy_if_SSE: PASS copy_if_ref: 0.419 ns / element copy_if_EOF: 0.114 ns / element copy_if_SSE: 0.114 ns / element
Conclusion : while _mm_maskmoveu_si128 seems to be a good solution to this problem in terms of functionality, it does not seem as effective as using explicit loads, masking and storage. Moreover, the code generated by the compiler (see @EOF answer) seems as fast as the explicitly encoded SIMD in this case.