This is the best I could think of for immediate left / right shifts using SSE2:
#include <stdio.h> #include <emmintrin.h> #define SHL128(v, n) \ ({ \ __m128i v1, v2; \ \ if ((n) >= 64) \ { \ v1 = _mm_slli_si128(v, 8); \ v1 = _mm_slli_epi64(v1, (n) - 64); \ } \ else \ { \ v1 = _mm_slli_epi64(v, n); \ v2 = _mm_slli_si128(v, 8); \ v2 = _mm_srli_epi64(v2, 64 - (n)); \ v1 = _mm_or_si128(v1, v2); \ } \ v1; \ }) #define SHR128(v, n) \ ({ \ __m128i v1, v2; \ \ if ((n) >= 64) \ { \ v1 = _mm_srli_si128(v, 8); \ v1 = _mm_srli_epi64(v1, (n) - 64); \ } \ else \ { \ v1 = _mm_srli_epi64(v, n); \ v2 = _mm_srli_si128(v, 8); \ v2 = _mm_slli_epi64(v2, 64 - (n)); \ v1 = _mm_or_si128(v1, v2); \ } \ v1; \ }) int main(void) { __m128i va = _mm_setr_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f); __m128i vb, vc; vb = SHL128(va, 4); vc = SHR128(va, 4); printf("va = %02vx\n", va); printf("vb = %02vx\n", vb); printf("vc = %02vx\n", vc); printf("\n"); vb = SHL128(va, 68); vc = SHR128(va, 68); printf("va = %02vx\n", va); printf("vb = %02vx\n", vb); printf("vc = %02vx\n", vc); return 0; }
Test:
$ gcc -Wall -msse2 shift128.c && ./a.out va = 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f vb = 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0 vc = 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0 00 va = 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f vb = 00 00 00 00 00 00 00 00 00 10 20 30 40 50 60 70 vc = 90 a0 b0 c0 d0 e0 f0 00 00 00 00 00 00 00 00 00 $
Please note that SHL128 / SHR128 macros are implemented using the gcc extension supported by gcc, clang and some other compilers, but they will need to be adapted if your compiler does not support this extension.
Please also note that the printf extension for the SIMD types used in the test bundle works with Apple gcc, clang and others, but again, if your compiler does not support this, and you want to test the code that you need to implement your SIMD's own printing procedures.
Performance note - the if / else branch will be optimized if n is a compile-time constant (which should be anyway for built-in changes), so you have 2 commands for n> = 64 cases and 4 commands for n <64.