Efficient way to rotate a byte inside an AVX register

Summary / tl; dr: Is there a way to rotate a byte in the YMM register bitwise (using AVX), besides doing two shifts and mixing the results together?

For every 8 bytes in the YMM register, I need to rotate 7 bytes to the left. Each byte should be rotated a few bits to the left than the first. Thus, 1 byte should be rotated by 0 bits, and the seventh by 6 bits.

Currently, I have made an implementation that does this using [I use a 1-bit rotation as an example here], shifting the register 1 bit to the left and 7 to the right individually. Then I use the mix operation (internal operation _mm256_blend_epi16) to select the correct bits from the first and second temporary results to get my final rotated byte.
It costs only 2 shift operations and 1 mixing operation per byte, and 6 bytes must be rotated, so 18 operations per byte (shift and mixing have approximately the same performance).

There must be a faster way to do this than using 18 operations to rotate one byte!

In addition, I need to collect all the bytes subsequently in a new register. I do this by loading 7 masks with the set command in registers, so I can extract the correct byte from each register. I. And these masks with registers extract the correct byte from them. Subsequently, I XOR single-byte registers together to get a new register with all bytes. This takes a total of 7 + 7 + 6 operations, so another 20 operations (for each register).

I could use extract intrinsic (_mm256_extract_epi8) to get single bytes, and then use _mm256_set_epi8 to build new registers, but I don’t know if it will be faster. (The Intel Embedded Product Guide does not have the indicated performance for these features, so maybe I don’t understand something.)

38 , 6 - .

, -, AVX/SIMD, - - , .

+4
2

XOP _mm_rot_epi8() ( Microsoft, GCC 4.4 clang). 128- . , XOP, .

AVX2 256- , , 8 , 16- . ( 64- GCC)

static const __m256i epi16_highbyte = { 0xFF00FF00FF00FF00ULL,
                                        0xFF00FF00FF00FF00ULL,
                                        0xFF00FF00FF00FF00ULL,
                                        0xFF00FF00FF00FF00ULL };
static const __m256i epi16_lowbyte  = { 0x00FF00FF00FF00FFULL,
                                        0x00FF00FF00FF00FFULL,
                                        0x00FF00FF00FF00FFULL,
                                        0x00FF00FF00FF00FFULL };
static const __m256i epi16_oddmuls  = { 0x4040101004040101ULL,
                                        0x4040101004040101ULL,
                                        0x4040101004040101ULL,
                                        0x4040101004040101ULL };
static const __m256i epi16_evenmuls = { 0x8080202008080202ULL,
                                        0x8080202008080202ULL,
                                        0x8080202008080202ULL,
                                        0x8080202008080202ULL };

__m256i byteshift(__m256i value)
{
    return _mm256_or_si256(_mm256_srli_epi16(_mm256_mullo_epi16(_mm256_and_si256(value, epi16_lowbyte), epi16_oddmuls), 8),
                           _mm256_and_si256(_mm256_mullo_epi16(_mm256_and_si256(_mm256_srai_epi16(value, 8), epi16_lowbyte), epi16_evenmuls), epi16_highbyte));
}

Intel Core i5-4200U GCC-4.8.4. ( 256- )

88 87 86 85 84 83 82 81 38 37 36 35 34 33 32 31 28 27 26 25 24 23 22 21 FF FE FD FC FB FA F9 F8

44 E1 D0 58 24 0E 05 81 1C CD C6 53 A1 CC 64 31 14 C9 C4 52 21 8C 44 21 FF BF BF CF DF EB F3 F8

7 , 6 ..; , 7 .. 32 .

, - , , , .

, , , , :

static __m256i byteshift(__m256i value)
{
    __m256i low, high;
    high = _mm256_srai_epi16(value, 8);
    low = _mm256_and_si256(value, epi16_lowbyte);
    high = _mm256_and_si256(high, epi16_lowbyte);
    low = _mm256_mullo_epi16(low, epi16_lowmuls);
    high = _mm256_mullo_epi16(high, epi16_highmuls);
    low = _mm256_srli_epi16(low, 8);
    high = _mm256_and_si256(high, epi16_highbyte);
    return _mm256_or_si256(low, high);
}

srai + and srli , , and + or a blendv. , , ( Intel!) .

, . TSC x86-64 , , .

, , , , , , - .

odd even high low, , 0, , ..

#include <immintrin.h>

static const __m256i epi16_oddmask  = { 0xFF00FF00FF00FF00ULL,
                                        0xFF00FF00FF00FF00ULL,
                                        0xFF00FF00FF00FF00ULL,
                                        0xFF00FF00FF00FF00ULL };
static const __m256i epi16_evenmask = { 0x00FF00FF00FF00FFULL,
                                        0x00FF00FF00FF00FFULL,
                                        0x00FF00FF00FF00FFULL,
                                        0x00FF00FF00FF00FFULL };
static const __m256i epi16_evenmuls = { 0x4040101004040101ULL,
                                        0x4040101004040101ULL,
                                        0x4040101004040101ULL,
                                        0x4040101004040101ULL };
static const __m256i epi16_oddmuls  = { 0x8080202008080202ULL,
                                        0x8080202008080202ULL,
                                        0x8080202008080202ULL,
                                        0x8080202008080202ULL };

/* Original version suggested by Nominal Animal. */
__m256i original(__m256i value)
{
    return _mm256_or_si256(_mm256_srli_epi16(_mm256_mullo_epi16(_mm256_and_si256(value, epi16_evenmask), epi16_evenmuls), 8),
                           _mm256_and_si256(_mm256_mullo_epi16(_mm256_and_si256(_mm256_srai_epi16(value, 8), epi16_evenmask), epi16_oddmuls), epi16_oddmask));
}

/* Optimized as suggested by Peter Cordes, without blendv */
__m256i no_blendv(__m256i value)
{
    return _mm256_or_si256(_mm256_srli_epi16(_mm256_mullo_epi16(_mm256_and_si256(value, epi16_evenmask), epi16_evenmuls), 8),
                           _mm256_and_si256(_mm256_mullo_epi16(_mm256_srli_epi16(value, 8), epi16_oddmuls), epi16_oddmask));
}

/* Optimized as suggested by Peter Cordes, with blendv.
 * This is the recommended version. */
__m256i optimized(__m256i value)
{
    return _mm256_blendv_epi8(_mm256_srli_epi16(_mm256_mullo_epi16(_mm256_and_si256(value, epi16_evenmask), epi16_evenmuls), 8),
                              _mm256_mullo_epi16(_mm256_srli_epi16(value, 8), epi16_oddmuls), epi16_oddmask);
}

, , . , const, , , .

__m256i original_verbose(const __m256i value)
{
    const __m256i odd1  = _mm256_srai_epi16(value, 8);
    const __m256i even1 = _mm256_and_si256(value, epi16_evenmask);
    const __m256i odd2  = _mm256_and_si256(odd1, epi16_evenmask);
    const __m256i even2 = _mm256_mullo_epi16(even1, epi16_evenmuls);
    const __m256i odd3  = _mm256_mullo_epi16(odd3, epi16_oddmuls);
    const __m256i even3 = _mm256_srli_epi16(even3, 8);
    const __m256i odd4  = _mm256_and_si256(odd3, epi16_oddmask);
    return _mm256_or_si256(even3, odd4);
}

__m256i no_blendv_verbose(const __m256i value)
{
    const __m256i even1 = _mm256_and_si256(value, epi16_evenmask);
    const __m256i odd1  = _mm256_srli_epi16(value, 8);
    const __m256i even2 = _mm256_mullo_epi16(even1, epi16_evenmuls);
    const __m256i odd2  = _mm256_mullo_epi16(odd1, epi16_oddmuls);
    const __m256i even3 = _mm256_srli_epi16(even2, 8);
    const __m256i odd3  = _mm256_and_si256(odd2, epi16_oddmask);
    return _mm256_or_si256(even3, odd3);
}

__m256i optimized_verbose(const __m256i value)
{
    const __m256i even1 = _mm256_and_si256(value, epi16_evenmask);
    const __m256i odd1  = _mm256_srli_epi16(value, 8);
    const __m256i even2 = _mm256_mullo_epi16(even1, epi16_evenmuls);
    const __m256i odd2  = _mm256_mullo_epi16(odd1, epi16_oddmuls);
    const __m256i even3 = _mm256_srli_epi16(even2, 8);
    return _mm256_blendv_epi8(even3, odd2, epi16_oddmask);
}

, - . , , , - , ( ), . , , .

+5

[ , . , ]

2 , . @harold , , "" .

  • 16- [... d c b a] -> [... dd cc bb aa]
  • 16- [128 64 32 16 8 4 2 1]
  • , , 16- ,

__m128i ( 8 , ?):

__m128i duped = _mm_unpacklo_epi8(src, src);
__m128i res = _mm_mullo_epi16(duped, power_of_two_vector);
__m128i repacked = _mm_packus_epi16(_mm_srli_epi16(res, 8), __mm_setzero_si128());

[ ]

: 2 16- . , .

  • 16- .
  • 16- [128 64 32 16 8 4 2 1]
  • 16-
  • 16- ,
  • OR .

, 8- 8- , 16- . , , , .

+5

All Articles