Here are the test results for the proposed methods. The Intel Core i7 2600K processor is clocked at 4 GHz. Units are nanoseconds per loop (less is better). Measurements include pseudo-random test data generation and overhead function calls, in addition to the actual min3 code being tested.
gcc cmov conditional classical sequence branches branchless pseudo-random data 2.24 6.31 2.39 fixed data 2.24 2.99 2.39
Control code source
functions.s: 3 solutions are evaluated:
//---------------------------------------------------------------------------- .text .intel_syntax noprefix //----------------------------------------------------------------------------- // uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_a min3_a: mov rax, rcx cmp rax, rdx cmovg rax, rdx cmp rax, r8 cmovg rax, r8 retq //----------------------------------------------------------------------------- // uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_b min3_b: mov rax, rcx cmp rax, rdx jle skip1 mov rax, rdx skip1: cmp rax, r8 jle skip2 mov rax, r8 skip2: retq //----------------------------------------------------------------------------- // uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_c min3_c: sub rdx, rcx sbb rax, rax and rax, rdx add rcx, rax sub r8, rcx sbb rax, rax and rax, r8 add rax, rcx retq //-----------------------------------------------------------------------------
min3test.c, the main program (mingw64 / windows):
#define __USE_MINGW_ANSI_STDIO 1 #include <windows.h> #include <stdio.h> #include <stdint.h> uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8); uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8); uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8); //----------------------------------------------------------------------------- // // queryPerformanceCounter - similar to QueryPerformanceCounter, but returns // count directly. uint64_t queryPerformanceCounter (void) { LARGE_INTEGER int64; QueryPerformanceCounter (&int64); return int64.QuadPart; } //----------------------------------------------------------------------------- // // queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly. uint64_t queryPerformanceFrequency (void) { LARGE_INTEGER int64; QueryPerformanceFrequency (&int64); return int64.QuadPart; } //---------------------------------------------------------------------------- // // lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation // static uint64_t lfsr64gpr (uint64_t data, uint64_t mask) { uint64_t carryOut = data >> 63; uint64_t maskOrZ = -carryOut; return (data << 1) ^ (maskOrZ & mask); } //--------------------------------------------------------------------------- uint64_t min3 (uint64_t a, uint64_t b, uint64_t c) { uint64_t smallest; smallest = a; if (smallest > b) smallest = b; if (smallest > c) smallest = c; return smallest; } //--------------------------------------------------------------------------- static void runtests (uint64_t pattern, uint64_t mask) { uint64_t startCount, elapsed, index, loops = 800000000; double ns; startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); } //--------------------------------------------------------------------------- int main (void) { uint64_t index, pattern, mask; uint64_t count_a = 0, count_b = 0, count_c = 0; mask = 0xBEFFFFFFFFFFFFFF; pattern = 1; // sanity check the asm functions for (index = 0; index < 1000000; index++) { uint64_t expected, result_a, result_b, result_c; uint64_t pattern1 = (pattern >> 0) & 0xFFFFF; uint64_t pattern2 = (pattern >> 20) & 0xFFFFF; uint64_t pattern3 = (pattern >> 40) & 0xFFFFF; pattern = lfsr64gpr (pattern, mask); expected = min3 (pattern1, pattern2, pattern3); result_a = min3_a (pattern1, pattern2, pattern3); result_b = min3_b (pattern1, pattern2, pattern3); result_c = min3_c (pattern1, pattern2, pattern3); if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3); if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3); if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3); if (expected == pattern1) count_a++; if (result_b == pattern2) count_b++; if (result_c == pattern3) count_c++; } printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c); // raise our priority to increase measurement accuracy SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS); printf ("using pseudo-random data\n"); runtests (1, mask); printf ("using fixed data\n"); runtests (0, mask); return 0; } //---------------------------------------------------------------------------
build command line:
gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s