Best build or compilation for at least three values

I am looking for the code generated by GCC-4.8 for x86_64 and am wondering if there is a better (faster) way to calculate the minimum of three values.

Here's an excerpt from the Python collections module, which calculates the minimum m , rightindex+1 and leftindex :

  ssize_t m = n; if (m > rightindex + 1) m = rightindex + 1; if (m > leftindex) m = leftindex; 

GCC generates sequentially dependent code with CMOV:

 leaq 1(%rbp), %rdx cmpq %rsi, %rdx cmovg %rsi, %rdx cmpq %rbx, %rdx cmovg %rbx, %rdx 

Is there a faster code that can use parallel processor execution out of order, removing data dependencies? I am wondering if there are any known tricks for calculating the minimum of several values ​​without using conditional expressions or given instructions. I also wonder if there are any saturating arithmetic properties that will help in this situation.

EDIT:

+7
optimization c assembly x86-64 intrinsics
source share
3 answers

A minimum of two unsigned numbers has a classic solution:

 ; eax = min(eax, ebx), ecx - scratch register. .min2: sub ebx, eax sbb ecx, ecx and ecx, ebx add eax, ecx 

This approach is probably faster than a solution with cmov, but for faster speed, the instructions should be separated by other instructions for parallel execution.

The implementation of this method for three numbers is possible:

 ; eax = min(eax, ebx, edx), ecx - scratch register. .min3: sub ebx, eax sbb ecx, ecx and ecx, ebx add eax, ecx sub edx, eax sbb ecx, ecx and ecx, edx add eax, ecx 

Another attempt is to check the conditional jump option. For modern processors, this can be even faster, especially if the jumps are very predictable:

 .min3: cmp eax, ebx jle @f mov eax, ebx @@: cmp eax, edx jle @f mov eax, edx @@: 
+5
source share

Here are the test results for the proposed methods. The Intel Core i7 2600K processor is clocked at 4 GHz. Units are nanoseconds per loop (less is better). Measurements include pseudo-random test data generation and overhead function calls, in addition to the actual min3 code being tested.

  gcc cmov conditional classical sequence branches branchless pseudo-random data 2.24 6.31 2.39 fixed data 2.24 2.99 2.39 

Control code source

functions.s: 3 solutions are evaluated:

 //---------------------------------------------------------------------------- .text .intel_syntax noprefix //----------------------------------------------------------------------------- // uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_a min3_a: mov rax, rcx cmp rax, rdx cmovg rax, rdx cmp rax, r8 cmovg rax, r8 retq //----------------------------------------------------------------------------- // uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_b min3_b: mov rax, rcx cmp rax, rdx jle skip1 mov rax, rdx skip1: cmp rax, r8 jle skip2 mov rax, r8 skip2: retq //----------------------------------------------------------------------------- // uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_c min3_c: sub rdx, rcx sbb rax, rax and rax, rdx add rcx, rax sub r8, rcx sbb rax, rax and rax, r8 add rax, rcx retq //----------------------------------------------------------------------------- 

min3test.c, the main program (mingw64 / windows):

 #define __USE_MINGW_ANSI_STDIO 1 #include <windows.h> #include <stdio.h> #include <stdint.h> uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8); uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8); uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8); //----------------------------------------------------------------------------- // // queryPerformanceCounter - similar to QueryPerformanceCounter, but returns // count directly. uint64_t queryPerformanceCounter (void) { LARGE_INTEGER int64; QueryPerformanceCounter (&int64); return int64.QuadPart; } //----------------------------------------------------------------------------- // // queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly. uint64_t queryPerformanceFrequency (void) { LARGE_INTEGER int64; QueryPerformanceFrequency (&int64); return int64.QuadPart; } //---------------------------------------------------------------------------- // // lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation // static uint64_t lfsr64gpr (uint64_t data, uint64_t mask) { uint64_t carryOut = data >> 63; uint64_t maskOrZ = -carryOut; return (data << 1) ^ (maskOrZ & mask); } //--------------------------------------------------------------------------- uint64_t min3 (uint64_t a, uint64_t b, uint64_t c) { uint64_t smallest; smallest = a; if (smallest > b) smallest = b; if (smallest > c) smallest = c; return smallest; } //--------------------------------------------------------------------------- static void runtests (uint64_t pattern, uint64_t mask) { uint64_t startCount, elapsed, index, loops = 800000000; double ns; startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); } //--------------------------------------------------------------------------- int main (void) { uint64_t index, pattern, mask; uint64_t count_a = 0, count_b = 0, count_c = 0; mask = 0xBEFFFFFFFFFFFFFF; pattern = 1; // sanity check the asm functions for (index = 0; index < 1000000; index++) { uint64_t expected, result_a, result_b, result_c; uint64_t pattern1 = (pattern >> 0) & 0xFFFFF; uint64_t pattern2 = (pattern >> 20) & 0xFFFFF; uint64_t pattern3 = (pattern >> 40) & 0xFFFFF; pattern = lfsr64gpr (pattern, mask); expected = min3 (pattern1, pattern2, pattern3); result_a = min3_a (pattern1, pattern2, pattern3); result_b = min3_b (pattern1, pattern2, pattern3); result_c = min3_c (pattern1, pattern2, pattern3); if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3); if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3); if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3); if (expected == pattern1) count_a++; if (result_b == pattern2) count_b++; if (result_c == pattern3) count_c++; } printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c); // raise our priority to increase measurement accuracy SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS); printf ("using pseudo-random data\n"); runtests (1, mask); printf ("using fixed data\n"); runtests (0, mask); return 0; } //--------------------------------------------------------------------------- 

build command line:

 gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s 
+8
source share

The function min(x,y,z) continuous, but its derivative is not. This derivative has a norm of 1 wherever it is defined. It is simply impossible to express this as an arithmetic function.

Saturated arithmetic has its own gaps, so the previous reasoning cannot be used in this case. However, the saturation point is independent of the input. This, in turn, implies that you will need to scale the inputs, after which I am sure that the resulting code will not be faster.

This, of course, is not a complete proof of the lack of faster code, but you probably need an exhaustive search for this.

+1
source share

All Articles