Did you mean heaviside = 0.5*(sign(1,x)+1) ? In any case, testing with gcc 4.8.1 fortran shows that the idea of ββa High Performance Mark should be useful. Here are 3 possibilities:
heavyiside1 - original heavyiside2 - high-performance idea heavyiside3 - another variation
function heaviside1 (x) double precision heaviside1, x heaviside1 = 0.5 * (sign(1d0,x) + 1) end function heaviside2 (x) double precision heaviside2, x heaviside2 = sign(0.5d0,x) + 0.5 end function heaviside3 (x) double precision heaviside3, x heaviside3 = 0 if (x .ge. 0) heaviside3 = 1 end program demo double precision heaviside1, heaviside2, heaviside3, x, a, b, c do x = 0.5 - RAND(0) a = heaviside1(x) b = heaviside2(x) c = heaviside3(x) print *, "x=", x, "heaviside(x)=", a, b, c enddo end
When compiled, gcc generates these 3 autonomous functions:
<heaviside1_>: vmovsd xmm0,QWORD PTR [rcx] vandpd xmm0,xmm0,XMMWORD PTR [rip+0x2d824] vorpd xmm0,xmm0,XMMWORD PTR [rip+0x2d80c] vaddsd xmm0,xmm0,QWORD PTR [rip+0x2d7f4] vmulsd xmm0,xmm0,QWORD PTR [rip+0x2d81c] ret <heaviside2_>: vmovsd xmm0,QWORD PTR [rcx] vandpd xmm0,xmm0,XMMWORD PTR [rip+0x2d844] vorpd xmm0,xmm0,XMMWORD PTR [rip+0x2d85c] vaddsd xmm0,xmm0,QWORD PTR [rip+0x2d844] ret <heaviside3_>: vxorpd xmm0,xmm0,xmm0 vmovsd xmm1,QWORD PTR [rip+0x2d844] vcmplesd xmm0,xmm0,QWORD PTR [rcx] vandpd xmm0,xmm1,xmm0 ret
When compiled with gcc, heavyiside1 generates a multiplication that can slow down execution. heavyiside2 precludes multiplication. The heavyiside3 command has the same number of instructions as heavy2, but uses 2 more memory accesses.
For autonomous functions:
instruction memory reference count count heaviside1 6 5 heaviside2 5 4 heaviside3 5 2
The built-in code for these functions avoids the need for a return command and ideally passes arguments to registers and preloads other registers with the necessary constants. The exact result depends on the compiler used and the calling code. Embed code rating:
instruction memory reference count count heaviside1 4 0 heaviside2 3 0 heaviside3 2 0
It seems that the function can be processed with only two instructions generated by the compiler: vcmplesd + vandpd. The first command creates a mask of all zeros if the argument is negative, or a mask of all others. The second instruction applies the mask to the value of the unit register constant to get a result value equal to zero or one.
Although I have not tested these functions, it seems that the function should not take much time.
--- 09/23/2013: adding x86_64 versions of the assembler language and C language test ---
file functions.s
//---------------------------------------------------------------------------- .intel_syntax noprefix .text //----------------------------------------------------------------------------- // this heaviside function generates its own register constants // double heaviside_a1 (double arg); .globl heaviside_a1 heaviside_a1: mov rax,0x3ff0000000000000 xorpd xmm1,xmm1 # xmm1: constant 0.0 cmplesd xmm1,xmm0 # xmm1: mask (all Fs or all 0s) movq xmm0,rax # xmm0: constant 1.0 andpd xmm0,xmm1 retq //----------------------------------------------------------------------------- // this heaviside function uses register constants passed from caller // double heaviside_a2 (double arg, double const0, double const1); .globl heaviside_a2 heaviside_a2: cmplesd xmm1,xmm0 # xmm1: mask (all Fs or all 0s) movsd xmm0,xmm2 # xmm0: constant 1.0 andpd xmm0,xmm1 retq //-----------------------------------------------------------------------------
ctest.c file
#define __USE_MINGW_ANSI_STDIO 1 #include <windows.h> #include <stdio.h> #include <stdint.h> // functions.s double heaviside_a1 (double x); double heaviside_a2 (double arg, double const0, double const1); //----------------------------------------------------------------------------- static double heaviside_c1 (double x) { double result = 0; if (x >= 0) result = 1; return result; } //----------------------------------------------------------------------------- // // queryPerformanceCounter - similar to QueryPerformanceCounter, but returns // count directly. uint64_t queryPerformanceCounter (void) { LARGE_INTEGER int64; QueryPerformanceCounter (&int64); return int64.QuadPart; } //----------------------------------------------------------------------------- // // queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly. uint64_t queryPerformanceFrequency (void) { LARGE_INTEGER int64; QueryPerformanceFrequency (&int64); return int64.QuadPart; } //---------------------------------------------------------------------------- // // lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation // static uint64_t lfsr64gpr (uint64_t data, uint64_t mask) { uint64_t carryOut = data >> 63; uint64_t maskOrZ = -carryOut; return (data << 1) ^ (maskOrZ & mask); } //--------------------------------------------------------------------------- int runtests (uint64_t pattern, uint64_t mask) { uint64_t startCount, elapsed, index, loops = 800000000; double ns; double total = 0; startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { double x, result; pattern = lfsr64gpr (pattern, mask); x = (double) (int64_t) pattern; result = heaviside_c1 (x); total += result; } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("heaviside_c1: %7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { double x, result; pattern = lfsr64gpr (pattern, mask); x = (double) (int64_t) pattern; result = heaviside_a1 (x); //printf ("heaviside_a1 (%lf): %lf\n", x, result); total += result; } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("heaviside_a1: %7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { double x, result; const double const0 = 0.0; const double const1 = 1.0; pattern = lfsr64gpr (pattern, mask); x = (double) (int64_t) pattern; result = heaviside_a2 (x, const0, const1); //printf ("heaviside_a2 (%lf): %lf\n", x, result); total += result; } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("heaviside_a2: %7.2f ns\n", ns); return total; } //--------------------------------------------------------------------------- int main (void) { uint64_t mask; mask = 0xBEFFFFFFFFFFFFFF; // raise our priority to increase measurement accuracy SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS); printf ("using pseudo-random data\n"); runtests (1, mask); return 0; } //---------------------------------------------------------------------------
mingw64 build command: gcc -Wall -Wextra -O3 -octest.exe ctest.c functions.s
Software output from Intel Core i7-2600K with a frequency of 4.0 GHz:
using pseudo-random data heaviside_c1: 2.24 ns heaviside_a1: 2.00 ns heaviside_a2: 2.00 ns
These synchronization results include the generation of pseudo-random arguments and a summing code for the results necessary so that the optimizer does not remove the unnecessary localized function badiside_c1.
heavyiside_c1 is the original fortran clause ported to C. The heavyiside_a1 function is an assembly language implementation. heavyiside_a2 is a modification of the assembly language version that uses register constants passed by the caller to avoid the overhead of creating them. For my processor benchmarking does not give advantages to transmit constants.
The assembly language functions assume that xmm0 returns the result, and xmm1 and xmm2 are available as scratch registers. This is true for the x64 agreement used by Windows. This assumption must be confirmed for other calling arrangements.
To avoid memory access, the assembly language version expects the argument to be passed by value in the register (XMM0). Since this is not a fortran value, a special declaration is required. This seems to work correctly for gfortran 64-bit:
interface real(c_double) function heaviside_a1(x) use iso_c_binding, only: c_double real(c_double), VALUE :: x end function heaviside_a1 end interface