Short storage:
I have a function to follow the link of output variables
void acum( float2 dW, float4 dFE, float2 *W, float4 *FE )
which is the expected increment variable * W, * FE, by dW, dFE, if some codes are executed. I want to make this function generic - output varibales can be local or global.
acum( dW, dFE, &W , &FE ); // __local acum acum( W, FE, &Wout[idOut], &FEout[idOut] ); // __global acum
When I try to compile it, I got an error
error: illegal implicit conversion between two pointers with different address spaces
Can this be done somehow ??? I was thinking if I could use a macro instead of a function (but I'm not very familiar with macros in C).
Perhaps there will be another possibility:
- use a function that returns the structure {Wnew, FEnew}
- or (float8) (Wnew, FEnew, 0,0), but I don't like it because it makes the code more confusing.
- Naturally, I also do not want to just copy the body of "acum" in all places (for example, manually paste it :))
Backbround (No need to read):
I am trying to do some thermodynamic sampling using OpenCL. Since the statistical weight W = exp (-E / kT) can easily overflow with float (2 ^ 64) for low temperature, I created a complex data type W = float2 (W_digits, W_exponent) and I defined functions to control these "acum" numbers.
I am trying to minimize the number of global memory operations, so I assume that work_items go to Vsurf, not FEOT, becauese. I expect that only a few points in Vsurf will have significant statistical weight, so the FEOT accumulation will be called only at a time for each product item. While iteratin on FEout will require the operation sizeof (FEout) * sizeof (Vsurf). All code is here (any recommendations on how to make it more efficient are welcome):
// ===== function :: FF_vdW - Lenard-Jones Van Der Waals forcefield float4 FF_vdW ( float3 R ){ const float C6 = 1.0; const float C12 = 1.0; float ir2 = 1.0/ dot( R, R ); float ir6 = ir2*ir2*ir2; float ir12 = ir6*ir6; float E6 = C6*ir6; float E12 = C12*ir12; return (float4)( ( 6*E6 - 12*E12 ) * ir2 * R , E12 - E6 );} // ===== function :: FF_spring - harmonic forcefield float4 FF_spring( float3 R){ const float3 k = (float3)( 1.0, 1.0, 1.0 ); float3 F = k*R; return (float4)( F, 0.5*dot(F,R) );} // ===== function :: EtoW - compute statistical weight float2 EtoW( float EkT ){ float Wexp = floor( EkT); return (float2)( exp(EkT - Wexp) , Wexp ); } // ===== procedure : addExpInplace -- acumulate F,E with statistical weight dW void acum( float2 dW, float4 dFE, float2 *W, float4 *FE ) { float dExp = dW.y - (*W).y; // log(dW)-log(W) if(dExp>-22){ // e^22 = 2^32 , single_float = 2^+64 float fac = exp(dExp); if (dExp<0){ // log(dW)<log(W) dW.x *= fac; (*FE) += dFE*dW.x; (*W ).x += dW.x; }else{ // log(dW)>log(W) (*FE) = dFE + fac*(*FE); (*W ).x = dW.x + fac*(*W).x; (*W ).y = dW.y; } } } // ===== __kernel :: sampler __kernel void sampler( __global float * Vsurf, // in : surface potential (including vdW) // may be faster to precomputed endpoints positions like float8 __global float4 * FEout, // out : Fx,Fy,Fy, E __global float2 * Wout, // out : W_digits, W_exponent int3 nV , float3 dV , int3 nOut , int3 iOut0 , // shift of Fout in respect to Vsurf int3 nCopy , // number of copies of int3 nSample , // dimension of sampling in each dimension around R0 +nSample,-nSample float3 RXe0 , // postion Xe relative to Tip float EcutSurf , float EcutTip , float logWcut , // accumulate only when log(W) > logWcut float kT // maximal energy which should be sampled ) { int id = get_global_id(0); // loop over potential grid points int idx = id/nV.x; int3 iV = (int3)( idx/nV.y , idx%nV.y , id %nV.x ); float V = Vsurf[id]; float3 RXe = dV*iV; if (V<EcutSurf){ // loop over tip position for (int iz=0;iz<nOut.z;iz++ ){ for (int iy=0;iy<nOut.y;iy++ ){ for (int ix=0;ix<nOut.x;ix++ ){ int3 iTip = (int3)( iz, iy, ix ); float3 Rtip = dV*iTip; float4 FE = 0; float2 W = 0; // loop over images of potential for (int ix=0;ix<nCopy.x;ix++ ){ for (int iy=0;iy<nCopy.y;iy++ ){ float3 dR = RXe - Rtip; float4 dFE = FF_vdW( dR ); dFE += FF_spring( dR - RXe0 ); dFE.w += V; if( dFE.w < EcutTip ){ float2 dW = EtoW( - FE.w / kT ); acum( dW, dFE, &W , &FE ); // __local acum } } } if( Wy > logWcut ){ // accumulate force int idOut = iOut0.x + iOut0.y*nOut.x + iOut0.z*nOut.x*nOut.y; acum( W, FE, &Wout[idOut], &FEout[idOut] ); // __global acum } }}}} }
I am using pyOpenCL on ubuntu 12.04 64bit, but I think it is not related to the problem