OpenCL follow the link another space addres

Short storage:

I have a function to follow the link of output variables

void acum( float2 dW, float4 dFE, float2 *W, float4 *FE ) 

which is the expected increment variable * W, * FE, by dW, dFE, if some codes are executed. I want to make this function generic - output varibales can be local or global.

 acum( dW, dFE, &W , &FE ); // __local acum acum( W, FE, &Wout[idOut], &FEout[idOut] ); // __global acum 

When I try to compile it, I got an error

 error: illegal implicit conversion between two pointers with different address spaces 

Can this be done somehow ??? I was thinking if I could use a macro instead of a function (but I'm not very familiar with macros in C).

Perhaps there will be another possibility:

  • use a function that returns the structure {Wnew, FEnew}
  • or (float8) (Wnew, FEnew, 0,0), but I don't like it because it makes the code more confusing.
  • Naturally, I also do not want to just copy the body of "acum" in all places (for example, manually paste it :))

Backbround (No need to read):

I am trying to do some thermodynamic sampling using OpenCL. Since the statistical weight W = exp (-E / kT) can easily overflow with float (2 ^ 64) for low temperature, I created a complex data type W = float2 (W_digits, W_exponent) and I defined functions to control these "acum" numbers.

I am trying to minimize the number of global memory operations, so I assume that work_items go to Vsurf, not FEOT, becauese. I expect that only a few points in Vsurf will have significant statistical weight, so the FEOT accumulation will be called only at a time for each product item. While iteratin on FEout will require the operation sizeof (FEout) * sizeof (Vsurf). All code is here (any recommendations on how to make it more efficient are welcome):

 // ===== function :: FF_vdW - Lenard-Jones Van Der Waals forcefield float4 FF_vdW ( float3 R ){ const float C6 = 1.0; const float C12 = 1.0; float ir2 = 1.0/ dot( R, R ); float ir6 = ir2*ir2*ir2; float ir12 = ir6*ir6; float E6 = C6*ir6; float E12 = C12*ir12; return (float4)( ( 6*E6 - 12*E12 ) * ir2 * R , E12 - E6 );} // ===== function :: FF_spring - harmonic forcefield float4 FF_spring( float3 R){ const float3 k = (float3)( 1.0, 1.0, 1.0 ); float3 F = k*R; return (float4)( F, 0.5*dot(F,R) );} // ===== function :: EtoW - compute statistical weight float2 EtoW( float EkT ){ float Wexp = floor( EkT); return (float2)( exp(EkT - Wexp) , Wexp ); } // ===== procedure : addExpInplace -- acumulate F,E with statistical weight dW void acum( float2 dW, float4 dFE, float2 *W, float4 *FE ) { float dExp = dW.y - (*W).y; // log(dW)-log(W) if(dExp>-22){ // e^22 = 2^32 , single_float = 2^+64 float fac = exp(dExp); if (dExp<0){ // log(dW)<log(W) dW.x *= fac; (*FE) += dFE*dW.x; (*W ).x += dW.x; }else{ // log(dW)>log(W) (*FE) = dFE + fac*(*FE); (*W ).x = dW.x + fac*(*W).x; (*W ).y = dW.y; } } } // ===== __kernel :: sampler __kernel void sampler( __global float * Vsurf, // in : surface potential (including vdW) // may be faster to precomputed endpoints positions like float8 __global float4 * FEout, // out : Fx,Fy,Fy, E __global float2 * Wout, // out : W_digits, W_exponent int3 nV , float3 dV , int3 nOut , int3 iOut0 , // shift of Fout in respect to Vsurf int3 nCopy , // number of copies of int3 nSample , // dimension of sampling in each dimension around R0 +nSample,-nSample float3 RXe0 , // postion Xe relative to Tip float EcutSurf , float EcutTip , float logWcut , // accumulate only when log(W) > logWcut float kT // maximal energy which should be sampled ) { int id = get_global_id(0); // loop over potential grid points int idx = id/nV.x; int3 iV = (int3)( idx/nV.y , idx%nV.y , id %nV.x ); float V = Vsurf[id]; float3 RXe = dV*iV; if (V<EcutSurf){ // loop over tip position for (int iz=0;iz<nOut.z;iz++ ){ for (int iy=0;iy<nOut.y;iy++ ){ for (int ix=0;ix<nOut.x;ix++ ){ int3 iTip = (int3)( iz, iy, ix ); float3 Rtip = dV*iTip; float4 FE = 0; float2 W = 0; // loop over images of potential for (int ix=0;ix<nCopy.x;ix++ ){ for (int iy=0;iy<nCopy.y;iy++ ){ float3 dR = RXe - Rtip; float4 dFE = FF_vdW( dR ); dFE += FF_spring( dR - RXe0 ); dFE.w += V; if( dFE.w < EcutTip ){ float2 dW = EtoW( - FE.w / kT ); acum( dW, dFE, &W , &FE ); // __local acum } } } if( Wy > logWcut ){ // accumulate force int idOut = iOut0.x + iOut0.y*nOut.x + iOut0.z*nOut.x*nOut.y; acum( W, FE, &Wout[idOut], &FEout[idOut] ); // __global acum } }}}} } 

I am using pyOpenCL on ubuntu 12.04 64bit, but I think it is not related to the problem

+4
source share
1 answer

OK, here's what happens with the OpenCL man pages:

http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/global.html

"If the type of the object is determined by the name of the address space, the object is allocated in the specified address name, otherwise the object is allocated in the general address space"

...

"The common name of the address space for function arguments in the program or local variables of the __private function. All function arguments must be in the __private address space."

So your arguments are acum (...) in the __private address space.

So, the compiler correctly says that

acum (.. & Wout [idOut], & FEout [idOut])

called with & Wout and & FEout in the global hadron space when the args function should be in the private address space.

The solution is the transformation between global and private.

Create two private timelines to get results.

calling acum (...) with these vars.

assign temporary partial values ​​to global values ​​after calling acum (..)

The code will look a bit messy

Remember that you have many address spaces on the GPU; you cannot magically jump between them by casting. You must explicitly move data between address spaces by assignment.

+4
source

All Articles