As Newuff said, the main problem was manual interpolation and normalization. After using CUDA texture instead of CUDA surface assembly can be used in interpolation by calling tex2D(..) instead of surf2Dread(...) .
The modified CUDA code is now almost identical for the OpenGL shader and really works equally well.
__global__ void filter_fxaa2(TextureData data) { int x = blockIdx.x*blockDim.x + threadIdx.x; int y = blockIdx.y*blockDim.y + threadIdx.y; if(x >= data.w || y >= data.h) { return; } uchar4 out_color; const float FXAA_SPAN_MAX = 8.0f; const float FXAA_REDUCE_MUL = 1.0f/8.0f; const float FXAA_REDUCE_MIN = (1.0f/128.0f); float u = x + 0.5f; float v = y + 0.5f; float4 rgbNW = tex2D( texRef, u-1.0f,v-1.0f); float4 rgbNE = tex2D( texRef, u+1.0f,v-1.0f); float4 rgbSW = tex2D( texRef, u-1.0f,v+1.0f); float4 rgbSE = tex2D( texRef, u+1.0f,v+1.0f); float4 rgbM = tex2D( texRef, u,v); const float4 luma = make_float4(0.299f, 0.587f, 0.114f,0.0f); float lumaNW = dot(rgbNW, luma); float lumaNE = dot(rgbNE, luma); float lumaSW = dot(rgbSW, luma); float lumaSE = dot(rgbSE, luma); float lumaM = dot( rgbM, luma); float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE))); float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE))); float2 dir; dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE)); dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE)); float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25f * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN); float rcpDirMin = 1.0f/(min(abs(dir.x), abs(dir.y)) + dirReduce); float2 test = dir * rcpDirMin; dir = clamp(test,-FXAA_SPAN_MAX,FXAA_SPAN_MAX); float4 rgbA = (1.0f/2.0f) * ( tex2D( texRef,u+ dir.x * (1.0f/3.0f - 0.5f),v+ dir.y * (1.0f/3.0f - 0.5f))+ tex2D( texRef,u+ dir.x * (2.0f/3.0f - 0.5f),v+ dir.y * (2.0f/3.0f - 0.5f))); float4 rgbB = rgbA * (1.0f/2.0f) + (1.0f/4.0f) * ( tex2D( texRef,u+ dir.x * (0.0f/3.0f - 0.5f),v+ dir.y * (0.0f/3.0f - 0.5f))+ tex2D( texRef,u+ dir.x * (3.0f/3.0f - 0.5f),v+ dir.y * (3.0f/3.0f - 0.5f))); float lumaB = dot(rgbB, luma); if((lumaB < lumaMin) || (lumaB > lumaMax)){ out_color=toChar(rgbA); } else { out_color=toChar(rgbB); } surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y); }
Update:
Performance measured with cudaEvents :
- Old version: ~ 12.8ms
- New version: ~ 1.2ms
Output:
Use CUDA surfaces only for writing, not for reading textures!