OpenGL Shader vs CUDA

I used this FXAA Shader to smooth out my OpenGL program. Now I have re-implemented this code in CUDA and tested it. The resulting images are the same, but the CUDA version is much slower. (Shader works at 60 FPS with vsync, and CUDA drops to ~ 40 FPS)

Here is the CUDA code:

__device__ uchar4 readChar(int x, int y){ return surf2Dread<uchar4>( surfaceRead, (x)*sizeof(uchar4), (y),cudaBoundaryModeClamp); } __device__ uchar4 readFloatBilin2(float x, float y){ int x1 = floor(x); int y1 = floor(y); uchar4 z11 = readChar(x1,y1); uchar4 z12 = readChar(x1,y1+1); uchar4 z21 = readChar(x1+1,y1); uchar4 z22 = readChar(x1+1,y1+1); float u_ratio = x - x1; float v_ratio = y - y1; float u_opposite = 1 - u_ratio; float v_opposite = 1 - v_ratio; uchar4 result = (z11 * u_opposite + z21 * u_ratio) * v_opposite + (z12 * u_opposite + z22 * u_ratio) * v_ratio; return result; } __device__ float fluma(const uchar4 &c){ return cx*0.299 * (1.0/255) + cy *0.587 * (1.0/255) + cz*0.114 * (1.0/255); } __global__ void filter_fxaa_opt(TextureData data) { int x = blockIdx.x*blockDim.x + threadIdx.x; int y = blockIdx.y*blockDim.y + threadIdx.y; if(x >= data.w || y >= data.h) { return; } uchar4 out_color; const float FXAA_SPAN_MAX = 8.0; const float FXAA_REDUCE_MUL = 1.0/8.0; const float FXAA_REDUCE_MIN = (1.0/128.0); float lumaNW = fluma(readChar(x-1,y-1)); float lumaNE = fluma(readChar(x+1,y-1)); float lumaSW = fluma(readChar(x-1,y+1)); float lumaSE = fluma(readChar(x+1,y+1)); float lumaM = fluma(readChar(x,y)); float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE))); float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE))); float2 dir; dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE)); dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE)); float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25 * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN); float rcpDirMin = 1.0/(min(abs(dir.x), abs(dir.y)) + dirReduce); // float2 test = dir * rcpDirMin; dir = clamp(dir * rcpDirMin,-FXAA_SPAN_MAX,FXAA_SPAN_MAX); uchar4 rgbA = ( readFloatBilin2(x+ dir.x * (1.0/3.0 - 0.5),y+ dir.y * (1.0/3.0 - 0.5))*0.5f+ readFloatBilin2(x+ dir.x * (2.0/3.0 - 0.5),y+ dir.y * (2.0/3.0 - 0.5))*0.5f); uchar4 rgbB = rgbA * (1.0/2.0) + ( readFloatBilin2(x+ dir.x * (0.0/3.0 - 0.5),y+ dir.y * (0.0/3.0 - 0.5))*0.25f+ readFloatBilin2(x+ dir.x * (3.0/3.0 - 0.5),y+ dir.y * (3.0/3.0 - 0.5))*0.25f); float lumaB = fluma(rgbB); if((lumaB < lumaMin) || (lumaB > lumaMax)){ out_color=rgbA; } else { out_color=rgbB; } surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y); } 

Setup:

 //called for the 'src' and 'dst' texture once at the beginning checked_cuda( cudaGraphicsGLRegisterImage(&res, gl_buffer,gl_target, cudaGraphicsRegisterFlagsSurfaceLoadStore)); //called for the 'src' and 'dst' texture every frame checked_cuda( cudaGraphicsMapResources(1, &res, 0)); checked_cuda( cudaGraphicsSubResourceGetMappedArray(&array, res, 0,0)); //kernel call every frame dim3 block_size(8, 8); dim3 grid_size; grid_size.x = (src->w) / (block_size.x) ; grid_size.y = (src->h) / (block_size.y) ; checked_cuda(cudaBindSurfaceToArray(surfaceRead, (cudaArray *)src->d_data)); checked_cuda(cudaBindSurfaceToArray(surfaceWrite, (cudaArray *)dst->d_data)); filter_fxaa_opt<<<grid_size, block_size>>>(*src); 

System:

 Ubuntu 14.04 Opengl version: 4.4.0 NVIDIA 331.113 Renderer version: GeForce GTX 760M/PCIe/SSE2 CUDA 5.5 

Question: What makes OpenGL Shader better and why is it so much faster?

+7
c ++ performance cuda opengl glsl
source share
1 answer

As Newuff said, the main problem was manual interpolation and normalization. After using CUDA texture instead of CUDA surface assembly can be used in interpolation by calling tex2D(..) instead of surf2Dread(...) .

The modified CUDA code is now almost identical for the OpenGL shader and really works equally well.

 __global__ void filter_fxaa2(TextureData data) { int x = blockIdx.x*blockDim.x + threadIdx.x; int y = blockIdx.y*blockDim.y + threadIdx.y; if(x >= data.w || y >= data.h) { return; } uchar4 out_color; const float FXAA_SPAN_MAX = 8.0f; const float FXAA_REDUCE_MUL = 1.0f/8.0f; const float FXAA_REDUCE_MIN = (1.0f/128.0f); float u = x + 0.5f; float v = y + 0.5f; float4 rgbNW = tex2D( texRef, u-1.0f,v-1.0f); float4 rgbNE = tex2D( texRef, u+1.0f,v-1.0f); float4 rgbSW = tex2D( texRef, u-1.0f,v+1.0f); float4 rgbSE = tex2D( texRef, u+1.0f,v+1.0f); float4 rgbM = tex2D( texRef, u,v); const float4 luma = make_float4(0.299f, 0.587f, 0.114f,0.0f); float lumaNW = dot(rgbNW, luma); float lumaNE = dot(rgbNE, luma); float lumaSW = dot(rgbSW, luma); float lumaSE = dot(rgbSE, luma); float lumaM = dot( rgbM, luma); float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE))); float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE))); float2 dir; dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE)); dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE)); float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25f * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN); float rcpDirMin = 1.0f/(min(abs(dir.x), abs(dir.y)) + dirReduce); float2 test = dir * rcpDirMin; dir = clamp(test,-FXAA_SPAN_MAX,FXAA_SPAN_MAX); float4 rgbA = (1.0f/2.0f) * ( tex2D( texRef,u+ dir.x * (1.0f/3.0f - 0.5f),v+ dir.y * (1.0f/3.0f - 0.5f))+ tex2D( texRef,u+ dir.x * (2.0f/3.0f - 0.5f),v+ dir.y * (2.0f/3.0f - 0.5f))); float4 rgbB = rgbA * (1.0f/2.0f) + (1.0f/4.0f) * ( tex2D( texRef,u+ dir.x * (0.0f/3.0f - 0.5f),v+ dir.y * (0.0f/3.0f - 0.5f))+ tex2D( texRef,u+ dir.x * (3.0f/3.0f - 0.5f),v+ dir.y * (3.0f/3.0f - 0.5f))); float lumaB = dot(rgbB, luma); if((lumaB < lumaMin) || (lumaB > lumaMax)){ out_color=toChar(rgbA); } else { out_color=toChar(rgbB); } surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y); } 

Update:

Performance measured with cudaEvents :

  • Old version: ~ 12.8ms
  • New version: ~ 1.2ms

Output:

Use CUDA surfaces only for writing, not for reading textures!

+5
source share

All Articles