I had a simple CUDA problem for class assignment, but the professor added an optional task to implement the same algorithm using shared memory. I could not finish it before the deadline (for example, the start date was a week ago), but I'm still curious, so now I will ask the Internet;).
The main purpose was to implement the inherited version of the red-black sequential reinstallation both sequentially and in CUDA, make sure that you get the same result in both, and then compare the acceleration. As I said, doing this with shared memory was an optional + 10% addition.
I am going to publish my working version and pseudo-code, which I tried to do, since I do not have the code in my hands at the moment, but I can update it later with the actual code if someone needs it.
Before anyone says this: Yes, I know that using CUtil is lame, but it made comparisons and timers easier.
Working version of global memory:
#include <stdlib.h>
#include <stdio.h>
#include <cutil_inline.h>
#define N 1024
__global__ void kernel(int *d_A, int *d_B) {
unsigned int index_x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int index_y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int grid_width = gridDim.x * blockDim.x;
unsigned int index = index_y * grid_width + index_x;
if((index_x > 0) && (index_y > 0) && (index_x < N-1) && (index_y < N-1))
d_B[index] = (d_A[index-1]+d_A[index+1]+d_A[index+N]+d_A[index-N])/4;
}
main (int argc, char **argv) {
int A[N][N], B[N][N];
int *d_A, *d_B;
int *h_B;
int i, j;
int num_bytes = N * N * sizeof(int);
for(i=0;i<N;i++) {
for(j=0;j<N;j++) {
A[i][j] = rand()/1795831;
}
}
cudaEvent_t start_event0, stop_event0;
float elapsed_time0;
CUDA_SAFE_CALL( cudaEventCreate(&start_event0) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event0) );
cudaEventRecord(start_event0, 0);
for(i=1;i<N-1;i++) {
for(j=1;j<N-1;j++) {
B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4;
}
}
cudaEventRecord(stop_event0, 0);
cudaEventSynchronize(stop_event0);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time0,start_event0, stop_event0) );
h_B = (int *)malloc(num_bytes);
memset(h_B, 0, num_bytes);
cudaMalloc((void**)&d_A, num_bytes);
cudaMalloc((void**)&d_B, num_bytes);
cudaMemset(d_A, 0, num_bytes);
cudaMemset(d_B, 0, num_bytes);
cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice);
cudaEvent_t start_event, stop_event;
float elapsed_time;
CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
cudaEventRecord(start_event, 0);
dim3 block_size(256,1,1);
dim3 grid_size;
grid_size.x = N / block_size.x;
grid_size.y = N / block_size.y;
kernel<<<grid_size,block_size>>>(d_A,d_B);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time,start_event, stop_event) );
cudaMemcpy(h_B, d_B, num_bytes, cudaMemcpyDeviceToHost);
CUTBoolean res = cutComparei( (int *)B, (int *)h_B, N*N);
printf("Test %s\n",(1 == res)?"Passed":"Failed");
printf("Elapsed Time for Sequential: \t%.2f ms\n", elapsed_time0);
printf("Elapsed Time for CUDA:\t%.2f ms\n", elapsed_time);
printf("CUDA Speedup:\t%.2fx\n",(elapsed_time0/elapsed_time));
cudaFree(d_A);
cudaFree(d_B);
free(h_B);
cutilDeviceReset();
}
For the shared memory version, this is what I have tried so far:
#define N 1024
__global__ void kernel(int *d_A, int *d_B, int width) {
__shared__ int A_sh[3][66];
for(i=0; i < width+2; i++)
A_sh[index_y%3][i] = d_A[index+i-1];
__syncthreads();
for(i=0; i < width; i++)
d_B[index+i] = A_sh[index_y%3][i] + A_sh[index_y%3][i+2] + A_sh[index_y%3-1][i+1] + A_sh[index_y%3+1][i+1];
}
main(){
dim3 threadsperblk(32,16);
dim3 numblks(32,64);
kernel<<<numblks,threadsperblk>>>(d_A,d_B,64);
}
mem ( " - " ), , , . , , ( SDK), , mem, .
, , , (GTX 560 Ti - 0.121ms), : P
2: , , , .