I am trying to research "__ldg intrinsic". To do this, I looked at the NVIDIA documentation, but did not receive a satisfactory answer to its use and implementation. Moreover, referring to THIS , I tried to implement __ldg in a simple example of matrix multiplication of size 1024 * 1024.
#include<stdio.h>
#include<stdlib.h>
__global__ void matrix_mul(float * ad,float * bd,float * cd,int N)
{
float pvalue=0;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
for(int i=0;i< N;++i)
{
float m=__ldg(&ad[Row * N+i]);
float n=__ldg(&bd[i * N + Col]);
pvalue += m * n;
}
cd[Row * N + Col] = pvalue;
}
int main()
{
int N = 1024,i,j;
float *a,*b;
float *ad,*bd,*cd,*c;
FILE *f;
f=fopen("Parallel Multiply_ldg.txt","w");
size_t size=sizeof(float)* N * N;
a=(float*)malloc(size);
b=(float*)malloc(size);
c=(float*)malloc(size);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
a[i*N+j]=2.0;
b[i*N+j]=1.0;
}
}
cudaMalloc(&ad,size);
cudaMalloc(&bd,size);
cudaMalloc(&cd,size);
cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(bd,b,size,cudaMemcpyHostToDevice);
printf("\nAfter HostToDevice Memcpy\n%s\n",cudaGetErrorString(cudaGetLastError()));
dim3 blocksize(16,16);
dim3 gridsize(N/16,N/16);
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
matrix_mul <<< gridsize, blocksize >>> (ad,bd,cd, N);
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaMemcpy(c,cd,sizeof(float)* N*N,cudaMemcpyDeviceToHost);
printf("\nAfter DeviceToHost Memcpy\n%s\n",cudaGetErrorString(cudaGetLastError()));
fprintf(f,"Array A was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",a[i*N+j]);
fprintf(f,"\n");
}
fprintf(f,"\nArray B was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",b[i*N+j]);
fprintf(f,"\n");
}
fprintf(f,"\nMultiplication of A and B gives C----\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",c[i*N+j]);
fprintf(f,"\n");
}
printf("\nYou can see output in Parallel Mutiply.txt file in project directory");
printf("\n\nTime taken is %f (ms)\n",time);
fprintf(f,"\n\nTime taken is %f (ms)\n",time);
fclose(f);
cudaThreadExit();
free(a);free(b);free(c);
return 1;
}
I commented that the __ldg part is in my kernel and runs in the usual way, and vice versa. In both cases, it gives the correct multiplication result. I am confused with the time difference that I get between these executions, because its huge is almost more than 100X!
In the case of __ldg, this gives me: Time taken is 0.014432 (ms)
And in the case of normal execution without __ldg, it gives me: Time taken is 36.858398 (ms)
__ldg intrisic? __ldg intrinsic ? -, , , . . .