Getting with the CUDA core

I have a huge problem with the code I'm programming. I am not an expert, and I asked many people before coming here. too much fixed. Therefore, I think I am ready to show you the code and ask you my questions. I will put all the code here so that you understand well what my problem is. The thing I want to do is if ARRAY_SIZE too big for THREAD_SIZE, so I put the data of a large array into a smaller array, specially created with a size of THREAD_SIZE . Then I send it to the kernel and do whatever I need. But I have a problem from the side

 isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y]; 

where the code stops due. First, I made a double pointer to big_matrix. But the people in the #cuda channel on the freenode irc network told me that it was too large for the processor memory to handle this, that I had to create a linear pointer. I did this, but I still have a stack overflow problem. So, here it is ... updated after some changes that have not yet worked (stack overflow has stopped, but the connection and manifest problem has not been completed)

 #define ARRAY_SIZE 2048 #define THREAD_SIZE 32 #define PI 3.14 int main(int argc, char** argv) { int array_plus=0,x,y; float time; //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE; //bool array_rest; cudaEvent_t start,stop; float *d_isub_matrix; float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE]; float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE]; float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE]; float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE]; //if the array size is not compatible with the thread size, it won't work. //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE); //isub_matrix=(float*) malloc(memsize); //osub_matrix=(float*) malloc(memsize); if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)) { //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes //it has to be like this (lots of loops) //populating the big array for(x=0;x<ARRAY_SIZE;x++) { for(y=0;y<ARRAY_SIZE;y++) big_matrix[x*ARRAY_SIZE+y]=rand()%10000; } //kind of loop for the big array //Start counting the time of processing (everything) cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start,0); while(array_plus<ARRAY_SIZE) { //putting the big array values into the sub-matrix for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus)*ARRAY_SIZE+y]; } cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float)); cudaMalloc((void**)&osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float)); cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice); //call the cuda kernel twiddle_factor<<<1,256>>>(isub_matrix,osub_matrix);//<---- cudaMemcpy(osub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); array_plus=array_plus+THREAD_SIZE; for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) big_matrix2[x*THREAD_SIZE+array_plus+y]=osub_matrix[x*THREAD_SIZE+y]; } array_rest=array_plus+(ARRAY_SIZE); cudaFree(isub_matrix); cudaFree(osub_matrix); system("PAUSE"); } //Stop the time cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time,start,stop); //Free memory in GPU printf("The processing time took... %fms to finish",time); system("PAUSE"); } printf("The processing time took...NAO ENTROU!"); system("PAUSE"); return 0; } //things to do: TRANSPOSITION!!!! 

Another question about the parallel part. The compiler (Visual Studio) says that I used too many pow () and exp () parameters at once. How do I solve this problem?

 if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE)) { block[xIndex][yIndex]=exp(sum_sin[xIndex][yIndex])+exp(sum_cos[xIndex][yIndex]); } 

The source code is here. I commented on this because I wanted to know at least my code is gaining some value in the GPU. But he didn’t even start the kernel ... so sad)

 __global__ void twiddle_factor(float *isub_matrix, float *osub_matrix) { __shared__ float block[THREAD_SIZE][THREAD_SIZE]; // int x,y,z; unsigned int xIndex = threadIdx.x; unsigned int yIndex = threadIdx.y; /* int sum_sines=0.0; int sum_cosines=0.0; float sum_sin[THREAD_SIZE],sum_cos[THREAD_SIZE]; float angle=(2*PI)/THREAD_SIZE; //put into shared memory the FFT calculation (F(u)) for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) { for(z=0;z<THREAD_SIZE;z++) { sum_sines=sum_sin+sin(isub_matrix[y*THREAD_SIZE+z]*(angle*z)); sum_cosines=sum_cos+cos(isub_matrix[y*THREAD_SIZE+z]*(angle*z)); } sum_sin[x][y]=sum_sines/THREAD_SIZE; sum_cos[x][y]=sum_cosines/THREAD_SIZE; } } */ if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE)) block[xIndex][yIndex]=pow(THREAD_SIZE,0.5); //block[xIndex][yIndex]=pow(exp(sum_sin[xIndex*THREAD_SIZE+yIndex])+exp(sum_cos[xIndex*THREAD_SIZE+yIndex]),0.5); __syncthreads(); //transposition X x Y //transfer back the results into another sub-matrix that is allocated in CPU if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE)) osub_matrix[yIndex*THREAD_SIZE+xIndex]=block[xIndex][yIndex]; __syncthreads(); } 

Thanks for reading all this!

Below is the whole code:

 #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #define ARRAY_SIZE 2048 #define THREAD_SIZE 32 #define PI 3.14 __global__ void twiddle_factor(float *isub_matrix, float *osub_matrix) { __shared__ float block[THREAD_SIZE][THREAD_SIZE]; int x,y,z; unsigned int xIndex = threadIdx.x; unsigned int yIndex = threadIdx.y; float sum_sines=0.0; //float expo_sums; float sum_cosines=0.0; float sum_sin[THREAD_SIZE][THREAD_SIZE],sum_cos[THREAD_SIZE][THREAD_SIZE]; float angle=(2*PI)/THREAD_SIZE; //put into shared memory the FFT calculation (F(u)) for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) { for(z=0;z<THREAD_SIZE;z++) { sum_sines=sum_sines+sin(isub_matrix[y*THREAD_SIZE+z]*(angle*z)); sum_cosines=sum_cosines+cos(isub_matrix[y*THREAD_SIZE+z]*(angle*z)); } sum_sin[x][y]=sum_sines/THREAD_SIZE; sum_cos[x][y]=sum_cosines/THREAD_SIZE; } } if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE)) { block[xIndex][yIndex]=exp(sum_sin[xIndex][yIndex])+exp(sum_cos[xIndex][yIndex]); } __syncthreads(); //transposition X x Y //transfer back the results into another sub-matrix that is allocated in CPU if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE)) osub_matrix[yIndex*THREAD_SIZE+xIndex]=block[xIndex][yIndex]; __syncthreads(); } int main(int argc, char** argv) { int array_plus=0,x,y; float time; //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE; //bool array_rest; cudaEvent_t start,stop; float *d_isub_matrix,*d_osub_matrix; float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE]; float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE]; float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE]; float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE]; //if the array size is not compatible with the thread size, it won't work. //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE); //isub_matrix=(float*) malloc(memsize); //osub_matrix=(float*) malloc(memsize); if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE)) { //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes //it has to be like this (lots of loops) //populating the big array for(x=0;x<ARRAY_SIZE;x++) { for(y=0;y<ARRAY_SIZE;y++) big_matrix[x*ARRAY_SIZE+y]=rand()%10000; } //kind of loop for the big array //Start counting the time of processing (everything) cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start,0); while(array_plus<ARRAY_SIZE) { //putting the big array values into the sub-matrix for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y]; } cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float)); cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float)); cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice); //call the cuda kernel twiddle_factor<<<1,256>>>(d_isub_matrix,d_osub_matrix);//<---- cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); array_plus=array_plus+THREAD_SIZE; for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) big_matrix2[x*THREAD_SIZE+array_plus+y]=osub_matrix[x*THREAD_SIZE+y]; } cudaFree(isub_matrix); cudaFree(osub_matrix); cudaFree(d_osub_matrix); cudaFree(d_isub_matrix); } //Stop the time cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time,start,stop); //Free memory in GPU 
+7
source share
2 answers

I see a lot of problems in this code.

  • You do not allocate memory for isub_matrix before copying data from big_matrix to isub_matrix

      for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y]; } 
  • You are not doing cudaMemcpy from the host to the device for isub_matrix. After allocating memory on the device for isub_matrix, you need to copy the data.

  • I see that inside the while loop you are calculating the same data.

      //putting the big array values into the sub-matrix for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y]; } 

The for loop should depend on array_plus.

I suggest you do it

 for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus)*ARRAY_SIZE+y]; } 
  • Also, I don't consider using array_rest. What is it used for?

Based on the updated version:

I see a mistake

  • you use osub_matrix as host and device pointers. I would suggest you create another float pointer and use it for the device.

float * d_osub_matrix;

cudaMalloc ((invalid **) & d_osub_matrix, THREAD_SIZE * THREAD_SIZE * SizeOf (floating point));

and a challenge.

 twiddle_factor<<<1,256>>>(d_isub_matrix,d_osub_matrix); 

Then do

 cudaMemcpy(osub_matrix,d_osub_matrix, ((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); 
  • By the way, this is not

    twiddle_factor <<<β†’> one thousand two hundred fifty six (isub_matrix, osub_matrix);

It should be

twiddle_factor <<1256 β†’> (d_isub_matrix, osub_matrix);

Final and complete code:

 int main(int argc, char** argv) { int array_plus=0,x,y; int array_plus_x, array_plus_y; float time; //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE; //bool array_rest; cudaEvent_t start,stop; float *d_isub_matrix,*d_osub_matrix; float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE]; float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE]; float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE]; float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE]; //if the array size is not compatible with the thread size, it won't work. //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE); //isub_matrix=(float*) malloc(memsize); //osub_matrix=(float*) malloc(memsize); if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE)) { //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes //it has to be like this (lots of loops) //populating the big array for(x=0;x<ARRAY_SIZE;x++) { for(y=0;y<ARRAY_SIZE;y++) big_matrix[x*ARRAY_SIZE+y]=rand()%10000; } //kind of loop for the big array //Start counting the time of processing (everything) cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start,0); for(array_plus_x = 0; array_plus_x < ARRAY_SIZE; array_plus_x += THREAD_SIZE) for(array_plus_y = 0; array_plus_y < ARRAY_SIZE; array_plus_y += THREAD_SIZE) { //putting the big array values into the sub-matrix for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)]; } cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float)); cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float)); cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice); //call the cuda kernel dim3 block(32,32); twiddle_factor<<<1,block>>>(d_isub_matrix,d_osub_matrix);//<---- cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); for(x=0;x<THREAD_SIZE;x++) { for(y=0;y<THREAD_SIZE;y++) big_matrix2[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)]=osub_matrix[x*THREAD_SIZE+y]; } cudaFree(d_osub_matrix); cudaFree(d_isub_matrix); } //Stop the time cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time,start,stop); //Free memory in GPU 
+3
source

I think the problem is in the line.

  cudaMemcpy(osub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); 

This is because you osub_matrix both osub_matrix and isub_matrix in the device.

+1
source

All Articles