GPU code is slower than processor version

I am working on an application that splits a string into pieces and assigns a block to each. Inside each block, the text is scanned by character, and the general array int, D must be updated by different threads in parallel based on the character read. At the end of each iteration, the last element of D is checked, and if it satisfies the condition, the global array int m is set to 1 at the position corresponding to the text. This code was executed on the NVIDIA GEForce Fermi 550 and runs even slower than the processor version. I just turned on the kernel here:

__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){ __shared__ int D[MAX_THREADS+2]; __shared__ char Text_S[MAX_PATTERN_SIZE]; __shared__ int DNew[MAX_THREADS+2]; __shared__ int BB_S[4][MAX_THREADS]; int w=threadIdx.x+1; for(int i=0;i<4;i++) { BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x]; } { D[threadIdx.x] = 0; { D[w] = (1<<(k+1)) -1; for(int i = 0; i < lc - 1; i++) { D[w] = (D[w] << k+2) + (1<<(k+1)) -1; } } D[J+1] = (1<<((k+2)*lc)) - 1; } int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap)))); int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize)); int copyBlock=(size/J)+ ((size%J)==0?0:1); if((threadIdx.x * copyBlock) <= size) memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock)); memcpy(DNew, D, (J+2)*sizeof(int)); __syncthreads(); uint32_t initial = D[1]; uint32_t x; uint32_t mask = 1; for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1; for(int i = 0; i < size;i++) { { x = ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1); DNew[w] = ((D[w]<<1) | mask) & (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1))) & (((x + mask) ^ x) >> 1) & initial; } __syncthreads(); memcpy(D, DNew, (J+2)*sizeof(int)); if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k )))) { matched[startblock+i] = 1; D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1); } } } 

I am not very familiar with CUDA, so I do not quite understand such problems as conflicts in banks with shared memory. Could this be the bottleneck here?

As set, this is the code where I run the kernel:

 #include <stdio.h> #include <assert.h> #include <cuda.h> #define uint32_t unsigned int #define MAX_THREADS 512 #define MAX_PATTERN_SIZE 1024 #define MAX_BLOCKS 8 #define MAX_STREAMS 16 #define TEXT_MAX_LENGTH 1000000000 void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){}; void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); exit(EXIT_FAILURE); } } char* getTextString() { FILE *input, *output; char c; char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH); int numchars = 0, index = 0; input = fopen("sequence.fasta", "r"); c = fgetc(input); while(c != EOF) { inputbuffer[numchars] = c; numchars++; c = fgetc(input); } fclose(input); inputbuffer[numchars] = '\0'; return inputbuffer; } int main(void) { const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC"; char * text_h=getTextString(); //reading text from file, supported upto 200MB currently int k = 13; int i; int count=0; char *pattern_d, *text_d; // pointers to device memory char* text_new_d; int* matched_d; int* matched_new_d; uint32_t* BB_d; uint32_t* BB_new_d; int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h)); cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1); cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1); cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h)); cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice); cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice); cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h)); int m = strlen(pattern_h); int n = strlen(text_h); uint32_t* BB_h[4]; unsigned int maxLc = ((((mk)*(k+2)) > (31))?(31/(k+2)):(mk)); unsigned int lc=2; // Determines the number of threads per block // can be varied upto maxLc for tuning performance if(lc>maxLc) { exit(0); } unsigned int noWordorNfa =((mk)/lc) + (((mk)%lc) == 0?0:1); cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4); if(noWordorNfa >= MAX_THREADS) { printf("Error: max threads\n"); exit(0); } calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa); // not included this function for(i=0;i<4;i++) { cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice); } int overlap=m; int textBlockSize=(((m+k+1)>n)?n:(m+k+1)); cudaStream_t stream[MAX_STREAMS]; for(i=0;i<MAX_STREAMS;i++) { cudaStreamCreate( &stream[i] ); } int start_addr=0,index=0,maxNoBlocks=0; if(textBlockSize>n) { maxNoBlocks=1; } else { maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1))); } int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks); int blocksRemaining =maxNoBlocks; printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks); while(blocksRemaining >0) { kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining); printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize); match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d); start_addr+=kernelBlocks*(textBlockSize-overlap);; blocksRemaining -= kernelBlocks; } cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost); checkCUDAError("Matched Function"); for(i=0;i<MAX_STREAMS;i++) cudaStreamSynchronize( stream[i] ); // do stuff with matched // .... // .... free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d); return 0; 

}

The number of threads running on the block depends on the length of pattern_h (it can be no more than maxLc above). I expect that in this case there will be about 30. Is this enough to see a good amount of concurrency? As for the blocks, I see no reason to run more than MAX_BLOCKS (= 10) at a time, since the hardware can only schedule 8 at a time

NOTE I do not have access to the graphical interface.

+6
source share
3 answers

I found that I copied the entire Dnew array to D in each thread, and did not copy only the part in which each thread had to update D [w]. This will cause the threads to be executed in batches, although I don’t know if a conflict with shared memory can be called. Now it gives 8-9x acceleration for large enough patterns (= more threads). This is much smaller than I expected. I will try to increase the number of blocks as suggested. I do not know how to increase the number of threads

+1
source

When using all the shared memory used, you may encounter conflicts in banks if serial streams cannot be read from serial addresses in shared arrays ... which can lead to serialization of memory access, which, in turn, will kill the parallel performance of your algorithm .

+3
source

I quickly looked at your code, but it looks like you are sending data to gpu back and forth, creating a bottleneck on the bus? Have you tried profiling it?

+1
source

All Articles