CUDA Primes Generation

My CUDA program stops working (it does not print anything), because the data size is increasing by more than 260 thousand.

Can someone tell me why this is happening? This is my first CUDA program. And if I need large primes, how do I use a data type that is larger than the long long int on CUDA?

Graphics Card - GT425M.

#include<stdio.h> #include<stdlib.h> #include<cuda.h> #define SIZE 250000 #define BLOCK_NUM 96 #define THREAD_NUM 1024 int data[SIZE]; __global__ static void sieve(int *num,clock_t* time){ const int tid = threadIdx.x; const int bid = blockIdx.x; int tmp=bid*THREAD_NUM+tid; if(tid==0) time[bid] = clock(); while(tmp<SIZE){ int i=1; while(((2*tmp+3)*i+tmp+1)<SIZE){ num[(2*tmp+3)*i+tmp+1] = 0; i++; } tmp+=BLOCK_NUM*THREAD_NUM; } if(tid==0) time[bid+BLOCK_NUM] = clock(); } void GenerateNumbers(int *number,int size){ for(int i=0;i<size;i++) number[i] = 2*i+1; number[0] = 2; } int main(){ GenerateNumbers(data,SIZE); int *gpudata; clock_t* time; int cpudata[SIZE]; cudaMalloc((void**)&gpudata,sizeof(int)*SIZE); cudaMalloc((void**)&time,sizeof(clock_t)*BLOCK_NUM*2); cudaMemcpy(gpudata,data,sizeof(int)*SIZE,cudaMemcpyHostToDevice); sieve<<<BLOCK_NUM,THREAD_NUM,0>>>(gpudata,time); clock_t time_used[BLOCK_NUM * 2]; cudaMemcpy(&cpudata,gpudata,sizeof(int)*SIZE,cudaMemcpyDeviceToHost); cudaMemcpy(&time_used,time,sizeof(clock_t)*BLOCK_NUM*2,cudaMemcpyDeviceToHost); cudaFree(gpudata); for(int i=0;i<SIZE;i++) if(cpudata[i]!=0) printf("%d\t",cpudata[i]); clock_t min_start,max_end; min_start = time_used[0]; max_end = time_used[BLOCK_NUM]; for(int i=1;i<BLOCK_NUM;i++) { if(min_start>time_used[i]) min_start=time_used[i]; if(max_end<time_used[i+BLOCK_NUM]) max_end=time_used[i+BLOCK_NUM]; } printf("\nTime Cost: %d\n",max_end-min_start); } 
+6
source share
1 answer

(unsigned) long long int provides 64-bit. There is no built-in non-vector integer type whose width exceeds 64 bits. However, you can easily create your own 128-bit integer type. For instance:

 typedef struct { unsigned long long int lo; unsigned long long int hi; } my_uint128; my_uint128 add_uint128 (my_uint128 a, my_uint128 b) { my_uint128 res; res.lo = a.lo + b.lo; res.hi = a.hi + b.hi + (res.lo < a.lo); return res; } 

If you need a solution with a higher performance, consider matching a 128-bit integer with uint4 and using the built-in PTX to more efficiently handle hyphenation between four 32-bit fragments. a source

+1
source

All Articles