CudaDeviceSynchronize () error code 77: cudaErrorIllegalAddress

Question

CudaDeviceSynchronize () error code 77: cudaErrorIllegalAddress

Thanks so much for reading my topics.

I am doing CUDA work, but keep getting the error code cudaDeviceSynchronize () 77: cudaErrorIllegalAddress, without any idea why. I did a search for code and functions, which is surprising, only a few entries appeared. Very strange.

I basically summarize all the pixels in the images. So that my questions have as many references as possible, I show all the CUDA code here:

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "thorcalgpu.h" #include <stdio.h> #include "math.h" #include <vector> #include <algorithm> #include <stdlib.h> #include <stdio.h> #include <vector> #include <numeric> #include <iostream> using namespace std; float random_float(void) { return static_cast<float>(rand()) / RAND_MAX; } __global__ void reduceSum(unsigned short *input, unsigned long long *per_block_results, const int n) { extern __shared__ unsigned long long sdata[]; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; // load input into __shared__ memory unsigned short x = 0; if(i < n) { x = input[i]; } sdata[threadIdx.x] = x; __syncthreads(); // contiguous range pattern for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if(threadIdx.x < offset) { // add a partial sum upstream to our own sdata[threadIdx.x] += sdata[threadIdx.x + offset]; } // wait until all threads in the block have // updated their partial sums __syncthreads(); } // thread 0 writes the final result if(threadIdx.x == 0) { per_block_results[blockIdx.x] = sdata[0]; } } // Helper function for using CUDA to add vectors in parallel. //template <class T> cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist) { int size = N*N; unsigned long long* dev_sum = 0; unsigned short* dev_img = 0; cudaError_t cudaStatus; const int block_size = 512; const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0); int L = filelist.size(); // Choose which GPU to run on, change this on a multi-GPU system. double totalgpuinittime = 0; StartCounter(7); cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } totalgpuinittime = GetCounter(7); unsigned short* img; unsigned short* pimg; unsigned long long* sum = new unsigned long long[num_blocks]; unsigned long long* psum = sum; cout<<endl; cout << "gpu looping starts, and in progress ..." << endl; StartCounter(6); double totalfileiotime = 0; double totalh2dcpytime = 0; double totalkerneltime = 0; double totald2hcpytime = 0; double totalcpusumtime = 0; double totalloopingtime = 0; for (int k = 0; k < L; k++) { StartCounter(1); img = (unsigned short*)LoadTIFF(filelist[k].c_str()); totalfileiotime += GetCounter(1); psum = sum; pimg = img; float gpumean = 0; memset(psum, 0, sizeof(unsigned long long)*num_blocks); StartCounter(2); // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } totalh2dcpytime += GetCounter(2); StartCounter(3); //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size); //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size); reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size); totalkerneltime += GetCounter(3); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. // !!!!!! following is where the code 77 error occurs!!!!!!! cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. StartCounter(4); cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } totald2hcpytime += GetCounter(4); StartCounter(5); for (int i = 0; i < num_blocks; i++) { gpumean += *psum; psum++; } gpumean /= N*N; totalcpusumtime += GetCounter(5); delete img; img = NULL; cout<<gpumean<<endl; } int S = 1e+6; int F = filelist.size(); float R = S/F; totalloopingtime = GetCounter(6); cout<<"gpu looping ends."<<endl<<endl; cout<< "analysis:"<<endl; cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl; cout<<"file I/O time: "<<endl; cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl; cout<<"host-to-device copy time: "<<endl; cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl; cout<<"pure gpu kerneling time: "<<endl; cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl; cout<<"device-to-host copy time: "<<endl; cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl; /*cout<<"cpu summing time: "<<endl; cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/ /*cout <<"gpu looping time: " << endl; cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/ Error: cudaFree(dev_sum); cudaFree(dev_img); delete sum; sum = NULL; return cudaStatus; } void kernel(float* &mean, int N, vector<string> filelist) { // wrapper and kernel cudaError_t cudaStatus = gpuWrapper(mean, N, filelist); if (cudaStatus != cudaSuccess) { fprintf(stderr, "gpuWapper failed!"); } // printf("mean is: %f\n", mean); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. StartCounter(8); cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); } cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl; //return *mean; }

I have allocated enough and equivalent memory space for the host and device memory. Any comments are welcome.

+6

c ++ memory gpu cuda

Ono Apr 14 '14 at 20:13

source share

1 answer

talonmies · Accepted Answer · 2014-04-15T04:36:35+0000

Although this may not be the only source of errors in the code, you do not allocate any dynamic shared memory for the recovery kernel, which leads to the illegal addressing error that you see. A proper kernel launch should be something like

 size_t shm_size = block_size * sizeof(unsigned long long); reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);

This sets aside the equivalent of one unsigned long long for each thread running in the shrink kernel, which (in my very cursory reading of your code) should make the sdata shared memory sdata correct size for the kernel to run without out-of-bounds access to this array.

CudaDeviceSynchronize () error code 77: cudaErrorIllegalAddress

More articles: