GPU memory allocation for a dynamic array of structures

I have a problem with passing a struct array to the gpu core. I was based on this topic - cudaMemcpy segmentation error , and I wrote sth like this:

#include <stdio.h> #include <stdlib.h> struct Test { char *array; }; __global__ void kernel(Test *dev_test) { for(int i=0; i < 5; i++) { printf("Kernel[0][i]: %c \n", dev_test[0].array[i]); } } int main(void) { int n = 4, size = 5; Test *dev_test, *test; test = (Test*)malloc(sizeof(Test)*n); for(int i = 0; i < n; i++) test[i].array = (char*)malloc(size * sizeof(char)); for(int i=0; i < n; i++) { char temp[] = { 'a', 'b', 'c', 'd' , 'e' }; memcpy(test[i].array, temp, size * sizeof(char)); } cudaMalloc((void**)&dev_test, n * sizeof(Test)); cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice); for(int i=0; i < n; i++) { cudaMalloc((void**)&(test[i].array), size * sizeof(char)); cudaMemcpy(&(dev_test[i].array), &(test[i].array), size * sizeof(char), cudaMemcpyHostToDevice); } kernel<<<1, 1>>>(dev_test); cudaDeviceSynchronize(); // memory free return 0; } 

There is no error, but the displayed values โ€‹โ€‹in the kernel are incorrect. What am I doing wrong? Thanks in advance for any help.

+2
c struct cuda dynamic-memory-allocation
source share
1 answer
  • This is the purpose of the new host memory pointer:

     test[i].array = (char*)malloc(size * sizeof(char)); 
  • This is copying data to this area in the host memory:

     memcpy(test[i].array, temp, size * sizeof(char)); 
  • This is a rewrite of the previously allocated pointer to the host memory (from step 1 above) with a new pointer to the device's memory:

     cudaMalloc((void**)&(test[i].array), size * sizeof(char)); 

After step 3, the data that you installed in step 2 is completely lost and no longer available. Referring to steps 3 and 4 in the question / answer , you linked:

3.Create a separate int pointer on the host, call it myhostptr

4.cudaMalloc int storage on device for myhostptr

You have not done so. You have not created a separate pointer. You reused (erased, rewritten) an existing pointer that pointed to data that you were worried about on the host. This question / answer , also related to the answer you linked, gives you almost the exact same thing you need in the code.

Here is a modified version of your code that correctly implements the missing steps 3 and 4 (and 5) that you did not implement correctly in accordance with the indicated question / answer: (see comments indicating steps 3,4, 5)

 $ cat t755.cu #include <stdio.h> #include <stdlib.h> struct Test { char *array; }; __global__ void kernel(Test *dev_test) { for(int i=0; i < 5; i++) { printf("Kernel[0][i]: %c \n", dev_test[0].array[i]); } } int main(void) { int n = 4, size = 5; Test *dev_test, *test; test = (Test*)malloc(sizeof(Test)*n); for(int i = 0; i < n; i++) test[i].array = (char*)malloc(size * sizeof(char)); for(int i=0; i < n; i++) { char temp[] = { 'a', 'b', 'c', 'd' , 'e' }; memcpy(test[i].array, temp, size * sizeof(char)); } cudaMalloc((void**)&dev_test, n * sizeof(Test)); cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice); // Step 3: char *temp_data[n]; // Step 4: for (int i=0; i < n; i++) cudaMalloc(&(temp_data[i]), size*sizeof(char)); // Step 5: for (int i=0; i < n; i++) cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice); // now copy the embedded data: for (int i=0; i < n; i++) cudaMemcpy(temp_data[i], test[i].array, size*sizeof(char), cudaMemcpyHostToDevice); kernel<<<1, 1>>>(dev_test); cudaDeviceSynchronize(); // memory free return 0; } $ nvcc -o t755 t755.cu $ cuda-memcheck ./t755 ========= CUDA-MEMCHECK Kernel[0][i]: a Kernel[0][i]: b Kernel[0][i]: c Kernel[0][i]: d Kernel[0][i]: e ========= ERROR SUMMARY: 0 errors $ 

Since the above methodology can be tricky for beginners, the usual recommendation is not to do this, but instead smooth out your data structures. Smoothing usually means reordering the data warehouse to remove inline pointers that need to be highlighted separately.

A trivial example of smoothing this data structure would be:

 struct Test { char array[5]; }; 

He recognized, of course, that this particular approach would not serve all purposes, but he should illustrate the general idea / intention. With this modification, for example, the code becomes much simpler:

 $ cat t755.cu #include <stdio.h> #include <stdlib.h> struct Test { char array[5]; }; __global__ void kernel(Test *dev_test) { for(int i=0; i < 5; i++) { printf("Kernel[0][i]: %c \n", dev_test[0].array[i]); } } int main(void) { int n = 4, size = 5; Test *dev_test, *test; test = (Test*)malloc(sizeof(Test)*n); for(int i=0; i < n; i++) { char temp[] = { 'a', 'b', 'c', 'd' , 'e' }; memcpy(test[i].array, temp, size * sizeof(char)); } cudaMalloc((void**)&dev_test, n * sizeof(Test)); cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice); kernel<<<1, 1>>>(dev_test); cudaDeviceSynchronize(); // memory free return 0; } $ nvcc -o t755 t755.cu $ cuda-memcheck ./t755 ========= CUDA-MEMCHECK Kernel[0][i]: a Kernel[0][i]: b Kernel[0][i]: c Kernel[0][i]: d Kernel[0][i]: e ========= ERROR SUMMARY: 0 errors $ 
+5
source share

All Articles