I have found a solution. I just needed to use an atomic function, that is, a function that runs without interference from other threads. In other words, no other thread can access a specific address until the operation is complete.
the code:
#include <iostream> using namespace std; __global__ void inc(int *foo) { atomicAdd(foo, 1); } int main() { int count = 0, *cuda_count; cudaMalloc((void**)&cuda_count, sizeof(int)); cudaMemcpy(cuda_count, &count, sizeof(int), cudaMemcpyHostToDevice); cout << "count: " << count << '\n'; inc <<< 100, 25 >>> (cuda_count); cudaMemcpy(&count, cuda_count, sizeof(int), cudaMemcpyDeviceToHost); cudaFree(cuda_count); cout << "count: " << count << '\n'; return 0; }
Output:
count: 0 count: 2500
Thank you for making me understand the mistake I made.
Renato rodrigues
source share