When I came across this question , I was curious to know the answer. therefore, I wrote below a code snippet to test atom performance in different scenarios. The OS is Ubuntu 12.04 with CUDA 5.5, and the device is the GeForce GTX780 (Kepler architecture). I compiled the code with the -O3 flag for CC = 3.5 as well.
#include <stdio.h> static void HandleError( cudaError_t err, const char *file, int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define BLOCK_SIZE 256 #define RESTRICTION_SIZE 32 __global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data+i, 6); //arbitrary number to add } } __global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6); //arbitrary number to add } } __global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data+(i>>5), 6); //arbitrary number to add } } __global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data, 6); //arbitrary number to add } } __global__ void CoalescedAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data+threadIdx.x, data[i]); } } __global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]); } } __global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]); } } __global__ void SameAddressAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data, data[0]); } } int main(void) { const int n = 2 << 24; int* data = new int[n]; int i; for(i=0; i<n; i++) { data[i] = i%1024+1; } int* dev_data; HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) ); HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) ); HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); HANDLE_ERROR( cudaDeviceReset() ); printf("Program finished without error.\n"); return 0; }
Basically in the above code there are 8 cores in which all threads are executed by atomicAdd for all data.
- The combined addition of atoms to global memory.
- Atomic addition in a limited address space in global memory. (32 in code)
- Atomic addition for jumpers at the same address in global memory.
- Atomically adding all threads to the same address in global memory.
Items 5-8 can be found by replacing global with the sharing of the above items. The selected block size is 256.
I used nvprof to profile the program. Exit:
Time(%) Time Calls Avg Min Max Name 44.33% 2.35113s 50 47.023ms 46.987ms 47.062ms SameAddressAtomicOnSharedMem(int*, int) 31.89% 1.69104s 50 33.821ms 33.818ms 33.826ms SameAddressAtomicOnGlobalMem(int*, int) 10.10% 535.88ms 50 10.718ms 10.707ms 10.738ms WarpRestrictedAtomicOnSharedMem(int*, int) 3.96% 209.95ms 50 4.1990ms 4.1895ms 4.2103ms AddressRestrictedAtomicOnSharedMem(int*, int) 3.95% 209.47ms 50 4.1895ms 4.1893ms 4.1900ms AddressRestrictedAtomicOnGlobalMem(int*, int) 3.33% 176.48ms 50 3.5296ms 3.5050ms 3.5498ms WarpRestrictedAtomicOnGlobalMem(int*, int) 1.08% 57.428ms 50 1.1486ms 1.1460ms 1.1510ms CoalescedAtomicOnGlobalMem(int*, int) 0.84% 44.784ms 50 895.68us 888.65us 905.77us CoalescedAtomicOnSharedMem(int*, int) 0.51% 26.805ms 1 26.805ms 26.805ms 26.805ms [CUDA memcpy HtoD] 0.01% 543.61us 1 543.61us 543.61us 543.61us [CUDA memset]
Obviously, combined conflict-free atomic operations had better performance, and the same address had the worst. The only thing I could not explain was why the same atom address in the shared memory (inside the block) is slower compared to the global memory (shared by all threads). When all the transition partitions fall into the same place in the shared memory, the performance is very poor, but (surprisingly) this is not the case when they execute it in global memory. I canβt explain why. Another case of confusion is an address limited by the atom on the global one, worse than when all threads inside the warp execute it at the same address, while it seems that the memory statements in the first case are lower.
In any case, I would be happy if someone could explain the above profiling results.