Prior to CUDA 4.0, multi-GPU programming required multi-threaded CPU programming. This can be tricky, especially if you need to synchronize and / or communicate between threads or GPUs. And if all of your parallelism is in your GPU, then having multiple CPU threads can add to the complexity of your software without improving performance beyond what the GPU does.
So, starting with CUDA 4.0, you can easily program multiple GPUs from a single-threaded host program. Here are a few slides that I presented last year about this .
Programming multiple GPUs can be simple:
int numDevs = 0;
cudaGetNumDevices(&numDevs);
...
for (int d = 0; d < numDevs; d++) {
cudaSetDevice(d);
kernel<<<blocks, threads>>>(args);
}
thrust::inner_product . . . .
, , .
int numDevs = 0;
cudaGetNumDevices(&numDevs);
...
float result = 0.f;
for (int d = 0; d < numDevs; d++) {
cudaSetDevice(d);
device_vector<float> vecA(h_vecA + d*(n/d), h_vecA + (d+1)*(n/d)-1);
device_vector<float> vecB(h_vecB + d*(n/d), h_vecB + (d+1)*(n/d)-1);
result += thrust::inner_product(vecA.begin(), vecA.end(), vecB.begin(), 0.f);
}
( , , n numDevs, .:)
, . , .
, , , - , , - , PCI-e, concurrency , thrust:: inner_product - , . , cudaMemcpyAsync ( device_vector cudaMemcpy). , " " - ( gpu). , , ( ), , , . , , concurrency.
- :
int bytes = sizeof(float) * n;
cudaHostAlloc(h_vecA, bytes, cudaHostAllocMapped | cudaHostAllocPortable);
cudaHostAlloc(h_vecB, bytes, cudaHostAllocMapped | cudaHostAllocPortable);
cudaHostAlloc(results, numDevs * sizeof(float), cudaHostAllocMapped | cudaHostAllocPortable);
for (int d = 0; d < numDevs; d++) {
cudaSetDevice(d);
cudaEventCreate(event[d]));
cudaHostGetDevicePointer(&dptrsA[d], h_vecA, 0);
cudaHostGetDevicePointer(&dptrsB[d], h_vecB, 0);
cudaHostGetDevicePointer(&dresults[d], results, 0);
}
...
for (int d = 0; d < numDevs; d++) {
cudaSetDevice(d);
int first = d * (n/d);
int last = (d+1)*(n/d)-1;
my_inner_product<<<grid, block>>>(&dresults[d],
vecA+first,
vecA+last,
vecB+first, 0.f);
cudaEventRecord(event[d], 0);
}
float total = 0.0f;
for (int d = 0; d < devs; d++) {
cudaEventSynchronize(event[d]);
total += results[numDevs];
}