The following is a snippet of source code in our sample gallery, which you can find in the Alea GPU sample gallery http://www.quantalea.com/gallery .
. , .
public static class FloydWarshallSingleStage
{
const int BlockWidth = 16;
public static void KernelSingleStage(int u, int[,] d, int[,] p)
{
var n = d.GetLength(0);
var v1 = blockDim.y * blockIdx.y + threadIdx.y;
var v2 = blockDim.x * blockIdx.x + threadIdx.x;
if (v1 < n && v2 < n)
{
var newPath = d[v1, u] + d[u, v2];
var oldPath = d[v1, v2];
if (oldPath > newPath)
{
d[v1, v2] = newPath;
p[v1, v2] = p[u, v2];
}
}
}
[GpuManaged]
public static void Run(Gpu gpu, int[,] d, int[,] p)
{
var n = d.GetLength(0);
var gridDim = new dim3((n - 1) / BlockWidth + 1, (n - 1) / BlockWidth + 1, 1);
var blockDim = new dim3(BlockWidth, BlockWidth, 1);
var lp = new LaunchParam(gridDim, blockDim);
for (var u = 0; u < n; u++)
{
gpu.Launch(KernelSingleStage, lp, u, d, p);
}
}
}
, . , , , . .
public class FloydWarshallMultiStage
{
private const int None = -1;
private const int Inf = 1061109567;
private const int BlockSize = 16;
private const int ThreadSize = 2;
private const int VirtualBlockSize = BlockSize*ThreadSize;
public FloydWarshallMultiStage(int blockSize, int threadSize)
{
}
public void KernelPhaseOne(int block, int n, int pitch, int[,] d, int[,] p)
{
var newPred = 0;
var tx = threadIdx.x;
var ty = threadIdx.y;
var v1 = VirtualBlockSize*block + ty;
var v2 = VirtualBlockSize*block + tx;
var primaryD = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var primaryP = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
if (v1 < n && v2 < n)
{
primaryD[ty, tx] = d[v1, v2];
primaryP[ty, tx] = p[v1, v2];
newPred = primaryP[ty, tx];
}
else
{
primaryD[ty, tx] = Inf;
primaryP[ty, tx] = None;
}
DeviceFunction.SyncThreads();
for (var i = 0; i < VirtualBlockSize; i++)
{
var newPath = primaryD[ty, i] + primaryD[i, tx];
DeviceFunction.SyncThreads();
if (newPath < primaryD[ty, tx])
{
primaryD[ty, tx] = newPath;
newPred = primaryP[i, tx];
}
DeviceFunction.SyncThreads();
primaryP[ty, tx] = newPred;
}
if (v1 < n && v2 < n)
{
d[v1, v2] = primaryD[ty, tx];
p[v1, v2] = primaryP[ty, tx];
}
}
public void KernelPhaseTwo(int block, int n, int pitch, int[,] d, int[,] p)
{
if (blockIdx.x == block) return;
var newPath = 0;
var newPred = 0;
var tx = threadIdx.x;
var ty = threadIdx.y;
var v1 = VirtualBlockSize*block + ty;
var v2 = VirtualBlockSize*block + tx;
var primaryD = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var currentD = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var primaryP = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var currentP = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
if (v1 < n && v2 < n)
{
primaryD[ty, tx] = d[v1, v2];
primaryP[ty, tx] = p[v1, v2];
}
else
{
primaryD[ty, tx] = Inf;
primaryP[ty, tx] = None;
}
if (blockIdx.y == 0)
{
v1 = VirtualBlockSize*block + ty;
v2 = VirtualBlockSize*blockIdx.x + tx;
}
else
{
v1 = VirtualBlockSize*blockIdx.x + ty;
v2 = VirtualBlockSize*block + tx;
}
if (v1 < n && v2 < n)
{
currentD[ty, tx] = d[v1, v2];
currentP[ty, tx] = p[v1, v2];
newPred = currentP[ty, tx];
}
else
{
currentD[ty, tx] = Inf;
currentP[ty, tx] = None;
}
DeviceFunction.SyncThreads();
if (blockIdx.y == 0)
{
for (var i = 0; i < VirtualBlockSize; i++)
{
newPath = primaryD[ty, i] + currentD[i, tx];
DeviceFunction.SyncThreads();
if (newPath < currentD[ty, tx])
{
currentD[ty, tx] = newPath;
newPred = currentP[i, tx];
}
DeviceFunction.SyncThreads();
currentP[ty, tx] = newPred;
}
}
else
{
for (var i = 0; i < VirtualBlockSize; i++)
{
newPath = currentD[ty, i] + primaryD[i, tx];
DeviceFunction.SyncThreads();
if (newPath < currentD[ty, tx])
{
currentD[ty, tx] = newPath;
currentP[ty, tx] = primaryP[i, tx];
}
DeviceFunction.SyncThreads();
}
}
if (v1 < n && v2 < n)
{
d[v1, v2] = currentD[ty, tx];
p[v1, v2] = currentP[ty, tx];
}
}
public void KernelPhaseThree(int block, int n, int pitch, int[,] d, int[,] p)
{
if (blockIdx.x == block || blockIdx.y == block) return;
var tx = threadIdx.x*ThreadSize;
var ty = threadIdx.y*ThreadSize;
var v1 = blockDim.y*blockIdx.y*ThreadSize + ty;
var v2 = blockDim.x*blockIdx.x*ThreadSize + tx;
var primaryRowD = __shared__.Array2D<int>(BlockSize*ThreadSize, BlockSize*ThreadSize);
var primaryColD = __shared__.Array2D<int>(BlockSize*ThreadSize, BlockSize*ThreadSize);
var primaryRowP = __shared__.Array2D<int>(BlockSize*ThreadSize, BlockSize*ThreadSize);
var v1Row = BlockSize*block*ThreadSize + ty;
var v2Col = BlockSize*block*ThreadSize + tx;
for (var i = 0; i < ThreadSize; i++)
{
for (var j = 0; j < ThreadSize; j++)
{
var idx = tx + j;
var idy = ty + i;
if (v1Row + i < n && v2 + j < n)
{
primaryRowD[idy, idx] = d[v1Row + i, v2 + j];
primaryRowP[idy, idx] = p[v1Row + i, v2 + j];
}
else
{
primaryRowD[idy, idx] = Inf;
primaryRowP[idy, idx] = None;
}
if (v1 + i < n && v2Col + j < n)
{
primaryColD[idy, idx] = d[v1 + i, v2Col + j];
}
else
{
primaryColD[idy, idx] = Inf;
}
}
}
DeviceFunction.SyncThreads();
for (var i = 0; i < ThreadSize; i++)
{
for (var j = 0; j < ThreadSize; j++)
{
if (v1 + i < n && v2 + j < n)
{
var path = d[v1 + i, v2 + j];
var predecessor = p[v1 + i, v2 + j];
var idy = ty + i;
var idx = tx + j;
for (var k = 0; k < BlockSize*ThreadSize; k++)
{
var newPath = primaryColD[idy, k] + primaryRowD[k, idx];
if (path > newPath)
{
path = newPath;
predecessor = primaryRowP[k, idx];
}
}
d[v1 + i, v2 + j] = path;
p[v1 + i, v2 + j] = predecessor;
}
}
}
}
public void Run(Gpu gpu, int[,] d, int[,] p, bool verbose = false)
{
var n = d.GetLength(0);
var gridDim1 = new dim3(1, 1, 1);
var gridDim2 = new dim3((n - 1)/VirtualBlockSize + 1, 2, 1);
var gridDim3 = new dim3((n - 1)/VirtualBlockSize + 1, (n - 1)/VirtualBlockSize + 1, 1);
var blockDim1 = new dim3(VirtualBlockSize, VirtualBlockSize, 1);
var blockDim2 = new dim3(VirtualBlockSize, VirtualBlockSize, 1);
var blockDim3 = new dim3(BlockSize, BlockSize, 1);
var numOfBlock = (n - 1)/VirtualBlockSize + 1;
var pitchInt = n;
if (verbose)
{
Console.WriteLine($"|V| {n}");
Console.WriteLine($"Phase 1: grid dim {gridDim1} block dim {blockDim1}");
Console.WriteLine($"Phase 2: grid dim {gridDim2} block dim {blockDim2}");
Console.WriteLine($"Phase 3: grid dim {gridDim3} block dim {blockDim3}");
}
for (var block = 0; block < numOfBlock; block++)
{
gpu.Launch(KernelPhaseOne, new LaunchParam(gridDim1, blockDim1), block, n, pitchInt, d, p);
gpu.Launch(KernelPhaseTwo, new LaunchParam(gridDim2, blockDim2), block, n, pitchInt, d, p);
gpu.Launch(KernelPhaseThree, new LaunchParam(gridDim3, blockDim3), block, n, pitchInt, d, p);
}
}
}
http://www.quantalea.com/gallery Floyd-Warshall.
, .
:
, , CUDA -, 2010.
https://arxiv.org/abs/1001.4108