I am developing genetic cellular automata using PyCuda. Each cell will have a lot of genome data, as well as cell parameters. I am wondering what could be the most efficient way: 1) transfer the cell data to the CUDA core, then 2) process this data.
I started with a particularly bad (imo) but still working solution. He passed each parameter in a separate array, then processed them using a switch and a lot of duplicate code.
Then I realized that I could quickly get a fairly large number of parameters for each kernel function and decide to rewrite it.
The second solution was to store all the bundles of cell parameters in one array with an extra size. It was much more elegant in the code, but surprisingly, the code runs 10x slower !
To make this clearer, a complete list of the data that needs to be stored for each cell:
- (Fc, Mc, Tc): 3x (int) - cell current, mass and temperature.
- (Rfc, Rmc, Rtc): 3x (int) - cell current registers
- (Fi, Mi, Ti) for each neighbor: 8 * 3x (int) - input values
- (Rfi, Rmi, Rti) for each neighbor: 8 * 3x (int) - input values
- gate orientation: 1x (uchar)
- progress bar: 1x (uchar)
- current microoperation memory: 32x (uchar)
- microoperation memory of the last step: 32x (uchar)
2 . ( ) (Fi, Mi, Ti) . ( ) - 8x (Fi, Mi, Ti) . , , .
, :
Mk = 64
Tk = 1000
emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
int x = i / h;
int y = i % h;
int3 cell = cells[i];
float M = (float) cell.y;
float T = (float) cell.z;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[i].y -= Mi;
cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);
int Fi = cell.x;
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int Ti = cell.z;
int ii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fi) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo;
if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo;
if (xo >= w) yo = yo - h;
ii = xo * h + yo;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
switch(c) {
case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
default: break;
}
}
""", "ca_prepare", preamble="""
""" % Tk)
absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
int3 cell = cells[i];
int3 dcell = dcells0[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
dcell = dcells1[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells2[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells3[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells4[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells5[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells6[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells7[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
cells[i] = cell;
img[i] = hsv2rgb(cell);
""", "ca_calc", preamble="""
__device__ int3 operator+(const int3 &a, const int3 &b) {
return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}
__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
return make_int3(dF, dM, dT);
}
__device__ uint hsv2rgb(int3 pixel) {
// skipped for brevity
}
""" % (Mk, Tk, RAM))
:
Mk = 64
Tk = 1000
CELL_LEN = 120 # number of parameters per cell
emit_gpu = ElementwiseKernel("int *cells, int w, int h", """
int x = i / h;
int y = i % h;
int ii = i * CN;
int Fc = cells[ii];
int Mc = cells[ii+1];
int Tc = cells[ii+2];
float M = (float) Mc;
float T = (float) Tc;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[ii+1] = Mc - Mi;
cells[ii+2] = Tc - (int) (T * fmin(1, T / Tk));
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int iii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fc) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
if (c > 4) c--;
iii = (xo * h + yo) * CN + 6 + c*3;
cells[iii] = Fc;
cells[iii+1] = Mbase + Madd;
cells[iii+2] = Tc;
}
""", "ca_emit", preamble="""
""" % (Tk, CELL_LEN))
absorb_gpu = ElementwiseKernel("int *cells, int *img, int w, int h", """
int ii = i * CN;
int Fc = cells[ii];
int Mc = cells[ii+1];
int Tc = cells[ii+2];
for (int c=0; c < 8; c++){
int iii = ii + c * 3 + 6;
int Fi = cells[iii];
int Mi = cells[iii+1];
int Ti = cells[iii+2];
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
Fc += dF;
Mc += dM;
Tc += dT;
Fc = Fc % 360;
if (Fc < 0) Fc += 360;
if (Tc > Tk) Tc = Tk;
}
cells[ii] = Fc;
cells[ii+1] = Mc;
cells[ii+2] = Tc;
cells[ii+18] = (cells[ii+18] + 1) % 8;
img[i] = hsv2rgb(Fc, Tc, Mc);
""", "ca_absorb", preamble="""
__device__ uint hsv2rgb(int hue, int sat, int val) {
// skipped for brevity
}
""" % (Mk, Tk, CELL_LEN))
CA, .
GTX Titan:
- : 1900x1080.
- №1: ~ 200 /
- №2: ~ 20 /
GT 630M:
- : 1600x900.
- №1: ~ 7.8 /
- №2: ~ 1,5 /
, , :
№ № 1
№2
: