Genetic cellular automata with PyCuda, how to efficiently transfer a lot of data per cell to the CUDA core?

Question

Genetic cellular automata with PyCuda, how to efficiently transfer a lot of data per cell to the CUDA core?

I am developing genetic cellular automata using PyCuda. Each cell will have a lot of genome data, as well as cell parameters. I am wondering what could be the most efficient way: 1) transfer the cell data to the CUDA core, then 2) process this data.

I started with a particularly bad (imo) but still working solution. He passed each parameter in a separate array, then processed them using a switch and a lot of duplicate code.

Then I realized that I could quickly get a fairly large number of parameters for each kernel function and decide to rewrite it.

The second solution was to store all the bundles of cell parameters in one array with an extra size. It was much more elegant in the code, but surprisingly, the code runs 10x slower !

To make this clearer, a complete list of the data that needs to be stored for each cell:

(Fc, Mc, Tc): 3x (int) - cell current, mass and temperature.
(Rfc, Rmc, Rtc): 3x (int) - cell current registers
(Fi, Mi, Ti) for each neighbor: 8 * 3x (int) - input values
(Rfi, Rmi, Rti) for each neighbor: 8 * 3x (int) - input values
gate orientation: 1x (uchar)
progress bar: 1x (uchar)
current microoperation memory: 32x (uchar)
microoperation memory of the last step: 32x (uchar)

2 . ( ) (Fi, Mi, Ti) . ( ) - 8x (Fi, Mi, Ti) . , , .

, :

Mk = 64
Tk = 1000

emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
    int x = i / h;
    int y = i % h;

    int3 cell = cells[i];
    float M = (float) cell.y;
    float T = (float) cell.z;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[i].y -= Mi;
    cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);

    int Fi = cell.x;
    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int Ti = cell.z;
    int ii, xo, yo;

    for (int cc = 0; cc < 9; cc++) {
      int c = (cc + Fi) % 9;
      if (c == 4) continue;
      xo = x + c%3 - 1;
      if (xo < 0) xo = w + xo;
      if (xo >= w) xo = xo - w;
      yo = y + c/3 - 1;
      if (yo < 0) yo = h + yo;
      if (xo >= w) yo = yo - h;
      ii = xo * h + yo;
      if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
      switch(c) {
        case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        default: break;
      }

    } 
""", "ca_prepare", preamble="""
#define Tk %s
""" % Tk)

absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
    int3 cell = cells[i];

    int3 dcell = dcells0[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;

    dcell = dcells1[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells2[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells3[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells4[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells5[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells6[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells7[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    cells[i] = cell;
    img[i] = hsv2rgb(cell);

""", "ca_calc", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s

__device__ int3 operator+(const int3 &a, const int3 &b) {
    return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}

__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
    int dF = Fi - Fc;
    if (dF > 180) Fc += 360;
    if (dF < -180) Fc -= 360;
    float sM = Mi + Mc;
    if (sM != 0) sM = Mi / sM; else sM = 0;
    dF = (int) (Fi - Fc) * sM;
    int dM = Mi;
    int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
    return make_int3(dF, dM, dT);
}

__device__ uint hsv2rgb(int3 pixel) {
    // skipped for brevity
}
""" % (Mk, Tk, RAM))

:

Mk = 64
Tk = 1000
CELL_LEN = 120 # number of parameters per cell

emit_gpu = ElementwiseKernel("int *cells, int w, int h", """
    int x = i / h;
    int y = i % h;
    int ii = i * CN;

    int Fc = cells[ii];
    int Mc = cells[ii+1];
    int Tc = cells[ii+2];
    float M = (float) Mc;
    float T = (float) Tc;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[ii+1] = Mc - Mi;
    cells[ii+2] = Tc - (int) (T * fmin(1, T / Tk));

    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int iii, xo, yo;

    for (int cc = 0; cc < 9; cc++) {
      int c = (cc + Fc) % 9;
      if (c == 4) continue;
      xo = x + c%3 - 1;
      if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
      yo = y + c/3 - 1;
      if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;
      if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
      if (c > 4) c--;
      iii = (xo * h + yo) * CN + 6 + c*3;

      cells[iii] = Fc;
      cells[iii+1] = Mbase + Madd;
      cells[iii+2] = Tc;

    } 
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))

absorb_gpu = ElementwiseKernel("int *cells, int *img, int w, int h", """
    int ii = i * CN;
    int Fc = cells[ii];
    int Mc = cells[ii+1];
    int Tc = cells[ii+2];

    for (int c=0; c < 8; c++){
      int iii = ii + c * 3 + 6;
      int Fi = cells[iii];
      int Mi = cells[iii+1];
      int Ti = cells[iii+2];

      int dF = Fi - Fc;
      if (dF > 180) Fc += 360;
      if (dF < -180) Fc -= 360;
      float sM = Mi + Mc;
      if (sM != 0) sM = Mi / sM; else sM = 0;
      dF = (int) (Fi - Fc) * sM;
      int dM = Mi;
      int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
      Fc += dF;
      Mc += dM;
      Tc += dT;
      Fc = Fc % 360;
      if (Fc < 0) Fc += 360;
      if (Tc > Tk) Tc = Tk;
    }      

    cells[ii] = Fc;
    cells[ii+1] = Mc;
    cells[ii+2] = Tc;
    cells[ii+18] = (cells[ii+18] + 1) % 8;

    img[i] = hsv2rgb(Fc, Tc, Mc);

""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
#define CN %s

__device__ uint hsv2rgb(int hue, int sat, int val) {
    // skipped for brevity
}
""" % (Mk, Tk, CELL_LEN))

CA, .

GTX Titan:

: 1900x1080.
№1: ~ 200 /
№2: ~ 20 /

GT 630M:

: 1600x900.
№1: ~ 7.8 /
№2: ~ 1,5 /

, , :

№ № 1

№2

:

?
№2 # 1?
?

+4

optimization c python cuda pycuda

a5kin 16 . '14 9:06

1

a5kin · Answer 1 · 2014-12-16T23:43:56+0000

, 15 . :

int int4. , int3. (.w ). [3 ]
, Repack, WIDHEIGHT. , (WIDTH, HEIGHT, N) (N, WIDTH, HEIGHT). , , , . [5x speedup]

:

Mk = 64
Tk = 1000

emit_gpu = ElementwiseKernel("int4 *cells, int w, int h, int cn", """
    int x = i / h;
    int y = i % h;

    int4 cell = cells[i];
    int Fc = cell.x;
    int Mc = cell.y;
    int Tc = cell.z;
    float M = (float) Mc;
    float T = (float) Tc;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[i] = make_int4(Fc, Mc - Mi, Tc - (int) (T * fmin(1, T / Tk)), 0);

    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int ii;
    int xo, yo;

    int cnn = 0;
    for (int dx = -1; dx < 2; dx++) {
      xo = x + dx;
      if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
      for (int dy = -1; dy < 2; dy++) {
        if (dx == 0 && dy == 0) continue;
        cnn += cn;
        yo = y + dy;
        if (yo < 0) yo = h + yo; else if (yo >= h) yo = yo - h;
        if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
        ii = (xo * h + yo) + cnn;
        cells[ii] = make_int4(Fc, Mbase + Madd, Tc, 0);
      }
    } 
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))

absorb_gpu = ElementwiseKernel("int4 *cells, int *img, int w, int h, int cn", """
    int ii = i;
    int4 cell = cells[i];
    int Fc = cell.x;
    int Mc = cell.y;
    int Tc = cell.z;

    for (int c=0; c < 8; c++){
      ii += cn;
      cell = cells[ii];
      int Fi = cell.x;
      int Mi = cell.y;
      int Ti = cell.z;

      int dF = Fi - Fc;
      if (dF > 180) Fc += 360;
      if (dF < -180) Fc -= 360;
      float sM = Mi + Mc;
      if (sM != 0) sM = Mi / sM; else sM = 0;
      dF = (int) (Fi - Fc) * sM;
      int dM = Mi;
      int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
      Fc += dF;
      Mc += dM;
      Tc += dT;
      Fc = Fc % 360;
      if (Fc < 0) Fc += 360;
      if (Tc > Tk) Tc = Tk;
    }      

    cells[i] = make_int4(Fc, Mc, Tc, 0);
    img[i] = hsv2rgb(Fc, Tc, Mc);

""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s

__device__ uint hsv2rgb(int hue, int sat, int val) {
    // skipped for brevity
}
""" % (Mk, Tk))

Park Young-Bae , .

Genetic cellular automata with PyCuda, how to efficiently transfer a lot of data per cell to the CUDA core?

More articles: