PBLAS matrix calculation - multithreading with MPI

Question

PBLAS matrix calculation - multithreading with MPI

I am currently developing C code with mpi for matrix multiplication. I have functions already implemented as mult or multadd defined in another file that work well. But my file pblas.ccompiles, but it crashes on startup.

I run my project on the university server on which I am installed mli.

Where am I mistaken in my pblascode?

/**********************************************************************

This file is just a pattern for pblas parallel multiplication

There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !

*********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>

#include "commfct.h"
#include "toolsfct.h"

void usage() {

  fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
  exit(1);
}

int main(int argc, char **argv) {

  int me,nbProc;
  int ligMe,colMe;
  int blockSize;
  int i,j;
  double t;

  if (argc != 2) {
    usage();
  }

  blockSize = atoi(argv[1]);

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Comm_size(MPI_COMM_WORLD, &nbProc);

  int P = (int)sqrt(nbProc); // P = the number of rows of proc.
  int Q = P; // Q = the number of columns of proc.
  if ((P*Q) != nbProc) {
    if (me == 0) {
      fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
    }
    exit(1);
  }

  createGridComm(me,P,Q);

  ligMe = me / Q;
  colMe = me % Q;

  // allocate memory for matrices
  double *A,*Btmp, *B,*C,*CC;
  A = (double *)malloc(blockSize*blockSize*sizeof(double));
  B = (double *)malloc(blockSize*blockSize*sizeof(double));
  Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
  C = (double *)malloc(blockSize*blockSize*sizeof(double));
  CC = (double *)malloc(blockSize*blockSize*sizeof(double));

  /* fill blocks with pseudo values

     NOTE : these values should not be changed so that
     the check below is valid
   */

  for(i=0;i<blockSize*blockSize;i++) {
    A[i] = 2.0+(double)me;
    B[i] = 1.0+(double)colMe;
    C[i] = (double)me / 10.0;
  }


  /* CAUTION : in the following, A,B C are supposed to be stored
     column after column, with each column of size blockSize.
     Thus A(0,0) and A(1,0) are contiguous in memory, but
     A(0,0) and A(0,1) are separated by blockSize cells.
  */

  t = dclock(CLOCK_S);

MPI_Status status;
//main Loop
for(i=0;i<P;i++) {
  /*************************************
Etape 1 et 2: Transposition  column i (in step i) of B-blocks . stock in Btmp.  
**************************************/

if(colMe==i){

    if(ligMe==colMe) {
    MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,ligMe,commCol);   
    multadd(A,B,C,blockSize);
    }   
    else {
        int dest = colMe * Q + ligMe;
        MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
        MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,dest%Q,commCol);
        mult(A,Btmp,CC,blockSize);
        }
}
else {

    int dest = colMe*Q + ligMe;
    if(dest%Q == i) {
    MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
    // Broadcast on the column
    MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,colMe,commCol);
    multadd(A,Btmp,C,blockSize);
    }
    else {
    MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
    mult(A,Btmp,CC,blockSize);
    }

}

if(colMe == i)
    MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
    MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);

}
  t = dclock(CLOCK_S) -t;

  printf("timing for %d : %f sec\n",me,t);

  // checking for result correctness
  int correct = 1;
  double sum = 0.0;
  for(i=0;i<P;i++) {
    sum += 2.0+(ligMe*Q)+(double)i;
  }

  for(i=0;i<blockSize;i++) {
    for(j=0;j<blockSize;j++) {
      if (C[i+j*blockSize] != ((double)me/10.0 + sum*blockSize*(colMe+1.0))) {
    correct = 0;
      }
    }
  }
  if (correct != 1) {
    printf("multiplication result is not correct\n");
  }

  // free memory
  free(A);
  free(B);
  free(C);
  free(CC);

  releaseGridComm();

  MPI_Finalize();

  return 0;
}

+4

c matrix mpi distributed-computing

maximeg Dec 19 '13 at 13:51

source share

1 answer

francis · Answer 1 · 2013-12-22T21:27:39+0000

MPI_Send(), MPI_Recv() MPI_Bcast(). , MPI_Send() MPI_Recv(). MPI_Recv() MPI_Send(). . , .

, , , A B.

A0 | A1

...........

A2 | A3

C, :

A0xB0 | A1xB2

...................

A2xB0 | A3xB2

, :

i

i_th B i- Btmp

i_th Btmp Btmp .

MPI, MPI, , , , ...

/**********************************************************************

This file is just a pattern for pblas parallel multiplication

There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !

 *********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "mpi.h"

//#include "commfct.h"
//#include "toolsfct.h"

#define TAG_TRANSPOSE 42

void usage() {

    fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
    exit(1);
}

int main(int argc, char **argv) {

    int me,nbProc;
    int ligMe,colMe;
    int blockSize;
    int i,j;
    double t;

    if (argc != 2) {
        usage();
    }

    blockSize = atoi(argv[1]);

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    MPI_Comm_size(MPI_COMM_WORLD, &nbProc);

    int P = (int)sqrt(nbProc); // P = the number of rows of proc.
    int Q = P; // Q = the number of columns of proc.
    if ((P*Q) != nbProc) {
        if (me == 0) {
            fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
        }
        exit(1);
    }

    //createGridComm(me,P,Q);

    colMe = me / Q;
    ligMe = me % Q;

    MPI_Comm commCol, commLine; 
    //comes from http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
    /* Split comm into row and column comms */ 
    MPI_Comm_split(MPI_COMM_WORLD, ligMe, colMe, &commLine); 
    /* color by row, rank by column */ 
    MPI_Comm_split(MPI_COMM_WORLD, colMe, ligMe, &commCol); 
    /* color by column, rank by row */ 

    printf("[%d]:My coordinates are i j (%d,%d)\n",me,ligMe,colMe); 

    // allocate memory for matrices
    double *A,*Btmp, *B,*C,*CC;
    A = (double *)malloc(blockSize*blockSize*sizeof(double));
    B = (double *)malloc(blockSize*blockSize*sizeof(double));
    Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
    C = (double *)malloc(blockSize*blockSize*sizeof(double));
    CC = (double *)malloc(blockSize*blockSize*sizeof(double));

    /* fill blocks with pseudo values

     NOTE : these values should not be changed so that
     the check below is valid
     */

    for(i=0;i<blockSize*blockSize;i++) {
        A[i] = 2.0+(double)me;
        B[i] = 1.0+(double)colMe;
        C[i] = (double)me / 10.0;
    }


    /* CAUTION : in the following, A,B C are supposed to be stored
     column after column, with each column of size blockSize.
     Thus A(0,0) and A(1,0) are contiguous in memory, but
     A(0,0) and A(0,1) are separated by blockSize cells.
     */

    // t = dclock(CLOCK_S);

    MPI_Status status;
    //main Loop
    for(i=0;i<Q;i++) {
        /*************************************
Etape 1 et 2: Transposition  column i (in step i) of B-blocks . stock in Btmp.  
         **************************************/

        if(colMe==i){

            if(ligMe==colMe) {
                MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);   
                // multadd(A,B,C,blockSize);
            }   
            else {
                int dest = ligMe * Q + i;//transpose !
                MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
                MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
                // mult(A,Btmp,CC,blockSize);
            }
        }
        else {

            int from = i*Q + colMe;// transpose !
            if(ligMe == i) {
                MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,from,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
                // Broadcast on the column
                MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
                // multadd(A,Btmp,C,blockSize);
            }
            else {
                MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
                // mult(A,Btmp,CC,blockSize);
            }

        }

        if(colMe == i)
            MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
        else
            MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);

    }
    //t = dclock(CLOCK_S) -t;

    printf("timing for %d : %f sec\n",me,t);

    // checking for result correctness
    int correct = 1;
    double sum = 0.0;
    for(i=0;i<P;i++) {
        sum += 2.0+(ligMe*Q)+(double)i;
    }

    for(i=0;i<blockSize;i++) {
        for(j=0;j<blockSize;j++) {
            if (C[i+j*blockSize] <0.99999*((double)me/10.0 + sum*blockSize*(colMe+1.0)) || C[i+j*blockSize] >1.00001*((double)me/10.0 + sum*blockSize*(colMe+1.0)) ) {
                correct = 0;
            }
        }
    }
    if (correct != 1) {
        printf("multiplication result is not correct\n");
    }

    // free memory
    free(A);
    free(B);
    free(C);
    free(CC);

    //releaseGridComm();

    MPI_Finalize();

    return 0;
}

createGridComm (me, P, Q); - http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html

. . !

, ! Bye,

PBLAS matrix calculation - multithreading with MPI

More articles: