Next: About this document ... Up: annexe1 Previous: A new code for
Strassen algorithm: a skeleton

/*
File name: out-of-core-strassen.c
/usr/local/mpi/bin/mpicc -O out-of-core-strassen.c
*/

#include "sys/types.h"
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include <argz.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <unistd.h>
#include "mpi.h"

#define MPI_OFFSET_ZERO 0

#define P          4              /* number of tasks       */
#define Nb         8              /* slice size            */
#define Ng         ( Nb * P )     /* global size of matrix */

static char fileA[] = "fileA";
static char fileB[] = "fileB";
static char fileC[] = "fileC";

void
fill_file_A(char *FNAME, int N)
{
  int             i, data;
  FILE            *f;

  /*  sprintf(s, FNAME, 0);*/
  if ((f = (FILE *) fopen(FNAME, "w+")) == NULL) {
    perror("io2: error in opening auxiliary files");
    exit(1);
  }
  for (i = 0; i < N * N; i++) {
    data = i + 1;
    if (fwrite(&data, sizeof(int), 1, f) != 1) {
      perror("io3: can't write in an auxiliary file");
      fclose(f);
      exit(1);
    }
  }
  fclose(f);
}

void
fill_file_B(char *FNAME, int N)
{
  int             i, j, data;
  FILE            *f;

  /*  sprintf(s, FNAME, 0);*/
  if ((f = (FILE *) fopen(FNAME, "w+")) == NULL) {
    perror("io2: error in opening auxiliary files");
    exit(1);
  }
  for (i = 0; i < N; i++) {
    for(j=0; j < N; j++) {
    data = (i+1) + N*j;
    if (fwrite(&data, sizeof(int), 1, f) != 1) {
      perror("io3: can't write in an auxiliary file");
      fclose(f);
      exit(1);
    }
    }
  }
  fclose(f);
}

/*
 * Print screen fname file (which much contain integers)
 */
void 
aff(char *fname, int N)
{
  FILE           *fp;
  int             i,count=0;

  fp = fopen(fname, "r");
  fread(&i, 4, 1, fp);
  while (0 == feof(fp)) {
    printf("%6.5d ", i);
    if(((count+1) % N) == 0) {
      count = 0; printf("\n");
    } else count++;
    fread(&i, 4, 1, fp);
  }
  printf("\n");
  fclose(fp);
}

void
aff_in_core(int *t,int n){
  int i,j,count=0;

  for(i=0;i<n*n;i++) {
    printf("%6.5d",*(t+i));
    if(((count+1) % n) == 0) {
      printf("\n");count = 0;
    }  else {
      count++;
    }
  }
}  

/* c = a+b */
void add_in_core(int n, int *a, int *b, int *c)
{

  int i, j;

  for (i = 0; i < n; i++) 
    for (j = 0; j < n; j++)
      *(c+(i*n)+j) = *(a+(i*n)+j) + *(b+(i*n)+j);
}

/* c = a-b */
void sub_in_core(int n, int *a, int *b, int *c)
{
  int i, j;
  
  for (i = 0; i < n; i++) 
    for (j = 0; j < n; j++)
      *(c+(i*n)+j) = (*(a+(i*n)+j)) - (*(b+(i*n)+j));
}

/* c = a*b */
void multiply_in_core(int n, int *a, int *b, int *c)
{
  int i, j, k;
  
  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      *(c+(i*n)+j) = 0;
      for(k=0;k<n;k++) 
        *(c+(i*n)+j) += *(a+(i*n)+k) * *(b+(k*n)+j);
    }
}

/* c = a*b (Strassen in-core Algorithm) */
void multiply_strassen(int NN, int *a, int *b, int *c)
{

  int i,j,k,n;
  int *a11, *a12, *a21, *a22;
  int *b11, *b12, *b21, *b22;
  int *M0, *M1, *M2, *M3, *M4, *M5, *M6;
  int *c1, *c2, *d1, *d2;
  int *c11, *c12, *c21, *c22;

  n = NN/2;

  /* initial step: isolate sub-matrices */

  a11 = (int *)malloc(n*n*sizeof(int));
  a12 = (int *)malloc(n*n*sizeof(int));
  a21 = (int *)malloc(n*n*sizeof(int));
  a22 = (int *)malloc(n*n*sizeof(int));
  b11 = (int *)malloc(n*n*sizeof(int));
  b12 = (int *)malloc(n*n*sizeof(int));
  b21 = (int *)malloc(n*n*sizeof(int));
  b22 = (int *)malloc(n*n*sizeof(int));

  for(i=0;i<n;i++)
    for(j=0;j<n;j++) {
      k = 2*(i*n) + j;
      *(a11 + (i*n) + j) = *(a + k);
      *(b11 + (i*n) + j) = *(b + k);
    }

  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      k = (i*n) + j + (n*(i+1));
      *(a12 + (i*n) + j) = *(a + k);
      *(b12 + (i*n) + j) = *(b + k);
    }

  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      k = 2*(i*n) + j + (n * Nb);
      *(a21 + (i*n) + j) = *(a + k);
      *(b21 + (i*n) + j) = *(b + k);
    }

  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      k = 2*(i*n) + j + (n*Nb) + n;
      *(a22 + (i*n) + j) = *(a + k);
      *(b22 + (i*n) + j) = *(b + k);
    }

  /*
    aff(a11,n);printf("\n");aff(a12,n);printf("\n");
    aff(a21,n);printf("\n");aff(a22,n);printf("\n");
  */
  /* first part */

  d1  = (int *)malloc(n*n*sizeof(int));
  d2  = (int *)malloc(n*n*sizeof(int));
  M0  = (int *)malloc(n*n*sizeof(int));
  M1  = (int *)malloc(n*n*sizeof(int));
  M2  = (int *)malloc(n*n*sizeof(int));
  M3  = (int *)malloc(n*n*sizeof(int));
  M4  = (int *)malloc(n*n*sizeof(int));
  M5  = (int *)malloc(n*n*sizeof(int));
  M6  = (int *)malloc(n*n*sizeof(int));
  c1  = (int *)malloc(n*n*sizeof(int));
  c2  = (int *)malloc(n*n*sizeof(int));
  c11 = (int *)malloc(n*n*sizeof(int));
  c12 = (int *)malloc(n*n*sizeof(int));
  c21 = (int *)malloc(n*n*sizeof(int));
  c22 = (int *)malloc(n*n*sizeof(int));


  add_in_core(n, a11, a22, d1);
  add_in_core(n, b11, b22, d2);
  multiply_in_core(n, d1, d2, M0);

  sub_in_core(n, a12, a22, d1);
  add_in_core(n, b21, b22, d2);
  multiply_in_core(n, d1, d2, M1);

  sub_in_core(n, b21, b11, d1);
  multiply_in_core(n, a22, d1, M2);

  add_in_core(n, a11, a12, d1);
  multiply_in_core(n, d1, b22, M3);

  add_in_core(n, a21, a22, d1);
  multiply_in_core(n, d1, b11, M4);

  sub_in_core(n, b12, b22, d1);
  multiply_in_core(n, a11, d1, M5);

  sub_in_core(n, a21, a11, d1);
  add_in_core(n, b11, b12, d2);
  multiply_in_core(n, d1, d2, M6);

  /* Second part */

  add_in_core(n, M0, M1, c1);
  sub_in_core(n, M2, M3, c2);
  add_in_core(n,c1,c2,c11);

  add_in_core(n, M3, M5, c12);

  add_in_core(n, M2, M4, c21);

  sub_in_core(n, M0, M4, c1);
  add_in_core(n, M5, M6, c2);
  add_in_core(n, c1, c2, c22);

  /* Third step: copy into the final destination */

  for(i=0;i<n;i++)
    for(j=0;j<n;j++) {
      k = 2*(i*n) + j;
      *(c + k) = *(c11 + (i*n) + j);
    }

  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      k = (i*n) + j + (n*(i+1));
      *(c + k) = *(c12 + (i*n) + j);
    }

  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      k = 2*(i*n) + j + (n * Nb);
      *(c + k) = *(c21 + (i*n) + j);
    }

  for(i=0;i<n;i++)
    for(j=0;j<n;j++){
      k = 2*(i*n) + j + (n*Nb) + n;
      *(c + k) = *(c22 + (i*n) + j);
    }
}


int
main( int argc, char *argv[] )
{
  int i, j, k;
  int iv;
  int myrank, commsize;
  int extent;
  int colext;
  int slicelen;
  int sliceext;
  int sizes[2], subsizes[2], starts[2]; 
  int length[ 3 ];
  MPI_Aint disp[ 3 ];
  MPI_Datatype type[ 3 ];
  MPI_Datatype ftypeA;
  MPI_Datatype ftypeB;
  MPI_Datatype ftypeC;
  MPI_Datatype slice_typeB;
  MPI_Datatype block_typeC;
  MPI_Datatype slice_typeC;
  int mode;
  MPI_File fh_A, fh_B, fh_C;
  MPI_Offset offsetA;
  MPI_Offset offsetB;
  MPI_Offset offsetC;
  MPI_Status status;
  int val_A[ Nb ][ Ng ], val_B[ Ng ][ Nb ], val_C[ Nb ][ Nb ];

  /* initialize MPI */
  MPI_Init( &argc, &argv );
  MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
  MPI_Comm_size( MPI_COMM_WORLD, &commsize );

  /* We fill and we print the file contain */    
  fill_file_A(fileA,Nb);
  fill_file_B(fileB,Nb);
  if(myrank == 0) {
    printf("File A:\n");
    aff(fileA,Nb);
    printf("File B:\n");
    aff(fileB,Nb);
  }
  /*  exit(0);*/

  /*****************************************/

  MPI_Type_extent( MPI_INT, &extent );
  colext = Nb * extent;
  slicelen = Ng * Nb;
  sliceext = slicelen * extent;

  /*
  The subarray type constructor creates an MPI datatype describing 
  an n-dimensional subarray of an n-dimensional array. The subarray may 
  be situated anywhere  within the full array, and may be of any nonzero 
  size up to the size of the larger array as long as it is confined 
  within this array. This type constructor facilitates creating
  filetypes to access arrays distributed in blocks among processes to a 
  single file that contains the global array. 
  
  Note: { MPI_ORDER_C}
     The ordering used by C arrays, (i.e., row-major order) 

  */
  /* create filetype for matrix A */
  sizes[0]=Nb; sizes[1]=Nb; 
  subsizes[0]= Nb/2; subsizes[1]=Nb/2; 
  starts[0]=(myrank % (P/2))*subsizes[0]; 
  starts[1]=((myrank / (P/2))*subsizes[1]); 
  MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, 
                           MPI_INT, &ftypeA); 

  MPI_Type_commit( &ftypeA );

  /* create filetype for matrix B */
  MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, 
                           MPI_INT, &ftypeB); 
  MPI_Type_commit( &ftypeB );

  /* create filetype for matrix C */
  MPI_Type_vector( Nb, Nb, Ng, MPI_INT, &block_typeC );
  MPI_Type_hvector( P, 1, colext, block_typeC, &slice_typeC );
  length[ 0 ] = 1;
  length[ 1 ] = 1;
  length[ 2 ] = 1;
  disp[ 0 ] = 0;
  disp[ 1 ] = sliceext * myrank;
  disp[ 2 ] = sliceext * P;
  type[ 0 ] = MPI_LB;
  type[ 1 ] = slice_typeC;
  type[ 2 ] = MPI_UB;
  MPI_Type_struct( 3, length, disp, type, &ftypeC );
  MPI_Type_commit( &ftypeC );

  /* open files */
  mode = MPI_MODE_RDONLY;
  MPI_File_open( MPI_COMM_WORLD, fileA, mode, MPI_INFO_NULL, &fh_A );
  mode = MPI_MODE_RDONLY;
  MPI_File_open( MPI_COMM_WORLD, fileB, mode, MPI_INFO_NULL, &fh_B );
  mode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
  MPI_File_open( MPI_COMM_WORLD, fileC, mode, MPI_INFO_NULL, &fh_C );

  /* set views */
  MPI_File_set_view (fh_A, MPI_OFFSET_ZERO, MPI_INT, ftypeA, 
                     "native", MPI_INFO_NULL );
  MPI_File_set_view( fh_B, MPI_OFFSET_ZERO, MPI_INT, ftypeB, 
                     "native", MPI_INFO_NULL );
  MPI_File_set_view( fh_C, MPI_OFFSET_ZERO, MPI_INT, ftypeC, 
                     "native", MPI_INFO_NULL );

  offsetA = MPI_OFFSET_ZERO;
  offsetB = MPI_OFFSET_ZERO;
  offsetC = MPI_OFFSET_ZERO;

  /* read a block of A */
  MPI_File_read_at_all( fh_A, offsetA, val_A, (Nb/2)*(Nb/2), 
                        MPI_INT, &status ); 
  printf("Block of A (PID=%d)\n",myrank);
  aff_in_core(val_A,Nb/2);

  /* read a block of B */
  MPI_File_read_at_all( fh_B, offsetB, val_B, (Nb/2)*(Nb/2), 
                        MPI_INT, &status ); 
  printf("Block of B (%d)\n",myrank);
  aff_in_core(val_B,Nb/2);

  /* close files */
  MPI_File_close( &fh_A );
  MPI_File_close( &fh_B );
  MPI_File_close( &fh_C );

  /* free filetypes */
  MPI_Type_free( &ftypeA );
  MPI_Type_free( &ftypeB );
  MPI_Type_free( &ftypeC );

  if(myrank == 0) {
    printf("File C:\n");
    /* aff(fileC,Nb); */
  }

  /* finalize MPI */
  MPI_Finalize();
}

/****************************
 * Strassen in-core algorithm
 ****************************
int
main(){

  int *A,*B,*C;
  int i,j,k;

  A = (int *)malloc(Nb*Nb*sizeof(int));
  B = (int *)malloc(Nb*Nb*sizeof(int));
  C = (int *)malloc(Nb*Nb*sizeof(int));

  fill_A(A,Nb);
  printf("Matrix A:\n");
  aff(A,Nb);

  fill_B(B,Nb);
  printf("Matrix B:\n");
  aff(B,Nb);

  multiply_strassen(Nb,A,B,C);

  printf("Matrix C:\n");
  aff(C,Nb);
}

************************/


/******* Execution Trace
[cerin@fatboy cerin]$ /usr/local/mpi/bin/mpicc -O out-of-core-strassen.c
[cerin@fatboy cerin]$ /usr/local/mpi/bin/mpirun -np 4 a.out
File A:
 00001  00002  00003  00004  00005  00006  00007  00008
 00009  00010  00011  00012  00013  00014  00015  00016
 00017  00018  00019  00020  00021  00022  00023  00024
 00025  00026  00027  00028  00029  00030  00031  00032
 00033  00034  00035  00036  00037  00038  00039  00040
 00041  00042  00043  00044  00045  00046  00047  00048
 00049  00050  00051  00052  00053  00054  00055  00056
 00057  00058  00059  00060  00061  00062  00063  00064

 File B:
 00001  00009  00017  00025  00033  00041  00049  00057
 00002  00010  00018  00026  00034  00042  00050  00058
 00003  00011  00019  00027  00035  00043  00051  00059
 00004  00012  00020  00028  00036  00044  00052  00060
 00005  00013  00021  00029  00037  00045  00053  00061
 00006  00014  00022  00030  00038  00046  00054  00062
 00007  00015  00023  00031  00039  00047  00055  00063
 00008  00016  00024  00032  00040  00048  00056  00064
                                                          
 Block of A (PID=0)
 00001 00002 00003 00004
 00009 00010 00011 00012
 00017 00018 00019 00020
 00025 00026 00027 00028
 Block of B (PID=0)
 00001 00009 00017 00025
 00002 00010 00018 00026
 00003 00011 00019 00027
 00004 00012 00020 00028
 File C:
 Block of A (PID=3)
 00037 00038 00039 00040
 00045 00046 00047 00048
 00053 00054 00055 00056
 00061 00062 00063 00064
 Block of B (PID=3)
 00037 00045 00053 00061
 00038 00046 00054 00062
 00039 00047 00055 00063
 00040 00048 00056 00064
 Block of A (PID=1)
 00033 00034 00035 00036  
 00041 00042 00043 00044
 00049 00050 00051 00052
 00057 00058 00059 00060
 Block of B (PID=1)
 00005 00013 00021 00029
 00006 00014 00022 00030
 00007 00015 00023 00031
 00008 00016 00024 00032
 Block of A (PID=2)
 00005 00006 00007 00008
 00013 00014 00015 00016
 00021 00022 00023 00024
 00029 00030 00031 00032
 Block of B (PID=2)
 00033 00041 00049 00057
 00034 00042 00050 00058
 00035 00043 00051 00059
 00036 00044 00052 00060
[cerin@fatboy cerin]$    

*****************************/
Christophe Cérin 2002-09-16