/*
File name: out-of-core-strassen.c
/usr/local/mpi/bin/mpicc -O out-of-core-strassen.c
*/
#include "sys/types.h"
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include <argz.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <unistd.h>
#include "mpi.h"
#define MPI_OFFSET_ZERO 0
#define P 4 /* number of tasks */
#define Nb 8 /* slice size */
#define Ng ( Nb * P ) /* global size of matrix */
static char fileA[] = "fileA";
static char fileB[] = "fileB";
static char fileC[] = "fileC";
void
fill_file_A(char *FNAME, int N)
{
int i, data;
FILE *f;
/* sprintf(s, FNAME, 0);*/
if ((f = (FILE *) fopen(FNAME, "w+")) == NULL) {
perror("io2: error in opening auxiliary files");
exit(1);
}
for (i = 0; i < N * N; i++) {
data = i + 1;
if (fwrite(&data, sizeof(int), 1, f) != 1) {
perror("io3: can't write in an auxiliary file");
fclose(f);
exit(1);
}
}
fclose(f);
}
void
fill_file_B(char *FNAME, int N)
{
int i, j, data;
FILE *f;
/* sprintf(s, FNAME, 0);*/
if ((f = (FILE *) fopen(FNAME, "w+")) == NULL) {
perror("io2: error in opening auxiliary files");
exit(1);
}
for (i = 0; i < N; i++) {
for(j=0; j < N; j++) {
data = (i+1) + N*j;
if (fwrite(&data, sizeof(int), 1, f) != 1) {
perror("io3: can't write in an auxiliary file");
fclose(f);
exit(1);
}
}
}
fclose(f);
}
/*
* Print screen fname file (which much contain integers)
*/
void
aff(char *fname, int N)
{
FILE *fp;
int i,count=0;
fp = fopen(fname, "r");
fread(&i, 4, 1, fp);
while (0 == feof(fp)) {
printf("%6.5d ", i);
if(((count+1) % N) == 0) {
count = 0; printf("\n");
} else count++;
fread(&i, 4, 1, fp);
}
printf("\n");
fclose(fp);
}
void
aff_in_core(int *t,int n){
int i,j,count=0;
for(i=0;i<n*n;i++) {
printf("%6.5d",*(t+i));
if(((count+1) % n) == 0) {
printf("\n");count = 0;
} else {
count++;
}
}
}
/* c = a+b */
void add_in_core(int n, int *a, int *b, int *c)
{
int i, j;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
*(c+(i*n)+j) = *(a+(i*n)+j) + *(b+(i*n)+j);
}
/* c = a-b */
void sub_in_core(int n, int *a, int *b, int *c)
{
int i, j;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
*(c+(i*n)+j) = (*(a+(i*n)+j)) - (*(b+(i*n)+j));
}
/* c = a*b */
void multiply_in_core(int n, int *a, int *b, int *c)
{
int i, j, k;
for(i=0;i<n;i++)
for(j=0;j<n;j++){
*(c+(i*n)+j) = 0;
for(k=0;k<n;k++)
*(c+(i*n)+j) += *(a+(i*n)+k) * *(b+(k*n)+j);
}
}
/* c = a*b (Strassen in-core Algorithm) */
void multiply_strassen(int NN, int *a, int *b, int *c)
{
int i,j,k,n;
int *a11, *a12, *a21, *a22;
int *b11, *b12, *b21, *b22;
int *M0, *M1, *M2, *M3, *M4, *M5, *M6;
int *c1, *c2, *d1, *d2;
int *c11, *c12, *c21, *c22;
n = NN/2;
/* initial step: isolate sub-matrices */
a11 = (int *)malloc(n*n*sizeof(int));
a12 = (int *)malloc(n*n*sizeof(int));
a21 = (int *)malloc(n*n*sizeof(int));
a22 = (int *)malloc(n*n*sizeof(int));
b11 = (int *)malloc(n*n*sizeof(int));
b12 = (int *)malloc(n*n*sizeof(int));
b21 = (int *)malloc(n*n*sizeof(int));
b22 = (int *)malloc(n*n*sizeof(int));
for(i=0;i<n;i++)
for(j=0;j<n;j++) {
k = 2*(i*n) + j;
*(a11 + (i*n) + j) = *(a + k);
*(b11 + (i*n) + j) = *(b + k);
}
for(i=0;i<n;i++)
for(j=0;j<n;j++){
k = (i*n) + j + (n*(i+1));
*(a12 + (i*n) + j) = *(a + k);
*(b12 + (i*n) + j) = *(b + k);
}
for(i=0;i<n;i++)
for(j=0;j<n;j++){
k = 2*(i*n) + j + (n * Nb);
*(a21 + (i*n) + j) = *(a + k);
*(b21 + (i*n) + j) = *(b + k);
}
for(i=0;i<n;i++)
for(j=0;j<n;j++){
k = 2*(i*n) + j + (n*Nb) + n;
*(a22 + (i*n) + j) = *(a + k);
*(b22 + (i*n) + j) = *(b + k);
}
/*
aff(a11,n);printf("\n");aff(a12,n);printf("\n");
aff(a21,n);printf("\n");aff(a22,n);printf("\n");
*/
/* first part */
d1 = (int *)malloc(n*n*sizeof(int));
d2 = (int *)malloc(n*n*sizeof(int));
M0 = (int *)malloc(n*n*sizeof(int));
M1 = (int *)malloc(n*n*sizeof(int));
M2 = (int *)malloc(n*n*sizeof(int));
M3 = (int *)malloc(n*n*sizeof(int));
M4 = (int *)malloc(n*n*sizeof(int));
M5 = (int *)malloc(n*n*sizeof(int));
M6 = (int *)malloc(n*n*sizeof(int));
c1 = (int *)malloc(n*n*sizeof(int));
c2 = (int *)malloc(n*n*sizeof(int));
c11 = (int *)malloc(n*n*sizeof(int));
c12 = (int *)malloc(n*n*sizeof(int));
c21 = (int *)malloc(n*n*sizeof(int));
c22 = (int *)malloc(n*n*sizeof(int));
add_in_core(n, a11, a22, d1);
add_in_core(n, b11, b22, d2);
multiply_in_core(n, d1, d2, M0);
sub_in_core(n, a12, a22, d1);
add_in_core(n, b21, b22, d2);
multiply_in_core(n, d1, d2, M1);
sub_in_core(n, b21, b11, d1);
multiply_in_core(n, a22, d1, M2);
add_in_core(n, a11, a12, d1);
multiply_in_core(n, d1, b22, M3);
add_in_core(n, a21, a22, d1);
multiply_in_core(n, d1, b11, M4);
sub_in_core(n, b12, b22, d1);
multiply_in_core(n, a11, d1, M5);
sub_in_core(n, a21, a11, d1);
add_in_core(n, b11, b12, d2);
multiply_in_core(n, d1, d2, M6);
/* Second part */
add_in_core(n, M0, M1, c1);
sub_in_core(n, M2, M3, c2);
add_in_core(n,c1,c2,c11);
add_in_core(n, M3, M5, c12);
add_in_core(n, M2, M4, c21);
sub_in_core(n, M0, M4, c1);
add_in_core(n, M5, M6, c2);
add_in_core(n, c1, c2, c22);
/* Third step: copy into the final destination */
for(i=0;i<n;i++)
for(j=0;j<n;j++) {
k = 2*(i*n) + j;
*(c + k) = *(c11 + (i*n) + j);
}
for(i=0;i<n;i++)
for(j=0;j<n;j++){
k = (i*n) + j + (n*(i+1));
*(c + k) = *(c12 + (i*n) + j);
}
for(i=0;i<n;i++)
for(j=0;j<n;j++){
k = 2*(i*n) + j + (n * Nb);
*(c + k) = *(c21 + (i*n) + j);
}
for(i=0;i<n;i++)
for(j=0;j<n;j++){
k = 2*(i*n) + j + (n*Nb) + n;
*(c + k) = *(c22 + (i*n) + j);
}
}
int
main( int argc, char *argv[] )
{
int i, j, k;
int iv;
int myrank, commsize;
int extent;
int colext;
int slicelen;
int sliceext;
int sizes[2], subsizes[2], starts[2];
int length[ 3 ];
MPI_Aint disp[ 3 ];
MPI_Datatype type[ 3 ];
MPI_Datatype ftypeA;
MPI_Datatype ftypeB;
MPI_Datatype ftypeC;
MPI_Datatype slice_typeB;
MPI_Datatype block_typeC;
MPI_Datatype slice_typeC;
int mode;
MPI_File fh_A, fh_B, fh_C;
MPI_Offset offsetA;
MPI_Offset offsetB;
MPI_Offset offsetC;
MPI_Status status;
int val_A[ Nb ][ Ng ], val_B[ Ng ][ Nb ], val_C[ Nb ][ Nb ];
/* initialize MPI */
MPI_Init( &argc, &argv );
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
MPI_Comm_size( MPI_COMM_WORLD, &commsize );
/* We fill and we print the file contain */
fill_file_A(fileA,Nb);
fill_file_B(fileB,Nb);
if(myrank == 0) {
printf("File A:\n");
aff(fileA,Nb);
printf("File B:\n");
aff(fileB,Nb);
}
/* exit(0);*/
/*****************************************/
MPI_Type_extent( MPI_INT, &extent );
colext = Nb * extent;
slicelen = Ng * Nb;
sliceext = slicelen * extent;
/*
The subarray type constructor creates an MPI datatype describing
an n-dimensional subarray of an n-dimensional array. The subarray may
be situated anywhere within the full array, and may be of any nonzero
size up to the size of the larger array as long as it is confined
within this array. This type constructor facilitates creating
filetypes to access arrays distributed in blocks among processes to a
single file that contains the global array.
Note: { MPI_ORDER_C}
The ordering used by C arrays, (i.e., row-major order)
*/
/* create filetype for matrix A */
sizes[0]=Nb; sizes[1]=Nb;
subsizes[0]= Nb/2; subsizes[1]=Nb/2;
starts[0]=(myrank % (P/2))*subsizes[0];
starts[1]=((myrank / (P/2))*subsizes[1]);
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C,
MPI_INT, &ftypeA);
MPI_Type_commit( &ftypeA );
/* create filetype for matrix B */
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C,
MPI_INT, &ftypeB);
MPI_Type_commit( &ftypeB );
/* create filetype for matrix C */
MPI_Type_vector( Nb, Nb, Ng, MPI_INT, &block_typeC );
MPI_Type_hvector( P, 1, colext, block_typeC, &slice_typeC );
length[ 0 ] = 1;
length[ 1 ] = 1;
length[ 2 ] = 1;
disp[ 0 ] = 0;
disp[ 1 ] = sliceext * myrank;
disp[ 2 ] = sliceext * P;
type[ 0 ] = MPI_LB;
type[ 1 ] = slice_typeC;
type[ 2 ] = MPI_UB;
MPI_Type_struct( 3, length, disp, type, &ftypeC );
MPI_Type_commit( &ftypeC );
/* open files */
mode = MPI_MODE_RDONLY;
MPI_File_open( MPI_COMM_WORLD, fileA, mode, MPI_INFO_NULL, &fh_A );
mode = MPI_MODE_RDONLY;
MPI_File_open( MPI_COMM_WORLD, fileB, mode, MPI_INFO_NULL, &fh_B );
mode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
MPI_File_open( MPI_COMM_WORLD, fileC, mode, MPI_INFO_NULL, &fh_C );
/* set views */
MPI_File_set_view (fh_A, MPI_OFFSET_ZERO, MPI_INT, ftypeA,
"native", MPI_INFO_NULL );
MPI_File_set_view( fh_B, MPI_OFFSET_ZERO, MPI_INT, ftypeB,
"native", MPI_INFO_NULL );
MPI_File_set_view( fh_C, MPI_OFFSET_ZERO, MPI_INT, ftypeC,
"native", MPI_INFO_NULL );
offsetA = MPI_OFFSET_ZERO;
offsetB = MPI_OFFSET_ZERO;
offsetC = MPI_OFFSET_ZERO;
/* read a block of A */
MPI_File_read_at_all( fh_A, offsetA, val_A, (Nb/2)*(Nb/2),
MPI_INT, &status );
printf("Block of A (PID=%d)\n",myrank);
aff_in_core(val_A,Nb/2);
/* read a block of B */
MPI_File_read_at_all( fh_B, offsetB, val_B, (Nb/2)*(Nb/2),
MPI_INT, &status );
printf("Block of B (%d)\n",myrank);
aff_in_core(val_B,Nb/2);
/* close files */
MPI_File_close( &fh_A );
MPI_File_close( &fh_B );
MPI_File_close( &fh_C );
/* free filetypes */
MPI_Type_free( &ftypeA );
MPI_Type_free( &ftypeB );
MPI_Type_free( &ftypeC );
if(myrank == 0) {
printf("File C:\n");
/* aff(fileC,Nb); */
}
/* finalize MPI */
MPI_Finalize();
}
/****************************
* Strassen in-core algorithm
****************************
int
main(){
int *A,*B,*C;
int i,j,k;
A = (int *)malloc(Nb*Nb*sizeof(int));
B = (int *)malloc(Nb*Nb*sizeof(int));
C = (int *)malloc(Nb*Nb*sizeof(int));
fill_A(A,Nb);
printf("Matrix A:\n");
aff(A,Nb);
fill_B(B,Nb);
printf("Matrix B:\n");
aff(B,Nb);
multiply_strassen(Nb,A,B,C);
printf("Matrix C:\n");
aff(C,Nb);
}
************************/
/******* Execution Trace
[cerin@fatboy cerin]$ /usr/local/mpi/bin/mpicc -O out-of-core-strassen.c
[cerin@fatboy cerin]$ /usr/local/mpi/bin/mpirun -np 4 a.out
File A:
00001 00002 00003 00004 00005 00006 00007 00008
00009 00010 00011 00012 00013 00014 00015 00016
00017 00018 00019 00020 00021 00022 00023 00024
00025 00026 00027 00028 00029 00030 00031 00032
00033 00034 00035 00036 00037 00038 00039 00040
00041 00042 00043 00044 00045 00046 00047 00048
00049 00050 00051 00052 00053 00054 00055 00056
00057 00058 00059 00060 00061 00062 00063 00064
File B:
00001 00009 00017 00025 00033 00041 00049 00057
00002 00010 00018 00026 00034 00042 00050 00058
00003 00011 00019 00027 00035 00043 00051 00059
00004 00012 00020 00028 00036 00044 00052 00060
00005 00013 00021 00029 00037 00045 00053 00061
00006 00014 00022 00030 00038 00046 00054 00062
00007 00015 00023 00031 00039 00047 00055 00063
00008 00016 00024 00032 00040 00048 00056 00064
Block of A (PID=0)
00001 00002 00003 00004
00009 00010 00011 00012
00017 00018 00019 00020
00025 00026 00027 00028
Block of B (PID=0)
00001 00009 00017 00025
00002 00010 00018 00026
00003 00011 00019 00027
00004 00012 00020 00028
File C:
Block of A (PID=3)
00037 00038 00039 00040
00045 00046 00047 00048
00053 00054 00055 00056
00061 00062 00063 00064
Block of B (PID=3)
00037 00045 00053 00061
00038 00046 00054 00062
00039 00047 00055 00063
00040 00048 00056 00064
Block of A (PID=1)
00033 00034 00035 00036
00041 00042 00043 00044
00049 00050 00051 00052
00057 00058 00059 00060
Block of B (PID=1)
00005 00013 00021 00029
00006 00014 00022 00030
00007 00015 00023 00031
00008 00016 00024 00032
Block of A (PID=2)
00005 00006 00007 00008
00013 00014 00015 00016
00021 00022 00023 00024
00029 00030 00031 00032
Block of B (PID=2)
00033 00041 00049 00057
00034 00042 00050 00058
00035 00043 00051 00059
00036 00044 00052 00060
[cerin@fatboy cerin]$
*****************************/