/* File name: out-of-core-strassen.c /usr/local/mpi/bin/mpicc -O out-of-core-strassen.c */ #include "sys/types.h" #include "stdlib.h" #include "stdio.h" #include "string.h" #include <argz.h> #include <stdlib.h> #include <limits.h> #include <string.h> #include <math.h> #include <sys/time.h> #include <sys/stat.h> #include <unistd.h> #include "mpi.h" #define MPI_OFFSET_ZERO 0 #define P 4 /* number of tasks */ #define Nb 8 /* slice size */ #define Ng ( Nb * P ) /* global size of matrix */ static char fileA[] = "fileA"; static char fileB[] = "fileB"; static char fileC[] = "fileC"; void fill_file_A(char *FNAME, int N) { int i, data; FILE *f; /* sprintf(s, FNAME, 0);*/ if ((f = (FILE *) fopen(FNAME, "w+")) == NULL) { perror("io2: error in opening auxiliary files"); exit(1); } for (i = 0; i < N * N; i++) { data = i + 1; if (fwrite(&data, sizeof(int), 1, f) != 1) { perror("io3: can't write in an auxiliary file"); fclose(f); exit(1); } } fclose(f); } void fill_file_B(char *FNAME, int N) { int i, j, data; FILE *f; /* sprintf(s, FNAME, 0);*/ if ((f = (FILE *) fopen(FNAME, "w+")) == NULL) { perror("io2: error in opening auxiliary files"); exit(1); } for (i = 0; i < N; i++) { for(j=0; j < N; j++) { data = (i+1) + N*j; if (fwrite(&data, sizeof(int), 1, f) != 1) { perror("io3: can't write in an auxiliary file"); fclose(f); exit(1); } } } fclose(f); } /* * Print screen fname file (which much contain integers) */ void aff(char *fname, int N) { FILE *fp; int i,count=0; fp = fopen(fname, "r"); fread(&i, 4, 1, fp); while (0 == feof(fp)) { printf("%6.5d ", i); if(((count+1) % N) == 0) { count = 0; printf("\n"); } else count++; fread(&i, 4, 1, fp); } printf("\n"); fclose(fp); } void aff_in_core(int *t,int n){ int i,j,count=0; for(i=0;i<n*n;i++) { printf("%6.5d",*(t+i)); if(((count+1) % n) == 0) { printf("\n");count = 0; } else { count++; } } } /* c = a+b */ void add_in_core(int n, int *a, int *b, int *c) { int i, j; for (i = 0; i < n; i++) for (j = 0; j < n; j++) *(c+(i*n)+j) = *(a+(i*n)+j) + *(b+(i*n)+j); } /* c = a-b */ void sub_in_core(int n, int *a, int *b, int *c) { int i, j; for (i = 0; i < n; i++) for (j = 0; j < n; j++) *(c+(i*n)+j) = (*(a+(i*n)+j)) - (*(b+(i*n)+j)); } /* c = a*b */ void multiply_in_core(int n, int *a, int *b, int *c) { int i, j, k; for(i=0;i<n;i++) for(j=0;j<n;j++){ *(c+(i*n)+j) = 0; for(k=0;k<n;k++) *(c+(i*n)+j) += *(a+(i*n)+k) * *(b+(k*n)+j); } } /* c = a*b (Strassen in-core Algorithm) */ void multiply_strassen(int NN, int *a, int *b, int *c) { int i,j,k,n; int *a11, *a12, *a21, *a22; int *b11, *b12, *b21, *b22; int *M0, *M1, *M2, *M3, *M4, *M5, *M6; int *c1, *c2, *d1, *d2; int *c11, *c12, *c21, *c22; n = NN/2; /* initial step: isolate sub-matrices */ a11 = (int *)malloc(n*n*sizeof(int)); a12 = (int *)malloc(n*n*sizeof(int)); a21 = (int *)malloc(n*n*sizeof(int)); a22 = (int *)malloc(n*n*sizeof(int)); b11 = (int *)malloc(n*n*sizeof(int)); b12 = (int *)malloc(n*n*sizeof(int)); b21 = (int *)malloc(n*n*sizeof(int)); b22 = (int *)malloc(n*n*sizeof(int)); for(i=0;i<n;i++) for(j=0;j<n;j++) { k = 2*(i*n) + j; *(a11 + (i*n) + j) = *(a + k); *(b11 + (i*n) + j) = *(b + k); } for(i=0;i<n;i++) for(j=0;j<n;j++){ k = (i*n) + j + (n*(i+1)); *(a12 + (i*n) + j) = *(a + k); *(b12 + (i*n) + j) = *(b + k); } for(i=0;i<n;i++) for(j=0;j<n;j++){ k = 2*(i*n) + j + (n * Nb); *(a21 + (i*n) + j) = *(a + k); *(b21 + (i*n) + j) = *(b + k); } for(i=0;i<n;i++) for(j=0;j<n;j++){ k = 2*(i*n) + j + (n*Nb) + n; *(a22 + (i*n) + j) = *(a + k); *(b22 + (i*n) + j) = *(b + k); } /* aff(a11,n);printf("\n");aff(a12,n);printf("\n"); aff(a21,n);printf("\n");aff(a22,n);printf("\n"); */ /* first part */ d1 = (int *)malloc(n*n*sizeof(int)); d2 = (int *)malloc(n*n*sizeof(int)); M0 = (int *)malloc(n*n*sizeof(int)); M1 = (int *)malloc(n*n*sizeof(int)); M2 = (int *)malloc(n*n*sizeof(int)); M3 = (int *)malloc(n*n*sizeof(int)); M4 = (int *)malloc(n*n*sizeof(int)); M5 = (int *)malloc(n*n*sizeof(int)); M6 = (int *)malloc(n*n*sizeof(int)); c1 = (int *)malloc(n*n*sizeof(int)); c2 = (int *)malloc(n*n*sizeof(int)); c11 = (int *)malloc(n*n*sizeof(int)); c12 = (int *)malloc(n*n*sizeof(int)); c21 = (int *)malloc(n*n*sizeof(int)); c22 = (int *)malloc(n*n*sizeof(int)); add_in_core(n, a11, a22, d1); add_in_core(n, b11, b22, d2); multiply_in_core(n, d1, d2, M0); sub_in_core(n, a12, a22, d1); add_in_core(n, b21, b22, d2); multiply_in_core(n, d1, d2, M1); sub_in_core(n, b21, b11, d1); multiply_in_core(n, a22, d1, M2); add_in_core(n, a11, a12, d1); multiply_in_core(n, d1, b22, M3); add_in_core(n, a21, a22, d1); multiply_in_core(n, d1, b11, M4); sub_in_core(n, b12, b22, d1); multiply_in_core(n, a11, d1, M5); sub_in_core(n, a21, a11, d1); add_in_core(n, b11, b12, d2); multiply_in_core(n, d1, d2, M6); /* Second part */ add_in_core(n, M0, M1, c1); sub_in_core(n, M2, M3, c2); add_in_core(n,c1,c2,c11); add_in_core(n, M3, M5, c12); add_in_core(n, M2, M4, c21); sub_in_core(n, M0, M4, c1); add_in_core(n, M5, M6, c2); add_in_core(n, c1, c2, c22); /* Third step: copy into the final destination */ for(i=0;i<n;i++) for(j=0;j<n;j++) { k = 2*(i*n) + j; *(c + k) = *(c11 + (i*n) + j); } for(i=0;i<n;i++) for(j=0;j<n;j++){ k = (i*n) + j + (n*(i+1)); *(c + k) = *(c12 + (i*n) + j); } for(i=0;i<n;i++) for(j=0;j<n;j++){ k = 2*(i*n) + j + (n * Nb); *(c + k) = *(c21 + (i*n) + j); } for(i=0;i<n;i++) for(j=0;j<n;j++){ k = 2*(i*n) + j + (n*Nb) + n; *(c + k) = *(c22 + (i*n) + j); } } int main( int argc, char *argv[] ) { int i, j, k; int iv; int myrank, commsize; int extent; int colext; int slicelen; int sliceext; int sizes[2], subsizes[2], starts[2]; int length[ 3 ]; MPI_Aint disp[ 3 ]; MPI_Datatype type[ 3 ]; MPI_Datatype ftypeA; MPI_Datatype ftypeB; MPI_Datatype ftypeC; MPI_Datatype slice_typeB; MPI_Datatype block_typeC; MPI_Datatype slice_typeC; int mode; MPI_File fh_A, fh_B, fh_C; MPI_Offset offsetA; MPI_Offset offsetB; MPI_Offset offsetC; MPI_Status status; int val_A[ Nb ][ Ng ], val_B[ Ng ][ Nb ], val_C[ Nb ][ Nb ]; /* initialize MPI */ MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &myrank ); MPI_Comm_size( MPI_COMM_WORLD, &commsize ); /* We fill and we print the file contain */ fill_file_A(fileA,Nb); fill_file_B(fileB,Nb); if(myrank == 0) { printf("File A:\n"); aff(fileA,Nb); printf("File B:\n"); aff(fileB,Nb); } /* exit(0);*/ /*****************************************/ MPI_Type_extent( MPI_INT, &extent ); colext = Nb * extent; slicelen = Ng * Nb; sliceext = slicelen * extent; /* The subarray type constructor creates an MPI datatype describing an n-dimensional subarray of an n-dimensional array. The subarray may be situated anywhere within the full array, and may be of any nonzero size up to the size of the larger array as long as it is confined within this array. This type constructor facilitates creating filetypes to access arrays distributed in blocks among processes to a single file that contains the global array. Note: { MPI_ORDER_C} The ordering used by C arrays, (i.e., row-major order) */ /* create filetype for matrix A */ sizes[0]=Nb; sizes[1]=Nb; subsizes[0]= Nb/2; subsizes[1]=Nb/2; starts[0]=(myrank % (P/2))*subsizes[0]; starts[1]=((myrank / (P/2))*subsizes[1]); MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &ftypeA); MPI_Type_commit( &ftypeA ); /* create filetype for matrix B */ MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &ftypeB); MPI_Type_commit( &ftypeB ); /* create filetype for matrix C */ MPI_Type_vector( Nb, Nb, Ng, MPI_INT, &block_typeC ); MPI_Type_hvector( P, 1, colext, block_typeC, &slice_typeC ); length[ 0 ] = 1; length[ 1 ] = 1; length[ 2 ] = 1; disp[ 0 ] = 0; disp[ 1 ] = sliceext * myrank; disp[ 2 ] = sliceext * P; type[ 0 ] = MPI_LB; type[ 1 ] = slice_typeC; type[ 2 ] = MPI_UB; MPI_Type_struct( 3, length, disp, type, &ftypeC ); MPI_Type_commit( &ftypeC ); /* open files */ mode = MPI_MODE_RDONLY; MPI_File_open( MPI_COMM_WORLD, fileA, mode, MPI_INFO_NULL, &fh_A ); mode = MPI_MODE_RDONLY; MPI_File_open( MPI_COMM_WORLD, fileB, mode, MPI_INFO_NULL, &fh_B ); mode = MPI_MODE_CREATE | MPI_MODE_WRONLY; MPI_File_open( MPI_COMM_WORLD, fileC, mode, MPI_INFO_NULL, &fh_C ); /* set views */ MPI_File_set_view (fh_A, MPI_OFFSET_ZERO, MPI_INT, ftypeA, "native", MPI_INFO_NULL ); MPI_File_set_view( fh_B, MPI_OFFSET_ZERO, MPI_INT, ftypeB, "native", MPI_INFO_NULL ); MPI_File_set_view( fh_C, MPI_OFFSET_ZERO, MPI_INT, ftypeC, "native", MPI_INFO_NULL ); offsetA = MPI_OFFSET_ZERO; offsetB = MPI_OFFSET_ZERO; offsetC = MPI_OFFSET_ZERO; /* read a block of A */ MPI_File_read_at_all( fh_A, offsetA, val_A, (Nb/2)*(Nb/2), MPI_INT, &status ); printf("Block of A (PID=%d)\n",myrank); aff_in_core(val_A,Nb/2); /* read a block of B */ MPI_File_read_at_all( fh_B, offsetB, val_B, (Nb/2)*(Nb/2), MPI_INT, &status ); printf("Block of B (%d)\n",myrank); aff_in_core(val_B,Nb/2); /* close files */ MPI_File_close( &fh_A ); MPI_File_close( &fh_B ); MPI_File_close( &fh_C ); /* free filetypes */ MPI_Type_free( &ftypeA ); MPI_Type_free( &ftypeB ); MPI_Type_free( &ftypeC ); if(myrank == 0) { printf("File C:\n"); /* aff(fileC,Nb); */ } /* finalize MPI */ MPI_Finalize(); } /**************************** * Strassen in-core algorithm **************************** int main(){ int *A,*B,*C; int i,j,k; A = (int *)malloc(Nb*Nb*sizeof(int)); B = (int *)malloc(Nb*Nb*sizeof(int)); C = (int *)malloc(Nb*Nb*sizeof(int)); fill_A(A,Nb); printf("Matrix A:\n"); aff(A,Nb); fill_B(B,Nb); printf("Matrix B:\n"); aff(B,Nb); multiply_strassen(Nb,A,B,C); printf("Matrix C:\n"); aff(C,Nb); } ************************/ /******* Execution Trace [cerin@fatboy cerin]$ /usr/local/mpi/bin/mpicc -O out-of-core-strassen.c [cerin@fatboy cerin]$ /usr/local/mpi/bin/mpirun -np 4 a.out File A: 00001 00002 00003 00004 00005 00006 00007 00008 00009 00010 00011 00012 00013 00014 00015 00016 00017 00018 00019 00020 00021 00022 00023 00024 00025 00026 00027 00028 00029 00030 00031 00032 00033 00034 00035 00036 00037 00038 00039 00040 00041 00042 00043 00044 00045 00046 00047 00048 00049 00050 00051 00052 00053 00054 00055 00056 00057 00058 00059 00060 00061 00062 00063 00064 File B: 00001 00009 00017 00025 00033 00041 00049 00057 00002 00010 00018 00026 00034 00042 00050 00058 00003 00011 00019 00027 00035 00043 00051 00059 00004 00012 00020 00028 00036 00044 00052 00060 00005 00013 00021 00029 00037 00045 00053 00061 00006 00014 00022 00030 00038 00046 00054 00062 00007 00015 00023 00031 00039 00047 00055 00063 00008 00016 00024 00032 00040 00048 00056 00064 Block of A (PID=0) 00001 00002 00003 00004 00009 00010 00011 00012 00017 00018 00019 00020 00025 00026 00027 00028 Block of B (PID=0) 00001 00009 00017 00025 00002 00010 00018 00026 00003 00011 00019 00027 00004 00012 00020 00028 File C: Block of A (PID=3) 00037 00038 00039 00040 00045 00046 00047 00048 00053 00054 00055 00056 00061 00062 00063 00064 Block of B (PID=3) 00037 00045 00053 00061 00038 00046 00054 00062 00039 00047 00055 00063 00040 00048 00056 00064 Block of A (PID=1) 00033 00034 00035 00036 00041 00042 00043 00044 00049 00050 00051 00052 00057 00058 00059 00060 Block of B (PID=1) 00005 00013 00021 00029 00006 00014 00022 00030 00007 00015 00023 00031 00008 00016 00024 00032 Block of A (PID=2) 00005 00006 00007 00008 00013 00014 00015 00016 00021 00022 00023 00024 00029 00030 00031 00032 Block of B (PID=2) 00033 00041 00049 00057 00034 00042 00050 00058 00035 00043 00051 00059 00036 00044 00052 00060 [cerin@fatboy cerin]$ *****************************/