/*      " " 
 */
/*   ,     A   
 *   B        .
 *       4-     22.
 */
   #include<stdio.h>
   #include<stdlib.h>
   #include<mpi.h>
   #include<time.h>
   #include<sys/time.h>
/* NUM_DIMS -   . " " P0xP1 */
   #define NUM_DIMS 2
   #define P0 2
   #define P1 2
/*    A = MxN, B = NxK  C = MxK (   
 *  0)
 */
   #define M 64
   #define N 64
   #define K 64
   #define A(i,j) A[N*i+j]
   #define B(i,j) B[K*i+j]
   #define C(i,j) C[K*i+j]

/* ,    */ 

PMATMAT_2(n, A, B, C, p, comm)
 /*  A, B, C, n, p        0 */
    int *n;                       /*    */
    double *A, *B, *C;     /*  : A[n[0]][n[1]],
                                                        B[n[1]][n[2]],
                                                        C[n[0]][n[2]]; */
 /*  */
    int *p;
 /*   . p[0]  n[0], p[1] 
  *  n[2]   p[0]*p[1]  
  *   comm */
 /*   ,       */
    MPI_Comm comm;

  { 
 /*        ,   
  *   0 */

    double *AA, *BB, *CC;  /*   () */
    int   nn[2];               /*    A  B   CC  C */
    int coords[2];            /*    */
    int rank;                 /*   */
 /*     C      () */
    int  *countc, *dispc, *countb, *dispb;
 /*        */

    MPI_Datatype typeb, typec, types[2];
 
    int  blen[2];
    int i, j, k;
    int periods[2], remains[2];    
    int sizeofdouble, disp[2];

 /*   2D ,   1D,   */
 /*  comm */
    MPI_Comm comm_2D, comm_1D[2], pcomm;

 /*    */
    MPI_Comm_dup(comm, &pcomm);
 /*       n[]  p[] */
    MPI_Bcast(n, 3, MPI_INT, 0, pcomm);
    MPI_Bcast(p, 2, MPI_INT, 0, pcomm);
    
 /*  2D    p[0]*p[1] */
    periods[0] = 0;
    periods[1] = 0;
    MPI_Cart_create(pcomm, 2, p, periods, 0, &comm_2D);
 /*          */
    MPI_Comm_rank(comm_2D, &rank);
    MPI_Cart_coords(comm_2D, rank, 2, coords);
  
 /*     1D    
  *  A  B */
    for(i = 0; i < 2; i++)
      { for(j = 0; j < 2; j++)
          remains[j] = (i == j);
          MPI_Cart_sub(comm_2D, remains, &comm_1D[i]);
      }
 /*      () */ 
 /*  ,     */
    nn[0] = n[0]/p[0];
    nn[1] = n[2]/p[1];
    
  #define AA(i,j) AA[n[1]*i+j]
  #define BB(i,j) BB[nn[1]*i+j]
  #define CC(i,j) CC[nn[1]*i+j]
    
    AA = (double *)malloc(nn[0] * n[1] * sizeof(double));
    BB = (double *)malloc(n[1] * nn[1] * sizeof(double));
    CC = (double *)malloc(nn[0] * nn[1] * sizeof(double));

 /*    */
    if(rank == 0)
      { 
     /*          
      *    , ..      
      *   .      
      *  , ..    ,
      *      . */

        MPI_Type_vector(n[1], nn[1], n[2], MPI_DOUBLE, &types[0]);
     /*      */
        MPI_Type_extent(MPI_DOUBLE, &sizeofdouble);
        blen[0]  = 1;
        blen[1]  = 1;
        disp[0]  = 0;
        disp[1]  = sizeofdouble * nn[1];
        types[1] = MPI_UB;
        MPI_Type_struct(2, blen, disp, types, &typeb);
        MPI_Type_commit(&typeb);

     /*    BB    
      *    B.  BB   B
      *        ,
      * ..      ,  
      * BB   ( B)    
      * :  BB0, BB1,.... */
        dispb =  (int *)malloc(p[1] * sizeof(int));
        countb = (int *)malloc(p[1] * sizeof(int));
        for(j = 0; j < p[1]; j++)
          { dispb[j] = j;
            countb[j] = 1;
          }

    /*      CC  C */
        MPI_Type_vector(nn[0], nn[1], n[2], MPI_DOUBLE, &types[0]);
    /*     */
        MPI_Type_struct(2, blen, disp, types, &typec);
        MPI_Type_commit(&typec);
     /*    C    
      *    C.  CC   
      *        ,
      * ..      ,  
      *    ( )    
      * :  0, 1, 2, CC3, 4, 5, 6, 7. */
        dispc =  (int *)malloc(p[0] * p[1] * sizeof(int));
        countc = (int *)malloc(p[0] * p[1] * sizeof(int));
        for(i = 0; i < p[0]; i++)
          { for(j = 0; j < p[1]; j++)
              { dispc[i*p[1]+j] = (i*p[1]*nn[0] + j);
                countc[i*p[1]+j] = 1;
              }
          }	  	  	  
      }        /*      */
  
 /*  (   .2.4   2) */
 /* 1.    (scatter)    A 
  *     x  */
 /*GPRD SECTION step_1 */
    if(coords[1] == 0)
      { MPI_Scatter(A, nn[0]*n[1], MPI_DOUBLE, AA, nn[0]*n[1], MPI_DOUBLE, 0, comm_1D[0]);
      }

 /* 2.    (scatter)    B 
  *     y  */
 /*GPRD SECTION step_2 */
  
    if(coords[0] == 0)
      { MPI_Scatterv(B, countb, dispb, typeb, BB, n[1]*nn[1], MPI_DOUBLE, 0, comm_1D[1]);
      }
 
 /* 3.    AA   y */
 /*GPRD SECTION step_3 */
     MPI_Bcast(AA, nn[0]*n[1], MPI_DOUBLE, 0, comm_1D[1]);
 /* 4.    BB   x */
 /*GPRD SECTION step_4 */
    MPI_Bcast(BB, n[1]*nn[1], MPI_DOUBLE, 0, comm_1D[0]);

 /* 5.       */

    for(i = 0; i < nn[0]; i++)
      { for(j = 0; j < nn[1]; j++)
          { CC(i,j) = 0.0;
            for(k = 0; k < n[1]; k++)
              { CC(i,j) = CC(i,j) + AA(i,k) * BB(k,j);
              }
          }
      }

 /* 6.       0 */
 /*GPRD SECTION step_6 */
    MPI_Gatherv(CC, nn[0]*nn[1], MPI_DOUBLE, C, countc, dispc, typec, 0, comm_2D);
 /*GPRD SECTION GPRD_MAIN_SECT */

 /*        */
    free(AA);
    free(BB);
    free(CC);

    MPI_Comm_free(&pcomm);
    MPI_Comm_free(&comm_2D);
    for(i = 0; i < 2; i++)
      { MPI_Comm_free(&comm_1D[i]);
      }
    if(rank == 0)
      { free(countc);
        free(dispc);
        MPI_Type_free(&typeb);
        MPI_Type_free(&typec);
        MPI_Type_free(&types[0]);
      }

    return 0;
  }

 /*   */

int main(int argc, char **argv)
  { 
    int        size, MyP, n[3], p[2], i, j, k;
    int        dims[NUM_DIMS], periods[NUM_DIMS];
    double     *A, *B, *C;
    int        reorder = 0;
    struct timeval tv1, tv2;        /*    */
    int dt1;
    MPI_Comm   comm;
 /*   MPI */
    MPI_Init(&argc, &argv);
 /*         */
    MPI_Comm_size(MPI_COMM_WORLD, &size);
 /*     () */
    MPI_Comm_rank(MPI_COMM_WORLD, &MyP);
 /*   dims    periods   
  * " " */
    for(i = 0; i < NUM_DIMS; i++) { dims[i] = 0; periods[i] = 0; }
 /*   dims,      */
    MPI_Dims_create(size, NUM_DIMS, dims);
 /*   " "  communicator() comm */
    MPI_Cart_create(MPI_COMM_WORLD, NUM_DIMS, dims, periods, reorder, &comm);
 /*           */
    if(MyP == 0)
      { 
     /*         */
        n[0] = M;
        n[1] = N;
        n[2] = K;
        p[0] = P0;
        p[1] = P1;

        A = (double *)malloc(M * N * sizeof(double));
        B = (double *)malloc(N * K * sizeof(double));
        C = (double *)malloc(M * K * sizeof(double));    
        
     /*       A  B,  C  */
        for(i = 0; i < M; i++)
          for(j = 0; j < N; j++)
            A(i,j) = i+1;
        for(j = 0; j < N; j++)
          for(k = 0; k < K; k++)
            B(j,k) = 21+j;
        for(i = 0; i < M; i++)
          for(k = 0; k < K; k++)
            C(i,k) = 0.0;
      }           /*    0  */
 /*        */
    gettimeofday(&tv1, (struct timezone*)0);
 /*       */
    PMATMAT_2(n, A, B, C, p, comm);
 /*  .        A 
  *    B.     .
  *      */
    gettimeofday(&tv2, (struct timezone*)0);
    dt1 = (tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec;
    printf("MyP = %d Time = %d\n", MyP, dt1);
 /*   0-    */


    if(MyP == 0)
      { for(i = 0; i < M; i++)
          { for(j = 0; j < K; j++)
              printf(" %3.1f",C(i,j));      
	    printf("\n");
	  }
      }

 /*     ,    comm
  *     */

    if(MyP == 0)
      { free(A);
        free(B);
        free(C);
      }

    MPI_Comm_free(&comm);
    MPI_Finalize();
    return(0);
  }

