MPI_Gether returns SUCCESS But Data in the Buffer M_Div is ZERO

Hi,

I am following the tutorial at:

http://mpitutorial.com/tutorials/mpi-scatter-gather-and-allgather/

I have done MPI_Scatter(...) to distribute data. After that I am doing division using a function computerDivion. computerDivision is returning a pointer and storing it in div_buffer.computeDivison is returning a pointer to array of double values of size num_cols.After that I am calling MPI_Gather(...). I want to store the data returned by each process into a pointer to pointer of 2Darray but the contents of 2D Array M_div is zero. Also MPI_Gather is return a SUCCESS value.

My code is given below:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#include <iostream>
#include <cmath>
#include <mpi.h>
#include <fstream>
#include <ctime>
#include <vector>
#define N 16

int rank, size;
double* computeDivisions(double* recv_buffer, int, int);
void GaussianElimination(double **,double *b ,double *y);
int main(int argc, char * argv[])
{
  double sTime, eTime, rTime;
  std::ifstream inFile;
  int num_rows = 4;
  int num_cols = 4;
  int num_processors =4;
  int cur_control = 0;
  double * send_buffer = NULL;
  double * recv_buffer = NULL;
  double ** data = NULL;
  double determinant;
  double *div_buffer=NULL;
  int irow =0; int icol=0; int iIndex =0;
  std::vector<double> file_buffer;
   double **M_div=NULL;


  double **M_A, *I_A, *I_B, *I_Y;
  double *Output, Pivot;
  I_B = NULL;
  I_B = new double[N];
  if(I_B == NULL){
    std::cout<< " I_A can't be allocated memory";
    MPI_Finalize();
    return -2;
  }
  I_A = NULL;
  I_A = new double[N];
  if(I_A == NULL){
    std::cout<< " I_A can't be allocated memory";
    MPI_Finalize();
    return -2;
  }
  I_Y = NULL;
  I_Y = new double[N];
  if(I_Y== NULL){
    std::cout<< " I_B can't be allocated memory";
    MPI_Finalize();
    return -2;
  }
  
 recv_buffer = new double[N];
 if(recv_buffer== NULL){
    std::cout<< " recv_buffer can't be allocated memory";
    return -2;
  }
   if(!rank)
  {
    
       
       std::cout<<"WE have "<<size <<" processors " <<std::endl;
       M_A[0][0] =2.0;M_A[0][1] =1.0; M_A[0][2] = -1.0; M_A[0][3] =2.0;  //A[0][3] = 12.0;
       M_A[1][0] =4.0;M_A[1][1] =5.0; M_A[1][2] =-3.0;   M_A[1][3] = 6.0;//A[1][3] = 0.0;
       M_A[2][0] =-2.0;M_A[2][1] =5.0; M_A[2][2] = -2.0; M_A[2][3]=6.0;//A[0][3] = -9;
       M_A[3][0] =4.0;M_A[3][1] =11.0; M_A[3][2] = -4.0; M_A[3][3]=8.0;//A[0][3] = -9;
       I_B[0] = 5; I_B[1] = 9; I_B[2] = 4; I_B[3]=2;
       
       //2d to 1d array is giving core dumped
       
       for(irow=0; irow<num_rows; irow++)
	  	  for(icol=0; icol<num_cols; icol++)
			  I_A[iIndex++] = M_A[irow][icol];

  

   }//if(!rank)

  

   MPI_Bcast (&num_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);

   MPI_Scatter(I_A, num_cols, MPI_DOUBLE, recv_buffer, num_cols, MPI_DOUBLE, 0, MPI_COMM_WORLD);//data goes to each process

   //http://mpitutorial.com/tutorials/mpi-scatter-gather-and-allgather/
   
   div_buffer = computeDivisions(recv_buffer, rank, num_cols);

   if(rank==0){
     std::cout<<"printing div_buffer"<<std::endl;
     for(int j=0; j<num_cols; j++)
           std::cout<<div_buffer[j]<<" ";

    std::cout<<"printing of div_buffer ends"<<std::endl;
   }
   

  int ret_val = MPI_Gather(M_div, num_cols, MPI_DOUBLE, div_buffer, num_cols, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  
  if(ret_val == MPI_SUCCESS)
  std::cout<<"MPI_Gather Success";
  else if(ret_val == MPI_ERR_COMM)
     std::cout<<"MPI_Gather : MPI_ERR_COMM";
  else if(ret_val == MPI_ERR_COUNT)
    std::cout<<"MPI_Gather : MPI_ERR_COUNT";
  else if(ret_val == MPI_ERR_TYPE)
    std::cout<<"MPI_Gather : MPI_ERR_TYPE";
  else if(ret_val == MPI_ERR_BUFFER)
    std::cout<<"MPI_Gather : MPI_ERR_BUFFER";


   if(rank==0){
     std::cout<<"printing M_div"<<std::endl;
     for(int i=0; i<num_rows;i++){   
        for(int j=0; j<num_cols; j++)
           std::cout<<M_div[i][j]<<" ";
        std::cout<<std::endl;
     }
     std::cout<<"printing of M_div ends"<<std::endl;
   }
  
    for(int i = 0; i < num_rows; i++)
       delete [] M_div[i];
    delete [ ] M_div;
    delete [ ] div_buffer;
    delete [] I_Y;
    delete [ ]I_B;
    for(int i = 0; i < num_rows; i++)
       delete [] M_A[i];
    delete [ ] M_A;// No need to delete A because A is not created dynamically
    MPI_Finalize();
    return 0;
}


double* computeDivisions(double* recv_buffer, int rank, int num_cols){
       
            for(int j= rank+1; j <=num_cols-1; ++j)
               recv_buffer[j] = recv_buffer[j]/recv_buffer[rank];
              
            
            return recv_buffer;
}

My output is:


$ mpirun -np 4 ./a.out
WE have 4 processors
printing div_buffer
2 0.5 -0.5 1 printing of div_buffer ends
MPI_Gather Successprinting M_div
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
printing of M_div ends
MPI_Gather SuccessMPI_Gather SuccessMPI_Gather Success



Some body please guide me.

Zulfi.


@zak100
The code you have presented won't produce the output you claim. It doesn't even have statements to initialise the MPI system (MPI_Init etc.)

If you want help then:
- present ALL your code properly, so that we have a chance of running it and reproducing your problem;
- remove all the lines completely unnecessary to your problem, as otherwise we are just looking at an obscure walll of code;
- use proper indentation and stop mixing spaces and tabs randomly.

I suspect you have the send and receive buffers the wrong way round in MPI_Gather, but who knows: your code won't run.

For reference (noting positions of the sending and receiving buffers):
1
2
3
int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
               void *recvbuf, int recvcount, MPI_Datatype recvtype,
               int root, MPI_Comm comm)

Obviously, only the root processor needs a valid receive buffer here.


In another thread @mbozzi asked you why you aren't using std::vector., and he has a point. If you need to get at the contiguous data of a std::vector V then you can use V.data() - see
http://www.cplusplus.com/reference/vector/vector/data/
Then you won't have to be continually creating arrays on the heap with new.
Last edited on
Hi,
Thanks for your response. I would take care of removing all unnecessary code next time




asked you why you aren't using std::vector., and he has a point

Do you mean I have to do it for all my declarations :
 
double *I_A, *I_B, *I_Y, *recv_buffer, *div_buffer;


For example for I_A I have to write following code:
1
2
3
std::vector<int> myvector_I_A (N);

int* I_A = myvector_I_A.data();

and then again for I_B like this:
1
2
std::vector<int> myvector_I_B (num_rows);
int* I_B = myvector_I_B.data();


Some body please guide me.

Zulfi.
Last edited on
Here you go @zak100 (EDITED)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include "mpi.h"
using namespace std;

const double SMALL = 1.0E-30;     // Used to stop divide-by-zero
const double PZERO = 1.0E-10;     // Interpretation of "zero" in output

using vec = vector<double>;
using veci = vector<int>;
using matrix = vector<vec>;   

// Function prototypes
bool GaussElimination( matrix A, vec &X );
ostream &operator << ( ostream &strm, const vec &V );
matrix readData();
matrix randomData( int N );

// MPI stuff
int tag = 99;
MPI_Status stat;
int myrank, size;
veci procRow;   // Processor owning a given row
veci localRow;  // Number of row on its own processor

//======================================

int main( int argc, char * argv[] )
{
   MPI_Init( &argc, &argv );
   MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
   MPI_Comm_size( MPI_COMM_WORLD, &size   );

   const int NTIMES = 1000;     // Repeat count
   int N = 128;                 // Generated size
   bool random = false;
   unsigned seed = 42;
// unsigned seed = time( 0 );
   srand( seed );
   matrix A;
   vec X;
   bool ok;


   // Set up matrix and RHS
   if ( random ) A = randomData( N );       // Randomly generated
   else          A = readData();            // Read from (e.g.) file


   // Solve (time multiple runs)
   clock_t start  = clock();
   for ( int times = 1; times <= NTIMES; times++ ) ok = GaussElimination( A, X );
   clock_t finish = clock();

   if ( myrank == 0 )
   {
      cout << "Time: " << (double)( finish - start ) / CLOCKS_PER_SEC << "\n\n";
      if ( ok ) cout << "Solution: " << X << '\n';
      else      cout << "Unable to solve\n";
   }

   MPI_Finalize();
}

//======================================

bool GaussElimination( matrix A, vec &X )
//-------------------------------------------------------------------
// Solve linear system by Gaussian elimination
// Note: A is the AUGMENTED matrix (last column is actually B); it has N rows and N+1 columns
// Note: Copy of A because first argument is passed by value
// Note: Each processor only has SOME of the rows
//-------------------------------------------------------------------
{
   int N = A[0].size() - 1;
   vec row( N + 1 );          
   X = vec( N );
   int proc;                           // Processor in charge of pivot row
   bool doPivoting = true;             // Slower, but more reliable
   struct ptype{ double Amax; int r; };

   //**********************
   // Forward elimination *
   //**********************
   for ( int i = 0; i < N; i++ )
   {
      proc = procRow[i];

      //************************************************************************
      // Partial pivoting: find row r below i with largest element in column i *
      //************************************************************************
      if ( doPivoting && i < N - 1 )
      {
         // Each processor updates its own maxA
         ptype p{ 0.0, i }, pmax;
         for ( int k = i; k < N; k++ )
         {
            if ( myrank == procRow[k] )
            {
               double val = abs( A[localRow[k]][i] );
               if ( val > p.Amax ) p = ptype{ val, k };
            }
         }                                                              
   
         // Global reduction to get the row to swap with
         MPI_Allreduce( &p, &pmax, 1, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD );
         int r = pmax.r;
   
         // Swap (tails of) ith and rth rows
         if ( r != i )
         {
            int remote = procRow[r];
            if ( myrank == proc && myrank == remote )      // Swap rows on same processor
            {
               int ii = localRow[i];
               int rr = localRow[r];
               for ( int j = i; j <= N; j++ ) swap( A[ii][j], A[rr][j] );
            }
            else if ( myrank == proc )
            {
               int ii = localRow[i];
               MPI_Sendrecv_replace( A[ii].data() + i, N + 1 - i, MPI_DOUBLE, remote, tag, remote, tag, MPI_COMM_WORLD, &stat );
            }
            else if ( myrank == remote )
            {
               int rr = localRow[r];
               MPI_Sendrecv_replace( A[rr].data() + i, N + 1 - i, MPI_DOUBLE, proc  , tag, proc  , tag, MPI_COMM_WORLD, &stat );
            }
         }
      }

      //**************************
      // Reduction of lower rows *
      //**************************
      // Send tail of pivot row to other processors
      if ( myrank == proc ) row = A[localRow[i]];
      MPI_Bcast( row.data() + i, N + 1 - i, MPI_DOUBLE, proc, MPI_COMM_WORLD );
      if ( abs( row[i] ) < SMALL ) return false;           // Singular matrix

      for ( int r = i + 1; r < N; r++ )         
      {
         if ( myrank == procRow[r] )
         {
            int rr = localRow[r];   
            double multiple = A[rr][i] / row[i];
            for ( int j = i; j < N + 1; j++ ) A[rr][j] -= multiple * row[j];
         }
      }
   }

   //***********************
   // Backward elimination *
   //***********************
   for ( int i = N - 1; i >= 0; i-- )
   {
      proc = procRow[i];
      if ( myrank == proc )
      {
         int ii = localRow[i];
         X[i] = A[ii][N] / A[ii][i];   // Solution for the pivot row (also a multiplier)
      }
      MPI_Bcast( &X[i], 1, MPI_DOUBLE, proc, MPI_COMM_WORLD );
      for ( int r = i - 1; r >= 0; r-- )
      {
         if ( myrank == procRow[r] )
         {
            int rr = localRow[r];
            A[rr][N] -= A[rr][i] * X[i];
            A[rr][i] = 0.0;
         }
      }
   }

   return true;
}

//======================================

ostream &operator << ( ostream &strm, const vec &V )
{
   for ( auto x : V ) strm << setw( 12 ) << ( abs( x ) < PZERO ? 0.0 : x ) << ' ';
   return strm;
}

//======================================

matrix readData()
{
// ifstream in( "in.txt" );
   stringstream in( "4"
                    " 1  2  3  4     1"
                    "-2  5  5  7     2"
                    " 1  9 10  3     3"
                    " 2  2  4  3     4" );

   int N;
   if ( !myrank ) in >> N;
   MPI_Bcast( &N, 1, MPI_INT, 0, MPI_COMM_WORLD );
   procRow  = veci( N );
   localRow = veci( N );

   matrix A;
   vec row( N + 1 );

   for ( int i = 0; i < N; i++ )
   {
      procRow [i] = i % size;
      localRow[i] = i / size;

      int proc = procRow[i];
      if ( myrank == 0 )
      {
         for ( int j = 0; j < N + 1; j++ ) in >> row[j];
         if ( myrank == proc ) A.push_back( row );
         else                  MPI_Send( row.data(), N + 1, MPI_DOUBLE, proc, tag, MPI_COMM_WORLD );
      }
      else if ( myrank == proc ) 
      {
         MPI_Recv( row.data(), N + 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &stat );
         A.push_back( row );
      }
   }

   return A;
}

//======================================

matrix randomData( int N )
{
   procRow  = veci( N );
   localRow = veci( N );

   matrix A;
   vec row( N + 1 );

   for ( int i = 0; i < N; i++ )
   {
      procRow [i] = i % size;
      localRow[i] = i / size;

      int proc = procRow[i];
      if ( myrank == 0 )
      {
         for ( int j = 0; j < N + 1; j++ ) row[j] = rand() % 10;
         if ( myrank == proc ) A.push_back( row );
         else                  MPI_Send( row.data(), N + 1, MPI_DOUBLE, proc, tag, MPI_COMM_WORLD );
      }
      else if ( myrank == proc ) 
      {
         MPI_Recv( row.data(), N + 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &stat );
         A.push_back( row );
      }
   }

   return A;
}

//====================================== 

Last edited on
Hi,

Thanks for your help. I found a sol of MPI_Gather(....). I have used different buffers. Thanks for this program. Actually I don't have to use pivoting or partial pivoting.I have to use a specific alg from the book, My code basically tries to implement that alg.

Zulfi.
Registered users can post here. Sign in or register to post.