NeoPZ
decompose.cpp
Go to the documentation of this file.
1 
8 #ifdef HAVE_CONFIG_H
9 #include <pz_config.h>
10 #endif
11 
12 #include <iostream>
13 #include <cstdlib>
14 
15 #include "pzbfilestream.h" // TPZBFileStream, TPZFileStream
16 #include "pzmd5stream.h"
17 
18 #include <fstream>
19 #include <string>
20 #ifdef USING_LIBNUMA
21 #include <numa.h>
22 #endif // USING_LIBNUMA
23 
24 #include "pzlog.h"
25 
26 #ifdef LOG4CXX
27 static LoggerPtr loggerconverge(Logger::getLogger("pz.converge"));
28 static LoggerPtr logger(Logger::getLogger("main"));
29 #endif
30 
31 #include "pzskylmat.h"
32 
33 //#include "timing_analysis.h"
34 #include "arglib.h"
35 #include "run_stats_table.h"
36 
37 #ifdef HAS_GETRUSAGE
38 #include <sys/resource.h> // getrusage
39 #endif
40 
41 #ifdef USING_TBB
42 #include "tbb/task_scheduler_init.h"
43 using namespace tbb;
44 // If you have issues with: dyld: Library not loaded: libtbb.dylib
45 // try setting the LD path. Ex:
46 // export DYLD_FALLBACK_LIBRARY_PATH=/Users/borin/Desktop/neopz/tbb40_297oss/lib/
47 #endif
48 
49 using namespace std;
50 
51 void help(const char* prg)
52 {
53  cout << "Compute the Decompose_LDLt method for the matrix" << endl;
54  cout << endl;
55  cout << "Usage: " << prg << "-if file [-v verbose_level] [-b] "
56  << "[-tot_rdt rdt_file] [-op matrix_operation] [-h]" << endl << endl;
57  cout << "matrix_operation:" << endl;
58  cout << " 0: Decompose_LDLt()" << endl;
59  cout << " 1: Decompose_LDLt2() -- deprecated (not working)" << endl;
60  cout << " 2: Decompose_Cholesky()" << endl;
61  clarg::arguments_descriptions(cout, " ", "\n");
62 }
63 
64 clarg::argString ifn("-ifn", "input matrix file name (use -bi to read from binary files)", "matrix.txt");
65 clarg::argInt affinity("-af", "affinity mode (0=no affinity, 1=heuristi 1)", 0);
66 clarg::argInt verb_level("-v", "verbosity level", 0);
67 int verbose = 0;
68 /* Verbose macro. */
69 #define VERBOSE(level,...) if (level <= verbose) cout << __VA_ARGS__
70 
71 clarg::argInt mop("-op", "Matrix operation", 1);
72 clarg::argBool br("-br", "binary reference. Reference decomposed matrix file format == binary.", false);
73 clarg::argBool bi("-bi", "binary input. Input file format == binary.", false);
74 clarg::argBool bd("-bd", "binary dump. Dump file format == binary.", false);
75 clarg::argBool h("-h", "help message", false);
76 clarg::argBool rea("-rea", "reallocate matrix inside matrix.", false);
77 clarg::argInt mstats("-mstats", "Matrix statistics vebosity level.", 0);
78 clarg::argInt maxcol("-maxcol", "Limit computation to max column (Use Resize(maxcol)).", 0);
79 clarg::argString gen_dm_sig("-gen_dm_md5", "generates MD5 signature for decomposed matrix into file.", "decomposed_matrix.md5");
80 clarg::argString chk_dm_sig("-chk_dm_md5", "compute MD5 signature for decomposed matrix and check against MD5 at file.", "decomposed_matrix.md5");
81 clarg::argString chk_dm_error("-chk_dm_error", "check the decomposed matrix error against a reference matrix. (use -br to read from binary files)", "ref_decomposed_matrix.txt");
82 clarg::argDouble error_tol("-error_tol", "error tolerance.", 1.e-12);
83 clarg::argString dump_dm("-dump_dm", "dump decomposed matrix. (use -bd for binary format)", "dump_matrix.txt");
84 clarg::argInt cholesky_blk("-chol_blk", "Cholesky blocking factor", 256);
85 
86 #ifdef USING_LIBNUMA
87 clarg::argBool naa("-naa", "NUMA aware allocation.", false);
88 clarg::argBool nats("-nats", "NUMA aware thread scheduling.", false);
89 
90 unsigned num_nodes = 0;
91 #endif
92 
93 
94 
95 /* Run statistics. */
96 RunStatsTable total_rst("-tot_rdt",
97  "Whole program (total) statistics raw data table");
98 
99 clarg::argInt nmats("-nmats", "Number of matrizes to decompose simultaneously.", 1);
100 
102 {
103 public:
105  {}
107 
108  void OpenWrite(const std::string& fn)
109  {
110  if (binary)
111  bfs.OpenWrite(fn);
112  else
113  fs.OpenWrite(fn);
114  }
115 
116  void OpenRead(const std::string& fn)
117  {
118  if (binary)
119  bfs.OpenRead(fn);
120  else
121  fs.OpenRead(fn);
122  }
123 
124  operator TPZStream&()
125  {
126  if (binary)
127  return bfs;
128  else
129  return fs;
130  }
131 
132 protected:
133 
134  bool binary;
137 };
138 
139 #include <sched.h> //sched_getcpu
140 
141 std::vector< TPZAutoPointer<TPZSkylMatrix<REAL> > > matrices;
142 
143 #ifdef USING_LIBNUMA
144 cpu_set_t dies_mask_array[8];
145 cpu_set_t mask_core0;
146 cpu_set_t mask_L20;
147 cpu_set_t mask_die0;
148 cpu_set_t mask_proc0;
149 cpu_set_t mask_oddcores;
150 cpu_set_t mask_evencores;
151 
152 void print_mask(cpu_set_t* mask)
153 {
154  for (int i=0; i<64; i++) {
155  cout << CPU_ISSET(i, mask)?"1":"0";
156  }
157 }
158 
159 // 4 processors
160 // 2 dies / processor
161 // 4 L2 caches per die
162 // 2 cores per L2 cache
163 
164 // FPU is shared among the 2 cores under the same L2 cache
165 
166 // Best Assign policy
167 // # of threads : policy
168 // 1 : any core
169 // 2, 3, 4 : different processors
170 // 5 - 16 : different dies.
171 void setup_masks()
172 {
173 #define SET_RANGE(mskp,start,end) CPU_ZERO(mskp); \
174 for (int i=start; i<end; i++) CPU_SET(i,mskp)
175 
176  SET_RANGE(&dies_mask_array[0],0,8);
177  SET_RANGE(&dies_mask_array[1],8,16);
178  SET_RANGE(&dies_mask_array[2],16,24);
179  SET_RANGE(&dies_mask_array[3],24,32);
180  SET_RANGE(&dies_mask_array[4],32,40);
181  SET_RANGE(&dies_mask_array[5],40,48);
182  SET_RANGE(&dies_mask_array[6],48,56);
183  SET_RANGE(&dies_mask_array[7],56,64);
184 
185  SET_RANGE(&mask_proc0,0,16);
186  SET_RANGE(&mask_die0,0,8);
187  SET_RANGE(&mask_L20,0,2);
188  SET_RANGE(&mask_core0,0,1);
189 
190  CPU_ZERO(&mask_oddcores);
191  CPU_ZERO(&mask_evencores);
192  for(int i=0; i<64; i+=2) {
193  CPU_SET(i,&mask_evencores);
194  CPU_SET(i+1,&mask_oddcores);
195  }
196 
197  if (verbose >= 1) {
198  cout << "mask core 0 : "; print_mask(&mask_core0); cout << endl;
199  cout << "mask core L2 0 : "; print_mask(&mask_L20); cout << endl;
200  cout << "mask core die 0 : "; print_mask(&mask_die0); cout << endl;
201  cout << "mask core proc 0: "; print_mask(&mask_proc0); cout << endl;
202  cout << "mask evencores : "; print_mask(&mask_evencores); cout << endl;
203  cout << "mask oddcores : "; print_mask(&mask_oddcores); cout << endl;
204  }
205 }
206 
207 
208 // CPU_SET(cpus[idx],&mask);
209 //int cpus[] = {0, 16, 32, 48, 8, 24, 40, 54};
210 void set_affinity(int af, int tidx)
211 {
212  cpu_set_t* msk = NULL;
213  switch (af) {
214 
215  case 1: {
216  msk = dies_mask_array + (tidx%8);
217  break;
218  }
219  case 2: {
220  msk = &mask_proc0;
221  break;
222  }
223  case 3: {
224  msk = &mask_die0;
225  break;
226  }
227  case 4: {
228  msk = &mask_L20;
229  break;
230  }
231  case 5: {
232  msk = &mask_core0;
233  break;
234  }
235  case 6: {
236  msk = &mask_evencores;
237  break;
238  }
239  case 7: {
240  msk = &mask_oddcores;
241  break;
242  }
243  case 0: // Do not set affinity
244  return;
245  default: {
246  VERBOSE(2, "Warning: -af " << af
247  << " has not been defined. Not setting affinity");
248  return;
249  }
250  }
251 
252  if (verbose >= 2) {
253  cout << "Thread " << tidx << " affinity mask = ";
254  print_mask(msk);
255  cout << endl;
256  }
257 
258  sched_setaffinity(0, sizeof(cpu_set_t), msk);
259 }
260 
261 
262 void* compute_decompose(void* m)
263 {
264  int64_t idx = (int64_t) m;
265 
266  // cpu_set_t mask;
267  // CPU_ZERO(&mask);
268  // CPU_SET(cpus[idx],&mask);
269  if (affinity.get_value() > 0) {
270  set_affinity(affinity.get_value(), idx);
271  }
272 
273  if (nats.was_set()) {
274  struct bitmask* nodemask = numa_allocate_nodemask();
275  numa_bitmask_clearall(nodemask);
276  numa_bitmask_setbit(nodemask,idx%num_nodes);
277  numa_bind(nodemask);
278  numa_free_nodemask(nodemask);
279  }
280 
281  if (verbose >= 2) {
282  int cpuid = sched_getcpu();
283  //int tid = pthread_self();
284  cout << "Thread " << idx << " at cpu " << cpuid << endl;
285  }
286 
287  TPZAutoPointer<TPZSkylMatrix<REAL> > matrix = matrices[idx];
288 
289  if (rea.was_set()) {
290  matrix.ReallocForNuma(-1);
291  }
292 
293 #define CASE_OP(opid,method) \
294 case opid: \
295 matrix->method; \
296 break
297 
298  switch (mop.get_value()) {
299  CASE_OP(0,Decompose_LDLt());
300  case 1:
301  std::cerr << "ERROR: deprecated operation -- decompose LDLt2 is no longer implemented." << std::endl;
302  break;
303  CASE_OP(2,Decompose_Cholesky());
304  CASE_OP(3,Decompose_Cholesky_blk(cholesky_blk.get_value()));
305  default:
306  std::cerr << "ERROR: Invalid matrix operation type." << std::endl;
307  }
308 
309  return NULL;
310 }
311 
312 #endif
313 
314 int main(int argc, char *argv[])
315 {
316 #ifdef USING_TBB
317  task_scheduler_init init;
318 #endif
319 
320 #ifdef USING_LIBNUMA
321  setup_masks();
322 #endif
323 
324  /* Parse the arguments */
325  if (clarg::parse_arguments(argc, argv)) {
326  cerr << "Error when parsing the arguments!" << endl;
327  return 1;
328  }
329 
330  verbose = verb_level.get_value();
331 
332  if (h.get_value() == true) {
333  help(argv[0]);
334  return 1;
335  }
336 
337  if (nmats.get_value() < 1) {
338  cerr << "Error, nmats must be >= 1" << endl;
339  return 1;
340  }
341 
342  if (verbose >= 1) {
343  std::cout << "- Arguments -----------------------" << std::endl;
344  clarg::values(std::cout, false);
345  std::cout << "-----------------------------------" << std::endl;
346  }
347 
348  /* Read the matrix. */
350 
351  VERBOSE(1,"Reading input file: " << ifn.get_value() << std::endl);
352  FileStreamWrapper input_file(bi.get_value());
353  input_file.OpenRead(ifn.get_value());
354  matrix.Read(input_file,0);
355  VERBOSE(1,"Reading input file: " << ifn.get_value()
356  << " [DONE]" << std::endl);
357 
358  if (maxcol.was_set())
359  matrix.Resize(maxcol.get_value(),0);
360 
361  int nthreads = nmats.get_value();
362  std::vector<pthread_t> threads(nthreads);
363 
364  /* Duplicate matrices. */
365  VERBOSE(1, "Copying matrices" << endl);
366 
367 #ifdef USING_LIBNUMA
368  num_nodes = numa_max_node()+1;
369  cout << "Max nodes = " << num_nodes << endl;
370  if (naa.was_set()) {
371  struct bitmask* nodemask = numa_allocate_nodemask();
372  for(int t=0; t<nthreads; t++) {
373  numa_bitmask_clearall(nodemask);
374  unsigned node = t%num_nodes;
375  numa_bitmask_setbit(nodemask,node);
376  numa_set_membind(nodemask);
378  matrices.push_back(mp);
379  }
380  numa_bitmask_setall(nodemask);
381  numa_set_membind(nodemask);
382  numa_free_nodemask(nodemask);
383  }
384  else {
385  for(int t=0; t<nthreads; t++) {
387  matrices.push_back(mp);
388  }
389  }
390 #else
391  for(int t=0; t<nthreads; t++) {
392 
394  matrices.push_back(mp);
395  }
396 #endif
397 
398  VERBOSE(1, "Copying matrices [DONE]" << endl);
399 
400 #ifdef USING_LIBNUMA
401  total_rst.start();
402 
403  for(int t=1; t<nthreads; t++) {
404  PZ_PTHREAD_CREATE(&threads[t], NULL, compute_decompose,
405  (void*) t, __FUNCTION__);
406  }
407 
408  // Perform thread 0 at the main thread
409  compute_decompose((void*) 0);
410 
411  /* Sync. */
412  if (nthreads>0) {
413  for(int t=0; t<nthreads; t++) {
414  PZ_PTHREAD_JOIN(threads[t], NULL, __FUNCTION__);
415  }
416  }
417  total_rst.stop();
418 #endif
419 
420  if (mstats.get_value() > 0) {
421  unsigned n = matrix.Dim();
422  uint64_t n_sky_items = 0;
423  uint64_t max_height = 0;
424  for (unsigned i=0; i<n; i++) {
425  unsigned height = matrix.SkyHeight(i);
426  if (mstats.get_value() > 1) {
427  cout << "col " << i << " height = " << height << endl;
428  }
429  n_sky_items += height;
430  if (height > max_height) max_height = height;
431  }
432  uint64_t n2 = n * n;
433  double av_height = (double) n_sky_items / (double) n;
434  cout << "N = " << n << endl;
435  cout << "N^2 = " << n2 << endl;
436  cout << "Sky items = " << n_sky_items << endl;
437  cout << "N^2 / Sky items = " << (double) n2 / (double) n_sky_items << endl;
438  cout << "Avg. Height = " << av_height << endl;
439  cout << "Max. Height = " << max_height << endl;
440  }
441 
443  if (dump_dm.was_set()) {
444  VERBOSE(1, "Dumping decomposed matrix into: " <<
445  dump_dm.get_value() << endl);
446  FileStreamWrapper dump_file(bd.get_value());
447  dump_file.OpenWrite(dump_dm.get_value());
448  matrix.Write(dump_file, 0);
449  }
450 
451  /* Gen/Check MD5 signature */
452  if (gen_dm_sig.was_set() || chk_dm_sig.was_set()) {
453  TPZMD5Stream sig;
454  matrix.Write(sig, 1);
455  int ret;
456  if (chk_dm_sig.was_set()) {
457  if ((ret=sig.CheckMD5(chk_dm_sig.get_value()))) {
458  cerr << "ERROR(ret=" << ret << ") : MD5 Signature for "
459  << "decomposed matrixdoes not match." << endl;
460  return 1;
461  }
462  else {
463  cout << "Checking decomposed matrix MD5 signature: [OK]" << endl;
464  }
465  }
466  if (gen_dm_sig.was_set()) {
467  if ((ret=sig.WriteMD5(gen_dm_sig.get_value()))) {
468  cerr << "ERROR (ret=" << ret << ") when writing the "
469  << "decomposed matrix MD5 signature to file: "
470  << gen_dm_sig.get_value() << endl;
471  return 1;
472  }
473  }
474  }
475 
476  int ret=0; // Ok
477 
479  if (chk_dm_error.was_set()) {
480  VERBOSE(1, "Checking decomposed matrix error: " <<
481  chk_dm_error.get_value() << endl);
482  FileStreamWrapper ref_file(br.get_value());
483  ref_file.OpenRead(chk_dm_error.get_value());
484  /* Reference matrix. */
485  TPZSkylMatrix<REAL> ref_matrix;
486  ref_matrix.Read(ref_file,0);
487  int max_j = matrix.Cols();
488  if (max_j != ref_matrix.Cols()) {
489  cerr << "Decomposed matrix has " << max_j
490  << " cols while reference matrix has "
491  << ref_matrix.Cols() << endl;
492  return 1;
493  }
494  REAL error_tolerance = error_tol.get_value();
495  REAL max_error = 0.0;
496  for (int j=0; j<max_j; j++) {
497  int col_height = matrix.SkyHeight(j);
498  if (col_height != ref_matrix.SkyHeight(j)) {
499  cerr << "Column " << j << " of decomposed matrix has " << col_height
500  << " non zero rows while reference matrix has "
501  << ref_matrix.SkyHeight(j) << endl;
502  return 1;
503  }
504  int min_i = (j+1) - col_height;
505  for (int i=min_i; i<=j; i++) {
506 
507  REAL dm_ij = matrix.s(i,j);
508  REAL rm_ij = ref_matrix.s(i,j);
509  if (dm_ij != rm_ij) {
510  REAL diff = abs(dm_ij - rm_ij);
511  if (diff >= error_tolerance) {
512  VERBOSE(1, "diff(" << diff << ") tolerance (" << error_tolerance
513  << "). dm[" << i << "][" << j << "] (" << dm_ij
514  << ") != rm[" << i << "][" << j << "] (" << rm_ij
515  << ")." << endl);
516  ret = 1;
517  max_error = (max_error < diff)?diff:max_error;
518  }
519  }
520  }
521  }
522  if (ret != 0) {
523  cerr << "Error ("<< max_error <<") > error tolerance ("
524  << error_tolerance <<")" << endl;
525  }
526  }
527 
528  return ret;
529 }
clarg::argString dump_dm("-dump_dm", "dump decomposed matrix. (use -bd for binary format)", "dump_matrix.txt")
Contains a class to record running statistics on CSV tables.
void set_affinity(int af, int tidx)
Definition: numatst.cpp:202
int Resize(const int64_t newDim, const int64_t) override
Redimensions a matriz keeping the previous values.
Definition: pzskylmat.cpp:2487
RunStatsTable total_rst("-tot_rdt", "Whole program (total) statistics raw data table")
clarg::argString chk_dm_sig("-chk_dm_md5", "compute MD5 signature for decomposed matrix and check against MD5 at file.", "decomposed_matrix.md5")
Contains definitions to LOGPZ_DEBUG, LOGPZ_INFO, LOGPZ_WARN, LOGPZ_ERROR and LOGPZ_FATAL, and the implementation of the inline InitializePZLOG(string) function using log4cxx library or not. It must to be called out of "#ifdef LOG4CXX" scope.
Contains declaration of the TPZMD5Stream class which implements the interface to write and check md5 ...
list threads
Definition: test.py:140
clarg::argInt nmats("-nmats", "Number of matrizes to decompose simultaneously.", 1)
void values(ostream &os, bool defined_only)
Definition: arglib.cpp:183
void setup_masks()
Definition: numatst.cpp:161
void OpenRead(const std::string &fn)
Definition: decompose.cpp:116
void help(const char *prg)
Definition: decompose.cpp:51
clarg::argBool br("-br", "binary reference. Reference decomposed matrix file format == binary.", false)
void Write(TPZStream &buf, int withclassid) const override
Packs the object structure in a stream of bytes.
Definition: pzskylmat.cpp:3287
clarg::argBool bi("-bi", "binary input. Input file format == binary.", false)
int64_t SkyHeight(int64_t col)
return the height of the skyline for a given column
Definition: pzskylmat.h:422
clarg::argBool h("-h", "help message", false)
TinyFad< 8, T > abs(const TinyFad< 8, T > &in)
Definition: tinyfadeight.h:846
void OpenWrite(const std::string &fn)
Definition: decompose.cpp:108
clarg::argInt verb_level("-v", "verbosity level", 0)
#define VERBOSE(level,...)
Definition: decompose.cpp:69
void compute_decompose(int idx)
Definition: numatst.cpp:268
Implements a skyline storage format. A Skyline matrix is symmetric so square. Matrix.
Definition: pzskylmat.h:394
fn
Definition: test.py:253
TPZSkylMatrix< REAL > matrix
Definition: numatst.cpp:255
clarg::argString ifn("-ifn", "input matrix file name (use -bi to read from binary files)", "matrix.txt")
#define PZ_PTHREAD_JOIN(thread, val, fn)
Definition: pz_pthread.h:34
virtual TVar & s(const int64_t row, const int64_t col) override
The operators check on the bounds if the DEBUG variable is defined.
Definition: pzskylmat.cpp:1871
clarg::argBool naa("-naDALora", "NUMA aware Dohrman Assembly List thread work objects re-allocation.", false)
clarg::argString mp("-mp", "starts execution from beginning - read a \alha_predio\input file", "../8andares02.txt")
clarg::argInt mstats("-mstats", "Matrix statistics vebosity level.", 0)
int CheckMD5(const std::string &filename)
Check Stream MD5 signature against MD5 signature store on file.
Definition: pzmd5stream.h:65
clarg::argInt affinity("-af", "affinity mode (0=no affinity, 1=heuristi 1)", 0)
int nthreads
Definition: numatst.cpp:315
Implements reading from and writing to an ascii file. Persistency.
Definition: TPZFileStream.h:15
clarg::argBool rea("-rea", "reallocate matrix inside matrix.", false)
TPZBFileStream bfs
Definition: decompose.cpp:136
int parse_arguments(int argc, char *argv[])
Definition: arglib.cpp:195
#define SET_RANGE(mskp, start, end)
TPZFileStream fs
Definition: decompose.cpp:135
Contains TPZSkyline class which implements a skyline storage format.
void arguments_descriptions(ostream &os, string prefix, string suffix)
Definition: arglib.cpp:189
std::vector< TPZAutoPointer< TPZSkylMatrix< REAL > > > matrices
Definition: decompose.cpp:141
int main(int argc, char *argv[])
Definition: decompose.cpp:314
virtual int64_t Dim() const
Returns the dimension of the matrix if the matrix is square.
Definition: pzmatrix.h:892
#define CASE_OP(opid, method)
clarg::argString chk_dm_error("-chk_dm_error", "check the decomposed matrix error against a reference matrix. (use -br to read from binary files)", "ref_decomposed_matrix.txt")
void Read(TPZStream &buf, void *context) override
Unpacks the object structure from a stream of bytes.
Definition: pzskylmat.cpp:3270
int WriteMD5(const std::string &filename)
Write computed MD5 signature to file.
Definition: pzmd5stream.h:121
FileStreamWrapper(bool b)
Definition: decompose.cpp:104
int verbose
Definition: decompose.cpp:67
clarg::argInt maxcol("-maxcol", "Limit computation to max column (Use Resize(maxcol)).", 0)
bool was_set() const
Definition: arglib.h:138
clarg::argString gen_dm_sig("-gen_dm_md5", "generates MD5 signature for decomposed matrix into file.", "decomposed_matrix.md5")
const T & get_value() const
Definition: arglib.h:177
clarg::argBool bd("-bd", "binary dump. Dump file format == binary.", false)
#define PZ_PTHREAD_CREATE(thread, attr, routine, args, fn)
Definition: pz_pthread.h:31
int64_t Cols() const
Returns number of cols.
Definition: pzmatrix.h:809
Defines the interface for saving and reading data. Persistency.
Definition: TPZStream.h:50
clarg::argString m("-m", "input matrix file name (text format)", "matrix.txt")
clarg::argDouble error_tol("-error_tol", "error tolerance.", 1.e-12)
clarg::argInt cholesky_blk("-chol_blk", "Cholesky blocking factor", 256)
void ReallocForNuma(int node)
clarg::argInt mop("-op", "Matrix operation", 1)
Implements the interface to write and check MD5 files. Persistency.
Definition: pzmd5stream.h:22
This class implements a reference counter mechanism to administer a dynamically allocated object...