Zoltan2
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 
4 #include <fstream>
5 #include <ctime>
6 #include <vector>
7 #include <set>
8 #include <tuple>
9 
10 #include "Zoltan2_Standards.hpp"
12 #include "Teuchos_ArrayViewDecl.hpp"
15 #include "Teuchos_ReductionOp.hpp"
16 
18 
19 #include "Zoltan2_GraphModel.hpp"
20 
21 #include <Zoltan2_TPLTraits.hpp>
22 
23 #include "Teuchos_Comm.hpp"
24 #ifdef HAVE_ZOLTAN2_MPI
25 #include "Teuchos_DefaultMpiComm.hpp"
26 #endif // HAVE_ZOLTAN2_MPI
27 #include <Teuchos_DefaultSerialComm.hpp>
28 
29 //#define gnuPlot
32 
33 namespace Teuchos{
34 
38 template <typename Ordinal, typename T>
39 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
40 {
41 
42 private:
43  T _EPSILON;
44 
45 public:
48  Zoltan2_ReduceBestMapping():_EPSILON(std::numeric_limits<T>::epsilon()) {}
49 
52  void reduce(const Ordinal count,
53  const T inBuffer[],
54  T inoutBuffer[]) const {
55 
56  for (Ordinal i = 0; i < count; i++) {
57  if (inBuffer[0] - inoutBuffer[0] < -_EPSILON) {
58  inoutBuffer[0] = inBuffer[0];
59  inoutBuffer[1] = inBuffer[1];
60  }
61  else if (inBuffer[0] - inoutBuffer[0] < _EPSILON &&
62  inBuffer[1] - inoutBuffer[1] < _EPSILON) {
63  inoutBuffer[0] = inBuffer[0];
64  inoutBuffer[1] = inBuffer[1];
65  }
66  }
67  }
68 };
69 
70 } // namespace Teuchos
71 
72 
73 namespace Zoltan2{
74 
75 template <typename it>
76 inline it z2Fact(it x) {
77  return (x == 1 ? x : x * z2Fact<it>(x - 1));
78 }
79 
80 template <typename gno_t, typename part_t>
82 
83 public:
84  gno_t gno;
86 
87 };
88 
89 //returns the ith permutation indices.
90 template <typename IT>
91 void ithPermutation(const IT n, IT i, IT *perm)
92 {
93  IT j, k = 0;
94  IT *fact = new IT[n];
95 
96  // compute factorial numbers
97  fact[k] = 1;
98  while (++k < n)
99  fact[k] = fact[k - 1] * k;
100 
101  // compute factorial code
102  for (k = 0; k < n; ++k)
103  {
104  perm[k] = i / fact[n - 1 - k];
105  i = i % fact[n - 1 - k];
106  }
107 
108  // readjust values to obtain the permutation
109  // start from the end and check if preceding values are lower
110  for (k = n - 1; k > 0; --k)
111  for (j = k - 1; j >= 0; --j)
112  if (perm[j] <= perm[k])
113  perm[k]++;
114 
115  delete [] fact;
116 }
117 
118 template <typename part_t>
120  part_t *&task_comm_xadj,
121  part_t *&task_comm_adj,
122  std::vector<int> grid_dims) {
123 
124  int dim = grid_dims.size();
125  int neighborCount = 2 * dim;
126  task_comm_xadj = allocMemory<part_t>(taskCount + 1);
127  task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
128 
129  part_t neighBorIndex = 0;
130  task_comm_xadj[0] = 0;
131 
132  for (part_t i = 0; i < taskCount; ++i) {
133  part_t prevDimMul = 1;
134 
135  for (int j = 0; j < dim; ++j) {
136  part_t lNeighbor = i - prevDimMul;
137  part_t rNeighbor = i + prevDimMul;
138  prevDimMul *= grid_dims[j];
139 
140  if (lNeighbor >= 0 &&
141  lNeighbor / prevDimMul == i / prevDimMul &&
142  lNeighbor < taskCount) {
143  task_comm_adj[neighBorIndex++] = lNeighbor;
144  }
145 
146  if (rNeighbor >= 0 &&
147  rNeighbor / prevDimMul == i / prevDimMul &&
148  rNeighbor < taskCount) {
149  task_comm_adj[neighBorIndex++] = rNeighbor;
150  }
151  }
152 
153  task_comm_xadj[i + 1] = neighBorIndex;
154  }
155 }
156 
157 //returns the center of the parts.
158 template <typename Adapter, typename scalar_t, typename part_t>
160  const Environment *envConst,
161  const Teuchos::Comm<int> *comm,
163 // const Zoltan2::PartitioningSolution<Adapter> *soln_,
164  const part_t *parts,
165  int coordDim,
166  part_t ntasks,
167  scalar_t **partCenters) {
168 
169  typedef typename Adapter::lno_t lno_t;
170  typedef typename Adapter::gno_t gno_t;
171 
172  typedef StridedData<lno_t, scalar_t> input_t;
173  ArrayView<const gno_t> gnos;
174  ArrayView<input_t> xyz;
175  ArrayView<input_t> wgts;
176  coords->getCoordinates(gnos, xyz, wgts);
177 
178  //local and global num coordinates.
179  lno_t numLocalCoords = coords->getLocalNumCoordinates();
180 // gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
181 
182  //local number of points in each part.
183  gno_t *point_counts = allocMemory<gno_t>(ntasks);
184  memset(point_counts, 0, sizeof(gno_t) * ntasks);
185 
186  //global number of points in each part.
187  gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
188 
189  scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
190 
191  for (int dim = 0; dim < coordDim; dim++) {
192  ArrayRCP<const scalar_t> ar;
193  xyz[dim].getInputArray(ar);
194 
195  //multiJagged coordinate values assignment
196  multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
197  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
198  }
199 
200  //get parts with parallel gnos.
201 // const part_t *parts = soln_->getPartListView();
202 
203  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
204 
205  for (lno_t i = 0; i < numLocalCoords; i++) {
206  part_t p = parts[i];
207 
208  //add up all coordinates in each part.
209  for (int j = 0; j < coordDim; ++j) {
210  scalar_t c = multiJagged_coordinates[j][i];
211  partCenters[j][p] += c;
212  }
213  ++point_counts[p];
214  }
215 
216  //get global number of points in each part.
217  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
218  ntasks, point_counts, global_point_counts);
219 
220  for (int j = 0; j < coordDim; ++j) {
221  for (part_t i = 0; i < ntasks; ++i) {
222  if (global_point_counts[i] > 0)
223  partCenters[j][i] /= global_point_counts[i];
224  }
225  }
226 
227  scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
228  for (int j = 0; j < coordDim; ++j) {
229  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
230  ntasks, partCenters[j], tmpCoords);
231 
232  scalar_t *tmp = partCenters[j];
233  partCenters[j] = tmpCoords;
234  tmpCoords = tmp;
235  }
236 
237  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
238 
239  freeArray<gno_t>(point_counts);
240  freeArray<gno_t>(global_point_counts);
241 
242  freeArray<scalar_t>(tmpCoords);
243  freeArray<scalar_t *>(multiJagged_coordinates);
244 }
245 
246 //returns the coarsend part graph.
247 template <typename Adapter, typename scalar_t, typename part_t>
249  const Environment *envConst,
250  const Teuchos::Comm<int> *comm,
252  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
253  part_t np,
254  const part_t *parts,
255  ArrayRCP<part_t> &g_part_xadj,
256  ArrayRCP<part_t> &g_part_adj,
257  ArrayRCP<scalar_t> &g_part_ew) {
258 
259  typedef typename Adapter::lno_t t_lno_t;
260  typedef typename Adapter::gno_t t_gno_t;
261  typedef typename Adapter::scalar_t t_scalar_t;
262  typedef typename Adapter::offset_t t_offset_t;
263 
264  typedef typename Zoltan2::GraphModel<
265  typename Adapter::base_adapter_t>::input_t t_input_t;
266 
267 // int numRanks = comm->getSize();
268 // int myRank = comm->getRank();
269 
270  //get parts with parallel gnos.
271 /*
272  const part_t *parts = soln_->getPartListView();
273 
274  part_t np = soln_->getActualGlobalNumberOfParts();
275  if (part_t(soln_->getTargetGlobalNumberOfParts()) > np) {
276  np = soln_->getTargetGlobalNumberOfParts();
277  }
278 */
279 
280 
281  t_lno_t localNumVertices = graph->getLocalNumVertices();
282  t_lno_t localNumEdges = graph->getLocalNumEdges();
283 
284  //get the vertex global ids, and weights
285  ArrayView<const t_gno_t> Ids;
286  ArrayView<t_input_t> v_wghts;
287  graph->getVertexList(Ids, v_wghts);
288 
289  //get the edge ids, and weights
290  ArrayView<const t_gno_t> edgeIds;
291  ArrayView<const t_offset_t> offsets;
292  ArrayView<t_input_t> e_wgts;
293  graph->getEdgeList(edgeIds, offsets, e_wgts);
294 
295  std::vector<t_scalar_t> edge_weights;
296  int numWeightPerEdge = graph->getNumWeightsPerEdge();
297 
298  if (numWeightPerEdge > 0) {
299  edge_weights = std::vector<t_scalar_t>(localNumEdges);
300 
301  for (t_lno_t i = 0; i < localNumEdges; ++i) {
302  edge_weights[i] = e_wgts[0][i];
303  }
304  }
305 
306  //create a zoltan dictionary to get the parts of the vertices
307  //at the other end of edges
308  std::vector<part_t> e_parts(localNumEdges);
309 
310 #ifdef HAVE_ZOLTAN2_MPI
311  if (comm->getSize() > 1)
312  {
314 
315  const bool bUseLocalIDs = false; // Local IDs not needed
316  int debug_level = 0;
317  const RCP<const Comm<int> > rcp_comm(comm,false);
318 
319  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
320 
321  directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
322  NULL, directory_t::Update_Mode::Replace);
323 
324  directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
325  NULL, NULL, false);
326  } else
327 #endif
328  {
329 
330 /*
331  std::cout << "localNumVertices:" << localNumVertices
332  << " np:" << np
333  << " globalNumVertices:" << graph->getGlobalNumVertices()
334  << " localNumEdges:" << localNumEdges << std::endl;
335 */
336 
337  for (t_lno_t i = 0; i < localNumEdges; ++i) {
338  t_gno_t ei = edgeIds[i];
339  part_t p = parts[ei];
340  e_parts[i] = p;
341  }
342 
343  //get the vertices in each part in my part.
344  std::vector<t_lno_t> part_begins(np, -1);
345  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
346 
347  //cluster vertices according to their parts.
348  //create local part graph.
349  for (t_lno_t i = 0; i < localNumVertices; ++i) {
350  part_t ap = parts[i];
351  part_nexts[i] = part_begins[ap];
352  part_begins[ap] = i;
353  }
354 
355  g_part_xadj = ArrayRCP<part_t>(np + 1);
356  g_part_adj = ArrayRCP<part_t>(localNumEdges);
357  g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
358 
359  part_t nindex = 0;
360  g_part_xadj[0] = 0;
361 
362  std::vector<part_t> part_neighbors(np);
363  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
364  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
365 
366  //coarsen for all vertices in my part in order with parts.
367  for (t_lno_t i = 0; i < np; ++i) {
368  part_t num_neighbor_parts = 0;
369  t_lno_t v = part_begins[i];
370 
371  //get part i, and first vertex in this part v.
372  while (v != -1) {
373  //now get the neightbors of v.
374  for (t_offset_t j = offsets[v]; j < offsets[v + 1]; ++j) {
375  //get the part of the second vertex.
376  part_t ep = e_parts[j];
377 
378  t_scalar_t ew = 1;
379  if (numWeightPerEdge > 0) {
380  ew = edge_weights[j];
381  }
382 
383 // std::cout << "part:" << i << " v:" << v
384 // << " part2:" << ep << " v2:" << edgeIds[j]
385 // << " w:" << ew << std::endl;
386 
387  //add it to my local part neighbors for part i.
388  if (part_neighbor_weights[ep] < 0.00001) {
389  part_neighbors[num_neighbor_parts++] = ep;
390  }
391 
392  part_neighbor_weights[ep] += ew;
393  }
394 
395  v = part_nexts[v];
396  }
397 
398 
399  //now get the part list.
400  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
401  part_t neighbor_part = part_neighbors[j];
402  g_part_adj[nindex] = neighbor_part;
403  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
404  part_neighbor_weights[neighbor_part] = 0;
405  }
406  g_part_xadj[i + 1] = nindex;
407  }
408  return;
409  }
410 
411  // struct for directory data - note more extensive comments in
412  // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
413  // because this is in progress. Similar concept there.
414  struct part_info {
415  part_info() : weight(0) {
416  }
417 
418  const part_info & operator+=(const part_info & src) {
419  this->weight += src.weight;
420  return *this;
421  }
422 
423  bool operator>(const part_info & src) {
424  return (destination_part > src.destination_part);
425  }
426 
427  bool operator==(const part_info & src) {
428  return (destination_part == src.destination_part);
429  }
430 
431  part_t destination_part;
432  scalar_t weight;
433  };
434 
436  directory_t;
437 
438  bool bUseLocalIDs = false;
439  const int debug_level = 0;
440  const RCP<const Comm<int> > rcp_comm(comm, false);
441 
442  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
443 
444  std::vector<part_t> part_data;
445  std::vector<std::vector<part_info>> user_data;
446 
447  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
448  {
449  //get the vertices in each part in my part.
450  std::vector<t_lno_t> part_begins(np, -1);
451  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
452 
453  //cluster vertices according to their parts.
454  //create local part graph.
455  for (t_lno_t i = 0; i < localNumVertices; ++i) {
456  part_t ap = parts[i];
457  part_nexts[i] = part_begins[ap];
458  part_begins[ap] = i;
459  }
460 
461  std::vector<part_t> part_neighbors(np);
462  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
463  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
464 
465  //coarsen for all vertices in my part in order with parts.
466  for (t_lno_t i = 0; i < np; ++i) {
467  part_t num_neighbor_parts = 0;
468  t_lno_t v = part_begins[i];
469 
470  //get part i, and first vertex in this part v.
471  while (v != -1) {
472  //now get the neightbors of v.
473  for (t_offset_t j = offsets[v]; j < offsets[v + 1]; ++j) {
474  //get the part of the second vertex.
475  part_t ep = e_parts[j];
476 
477  t_scalar_t ew = 1;
478  if (numWeightPerEdge > 0) {
479  ew = edge_weights[j];
480  }
481 
482  //add it to my local part neighbors for part i.
483  if (part_neighbor_weights[ep] < 0.00001) {
484  part_neighbors[num_neighbor_parts++] = ep;
485  }
486 
487  part_neighbor_weights[ep] += ew;
488  }
489 
490  v = part_nexts[v];
491  }
492 
493  //now get the part list.
494  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
495  part_t neighbor_part = part_neighbors[j];
496  part_neighbor_weights_ordered[j] =
497  part_neighbor_weights[neighbor_part];
498  part_neighbor_weights[neighbor_part] = 0;
499  }
500 
501  //insert it to tpetra crsmatrix.
502  if (num_neighbor_parts > 0) {
503  // TODO: optimize to avoid push_back
504  part_data.push_back(i);
505  std::vector<part_info> new_user_data(num_neighbor_parts);
506 
507  for (int q = 0; q < num_neighbor_parts; ++q) {
508  part_info & info = new_user_data[q];
509  info.weight = part_neighbor_weights_ordered[q];
510  info.destination_part = part_neighbors[q];
511  }
512  // TODO: optimize to avoid push_back
513  user_data.push_back(new_user_data);
514  }
515  }
516  }
517  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
518 
519  std::vector<part_t> part_indices(np);
520 
521  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
522 
523  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
524  directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
525  NULL, directory_t::Update_Mode::AggregateAdd);
526  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
527 
528  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
529  std::vector<std::vector<part_info>> find_user_data(part_indices.size());
530  directory.find(find_user_data.size(), &part_indices[0], NULL,
531  &find_user_data[0], NULL, NULL, false);
532  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
533 
534  // Now reconstruct the output data from the directory find data
535  // This code was designed to reproduce the exact format of the original
536  // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
537  // further changes I wanted to verify if this formatting should be
538  // preserved or potentially changed further.
539 
540  // first thing is get the total number of elements
541  int get_total_length = 0;
542  for (size_t n = 0; n < find_user_data.size(); ++n) {
543  get_total_length += find_user_data[n].size();
544  }
545 
546  // setup data space
547  g_part_xadj = ArrayRCP<part_t>(np + 1);
548  g_part_adj = ArrayRCP<part_t>(get_total_length);
549  g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
550 
551  // loop through again and fill to match the original formatting
552  int track_insert_index = 0;
553  for (size_t n = 0; n < find_user_data.size(); ++n) {
554  g_part_xadj[n] = track_insert_index;
555  const std::vector<part_info> & user_data_vector = find_user_data[n];
556 
557  for (size_t q = 0; q < user_data_vector.size(); ++q) {
558  const part_info & info = user_data_vector[q];
559  g_part_adj[track_insert_index] = info.destination_part;
560  g_part_ew[track_insert_index] = info.weight;
561  ++track_insert_index;
562  }
563  }
564  g_part_xadj[np] = get_total_length; // complete the series
565 
566  // This is purely for debugging logging and to be deleted
567  // hacky sort here just to make each subset of part destinations ordered
568  // so new and old code will produce identical output for validation.
569  // I think the ordering is arbitrary and no requirement to preserve the
570  // old ordering within each part. This code was written so I could drop
571  // it into the develop branch and validate we were producing identical
572  // results at this step. Would like to discuss this further before
573  // continuing.
574  // TODO: Delete all this
575 
576 /*
577  if (comm->getRank() == 0) {
578 
579  for (size_t i = 0; i < (size_t)g_part_xadj.size() - 1; ++i) {
580  part_t b = g_part_xadj[i];
581  part_t e = g_part_xadj[i + 1];
582  printf("Results for part %d:\n", (int)i);
583  for (part_t scan_part = 0; scan_part < np; ++scan_part) {
584 
585  for (part_t q = b; q < e; ++q) {
586  // Inefficient hack to sort g_part_adj
587  if (g_part_adj[q] == scan_part) {
588  printf( "(%d:%.2f) ", (int)scan_part, (float)g_part_ew[q]);
589  }
590  }
591  }
592 
593  printf("\n");
594  }
595  }
596 */
597 
598 }
599 
600 
603 template <class IT, class WT>
605 
606  IT heapSize;
607  IT *indices;
608  WT *values;
609  WT _EPSILON;
610 
611 public:
612 
614  freeArray<IT>(this->indices);
615  freeArray<WT>(this->values);
616  }
617 
618  void setHeapsize(IT heapsize_) {
619  this->heapSize = heapsize_;
620  this->indices = allocMemory<IT>(heapsize_ );
621  this->values = allocMemory<WT>(heapsize_ );
622  this->_EPSILON = std::numeric_limits<WT>::epsilon();
623  }
624 
625 
626  void addPoint(IT index, WT distance) {
627  WT maxVal = this->values[0];
628  //add only the distance is smaller than the maximum distance.
629 // std::cout << "indeX:" << index
630 // << " distance:" << distance
631 // << " maxVal:" << maxVal << endl;
632 
633  if (distance >= maxVal)
634  return;
635  else {
636  this->values[0] = distance;
637  this->indices[0] = index;
638  this->push_down(0);
639  }
640  }
641 
642  //heap push down operation
643  void push_down(IT index_on_heap) {
644  IT child_index1 = 2 * index_on_heap + 1;
645  IT child_index2 = 2 * index_on_heap + 2;
646 
647  IT biggerIndex = -1;
648  if (child_index1 < this->heapSize && child_index2 < this->heapSize) {
649 
650  if (this->values[child_index1] < this->values[child_index2]) {
651  biggerIndex = child_index2;
652  }
653  else {
654  biggerIndex = child_index1;
655  }
656  }
657  else if (child_index1 < this->heapSize) {
658  biggerIndex = child_index1;
659  }
660  else if (child_index2 < this->heapSize) {
661  biggerIndex = child_index2;
662  }
663 
664  if (biggerIndex >= 0 &&
665  this->values[biggerIndex] > this->values[index_on_heap]) {
666  WT tmpVal = this->values[biggerIndex];
667  this->values[biggerIndex] = this->values[index_on_heap];
668  this->values[index_on_heap] = tmpVal;
669 
670  IT tmpIndex = this->indices[biggerIndex];
671  this->indices[biggerIndex] = this->indices[index_on_heap];
672  this->indices[index_on_heap] = tmpIndex;
673  this->push_down(biggerIndex);
674  }
675  }
676 
677  void initValues() {
678  WT MAXVAL = std::numeric_limits<WT>::max();
679 
680  for (IT i = 0; i < this->heapSize; ++i) {
681  this->values[i] = MAXVAL;
682  this->indices[i] = -1;
683  }
684  }
685 
686  //returns the total distance to center in the cluster.
688 
689  WT nc = 0;
690  for (IT j = 0; j < this->heapSize; ++j) {
691  nc += this->values[j];
692 
693 // std::cout << "index:" << this->indices[j]
694 // << " distance:" << this->values[j] << endl;
695  }
696 
697  return nc;
698  }
699 
700  //returns the new center of the cluster.
701  bool getNewCenters(WT *center, WT **coords, int dimension) {
702  bool moved = false;
703 
704  for (int i = 0; i < dimension; ++i) {
705  WT nc = 0;
706 
707  for (IT j = 0; j < this->heapSize; ++j) {
708  IT k = this->indices[j];
709 // std::cout << "i:" << i
710 // << " dim:" << dimension
711 // << " k:" << k
712 // << " heapSize:" << heapSize << endl;
713  nc += coords[i][k];
714  }
715 
716  nc /= this->heapSize;
717  moved = (ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
718  center[i] = nc;
719  }
720 
721  return moved;
722  }
723 
724  void copyCoordinates(IT *permutation) {
725  for (IT i = 0; i < this->heapSize; ++i) {
726  permutation[i] = this->indices[i];
727  }
728  }
729 };
730 
733 template <class IT, class WT>
735 
736  int dimension;
737  KmeansHeap<IT,WT> closestPoints;
738 
739 public:
740 
741  WT *center;
742 
744  freeArray<WT>(center);
745  }
746 
747  void setParams(int dimension_, int heapsize) {
748  this->dimension = dimension_;
749  this->center = allocMemory<WT>(dimension_);
750  this->closestPoints.setHeapsize(heapsize);
751  }
752 
753  void clearHeap() {
754  this->closestPoints.initValues();
755  }
756 
757  bool getNewCenters( WT **coords) {
758  return this->closestPoints.getNewCenters(center, coords, dimension);
759  }
760 
761  //returns the distance of the coordinate to the center.
762  //also adds it to the heap.
763  WT getDistance(IT index, WT **elementCoords) {
764  WT distance = 0;
765 
766  for (int i = 0; i < this->dimension; ++i) {
767  WT d = (center[i] - elementCoords[i][index]);
768  distance += d * d;
769  }
770 
771  distance = pow(distance, WT(1.0 / this->dimension));
772  closestPoints.addPoint(index, distance);
773 
774  return distance;
775  }
776 
778  return closestPoints.getTotalDistance();
779  }
780 
781  void copyCoordinates(IT *permutation) {
782  closestPoints.copyCoordinates(permutation);
783  }
784 };
785 
792 template <class IT, class WT>
794 
795  int dim;
796  IT numElements;
797  WT **elementCoords;
798  IT numClusters;
799  IT required_elements;
800  KMeansCluster<IT,WT> *clusters;
801  WT *maxCoordinates;
802  WT *minCoordinates;
803 
804 public:
805 
807  freeArray<KMeansCluster<IT,WT> >(clusters);
808  freeArray<WT>(maxCoordinates);
809  freeArray<WT>(minCoordinates);
810  }
811 
815  int dim_ ,
816  IT numElements_,
817  WT **elementCoords_,
818  IT required_elements_):
819  dim(dim_),
820  numElements(numElements_),
821  elementCoords(elementCoords_),
822  numClusters((1 << dim_) + 1),
823  required_elements(required_elements_) {
824 
825  this->clusters = allocMemory<KMeansCluster<IT,WT> >(this->numClusters);
826 
827  //set dimension and the number of required elements for all clusters.
828  for (int i = 0; i < numClusters; ++i) {
829  this->clusters[i].setParams(this->dim, this->required_elements);
830  }
831 
832  this->maxCoordinates = allocMemory <WT>(this->dim);
833  this->minCoordinates = allocMemory <WT>(this->dim);
834 
835  //obtain the min and max coordiantes for each dimension.
836  for (int j = 0; j < dim; ++j) {
837  this->minCoordinates[j] = this->elementCoords[j][0];
838  this->maxCoordinates[j] = this->elementCoords[j][0];
839 
840  for (IT i = 1; i < numElements; ++i) {
841  WT t = this->elementCoords[j][i];
842 
843  if (t > this->maxCoordinates[j]) {
844  this->maxCoordinates[j] = t;
845  }
846 
847  if (t < minCoordinates[j]) {
848  this->minCoordinates[j] = t;
849  }
850  }
851  }
852 
853 
854  //assign initial cluster centers.
855  for (int j = 0; j < dim; ++j) {
856  int mod = (1 << (j + 1));
857 
858  for (int i = 0; i < numClusters - 1; ++i) {
859  WT c = 0;
860 
861  if ( (i % mod) < mod / 2) {
862  c = this->maxCoordinates[j];
863 // std::cout << "i:" << i << " j:" << j
864 // << " setting max:" << c << endl;
865  }
866  else {
867  c = this->minCoordinates[j];
868  }
869 
870  this->clusters[i].center[j] = c;
871  }
872  }
873 
874  //last cluster center is placed to middle.
875  for (int j = 0; j < dim; ++j) {
876  this->clusters[numClusters - 1].center[j] =
877  (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
878  }
879 
880 
881 /*
882  for (int i = 0; i < numClusters; ++i) {
883 // std::cout << endl << "cluster:" << i << endl << "\t";
884 
885  for (int j = 0; j < dim; ++j) {
886  std::cout << this->clusters[i].center[j] << " ";
887  }
888  }
889 */
890  }
891 
892  // Performs kmeans clustering of coordinates.
893  void kmeans() {
894  for (int it = 0; it < 10; ++it) {
895 // std::cout << "it:" << it << endl;
896 
897  for (IT j = 0; j < this->numClusters; ++j) {
898  this->clusters[j].clearHeap();
899  }
900 
901  for (IT i = 0; i < this->numElements; ++i) {
902 // std::cout << "i:" << i << " numEl:" << this->numElements << endl;
903 
904  for (IT j = 0; j < this->numClusters; ++j) {
905 // std::cout << "j:" << j
906 // << " numClusters:" << this->numClusters << endl;
907 
908  this->clusters[j].getDistance(i,this->elementCoords);
909  }
910  }
911 
912  bool moved = false;
913 
914  for (IT j = 0; j < this->numClusters; ++j) {
915  moved =
916  (this->clusters[j].getNewCenters(this->elementCoords) || moved );
917  }
918  if (!moved) {
919  break;
920  }
921  }
922  }
923 
924  // Finds the cluster in which the coordinates are the closest to each
925  // other.
926  void getMinDistanceCluster(IT *procPermutation) {
927 
928  WT minDistance = this->clusters[0].getDistanceToCenter();
929  IT minCluster = 0;
930 
931 // std::cout << "j:" << 0 << " minDistance:" << minDistance
932 // << " minTmpDistance:" << minDistance
933 // << " minCluster:" << minCluster << endl;
934 
935  for (IT j = 1; j < this->numClusters; ++j) {
936  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
937 
938 // std::cout << "j:" << j << " minDistance:" << minDistance
939 // << " minTmpDistance:" << minTmpDistance
940 // << " minCluster:" << minCluster << endl;
941 
942  if (minTmpDistance < minDistance) {
943  minDistance = minTmpDistance;
944  minCluster = j;
945  }
946  }
947 
948 // std::cout << "minCluster:" << minCluster << endl;
949  this->clusters[minCluster].copyCoordinates(procPermutation);
950  }
951 };
952 
953 
954 #define MINOF(a,b) (((a)<(b))?(a):(b))
955 
963 template <typename T>
964 void fillContinousArray(T *arr, size_t arrSize, T *val) {
965  if (val == NULL) {
966 
967 #ifdef HAVE_ZOLTAN2_OMP
968 #pragma omp parallel for
969 #endif
970  for (size_t i = 0; i < arrSize; ++i) {
971  arr[i] = i;
972  }
973 
974  }
975  else {
976  T v = *val;
977 #ifdef HAVE_ZOLTAN2_OMP
978 #pragma omp parallel for
979 #endif
980  for (size_t i = 0; i < arrSize; ++i) {
981 // std::cout << "writing to i:" << i << " arr:" << arrSize << endl;
982  arr[i] = v;
983  }
984  }
985 }
986 
990 template <typename part_t, typename pcoord_t>
992 
993 protected:
994  double commCost;
995 
996 public:
997 
998  // Number of processors and number of tasks
1001 
1002 
1004  CommunicationModel(part_t no_procs_, part_t no_tasks_):
1005  commCost(),
1006  no_procs(no_procs_),
1007  no_tasks(no_tasks_) {}
1008 
1009  virtual ~CommunicationModel() {}
1010 
1012  return this->no_procs;
1013  }
1014 
1016  return this->no_tasks;
1017  }
1018 
1020  part_t *task_to_proc,
1021  part_t *task_communication_xadj,
1022  part_t *task_communication_adj,
1023  pcoord_t *task_communication_edge_weight) {
1024 
1025  double totalCost = 0;
1026 
1027  part_t commCount = 0;
1028  for (part_t task = 0; task < this->no_tasks; ++task) {
1029  int assigned_proc = task_to_proc[task];
1030 
1031  part_t task_adj_begin = task_communication_xadj[task];
1032  part_t task_adj_end = task_communication_xadj[task + 1];
1033 
1034  commCount += task_adj_end - task_adj_begin;
1035 
1036  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
1037 
1038  part_t neighborTask = task_communication_adj[task2];
1039  int neighborProc = task_to_proc[neighborTask];
1040  double distance = getProcDistance(assigned_proc, neighborProc);
1041 
1042  if (task_communication_edge_weight == NULL) {
1043  totalCost += distance ;
1044  }
1045  else {
1046  totalCost += distance * task_communication_edge_weight[task2];
1047  }
1048  }
1049  }
1050 
1051  // commCount
1052  this->commCost = totalCost;
1053  }
1054 
1056  return this->commCost;
1057  }
1058 
1059  virtual double getProcDistance(int procId1, int procId2) const = 0;
1060 
1070  virtual void getMapping(
1071  int myRank,
1072  const RCP<const Environment> &env,
1073  ArrayRCP <part_t> &proc_to_task_xadj,
1074  ArrayRCP <part_t> &proc_to_task_adj,
1075  ArrayRCP <part_t> &task_to_proc,
1076  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1077  ) const = 0;
1078 };
1079 
1080 
1085 template <typename pcoord_t, typename tcoord_t, typename part_t>
1087  pcoord_t> {
1088 public:
1089  //private:
1090 
1091  // Dimension of the processors
1093  // Processor coordinates (allocated outside of the class)
1094  pcoord_t **proc_coords;
1095  // Dimension of the tasks coordinates.
1097  // Task coordinates (allocated outside of the class)
1098  tcoord_t **task_coords;
1099 
1102 
1106 
1109 
1110  //public:
1112  CommunicationModel<part_t, pcoord_t>(),
1113  proc_coord_dim(0),
1114  proc_coords(0),
1115  task_coord_dim(0),
1116  task_coords(0),
1117  partArraySize(-1),
1118  partNoArray(NULL),
1119  machine_extent(NULL),
1121  machine(NULL),
1122  num_ranks_per_node(1),
1123  divide_to_prime_first(false) {}
1124 
1126 
1138  int pcoord_dim_,
1139  pcoord_t **pcoords_,
1140  int tcoord_dim_,
1141  tcoord_t **tcoords_,
1142  part_t no_procs_,
1143  part_t no_tasks_,
1144  int *machine_extent_,
1145  bool *machine_extent_wrap_around_,
1146  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1147  ):
1148  CommunicationModel<part_t, pcoord_t>(no_procs_, no_tasks_),
1149  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1150  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1151  partArraySize(-1),
1152  partNoArray(NULL),
1153  machine_extent(machine_extent_),
1154  machine_extent_wrap_around(machine_extent_wrap_around_),
1155  machine(machine_),
1156  num_ranks_per_node(1),
1157  divide_to_prime_first(false) {
1158  }
1159 
1160 
1161  void setPartArraySize(int psize) {
1162  this->partArraySize = psize;
1163  }
1164 
1165  void setPartArray(part_t *pNo) {
1166  this->partNoArray = pNo;
1167  }
1168 
1178  void getClosestSubset(part_t *proc_permutation, part_t nprocs,
1179  part_t ntasks) const{
1180  // Currently returns a random subset.
1181 
1182  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1184  minCoordDim, nprocs,
1185  this->proc_coords, ntasks);
1186 
1187  kma.kmeans();
1188  kma.getMinDistanceCluster(proc_permutation);
1189 
1190  for (int i = ntasks; i < nprocs; ++i) {
1191  proc_permutation[i] = -1;
1192  }
1193 
1194 /*
1195  //fill array.
1196  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1197 
1198  int _u_umpa_seed = 847449649;
1199  srand (time(NULL));
1200 
1201  int a = rand() % 1000 + 1;
1202  _u_umpa_seed -= a;
1203 
1204  //permute array randomly.
1205  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1206 */
1207  }
1208 
1209  // Temporary, necessary for random permutation.
1210  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1211  {
1212  int a = 16807;
1213  int m = 2147483647;
1214  int q = 127773;
1215  int r = 2836;
1216  int lo, hi, test;
1217  double d;
1218 
1219  lo = _u_umpa_seed % q;
1220  hi = _u_umpa_seed / q;
1221  test = (a * lo) - (r * hi);
1222 
1223  if (test>0)
1224  _u_umpa_seed = test;
1225  else
1226  _u_umpa_seed = test + m;
1227 
1228  d = (double) ((double) _u_umpa_seed / (double) m);
1229 
1230  return (part_t) (d*(double)l);
1231  }
1232 
1233  virtual double getProcDistance(int procId1, int procId2) const {
1234  pcoord_t distance = 0;
1235  if (machine == NULL) {
1236  for (int i = 0 ; i < this->proc_coord_dim; ++i) {
1237  double d =
1238  ZOLTAN2_ABS(proc_coords[i][procId1] - proc_coords[i][procId2]);
1240  if (machine_extent[i] - d < d) {
1241  d = machine_extent[i] - d;
1242  }
1243  }
1244  distance += d;
1245  }
1246  }
1247  else {
1248  this->machine->getHopCount(procId1, procId2, distance);
1249  }
1250 
1251  return distance;
1252  }
1253 
1254 
1255  // Temporary, does random permutation.
1256  void update_visit_order(part_t* visitOrder, part_t n,
1257  int &_u_umpa_seed, part_t rndm) {
1258  part_t *a = visitOrder;
1259 
1260  if (rndm) {
1261  part_t i, u, v, tmp;
1262 
1263  if (n <= 4)
1264  return;
1265 
1266 // srand ( time(NULL) );
1267 // _u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1268 
1269  for (i = 0; i < n; i += 16) {
1270  u = umpa_uRandom(n-4, _u_umpa_seed);
1271  v = umpa_uRandom(n-4, _u_umpa_seed);
1272 
1273  // FIXME (mfh 30 Sep 2015) This requires including
1274  // Zoltan2_AlgMultiJagged.hpp.
1275 
1276  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1277  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1278  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1279  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1280  }
1281  }
1282  else {
1283  part_t i, end = n / 4;
1284 
1285  for (i = 1; i < end; i++) {
1286  part_t j = umpa_uRandom(n - i, _u_umpa_seed);
1287  part_t t = a[j];
1288  a[j] = a[n - i];
1289  a[n - i] = t;
1290  }
1291  }
1292 // PermuteInPlace(visitOrder, n);
1293  }
1294 
1304  virtual void getMapping(
1305  int myRank,
1306  const RCP<const Environment> &env,
1307  ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1308  ArrayRCP <part_t> &rcp_proc_to_task_adj,
1309  ArrayRCP <part_t> &rcp_task_to_proc,
1310  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1311  ) const {
1312 
1313  rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs + 1);
1314  rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1315  rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1316 
1317  // Holds the pointer to the task array
1318  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1319 
1320  // Holds the indices of task wrt to proc_to_task_xadj array.
1321  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1322 
1323  // Holds the processors mapped to tasks.
1324  part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1325 
1326  part_t invalid = 0;
1327  fillContinousArray<part_t>(proc_to_task_xadj, this->no_procs + 1,
1328  &invalid);
1329 
1330  // Obtain the number of parts that should be divided.
1331  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1332 
1333  // Obtain the min coordinate dim.
1334  // No more want to do min coord dim. If machine dimension > task_dim,
1335  // we end up with a long line.
1336 // part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1337 
1338  int recursion_depth = partArraySize;
1339 
1340 // if (partArraySize < minCoordDim)
1341 // recursion_depth = minCoordDim;
1342  if (partArraySize == -1) {
1343 
1344  if (divide_to_prime_first) {
1345  // It is difficult to estimate the number of steps in this case
1346  // as each branch will have different depth.
1347  // The worst case happens when all prime factors are 3s.
1348  // P = 3^n, n recursion depth will divide parts to 2x and x
1349  // and n recursion depth with divide 2x into x and x.
1350  // Set it to upperbound here.
1351  // We could calculate the exact value here as well, but the
1352  // partitioning algorithm skips further ones anyways.
1353  recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1354  }
1355  else {
1356  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1357  }
1358  }
1359 
1360  // Get number of different permutations for task dimension ordering
1361  int taskPerm = z2Fact<int>(this->task_coord_dim);
1362  // Get number of different permutations for proc dimension ordering
1363  int procPerm = z2Fact<int>(this->proc_coord_dim);
1364 
1365  // Total number of permutations (both task and proc permuted)
1366  int permutations = taskPerm * procPerm;
1367 
1368  // Add permutations where we divide the processors with longest
1369  // dimension but permute tasks.
1370  permutations += taskPerm;
1371 
1372  // Add permutations where we divide the tasks with longest
1373  // dimension but permute procs.
1374  permutations += procPerm;
1375 
1376  // Add permutation with both tasks and procs divided by longest
1377  // dimension
1378  permutations += 1;
1379 
1380  // Holds the pointers to proc_adjList
1381  part_t *proc_xadj = allocMemory<part_t>(num_parts + 1);
1382 
1383  // Holds the processors in parts according to the result of
1384  // partitioning algorithm.
1385  // The processors assigned to part x is at
1386  // proc_adjList[ proc_xadj[x] : proc_xadj[x + 1] ]
1387  part_t *proc_adjList = allocMemory<part_t>(this->no_procs);
1388 
1389 
1390  part_t used_num_procs = this->no_procs;
1391  if (this->no_procs > this->no_tasks) {
1392  // Obtain the subset of the processors that are closest to each
1393  // other.
1394  this->getClosestSubset(proc_adjList, this->no_procs,
1395  this->no_tasks);
1396  used_num_procs = this->no_tasks;
1397  }
1398  else {
1399  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1400  }
1401 
1402  // Index of the permutation
1403  int myPermutation = myRank % permutations;
1404  bool task_partition_along_longest_dim = false;
1405  bool proc_partition_along_longest_dim = false;
1406 
1407 
1408  int myProcPerm = 0;
1409  int myTaskPerm = 0;
1410 
1411  if (myPermutation == 0) {
1412  task_partition_along_longest_dim = true;
1413  proc_partition_along_longest_dim = true;
1414  }
1415  else {
1416  --myPermutation;
1417  if (myPermutation < taskPerm) {
1418  proc_partition_along_longest_dim = true;
1419  // Index of the task permutation
1420  myTaskPerm = myPermutation;
1421  }
1422  else {
1423  myPermutation -= taskPerm;
1424 
1425  if (myPermutation < procPerm) {
1426  task_partition_along_longest_dim = true;
1427  // Index of the task permutation
1428  myProcPerm = myPermutation;
1429  }
1430  else {
1431  myPermutation -= procPerm;
1432  // Index of the proc permutation
1433  myProcPerm = myPermutation % procPerm;
1434  // Index of the task permutation
1435  myTaskPerm = myPermutation / procPerm;
1436  }
1437  }
1438  }
1439 
1440 /*
1441  if (task_partition_along_longest_dim &&
1442  proc_partition_along_longest_dim) {
1443  std::cout <<"me:" << myRank << " task:longest proc:longest"
1444  << " numPerms:" << permutations << std::endl;
1445  }
1446  else if (proc_partition_along_longest_dim) {
1447  std::cout <<"me:" << myRank << " task:" << myTaskPerm
1448  << " proc:longest" << " numPerms:" << permutations << std::endl;
1449  }
1450  else if (task_partition_along_longest_dim) {
1451  std::cout <<"me:" << myRank << " task: longest" << " proc:"
1452  << myProcPerm << " numPerms:" << permutations << std::endl;
1453  }
1454  else {
1455  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:"
1456  << myProcPerm << " numPerms:" << permutations << std::endl;
1457  }
1458 */
1459 
1460  int *permutation =
1461  allocMemory<int>((this->proc_coord_dim > this->task_coord_dim) ?
1462  this->proc_coord_dim : this->task_coord_dim);
1463 
1464  // Get the permutation order from the proc permutation index.
1465  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1466 
1467 /*
1468  // Reorder the coordinate dimensions.
1469  pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1470  for (int i = 0; i < this->proc_coord_dim; ++i) {
1471  pcoords[i] = this->proc_coords[permutation[i]];
1472 // std::cout << permutation[i] << " ";
1473  }
1474 */
1475 
1476  int procdim = this->proc_coord_dim;
1477  pcoord_t **pcoords = this->proc_coords;
1478 
1479 /*
1480  int procdim = this->proc_coord_dim;
1481  procdim = 6;
1482  //reorder the coordinate dimensions.
1483  pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1484  for (int i = 0; i < procdim; ++i) {
1485  pcoords[i] = new pcoord_t[used_num_procs] ;
1486 // this->proc_coords[permutation[i]];
1487  }
1488 
1489  for (int k = 0; k < used_num_procs ; k++) {
1490  pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1491  pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1492 
1493  pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1494  pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1495  pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1496  pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1497 
1498  //if (this->proc_coords[0][k] == 40 &&
1499  // this->proc_coords[1][k] == 8 &&
1500  // this->proc_coords[2][k] == 48) {
1501  if (this->proc_coords[0][k] == 5 &&
1502  this->proc_coords[1][k] == 0 &&
1503  this->proc_coords[2][k] == 10) {
1504  std::cout << "pcoords[0][k]:" << pcoords[0][k]
1505  << "pcoords[1][k]:" << pcoords[1][k]
1506  << "pcoords[2][k]:" << pcoords[2][k]
1507  << "pcoords[3][k]:" << pcoords[3][k]
1508  << "pcoords[4][k]:" << pcoords[4][k]
1509  << "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1510  }
1511  else if (pcoords[0][k] == 64 &&
1512  pcoords[1][k] == 0 &&
1513  pcoords[2][k] == 160 &&
1514  pcoords[3][k] == 16 &&
1515  pcoords[4][k] == 0 &&
1516  pcoords[5][k] == 10) {
1517  std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k]
1518  << "this->proc_coords[1][k]:" << this->proc_coords[1][k]
1519  << "this->proc_coords[2][k]:" << this->proc_coords[2][k]
1520  << std::endl;
1521  }
1522  }
1523 */
1524 
1525 // if (partNoArray == NULL)
1526 // std::cout << "partNoArray is null" << std::endl;
1527 // std::cout << "recursion_depth:" << recursion_depth
1528 // << " partArraySize:" << partArraySize << std::endl;
1529 
1530  // Optimization for Dragonfly Networks, First Level of partitioning
1531  // is imbalanced to ensure procs are divided by first RCA
1532  // coord (a.k.a. group).
1533  part_t num_group_count = 1;
1534  part_t *group_count = NULL;
1535 
1536  if (machine != NULL)
1537  num_group_count = machine->getNumUniqueGroups();
1538 
1539  if (num_group_count > 1) {
1540  group_count = new part_t[num_group_count];
1541  memset(group_count, 0, sizeof(part_t) * num_group_count);
1542 
1543  machine->getGroupCount(group_count);
1544  }
1545 
1546  // Do the partitioning and renumber the parts.
1547  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1548  // Partitioning of Processors
1550 
1551  mj_partitioner.sequential_task_partitioning(
1552  env,
1553  this->no_procs,
1554  used_num_procs,
1555  num_parts,
1556  procdim,
1557  //minCoordDim,
1558  pcoords,//this->proc_coords,
1559  proc_adjList,
1560  proc_xadj,
1561  recursion_depth,
1562  partNoArray,
1563  proc_partition_along_longest_dim, //, false
1566  num_group_count,
1567  group_count
1568  );
1569  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1570 
1571 // comm_->barrier();
1572 // std::cout << "mj_partitioner.for procs over" << std::endl;
1573 // freeArray<pcoord_t *>(pcoords);
1574 
1575  part_t *task_xadj = allocMemory<part_t>(num_parts + 1);
1576  part_t *task_adjList = allocMemory<part_t>(this->no_tasks);
1577 
1578  // Fill task_adjList st: task_adjList[i] <- i.
1579  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1580 
1581  // Get the permutation order from the task permutation index.
1582  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1583 
1584  // Reorder task coordinate dimensions.
1585  tcoord_t **tcoords = allocMemory<tcoord_t *>(this->task_coord_dim);
1586  for (int i = 0; i < this->task_coord_dim; ++i) {
1587  tcoords[i] = this->task_coords[permutation[i]];
1588  }
1589 
1590  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1591  // Partitioning of Tasks
1592  mj_partitioner.sequential_task_partitioning(
1593  env,
1594  this->no_tasks,
1595  this->no_tasks,
1596  num_parts,
1597  this->task_coord_dim,
1598  //minCoordDim,
1599  tcoords, //this->task_coords,
1600  task_adjList,
1601  task_xadj,
1602  recursion_depth,
1603  partNoArray,
1604  task_partition_along_longest_dim,
1607  num_group_count,
1608  group_count
1609  //,"task_partitioning"
1610  //, false // (myRank == 6)
1611  );
1612  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1613 
1614 // std::cout << "myrank:" << myRank << std::endl;
1615 // comm_->barrier();
1616 // std::cout << "mj_partitioner.sequential_task_partitioning over"
1617 // << std::endl;
1618 
1619  freeArray<pcoord_t *>(tcoords);
1620  freeArray<int>(permutation);
1621 
1622  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1623  for (part_t i = 0; i < num_parts; ++i) {
1624 
1625  part_t proc_index_begin = proc_xadj[i];
1626  part_t task_begin_index = task_xadj[i];
1627  part_t proc_index_end = proc_xadj[i + 1];
1628  part_t task_end_index = task_xadj[i + 1];
1629 
1630  if (proc_index_end - proc_index_begin != 1) {
1631  std::cerr << "Error at partitioning of processors" << std::endl;
1632  std::cerr << "PART:" << i << " is assigned to "
1633  << proc_index_end - proc_index_begin << " processors."
1634  << std::endl;
1635  exit(1);
1636  }
1637  part_t assigned_proc = proc_adjList[proc_index_begin];
1638  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1639  }
1640 
1641  //holds the pointer to the task array
1642  //convert proc_to_task_xadj to CSR index array
1643  part_t *proc_to_task_xadj_work = allocMemory<part_t>(this->no_procs);
1644  part_t sum = 0;
1645  for (part_t i = 0; i < this->no_procs; ++i) {
1646  part_t tmp = proc_to_task_xadj[i];
1647  proc_to_task_xadj[i] = sum;
1648  sum += tmp;
1649  proc_to_task_xadj_work[i] = sum;
1650  }
1651  proc_to_task_xadj[this->no_procs] = sum;
1652 
1653  for (part_t i = 0; i < num_parts; ++i) {
1654 
1655  part_t proc_index_begin = proc_xadj[i];
1656  part_t task_begin_index = task_xadj[i];
1657  part_t task_end_index = task_xadj[i + 1];
1658 
1659  part_t assigned_proc = proc_adjList[proc_index_begin];
1660 
1661  for (part_t j = task_begin_index; j < task_end_index; ++j) {
1662  part_t taskId = task_adjList[j];
1663 
1664  task_to_proc[taskId] = assigned_proc;
1665 
1666  proc_to_task_adj[--proc_to_task_xadj_work[assigned_proc]] = taskId;
1667  }
1668  }
1669 
1670 /*
1671  if (myPermutation == 0) {
1672  std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1673 
1674  for (part_t i = 0; i < num_parts; ++i) {
1675 
1676  part_t proc_index_begin = proc_xadj[i];
1677  part_t proc_index_end = proc_xadj[i + 1];
1678 
1679  if (proc_index_end - proc_index_begin != 1) {
1680  std::cerr << "Error at partitioning of processors" << std::endl;
1681  std::cerr << "PART:" << i << " is assigned to "
1682  << proc_index_end - proc_index_begin << " processors."
1683  << std::endl;
1684  exit(1);
1685  }
1686 
1687  part_t assigned_proc = proc_adjList[proc_index_begin];
1688  gnuPlotCode << "Rank:" << i << " "
1689  << this->proc_coords[0][assigned_proc] << " "
1690  << this->proc_coords[1][assigned_proc] << " "
1691  << this->proc_coords[2][assigned_proc] << " "
1692  << pcoords[0][assigned_proc] << " "
1693  << pcoords[1][assigned_proc] << " "
1694  << pcoords[2][assigned_proc] << " "
1695  << pcoords[3][assigned_proc] << std::endl;
1696  }
1697 
1698  gnuPlotCode << "Machine Extent:" << std::endl;
1699  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1700  for (part_t i = 0; i < num_parts; ++i) {
1701 
1702  part_t proc_index_begin = proc_xadj[i];
1703  part_t proc_index_end = proc_xadj[i + 1];
1704 
1705  if (proc_index_end - proc_index_begin != 1) {
1706  std::cerr << "Error at partitioning of processors" << std::endl;
1707  std::cerr << "PART:" << i << " is assigned to "
1708  << proc_index_end - proc_index_begin << " processors."
1709  << std::endl;
1710  exit(1);
1711  }
1712 
1713  part_t assigned_proc = proc_adjList[proc_index_begin];
1714  gnuPlotCode << "Rank:" << i << " "
1715  << this->proc_coords[0][assigned_proc] << " "
1716  << this->proc_coords[1][assigned_proc] << " "
1717  << this->proc_coords[2][assigned_proc] << std::endl;
1718  }
1719  gnuPlotCode.close();
1720  }
1721 */
1722 
1723  freeArray<part_t>(proc_to_task_xadj_work);
1724  freeArray<part_t>(task_xadj);
1725  freeArray<part_t>(task_adjList);
1726  freeArray<part_t>(proc_xadj);
1727  freeArray<part_t>(proc_adjList);
1728  }
1729 
1730 };
1731 
1732 template <typename Adapter, typename part_t>
1734 protected:
1735 
1736 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1737 
1738  typedef typename Adapter::scalar_t pcoord_t;
1739  typedef typename Adapter::scalar_t tcoord_t;
1740  typedef typename Adapter::scalar_t scalar_t;
1741  typedef typename Adapter::lno_t lno_t;
1742 
1743 #endif
1744 
1745 // RCP<const Environment> env;
1746 
1747  // Holds the pointer to the task array
1748  ArrayRCP<part_t> proc_to_task_xadj;
1749 // = allocMemory<part_t> (this->no_procs + 1);
1750 
1751  // Holds the indices of tasks wrt to proc_to_task_xadj array.
1752  ArrayRCP<part_t> proc_to_task_adj;
1753 // = allocMemory<part_t>(this->no_tasks);
1754 
1755  // Holds the processors mapped to tasks.
1756  ArrayRCP<part_t> task_to_proc;
1757 // = allocMemory<part_t>(this->no_procs);
1758 
1759  // Holds the processors mapped to tasks.
1760  ArrayRCP<part_t> local_task_to_rank;
1761 // = allocMemory<part_t>(this->no_procs);
1762 
1767  ArrayRCP<part_t> task_communication_xadj;
1768  ArrayRCP<part_t> task_communication_adj;
1770 
1771 
1775  void doMapping(int myRank,
1776  const Teuchos::RCP<const Teuchos::Comm<int> > comm_) {
1777 
1778  if (this->proc_task_comm) {
1779  this->proc_task_comm->getMapping(
1780  myRank,
1781  this->env,
1782  // Holds the pointer to the task array
1783  this->proc_to_task_xadj,
1784  // Holds the indices of task wrt to proc_to_task_xadj array
1785  this->proc_to_task_adj,
1786  // Holds the processors mapped to tasks
1787  this->task_to_proc,
1788  comm_
1789  );
1790  }
1791  else {
1792  std::cerr << "communicationModel is not specified in the Mapper"
1793  << std::endl;
1794  exit(1);
1795  }
1796  }
1797 
1798 
1802  RCP<Comm<int> > create_subCommunicator() {
1803  int procDim = this->proc_task_comm->proc_coord_dim;
1804  int taskDim = this->proc_task_comm->task_coord_dim;
1805 
1806  // Get the number of different permutations for task dimension ordering
1807  int taskPerm = z2Fact<int>(procDim);
1808  // Get the number of different permutations for proc dimension ordering
1809  int procPerm = z2Fact<int>(taskDim);
1810  // Total number of permutations
1811  int idealGroupSize = taskPerm * procPerm;
1812 
1813  // For the one that does longest dimension partitioning.
1814  idealGroupSize += taskPerm + procPerm + 1;
1815 
1816  int myRank = this->comm->getRank();
1817  int commSize = this->comm->getSize();
1818 
1819  int myGroupIndex = myRank / idealGroupSize;
1820 
1821  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1822  if (prevGroupBegin < 0) prevGroupBegin = 0;
1823  int myGroupBegin = myGroupIndex * idealGroupSize;
1824  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1825  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1826 
1827  if (myGroupEnd > commSize) {
1828  myGroupBegin = prevGroupBegin;
1829  myGroupEnd = commSize;
1830  }
1831  if (nextGroupEnd > commSize) {
1832  myGroupEnd = commSize;
1833  }
1834  int myGroupSize = myGroupEnd - myGroupBegin;
1835 
1836  part_t *myGroup = allocMemory<part_t>(myGroupSize);
1837  for (int i = 0; i < myGroupSize; ++i) {
1838  myGroup[i] = myGroupBegin + i;
1839  }
1840 // std::cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin
1841 // << " myGroupEnd:" << myGroupEnd << endl;
1842 
1843  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1844 
1845  RCP<Comm<int> > subComm =
1846  this->comm->createSubcommunicator(myGroupView);
1847  freeArray<part_t>(myGroup);
1848  return subComm;
1849  }
1850 
1851 
1856  //create the sub group.
1857  RCP<Comm<int> > subComm = this->create_subCommunicator();
1858  //calculate cost.
1859  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1860 // std::cout << "me:" << this->comm->getRank() << " myCost:"
1861 // << myCost << std::endl;
1862  double localCost[2], globalCost[2];
1863 
1864  localCost[0] = myCost;
1865  localCost[1] = double(subComm->getRank());
1866 
1867  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1869  reduceAll<int, double>(*subComm, reduceBest,
1870  2, localCost, globalCost);
1871 
1872  int sender = int(globalCost[1]);
1873 
1874 /*
1875  if ( this->comm->getRank() == 0) {
1876  std::cout << "me:" << localCost[1] <<
1877  " localcost:" << localCost[0]<<
1878  " bestcost:" << globalCost[0] <<
1879  " Sender:" << sender <<
1880  " procDim" << proc_task_comm->proc_coord_dim <<
1881  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1882  }
1883 */
1884 
1885 // std::cout << "me:" << localCost[1] << " localcost:" << localCost[0]
1886 // << " bestcost:" << globalCost[0] << endl;
1887 // std::cout << "me:" << localCost[1] << " proc:" << globalCost[1]
1888 // << endl;
1889  broadcast(*subComm, sender, this->ntasks,
1890  this->task_to_proc.getRawPtr());
1891  broadcast(*subComm, sender, this->nprocs,
1892  this->proc_to_task_xadj.getRawPtr());
1893  broadcast(*subComm, sender, this->ntasks,
1894  this->proc_to_task_adj.getRawPtr());
1895  }
1896 
1897  //write mapping to gnuPlot code to visualize.
1898  void writeMapping() {
1899  std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1900 
1901  int mindim = MINOF(proc_task_comm->proc_coord_dim,
1902  proc_task_comm->task_coord_dim);
1903  std::string ss = "";
1904  for (part_t i = 0; i < this->nprocs; ++i) {
1905 
1906  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1907  if (i == 0) {
1908  gnuPlotCode << "plot \"" << procFile << "\"\n";
1909  }
1910  else {
1911  gnuPlotCode << "replot \"" << procFile << "\"\n";
1912  }
1913 
1914  std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1915 
1916  std::string gnuPlotArrow = "set arrow from ";
1917  for (int j = 0; j < mindim; ++j) {
1918  if (j == mindim - 1) {
1919  inpFile << proc_task_comm->proc_coords[j][i];
1920  gnuPlotArrow +=
1921  Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1922 
1923  }
1924  else {
1925  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1926  gnuPlotArrow +=
1927  Teuchos::toString<float>(proc_task_comm->
1928  proc_coords[j][i]) + ",";
1929  }
1930  }
1931  gnuPlotArrow += " to ";
1932 
1933  inpFile << std::endl;
1934  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1935 
1936  for (int k = 0; k < a.size(); ++k) {
1937  int j = a[k];
1938 // std::cout << "i:" << i << " j:"
1939  std::string gnuPlotArrow2 = gnuPlotArrow;
1940  for (int z = 0; z < mindim; ++z) {
1941  if (z == mindim - 1) {
1942 
1943 // std::cout << "z:" << z << " j:" << j << " "
1944 // << proc_task_comm->task_coords[z][j] << endl;
1945  inpFile << proc_task_comm->task_coords[z][j];
1946  gnuPlotArrow2 +=
1947  Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1948  }
1949  else {
1950  inpFile << proc_task_comm->task_coords[z][j] << " ";
1951  gnuPlotArrow2 +=
1952  Teuchos::toString<float>(proc_task_comm->
1953  task_coords[z][j]) + ",";
1954  }
1955  }
1956  ss += gnuPlotArrow2 + "\n";
1957  inpFile << std::endl;
1958  }
1959  inpFile.close();
1960  }
1961  gnuPlotCode << ss;
1962  gnuPlotCode << "\nreplot\n pause -1 \n";
1963  gnuPlotCode.close();
1964  }
1965 
1966  //write mapping to gnuPlot code to visualize.
1967  void writeMapping2(int myRank) {
1968  std::string rankStr = Teuchos::toString<int>(myRank);
1969  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
1970  std::string outF = gnuPlots + rankStr+ extentionS;
1971  std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
1972 
1974  *tmpproc_task_comm =
1975  static_cast <CoordinateCommunicationModel<pcoord_t, tcoord_t,
1976  part_t> * > (
1977  proc_task_comm);
1978 
1979 // int mindim = MINOF(tmpproc_task_comm->proc_coord_dim,
1980 // tmpproc_task_comm->task_coord_dim);
1981  int mindim = tmpproc_task_comm->proc_coord_dim;
1982  if (mindim != 3) {
1983  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
1984  return;
1985  }
1986  std::string ss = "";
1987  std::string procs = "";
1988 
1989  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
1990  for (part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank) {
1991  ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
1992  if (a.size() == 0) {
1993  continue;
1994  }
1995 
1996  std::string gnuPlotArrow = "set arrow from ";
1997  for (int j = 0; j < mindim; ++j) {
1998  if (j == mindim - 1) {
1999  gnuPlotArrow +=
2000  Teuchos::toString<float>(tmpproc_task_comm->
2001  proc_coords[j][origin_rank]);
2002  procs +=
2003  Teuchos::toString<float>(tmpproc_task_comm->
2004  proc_coords[j][origin_rank]);
2005 
2006  }
2007  else {
2008  gnuPlotArrow +=
2009  Teuchos::toString<float>(tmpproc_task_comm->
2010  proc_coords[j][origin_rank]) + ",";
2011  procs +=
2012  Teuchos::toString<float>(tmpproc_task_comm->
2013  proc_coords[j][origin_rank])+ " ";
2014  }
2015  }
2016  procs += "\n";
2017 
2018  gnuPlotArrow += " to ";
2019 
2020 
2021  for (int k = 0; k < a.size(); ++k) {
2022  int origin_task = a[k];
2023 
2024  for (int nind = task_communication_xadj[origin_task];
2025  nind < task_communication_xadj[origin_task + 1]; ++nind) {
2026  int neighbor_task = task_communication_adj[nind];
2027 
2028  bool differentnode = false;
2029 
2030  int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
2031 
2032  for (int j = 0; j < mindim; ++j) {
2033  if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2034  int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2035  differentnode = true; break;
2036  }
2037  }
2038  std::tuple<int,int,int, int, int, int> foo(
2039  int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2040  int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2041  int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2042  int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2043  int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2044  int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2045 
2046 
2047  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2048  my_arrows.insert(foo);
2049 
2050  std::string gnuPlotArrow2 = "";
2051  for (int j = 0; j < mindim; ++j) {
2052  if (j == mindim - 1) {
2053  gnuPlotArrow2 +=
2054  Teuchos::toString<float>(tmpproc_task_comm->
2055  proc_coords[j][neighbor_rank]);
2056  }
2057  else {
2058  gnuPlotArrow2 +=
2059  Teuchos::toString<float>(tmpproc_task_comm->
2060  proc_coords[j][neighbor_rank]) + ",";
2061  }
2062  }
2063  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2064  }
2065  }
2066  }
2067  }
2068 
2069  std::ofstream procFile("procPlot.plot", std::ofstream::out);
2070  procFile << procs << "\n";
2071  procFile.close();
2072 
2073  //gnuPlotCode << ss;
2074  if (mindim == 2) {
2075  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
2076  } else {
2077  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
2078  }
2079 
2080  gnuPlotCode << ss << "\nreplot\n pause -1 \n";
2081  gnuPlotCode.close();
2082  }
2083 
2084 
2085 // KDD Need to provide access to algorithm for getPartBoxes
2086 #ifdef gnuPlot
2087  void writeGnuPlot(
2088  const Teuchos::Comm<int> *comm_,
2090  int coordDim,
2091  tcoord_t **partCenters) {
2092 
2093  std::string file = "gggnuPlot";
2094  std::string exten = ".plot";
2095  std::ofstream mm("2d.txt");
2096  file += Teuchos::toString<int>(comm_->getRank()) + exten;
2097  std::ofstream ff(file.c_str());
2098 // ff.seekg(0, ff.end);
2099  std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2100  outPartBoxes =
2102  getPartBoxesView();
2103 
2104  for (part_t i = 0; i < this->ntasks;++i) {
2105  outPartBoxes[i].writeGnuPlot(ff, mm);
2106  }
2107  if (coordDim == 2) {
2108  ff << "plot \"2d.txt\"" << std::endl;
2109 // ff << "\n pause -1" << endl;
2110  }
2111  else {
2112  ff << "splot \"2d.txt\"" << std::endl;
2113 // ff << "\n pause -1" << endl;
2114  }
2115  mm.close();
2116 
2117  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2118  << std::endl;
2119 
2120  for (part_t i = 0; i < this->ntasks; ++i) {
2122  part_t pe = task_communication_xadj[i + 1];
2123 
2124  for (part_t p = pb; p < pe; ++p) {
2126 
2127 // std::cout << "i:" << i << " n:" << n << endl;
2128  std::string arrowline = "set arrow from ";
2129  for (int j = 0; j < coordDim - 1; ++j) {
2130  arrowline +=
2131  Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
2132  }
2133  arrowline +=
2134  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2135  " to ";
2136 
2137  for (int j = 0; j < coordDim - 1; ++j) {
2138  arrowline +=
2139  Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
2140  }
2141  arrowline +=
2142  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2143  " as 5\n";
2144 
2145 // std::cout << "arrow:" << arrowline << endl;
2146  ff << arrowline;
2147  }
2148  }
2149 
2150  ff << "replot\n pause -1" << std::endl;
2151  ff.close();
2152  }
2153 #endif // gnuPlot
2154 
2155 public:
2156 
2157  void getProcTask(part_t* &proc_to_task_xadj_,
2158  part_t* &proc_to_task_adj_) {
2159  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2160  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2161  }
2162 
2163  virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
2164 
2165  // Mapping was already computed in the constructor; we need to store it
2166  // in the solution.
2167  mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
2168 
2169  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
2170  // KDDKDD is not needed once we use MappingSolution to answer queries
2171  // KDDKDD instead of this algorithm.
2172  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
2173  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
2174  }
2175 
2176 
2178  //freeArray<part_t>(proc_to_task_xadj);
2179  //freeArray<part_t>(proc_to_task_adj);
2180  //freeArray<part_t>(task_to_proc);
2181  if (this->isOwnerofModel) {
2182  delete this->proc_task_comm;
2183  }
2184  }
2185 
2187  const lno_t num_local_coords,
2188  const part_t *local_coord_parts,
2189  const ArrayRCP<part_t> task_to_proc_) {
2190  local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
2191 
2192  for (lno_t i = 0; i < num_local_coords; ++i) {
2193  part_t local_coord_part = local_coord_parts[i];
2194  part_t rank_index = task_to_proc_[local_coord_part];
2195  local_task_to_rank[i] = rank_index;
2196  }
2197  }
2198 
2213  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2214  const Teuchos::RCP <const MachineRepresentation<pcoord_t, part_t> >
2215  machine_,
2216  const Teuchos::RCP <const Adapter> input_adapter_,
2217  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> >
2218  soln_,
2219  const Teuchos::RCP <const Environment> envConst,
2220  bool is_input_adapter_distributed = true,
2221  int num_ranks_per_node = 1,
2222  bool divide_to_prime_first = false,
2223  bool reduce_best_mapping = true):
2224  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2225  soln_, envConst),
2226  proc_to_task_xadj(0),
2227  proc_to_task_adj(0),
2228  task_to_proc(0),
2229  isOwnerofModel(true),
2230  proc_task_comm(0),
2234 
2235  using namespace Teuchos;
2236  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2237 
2238  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2239  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2240 
2241  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2242  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2243  if (!is_input_adapter_distributed) {
2244  ia_comm = Teuchos::createSerialComm<int>();
2245  }
2246 
2247  RCP<const Environment> envConst_ = envConst;
2248 
2249  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2250  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2251  input_adapter_.getRawPtr()), false));
2252 
2253  modelFlag_t coordFlags_, graphFlags_;
2254 
2255  //create coordinate model
2256  //since this is coordinate task mapper,
2257  //the adapter has to have the coordinates
2258  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2259  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2260 
2261  //if the adapter has also graph model, we will use graph model
2262  //to calculate the cost mapping.
2263  BaseAdapterType inputType_ = input_adapter_->adapterType();
2264  if (inputType_ == MatrixAdapterType ||
2265  inputType_ == GraphAdapterType ||
2266  inputType_ == MeshAdapterType)
2267  {
2268  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2269  baseInputAdapter_, envConst_, ia_comm,
2270  graphFlags_));
2271  }
2272 
2273  if (!machine_->hasMachineCoordinates()) {
2274  throw std::runtime_error("Existing machine does not provide "
2275  "coordinates for coordinate task mapping");
2276  }
2277 
2278  //if mapping type is 0 then it is coordinate mapping
2279  int procDim = machine_->getMachineDim();
2280  this->nprocs = machine_->getNumRanks();
2281 
2282  //get processor coordinates.
2283  pcoord_t **procCoordinates = NULL;
2284  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2285  throw std::runtime_error("Existing machine does not implement "
2286  "getAllMachineCoordinatesView");
2287  }
2288 
2289  //get the machine extent.
2290  //if we have machine extent,
2291  //if the machine has wrap-around links, we would like to shift the
2292  //coordinates, so that the largest hap would be the wrap-around.
2293  std::vector<int> machine_extent_vec(procDim);
2294  //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2295  int *machine_extent = &(machine_extent_vec[0]);
2296  bool *machine_extent_wrap_around = new bool[procDim];
2297  for (int i = 0; i < procDim; ++i)
2298  machine_extent_wrap_around[i] = false;
2299 
2300  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2301 
2302  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2303  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2304  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2305  // MD: Yes, I ADDED BELOW:
2306  if (machine_->getMachineExtent(machine_extent) &&
2307  haveWrapArounds) {
2308 
2309  procCoordinates =
2311  procDim,
2312  machine_extent,
2313  machine_extent_wrap_around,
2314  this->nprocs,
2315  procCoordinates);
2316  }
2317 
2318  //get the tasks information, such as coordinate dimension,
2319  //number of parts.
2320  int coordDim = coordinateModel_->getCoordinateDim();
2321 
2322 // int coordDim = machine_->getMachineDim();
2323 
2324  this->ntasks = soln_->getActualGlobalNumberOfParts();
2325  if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks) {
2326  this->ntasks = soln_->getTargetGlobalNumberOfParts();
2327  }
2328  this->solution_parts = soln_->getPartListView();
2329 
2330  //we need to calculate the center of parts.
2331  tcoord_t **partCenters = NULL;
2332  partCenters = allocMemory<tcoord_t *>(coordDim);
2333  for (int i = 0; i < coordDim; ++i) {
2334  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2335  }
2336 
2337  typedef typename Adapter::scalar_t t_scalar_t;
2338 
2339 
2340  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2341 
2342  //get centers for the parts.
2343  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2344  envConst.getRawPtr(),
2345  ia_comm.getRawPtr(),
2346  coordinateModel_.getRawPtr(),
2347  this->solution_parts,
2348 // soln_->getPartListView();
2349 // this->soln.getRawPtr(),
2350  coordDim,
2351  ntasks,
2352  partCenters);
2353 
2354  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2355 
2356  //create the part graph
2357  if (graph_model_.getRawPtr() != NULL) {
2358  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2359  envConst.getRawPtr(),
2360  ia_comm.getRawPtr(),
2361  graph_model_.getRawPtr(),
2362  this->ntasks,
2363  this->solution_parts,
2364 // soln_->getPartListView(),
2365 // this->soln.getRawPtr(),
2369  );
2370  }
2371 
2372  //create coordinate communication model.
2373  this->proc_task_comm =
2374  new Zoltan2::CoordinateCommunicationModel<pcoord_t, tcoord_t,
2375  part_t>(
2376  procDim,
2377  procCoordinates,
2378  coordDim,
2379  partCenters,
2380  this->nprocs,
2381  this->ntasks,
2382  machine_extent,
2383  machine_extent_wrap_around,
2384  machine_.getRawPtr());
2385 
2386  int myRank = comm_->getRank();
2387  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2388  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2389 
2390  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2391  this->doMapping(myRank, comm_);
2392  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2393 
2394  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2395 
2396 /*
2397  soln_->getCommunicationGraph(task_communication_xadj,
2398  task_communication_adj);
2399 */
2400 
2401  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2402  #ifdef gnuPlot1
2403  if (comm_->getRank() == 0) {
2404 
2405  part_t taskCommCount = task_communication_xadj.size();
2406  std::cout << " TotalComm:"
2407  << task_communication_xadj[taskCommCount] << std::endl;
2408  part_t maxN = task_communication_xadj[0];
2409  for (part_t i = 1; i <= taskCommCount; ++i) {
2410  part_t nc =
2412  if (maxN < nc)
2413  maxN = nc;
2414  }
2415  std::cout << " maxNeighbor:" << maxN << std::endl;
2416  }
2417 
2418  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2419  #endif
2420 
2421  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2422 
2423  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2424  task_communication_adj.getRawPtr()) {
2425  this->proc_task_comm->calculateCommunicationCost(
2426  task_to_proc.getRawPtr(),
2427  task_communication_xadj.getRawPtr(),
2428  task_communication_adj.getRawPtr(),
2429  task_communication_edge_weight.getRawPtr()
2430  );
2431  }
2432 
2433 // std::cout << "me: " << comm_->getRank() << " cost:"
2434 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2435 
2436  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2437 
2438  //processors are divided into groups of size procDim! * coordDim!
2439  //each processor in the group obtains a mapping with a different
2440  //rotation and best one is broadcasted all processors.
2441  this->getBestMapping();
2443  coordinateModel_->getLocalNumCoordinates(),
2444  this->solution_parts,
2445  this->task_to_proc);
2446 /*
2447  {
2448  if (task_communication_xadj.getRawPtr() &&
2449  task_communication_adj.getRawPtr())
2450  this->proc_task_comm->calculateCommunicationCost(
2451  task_to_proc.getRawPtr(),
2452  task_communication_xadj.getRawPtr(),
2453  task_communication_adj.getRawPtr(),
2454  task_communication_edge_weight.getRawPtr()
2455  );
2456  std::cout << "me: " << comm_->getRank() << " cost:"
2457  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2458  }
2459 */
2460 
2461 
2462  #ifdef gnuPlot
2463  this->writeMapping2(comm_->getRank());
2464  #endif
2465 
2466  delete [] machine_extent_wrap_around;
2467 
2468  if (machine_->getMachineExtent(machine_extent) &&
2469  haveWrapArounds) {
2470  for (int i = 0; i < procDim; ++i) {
2471  delete [] procCoordinates[i];
2472  }
2473  delete [] procCoordinates;
2474  }
2475 
2476  for (int i = 0; i < coordDim; ++i) {
2477  freeArray<tcoord_t>(partCenters[i]);
2478  }
2479  freeArray<tcoord_t *>(partCenters);
2480 
2481  }
2482 
2483 
2501  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2502  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> >
2503  machine_,
2504  const Teuchos::RCP <const Adapter> input_adapter_,
2505  const part_t num_parts_,
2506  const part_t *result_parts,
2507  const Teuchos::RCP <const Environment> envConst,
2508  bool is_input_adapter_distributed = true,
2509  int num_ranks_per_node = 1,
2510  bool divide_to_prime_first = false,
2511  bool reduce_best_mapping = true):
2512  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2513  num_parts_, result_parts, envConst),
2514  proc_to_task_xadj(0),
2515  proc_to_task_adj(0),
2516  task_to_proc(0),
2517  isOwnerofModel(true),
2518  proc_task_comm(0),
2522 
2523  using namespace Teuchos;
2524  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2525 
2526  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2527  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2528 
2529  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2530  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2531  if (!is_input_adapter_distributed) {
2532  ia_comm = Teuchos::createSerialComm<int>();
2533  }
2534  RCP<const Environment> envConst_ = envConst;
2535 
2536  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2537  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2538  input_adapter_.getRawPtr()), false));
2539 
2540  modelFlag_t coordFlags_, graphFlags_;
2541 
2542  //create coordinate model
2543  //since this is coordinate task mapper,
2544  //the adapter has to have the coordinates
2545  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2546  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2547 
2548  //if the adapter has also graph model, we will use graph model
2549  //to calculate the cost mapping.
2550  BaseAdapterType inputType_ = input_adapter_->adapterType();
2551  if (inputType_ == MatrixAdapterType ||
2552  inputType_ == GraphAdapterType ||
2553  inputType_ == MeshAdapterType)
2554  {
2555  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2556  baseInputAdapter_, envConst_, ia_comm,
2557  graphFlags_));
2558  }
2559 
2560  if (!machine_->hasMachineCoordinates()) {
2561  throw std::runtime_error("Existing machine does not provide "
2562  "coordinates for coordinate task mapping.");
2563  }
2564 
2565  //if mapping type is 0 then it is coordinate mapping
2566  int procDim = machine_->getMachineDim();
2567  this->nprocs = machine_->getNumRanks();
2568 
2569  //get processor coordinates.
2570  pcoord_t **procCoordinates = NULL;
2571  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2572  throw std::runtime_error("Existing machine does not implement "
2573  "getAllMachineCoordinatesView");
2574  }
2575 
2576  //get the machine extent.
2577  //if we have machine extent,
2578  //if the machine has wrap-around links, we would like to shift the
2579  //coordinates,
2580  //so that the largest hap would be the wrap-around.
2581  std::vector<int> machine_extent_vec(procDim);
2582 // std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2583  int *machine_extent = &(machine_extent_vec[0]);
2584  bool *machine_extent_wrap_around = new bool[procDim];
2585  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2586 
2587  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2588  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2589  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2590  // MD: Yes, I ADDED BELOW:
2591  if (machine_->getMachineExtent(machine_extent) &&
2592  haveWrapArounds) {
2593  procCoordinates =
2595  procDim,
2596  machine_extent,
2597  machine_extent_wrap_around,
2598  this->nprocs,
2599  procCoordinates);
2600  }
2601 
2602  //get the tasks information, such as coordinate dimension,
2603  //number of parts.
2604  int coordDim = coordinateModel_->getCoordinateDim();
2605 
2606 // int coordDim = machine_->getMachineDim();
2607 
2608 
2609  this->ntasks = num_parts_;
2610  this->solution_parts = result_parts;
2611 
2612  //we need to calculate the center of parts.
2613  tcoord_t **partCenters = NULL;
2614  partCenters = allocMemory<tcoord_t *>(coordDim);
2615  for (int i = 0; i < coordDim; ++i) {
2616  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2617  }
2618 
2619  typedef typename Adapter::scalar_t t_scalar_t;
2620 
2621 
2622  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2623 
2624  //get centers for the parts.
2625  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2626  envConst.getRawPtr(),
2627  ia_comm.getRawPtr(),
2628  coordinateModel_.getRawPtr(),
2629  this->solution_parts,
2630 // soln_->getPartListView();
2631 // this->soln.getRawPtr(),
2632  coordDim,
2633  ntasks,
2634  partCenters);
2635 
2636  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2637 
2638  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2639  //create the part graph
2640  if (graph_model_.getRawPtr() != NULL) {
2641  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2642  envConst.getRawPtr(),
2643  ia_comm.getRawPtr(),
2644  graph_model_.getRawPtr(),
2645  this->ntasks,
2646  this->solution_parts,
2647 // soln_->getPartListView(),
2648 // this->soln.getRawPtr(),
2652  );
2653  }
2654  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2655 
2656  envConst->timerStart(MACRO_TIMERS,
2657  "CoordinateCommunicationModel Create");
2658  //create coordinate communication model.
2659  this->proc_task_comm =
2660  new Zoltan2::CoordinateCommunicationModel<pcoord_t, tcoord_t,
2661  part_t>(
2662  procDim,
2663  procCoordinates,
2664  coordDim,
2665  partCenters,
2666  this->nprocs,
2667  this->ntasks,
2668  machine_extent,
2669  machine_extent_wrap_around,
2670  machine_.getRawPtr());
2671 
2672  envConst->timerStop(MACRO_TIMERS,
2673  "CoordinateCommunicationModel Create");
2674 
2675 
2676  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2677  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2678 
2679  int myRank = comm_->getRank();
2680 
2681 
2682  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2683  this->doMapping(myRank, comm_);
2684  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2685 
2686 
2687  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2688 
2689 /*
2690  soln_->getCommunicationGraph(task_communication_xadj,
2691  task_communication_adj);
2692 */
2693 
2694  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2695  #ifdef gnuPlot1
2696  if (comm_->getRank() == 0) {
2697 
2698  part_t taskCommCount = task_communication_xadj.size();
2699  std::cout << " TotalComm:"
2700  << task_communication_xadj[taskCommCount] << std::endl;
2701  part_t maxN = task_communication_xadj[0];
2702  for (part_t i = 1; i <= taskCommCount; ++i) {
2703  part_t nc =
2705  if (maxN < nc)
2706  maxN = nc;
2707  }
2708  std::cout << " maxNeighbor:" << maxN << std::endl;
2709  }
2710 
2711  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2712  #endif
2713 
2714  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2715 
2716  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2717  task_communication_adj.getRawPtr()) {
2718  this->proc_task_comm->calculateCommunicationCost(
2719  task_to_proc.getRawPtr(),
2720  task_communication_xadj.getRawPtr(),
2721  task_communication_adj.getRawPtr(),
2722  task_communication_edge_weight.getRawPtr()
2723  );
2724  }
2725 
2726 // std::cout << "me: " << comm_->getRank() << " cost:"
2727 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2728 
2729  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2730 
2731  //processors are divided into groups of size procDim! * coordDim!
2732  //each processor in the group obtains a mapping with a different rotation
2733  //and best one is broadcasted all processors.
2734  this->getBestMapping();
2735 
2737  coordinateModel_->getLocalNumCoordinates(),
2738  this->solution_parts,
2739  this->task_to_proc);
2740 /*
2741  {
2742  if (task_communication_xadj.getRawPtr() &&
2743  task_communication_adj.getRawPtr())
2744  this->proc_task_comm->calculateCommunicationCost(
2745  task_to_proc.getRawPtr(),
2746  task_communication_xadj.getRawPtr(),
2747  task_communication_adj.getRawPtr(),
2748  task_communication_edge_weight.getRawPtr()
2749  );
2750  std::cout << "me: " << comm_->getRank() << " cost:"
2751  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2752  }
2753 */
2754 
2755 
2756 
2757  #ifdef gnuPlot
2758  this->writeMapping2(comm_->getRank());
2759  #endif
2760 
2761  delete [] machine_extent_wrap_around;
2762 
2763  if (machine_->getMachineExtent(machine_extent) &&
2764  haveWrapArounds) {
2765  for (int i = 0; i < procDim; ++i) {
2766  delete [] procCoordinates[i];
2767  }
2768  delete [] procCoordinates;
2769  }
2770 
2771  for (int i = 0; i < coordDim; ++i) {
2772  freeArray<tcoord_t>(partCenters[i]);
2773  }
2774 
2775  freeArray<tcoord_t *>(partCenters);
2776  }
2777 
2844  const Environment *env_const_,
2845  const Teuchos::Comm<int> *problemComm,
2846  int proc_dim,
2847  int num_processors,
2848  pcoord_t **machine_coords,
2849  int task_dim,
2850  part_t num_tasks,
2851  tcoord_t **task_coords,
2852  ArrayRCP<part_t>task_comm_xadj,
2853  ArrayRCP<part_t>task_comm_adj,
2854  pcoord_t *task_communication_edge_weight_,
2855  int recursion_depth,
2856  part_t *part_no_array,
2857  const part_t *machine_dimensions,
2858  int num_ranks_per_node = 1,
2859  bool divide_to_prime_first = false,
2860  bool reduce_best_mapping = true):
2861  PartitionMapping<Adapter>(
2862  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2863  Teuchos::rcpFromRef<const Environment>(*env_const_)),
2864  proc_to_task_xadj(0),
2865  proc_to_task_adj(0),
2866  task_to_proc(0),
2867  isOwnerofModel(true),
2868  proc_task_comm(0),
2869  task_communication_xadj(task_comm_xadj),
2870  task_communication_adj(task_comm_adj) {
2871 
2872  //if mapping type is 0 then it is coordinate mapping
2873  pcoord_t ** virtual_machine_coordinates = machine_coords;
2874  bool *wrap_arounds = new bool [proc_dim];
2875  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2876 
2877  if (machine_dimensions) {
2878  virtual_machine_coordinates =
2880  proc_dim,
2881  machine_dimensions,
2882  wrap_arounds,
2883  num_processors,
2884  machine_coords);
2885  }
2886 
2887  this->nprocs = num_processors;
2888 
2889  int coordDim = task_dim;
2890  this->ntasks = num_tasks;
2891 
2892  //alloc memory for part centers.
2893  tcoord_t **partCenters = task_coords;
2894 
2895  //create coordinate communication model.
2896  this->proc_task_comm =
2898  proc_dim,
2899  virtual_machine_coordinates,
2900  coordDim,
2901  partCenters,
2902  this->nprocs,
2903  this->ntasks, NULL, NULL
2904  );
2905 
2906  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2907  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2908 
2909  this->proc_task_comm->setPartArraySize(recursion_depth);
2910  this->proc_task_comm->setPartArray(part_no_array);
2911 
2912  int myRank = problemComm->getRank();
2913 
2914  this->doMapping(myRank, this->comm);
2915 #ifdef gnuPlot
2916  this->writeMapping2(myRank);
2917 #endif
2918 
2919  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2920  task_communication_adj.getRawPtr()) {
2921  this->proc_task_comm->calculateCommunicationCost(
2922  task_to_proc.getRawPtr(),
2923  task_communication_xadj.getRawPtr(),
2924  task_communication_adj.getRawPtr(),
2925  task_communication_edge_weight_
2926  );
2927 
2928 
2929  this->getBestMapping();
2930 
2931 /*
2932  if (myRank == 0) {
2933  this->proc_task_comm->calculateCommunicationCost(
2934  task_to_proc.getRawPtr(),
2935  task_communication_xadj.getRawPtr(),
2936  task_communication_adj.getRawPtr(),
2937  task_communication_edge_weight_
2938  );
2939  cout << "me: " << problemComm->getRank() << " cost:"
2940  << this->proc_task_comm->getCommunicationCostMetric() << endl;
2941  }
2942 */
2943 
2944  }
2945  delete [] wrap_arounds;
2946 
2947  if (machine_dimensions) {
2948  for (int i = 0; i < proc_dim; ++i) {
2949  delete [] virtual_machine_coordinates[i];
2950  }
2951  delete [] virtual_machine_coordinates;
2952  }
2953 #ifdef gnuPlot
2954  if (problemComm->getRank() == 0)
2955  this->writeMapping2(-1);
2956 #endif
2957  }
2958 
2959 
2960  /*
2961  double getCommunicationCostMetric() {
2962  return this->proc_task_comm->getCommCost();
2963  }
2964  */
2965 
2968  virtual size_t getLocalNumberOfParts() const{
2969  return 0;
2970  }
2971 
2983  int machine_dim,
2984  const part_t *machine_dimensions,
2985  bool *machine_extent_wrap_around,
2986  part_t numProcs,
2987  pcoord_t **mCoords) {
2988 
2989  pcoord_t **result_machine_coords = NULL;
2990  result_machine_coords = new pcoord_t*[machine_dim];
2991 
2992  for (int i = 0; i < machine_dim; ++i) {
2993  result_machine_coords[i] = new pcoord_t [numProcs];
2994  }
2995 
2996  for (int i = 0; i < machine_dim; ++i) {
2997  part_t numMachinesAlongDim = machine_dimensions[i];
2998 
2999  part_t *machineCounts = new part_t[numMachinesAlongDim];
3000  memset(machineCounts, 0, sizeof(part_t) * numMachinesAlongDim);
3001 
3002  int *filledCoordinates = new int[numMachinesAlongDim];
3003 
3004  pcoord_t *coords = mCoords[i];
3005 
3006  for (part_t j = 0; j < numProcs; ++j) {
3007  part_t mc = (part_t) coords[j];
3008  ++machineCounts[mc];
3009  }
3010 
3011  part_t filledCoordinateCount = 0;
3012  for (part_t j = 0; j < numMachinesAlongDim; ++j) {
3013  if (machineCounts[j] > 0) {
3014  filledCoordinates[filledCoordinateCount++] = j;
3015  }
3016  }
3017 
3018  part_t firstProcCoord = filledCoordinates[0];
3019  part_t firstProcCount = machineCounts[firstProcCoord];
3020 
3021  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3022  part_t lastProcCount = machineCounts[lastProcCoord];
3023 
3024  part_t firstLastGap =
3025  numMachinesAlongDim - lastProcCoord + firstProcCoord;
3026  part_t firstLastGapProc = lastProcCount + firstProcCount;
3027 
3028  part_t leftSideProcCoord = firstProcCoord;
3029  part_t leftSideProcCount = firstProcCount;
3030  part_t biggestGap = 0;
3031  part_t biggestGapProc = numProcs;
3032 
3033  part_t shiftBorderCoordinate = -1;
3034  for (part_t j = 1; j < filledCoordinateCount; ++j) {
3035  part_t rightSideProcCoord= filledCoordinates[j];
3036  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3037 
3038  part_t gap = rightSideProcCoord - leftSideProcCoord;
3039  part_t gapProc = rightSideProcCount + leftSideProcCount;
3040 
3041  // Pick the largest gap in this dimension. Use fewer process on
3042  // either side of the largest gap to break the tie. An easy
3043  // addition to this would be to weight the gap by the number of
3044  // processes.
3045  if (gap > biggestGap ||
3046  (gap == biggestGap && biggestGapProc > gapProc)) {
3047  shiftBorderCoordinate = rightSideProcCoord;
3048  biggestGapProc = gapProc;
3049  biggestGap = gap;
3050  }
3051  leftSideProcCoord = rightSideProcCoord;
3052  leftSideProcCount = rightSideProcCount;
3053  }
3054 
3055 
3056  if (!(biggestGap > firstLastGap ||
3057  (biggestGap == firstLastGap &&
3058  biggestGapProc < firstLastGapProc))) {
3059  shiftBorderCoordinate = -1;
3060  }
3061 
3062  for (part_t j = 0; j < numProcs; ++j) {
3063 
3064  if (machine_extent_wrap_around[i] &&
3065  coords[j] < shiftBorderCoordinate) {
3066  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3067 
3068  }
3069  else {
3070  result_machine_coords[i][j] = coords[j];
3071  }
3072  }
3073  delete [] machineCounts;
3074  delete [] filledCoordinates;
3075  }
3076 
3077  return result_machine_coords;
3078 
3079  }
3080 
3088  virtual void getProcsForPart(part_t taskId, part_t &numProcs,
3089  part_t *&procs) const {
3090  numProcs = 1;
3091  procs = this->task_to_proc.getRawPtr() + taskId;
3092  }
3093 
3099  return this->task_to_proc[taskId];
3100  }
3101 
3109  virtual void getPartsForProc(int procId, part_t &numParts,
3110  part_t *&parts) const {
3111 
3112  part_t task_begin = this->proc_to_task_xadj[procId];
3113  part_t taskend = this->proc_to_task_xadj[procId + 1];
3114 
3115  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3116  numParts = taskend - task_begin;
3117  }
3118 
3119  ArrayView<part_t> getAssignedTasksForProc(part_t procId) {
3120  part_t task_begin = this->proc_to_task_xadj[procId];
3121  part_t taskend = this->proc_to_task_xadj[procId + 1];
3122 
3123 /*
3124  std::cout << "part_t:" << procId << " taskCount:"
3125  << taskend - task_begin << std::endl;
3126 
3127  for (part_t i = task_begin; i < taskend; ++i) {
3128  std::cout << "part_t:" << procId << " task:"
3129  << proc_to_task_adj[i] << endl;
3130  }
3131 */
3132  if (taskend - task_begin > 0) {
3133  ArrayView <part_t> assignedParts(
3134  this->proc_to_task_adj.getRawPtr() + task_begin,
3135  taskend - task_begin);
3136 
3137  return assignedParts;
3138  }
3139  else {
3140  ArrayView <part_t> assignedParts;
3141 
3142  return assignedParts;
3143  }
3144  }
3145 
3146 };
3147 
3226 template <typename part_t, typename pcoord_t, typename tcoord_t>
3228  RCP<const Teuchos::Comm<int> > problemComm,
3229  int proc_dim,
3230  int num_processors,
3231  pcoord_t **machine_coords,
3232  int task_dim,
3233  part_t num_tasks,
3234  tcoord_t **task_coords,
3235  part_t *task_comm_xadj,
3236  part_t *task_comm_adj,
3237  // float-like, same size with task_communication_adj_ weight of the
3238  // corresponding edge.
3239  pcoord_t *task_communication_edge_weight_,
3240  part_t *proc_to_task_xadj, /*output*/
3241  part_t *proc_to_task_adj, /*output*/
3242  int recursion_depth,
3243  part_t *part_no_array,
3244  const part_t *machine_dimensions,
3245  int num_ranks_per_node = 1,
3246  bool divide_to_prime_first = false) {
3247 
3248  const Environment *envConst_ = new Environment(problemComm);
3249 
3250  // mfh 03 Mar 2015: It's OK to omit the Node template
3251  // parameter in Tpetra, if you're just going to use the
3252  // default Node.
3253  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
3254 
3255  Teuchos::ArrayRCP<part_t> task_communication_xadj(
3256  task_comm_xadj, 0, num_tasks + 1, false);
3257 
3258  Teuchos::ArrayRCP<part_t> task_communication_adj;
3259  if (task_comm_xadj) {
3260  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3261  task_comm_adj, 0, task_comm_xadj[num_tasks], false);
3262  task_communication_adj = tmp_task_communication_adj;
3263  }
3264 
3265 
3268  part_t>(
3269  envConst_,
3270  problemComm.getRawPtr(),
3271  proc_dim,
3272  num_processors,
3273  machine_coords,
3274 // machine_coords_,
3275 
3276  task_dim,
3277  num_tasks,
3278  task_coords,
3279 
3282  task_communication_edge_weight_,
3283  recursion_depth,
3284  part_no_array,
3285  machine_dimensions,
3286  num_ranks_per_node,
3287  divide_to_prime_first);
3288 
3289 
3290  part_t* proc_to_task_xadj_;
3291  part_t* proc_to_task_adj_;
3292 
3293  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3294 
3295  for (part_t i = 0; i <= num_processors; ++i) {
3296  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3297  }
3298 
3299  for (part_t i = 0; i < num_tasks; ++i) {
3300  proc_to_task_adj[i] = proc_to_task_adj_[i];
3301  }
3302 
3303  delete ctm;
3304  delete envConst_;
3305 }
3306 
3307 template <typename proc_coord_t, typename v_lno_t>
3308 inline void visualize_mapping(int myRank,
3309  const int machine_coord_dim,
3310  const int num_ranks,
3311  proc_coord_t **machine_coords,
3312  const v_lno_t num_tasks,
3313  const v_lno_t *task_communication_xadj,
3314  const v_lno_t *task_communication_adj,
3315  const int *task_to_rank) {
3316 
3317  std::string rankStr = Teuchos::toString<int>(myRank);
3318  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
3319  std::string outF = gnuPlots + rankStr+ extentionS;
3320  std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3321 
3322  if (machine_coord_dim != 3) {
3323  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
3324  return;
3325  }
3326  std::string ss = "";
3327  std::string procs = "";
3328 
3329  std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3330 
3331  for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3332  int origin_rank = task_to_rank[origin_task];
3333  std::string gnuPlotArrow = "set arrow from ";
3334 
3335  for (int j = 0; j < machine_coord_dim; ++j) {
3336  if (j == machine_coord_dim - 1) {
3337  gnuPlotArrow +=
3338  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3339  procs +=
3340  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3341 
3342  }
3343  else {
3344  gnuPlotArrow +=
3345  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3346  + ",";
3347  procs +=
3348  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3349  + " ";
3350  }
3351  }
3352  procs += "\n";
3353 
3354  gnuPlotArrow += " to ";
3355 
3356 
3357  for (int nind = task_communication_xadj[origin_task];
3358  nind < task_communication_xadj[origin_task + 1]; ++nind) {
3359 
3360  int neighbor_task = task_communication_adj[nind];
3361 
3362  bool differentnode = false;
3363  int neighbor_rank = task_to_rank[neighbor_task];
3364 
3365  for (int j = 0; j < machine_coord_dim; ++j) {
3366  if (int(machine_coords[j][origin_rank]) !=
3367  int(machine_coords[j][neighbor_rank])) {
3368  differentnode = true; break;
3369  }
3370  }
3371 
3372  std::tuple<int,int,int, int, int, int> foo(
3373  (int)(machine_coords[0][origin_rank]),
3374  (int)(machine_coords[1][origin_rank]),
3375  (int)(machine_coords[2][origin_rank]),
3376  (int)(machine_coords[0][neighbor_rank]),
3377  (int)(machine_coords[1][neighbor_rank]),
3378  (int)(machine_coords[2][neighbor_rank]));
3379 
3380  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3381  my_arrows.insert(foo);
3382 
3383  std::string gnuPlotArrow2 = "";
3384  for (int j = 0; j < machine_coord_dim; ++j) {
3385  if (j == machine_coord_dim - 1) {
3386  gnuPlotArrow2 +=
3387  Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3388  }
3389  else {
3390  gnuPlotArrow2 +=
3391  Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3392  + ",";
3393  }
3394  }
3395  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3396  }
3397  }
3398  }
3399 
3400  std::ofstream procFile("procPlot.plot", std::ofstream::out);
3401  procFile << procs << "\n";
3402  procFile.close();
3403 
3404  //gnuPlotCode << ss;
3405  if (machine_coord_dim == 2) {
3406  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3407  }
3408  else {
3409  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3410  }
3411 
3412  gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3413  gnuPlotCode.close();
3414 }
3415 
3416 } // namespace Zoltan2
3417 
3418 #endif
MINOF
#define MINOF(a, b)
Definition: Zoltan2_TaskMapping.hpp:954
ZOLTAN2_ALGMULTIJAGGED_SWAP
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
Definition: Zoltan2_AlgMultiJagged.hpp:107
Zoltan2::CoordinateTaskMapper::task_communication_adj
ArrayRCP< part_t > task_communication_adj
Definition: Zoltan2_TaskMapping.hpp:1768
Zoltan2::Zoltan2_Directory_Simple
Definition: Zoltan2_Directory.hpp:313
Zoltan2::KMeansCluster::setParams
void setParams(int dimension_, int heapsize)
Definition: Zoltan2_TaskMapping.hpp:747
Zoltan2::MeshAdapterType
@ MeshAdapterType
mesh data
Definition: Zoltan2_Adapter.hpp:69
Zoltan2::CommunicationModel::commCost
double commCost
Definition: Zoltan2_TaskMapping.hpp:994
Zoltan2::KmeansHeap::getTotalDistance
WT getTotalDistance()
Definition: Zoltan2_TaskMapping.hpp:687
Zoltan2::CoordinateModel
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
Definition: Zoltan2_CoordinateModel.hpp:72
Zoltan2::CoordinateCommunicationModel
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Definition: Zoltan2_TaskMapping.hpp:1087
Zoltan2::CoordinateModel::getCoordinates
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
Definition: Zoltan2_CoordinateModel.hpp:202
Zoltan2_PartitionMapping.hpp
Zoltan2::CoordinateTaskMapper::task_communication_xadj
ArrayRCP< part_t > task_communication_xadj
Definition: Zoltan2_TaskMapping.hpp:1767
Zoltan2::Algorithm::getPartBoxesView
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Definition: Zoltan2_Algorithm.hpp:154
Zoltan2::CoordinateTaskMapper::writeMapping2
void writeMapping2(int myRank)
Definition: Zoltan2_TaskMapping.hpp:1967
Zoltan2::CoordinateCommunicationModel::getClosestSubset
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
Definition: Zoltan2_TaskMapping.hpp:1178
Zoltan2::coordinateTaskMapperInterface
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Definition: Zoltan2_TaskMapping.hpp:3227
Zoltan2::PartitioningSolution
A PartitioningSolution is a solution to a partitioning problem.
Definition: Zoltan2_PartitioningSolution.hpp:92
Zoltan2::CoordinateTaskMapper::map
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
Definition: Zoltan2_TaskMapping.hpp:2163
Zoltan2::PartitionMapping::env
const Teuchos::RCP< const Environment > env
Definition: Zoltan2_PartitionMapping.hpp:82
Zoltan2::PartitioningSolution::getPartListView
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
Definition: Zoltan2_PartitioningSolution.hpp:372
Zoltan2::z2Fact
it z2Fact(it x)
Definition: Zoltan2_TaskMapping.hpp:76
Zoltan2::CoordinateTaskMapper::getProcTask
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
Definition: Zoltan2_TaskMapping.hpp:2157
Zoltan2_TestingFramework::base_adapter_t
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Definition: Zoltan2_Typedefs.hpp:168
Zoltan2::CoordinateCommunicationModel::~CoordinateCommunicationModel
virtual ~CoordinateCommunicationModel()
Definition: Zoltan2_TaskMapping.hpp:1125
Zoltan2::MACRO_TIMERS
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
Definition: Zoltan2_Parameters.hpp:120
Zoltan2::MachineRepresentation::getHopCount
bool getHopCount(int rank1, int rank2, pcoord_t &hops) const
return the hop count between rank1 and rank2
Definition: Zoltan2_MachineRepresentation.hpp:162
Zoltan2::GraphModel::getVertexList
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
Definition: Zoltan2_GraphModel.hpp:177
ZOLTAN2_ABS
#define ZOLTAN2_ABS(x)
Definition: Zoltan2_AlgMultiJagged.hpp:99
Zoltan2::CoordinateCommunicationModel::getProcDistance
virtual double getProcDistance(int procId1, int procId2) const
Definition: Zoltan2_TaskMapping.hpp:1233
Zoltan2::CoordinateCommunicationModel::divide_to_prime_first
bool divide_to_prime_first
Definition: Zoltan2_TaskMapping.hpp:1108
Zoltan2::CoordinateCommunicationModel::machine
const MachineRepresentation< pcoord_t, part_t > * machine
Definition: Zoltan2_TaskMapping.hpp:1105
Zoltan2::CoordinateTaskMapper::shiftMachineCoordinates
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Definition: Zoltan2_TaskMapping.hpp:2982
Zoltan2::CommunicationModel::~CommunicationModel
virtual ~CommunicationModel()
Definition: Zoltan2_TaskMapping.hpp:1009
Zoltan2_MappingSolution.hpp
Defines the MappingSolution class.
Zoltan2::PartitionMapping
PartitionMapping maps a solution or an input distribution to ranks.
Definition: Zoltan2_PartitionMapping.hpp:66
Zoltan2::CoordinateCommunicationModel::CoordinateCommunicationModel
CoordinateCommunicationModel()
Definition: Zoltan2_TaskMapping.hpp:1111
Zoltan2::CommunicationModel::CommunicationModel
CommunicationModel(part_t no_procs_, part_t no_tasks_)
Definition: Zoltan2_TaskMapping.hpp:1004
Zoltan2::Environment::timerStop
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
Definition: Zoltan2_Environment.hpp:469
Zoltan2::CommunicationModel::CommunicationModel
CommunicationModel()
Definition: Zoltan2_TaskMapping.hpp:1003
Zoltan2::PartitioningSolution::getActualGlobalNumberOfParts
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
Definition: Zoltan2_PartitioningSolution.hpp:169
Zoltan2::CoordinateCommunicationModel::setPartArray
void setPartArray(part_t *pNo)
Definition: Zoltan2_TaskMapping.hpp:1165
Zoltan2::CommunicationModel::getProcDistance
virtual double getProcDistance(int procId1, int procId2) const =0
Zoltan2::KmeansHeap::addPoint
void addPoint(IT index, WT distance)
Definition: Zoltan2_TaskMapping.hpp:626
Zoltan2::KMeansAlgorithm::~KMeansAlgorithm
~KMeansAlgorithm()
Definition: Zoltan2_TaskMapping.hpp:806
part_t
SparseMatrixAdapter_t::part_t part_t
Definition: partitioningTree.cpp:74
Zoltan2::KmeansHeap::push_down
void push_down(IT index_on_heap)
Definition: Zoltan2_TaskMapping.hpp:643
Zoltan2::CoordinateTaskMapper::getProcsForPart
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
Definition: Zoltan2_TaskMapping.hpp:3088
Zoltan2::Zoltan2_Directory_Vector
Definition: Zoltan2_Directory.hpp:406
Zoltan2::CoordinateTaskMapper::create_subCommunicator
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
Definition: Zoltan2_TaskMapping.hpp:1802
Zoltan2::CoordinateCommunicationModel::getMapping
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
Definition: Zoltan2_TaskMapping.hpp:1304
Zoltan2::GraphModel::getLocalNumVertices
size_t getLocalNumVertices() const
Returns the number vertices on this process.
Definition: Zoltan2_GraphModel.hpp:141
Zoltan2::PartitionMapping::solution_parts
const part_t * solution_parts
Definition: Zoltan2_PartitionMapping.hpp:84
Zoltan2::GraphModel::getLocalNumEdges
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges.
Definition: Zoltan2_GraphModel.hpp:150
Zoltan2::Environment
The user parameters, debug, timing and memory profiling output objects, and error checking methods.
Definition: Zoltan2_Environment.hpp:83
Zoltan2::CoordinateTaskMapper::CoordinateTaskMapper
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
Definition: Zoltan2_TaskMapping.hpp:2500
Zoltan2::AlgMJ
Multi Jagged coordinate partitioning algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:533
Zoltan2::KMeansAlgorithm
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
Definition: Zoltan2_TaskMapping.hpp:793
Zoltan2::CoordinateCommunicationModel::update_visit_order
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
Definition: Zoltan2_TaskMapping.hpp:1256
Zoltan2::GraphModel::getNumWeightsPerEdge
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Definition: Zoltan2_GraphModel.hpp:163
Zoltan2::KmeansHeap::~KmeansHeap
~KmeansHeap()
Definition: Zoltan2_TaskMapping.hpp:613
Zoltan2::KMeansAlgorithm::KMeansAlgorithm
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
Definition: Zoltan2_TaskMapping.hpp:814
Zoltan2::GNO_LNO_PAIR::part
part_t part
Definition: Zoltan2_TaskMapping.hpp:85
Zoltan2::CommunicationModel::calculateCommunicationCost
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
Definition: Zoltan2_TaskMapping.hpp:1019
Zoltan2::KMeansCluster::getDistance
WT getDistance(IT index, WT **elementCoords)
Definition: Zoltan2_TaskMapping.hpp:763
epsilon
#define epsilon
Definition: partition2DMatrix.cpp:97
Zoltan2::CommunicationModel::no_tasks
part_t no_tasks
Definition: Zoltan2_TaskMapping.hpp:1000
Zoltan2::GNO_LNO_PAIR::gno
gno_t gno
Definition: Zoltan2_TaskMapping.hpp:84
Zoltan2::Algorithm::part_t
Adapter::part_t part_t
Definition: Zoltan2_Algorithm.hpp:85
Zoltan2_Standards.hpp
Gathering definitions used in software development.
Zoltan2::KMeansCluster::center
WT * center
Definition: Zoltan2_TaskMapping.hpp:741
Zoltan2::GNO_LNO_PAIR
Definition: Zoltan2_TaskMapping.hpp:81
Zoltan2::CommunicationModel::getNTasks
part_t getNTasks() const
Definition: Zoltan2_TaskMapping.hpp:1015
Zoltan2::KmeansHeap::copyCoordinates
void copyCoordinates(IT *permutation)
Definition: Zoltan2_TaskMapping.hpp:724
Zoltan2::CoordinateCommunicationModel::task_coord_dim
int task_coord_dim
Definition: Zoltan2_TaskMapping.hpp:1096
Zoltan2::MachineRepresentation
MachineRepresentation Class Base class for representing machine coordinates, networks,...
Definition: Zoltan2_MachineRepresentation.hpp:25
Zoltan2::CoordinateTaskMapper::doMapping
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
Definition: Zoltan2_TaskMapping.hpp:1775
Zoltan2::visualize_mapping
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
Definition: Zoltan2_TaskMapping.hpp:3308
Zoltan2::getSolutionCenterCoordinates
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
Definition: Zoltan2_TaskMapping.hpp:159
Zoltan2::CoordinateTaskMapper::getBestMapping
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
Definition: Zoltan2_TaskMapping.hpp:1855
Zoltan2::ithPermutation
void ithPermutation(const IT n, IT i, IT *perm)
Definition: Zoltan2_TaskMapping.hpp:91
Zoltan2::CoordinateTaskMapper::getAssignedProcForTask
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task
Definition: Zoltan2_TaskMapping.hpp:3098
Zoltan2::CommunicationModel::getNProcs
part_t getNProcs() const
Definition: Zoltan2_TaskMapping.hpp:1011
Zoltan2::CoordinateCommunicationModel::proc_coord_dim
int proc_coord_dim
Definition: Zoltan2_TaskMapping.hpp:1092
Zoltan2::CommunicationModel::getMapping
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
Zoltan2::CoordinateCommunicationModel::proc_coords
pcoord_t ** proc_coords
Definition: Zoltan2_TaskMapping.hpp:1094
Zoltan2::CoordinateTaskMapper::create_local_task_to_rank
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
Definition: Zoltan2_TaskMapping.hpp:2186
Zoltan2::CoordinateCommunicationModel::machine_extent_wrap_around
bool * machine_extent_wrap_around
Definition: Zoltan2_TaskMapping.hpp:1104
Zoltan2::KMeansCluster
KMeansCluster Class.
Definition: Zoltan2_TaskMapping.hpp:734
Zoltan2::KMeansCluster::getNewCenters
bool getNewCenters(WT **coords)
Definition: Zoltan2_TaskMapping.hpp:757
Zoltan2::CoordinateTaskMapper::proc_to_task_xadj
ArrayRCP< part_t > proc_to_task_xadj
Definition: Zoltan2_TaskMapping.hpp:1748
Zoltan2::CoordinateTaskMapper
Definition: Zoltan2_TaskMapping.hpp:1733
Teuchos::Zoltan2_ReduceBestMapping::reduce
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Definition: Zoltan2_TaskMapping.hpp:52
Zoltan2_MachineRepresentation.hpp
Zoltan2::PartitionMapping::comm
const Teuchos::RCP< const Teuchos::Comm< int > > comm
Definition: Zoltan2_PartitionMapping.hpp:78
Zoltan2::KmeansHeap::initValues
void initValues()
Definition: Zoltan2_TaskMapping.hpp:677
Zoltan2::CommunicationModel::no_procs
part_t no_procs
Definition: Zoltan2_TaskMapping.hpp:999
Zoltan2::CoordinateTaskMapper::local_task_to_rank
ArrayRCP< part_t > local_task_to_rank
Definition: Zoltan2_TaskMapping.hpp:1760
Zoltan2::BaseAdapterType
BaseAdapterType
An enum to identify general types of adapters.
Definition: Zoltan2_Adapter.hpp:63
Zoltan2::CoordinateTaskMapper::isOwnerofModel
bool isOwnerofModel
Definition: Zoltan2_TaskMapping.hpp:1763
Zoltan2::CoordinateTaskMapper::CoordinateTaskMapper
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation....
Definition: Zoltan2_TaskMapping.hpp:2843
Zoltan2::KMeansCluster::~KMeansCluster
~KMeansCluster()
Definition: Zoltan2_TaskMapping.hpp:743
Zoltan2::CoordinateTaskMapper::proc_to_task_adj
ArrayRCP< part_t > proc_to_task_adj
Definition: Zoltan2_TaskMapping.hpp:1752
Zoltan2::KMeansAlgorithm::kmeans
void kmeans()
Definition: Zoltan2_TaskMapping.hpp:893
Zoltan2::CoordinateCommunicationModel::num_ranks_per_node
int num_ranks_per_node
Definition: Zoltan2_TaskMapping.hpp:1107
Zoltan2::AlgMJ::sequential_task_partitioning
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
Definition: Zoltan2_AlgMultiJagged.hpp:1460
Zoltan2::CoordinateCommunicationModel::umpa_uRandom
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
Definition: Zoltan2_TaskMapping.hpp:1210
Teuchos::Zoltan2_ReduceBestMapping
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id.
Definition: Zoltan2_TaskMapping.hpp:40
Zoltan2::GraphModel::getEdgeList
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
Definition: Zoltan2_GraphModel.hpp:213
Zoltan2::KmeansHeap::setHeapsize
void setHeapsize(IT heapsize_)
Definition: Zoltan2_TaskMapping.hpp:618
Zoltan2::CoordinateTaskMapper::task_communication_edge_weight
ArrayRCP< scalar_t > task_communication_edge_weight
Definition: Zoltan2_TaskMapping.hpp:1769
Zoltan2::CoordinateTaskMapper::getAssignedTasksForProc
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
Definition: Zoltan2_TaskMapping.hpp:3119
Zoltan2::GraphAdapterType
@ GraphAdapterType
graph data
Definition: Zoltan2_Adapter.hpp:68
Zoltan2::CommunicationModel
CommunicationModel Base Class that performs mapping between the coordinate partitioning result.
Definition: Zoltan2_TaskMapping.hpp:991
Zoltan2::CoordinateCommunicationModel::partNoArray
part_t * partNoArray
Definition: Zoltan2_TaskMapping.hpp:1101
Zoltan2::PartitioningSolution::getTargetGlobalNumberOfParts
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
Definition: Zoltan2_PartitioningSolution.hpp:165
Zoltan2::CoordinateTaskMapper::nprocs
part_t nprocs
Definition: Zoltan2_TaskMapping.hpp:1765
Zoltan2::CoordinateTaskMapper::~CoordinateTaskMapper
virtual ~CoordinateTaskMapper()
Definition: Zoltan2_TaskMapping.hpp:2177
Zoltan2::KMeansCluster::copyCoordinates
void copyCoordinates(IT *permutation)
Definition: Zoltan2_TaskMapping.hpp:781
Zoltan2::CoordinateCommunicationModel::task_coords
tcoord_t ** task_coords
Definition: Zoltan2_TaskMapping.hpp:1098
Zoltan2::KMeansCluster::getDistanceToCenter
WT getDistanceToCenter()
Definition: Zoltan2_TaskMapping.hpp:777
Zoltan2::MappingSolution
PartitionMapping maps a solution or an input distribution to ranks.
Definition: Zoltan2_MappingSolution.hpp:65
Zoltan2::KMeansAlgorithm::getMinDistanceCluster
void getMinDistanceCluster(IT *procPermutation)
Definition: Zoltan2_TaskMapping.hpp:926
Zoltan2::CoordinateModel::getLocalNumCoordinates
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
Definition: Zoltan2_CoordinateModel.hpp:170
Zoltan2::CoordinateTaskMapper::getPartsForProc
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
Definition: Zoltan2_TaskMapping.hpp:3109
Zoltan2
Definition: Zoltan2_AlgSerialGreedy.hpp:56
Zoltan2::CoordinateCommunicationModel::machine_extent
int * machine_extent
Definition: Zoltan2_TaskMapping.hpp:1103
Zoltan2_Directory_Impl.hpp
Zoltan2_TPLTraits.hpp
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
Zoltan2::MatrixAdapterType
@ MatrixAdapterType
matrix data
Definition: Zoltan2_Adapter.hpp:67
Zoltan2::CoordinateCommunicationModel::setPartArraySize
void setPartArraySize(int psize)
Definition: Zoltan2_TaskMapping.hpp:1161
Zoltan2::KmeansHeap::getNewCenters
bool getNewCenters(WT *center, WT **coords, int dimension)
Definition: Zoltan2_TaskMapping.hpp:701
Zoltan2::getCoarsenedPartGraph
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
Definition: Zoltan2_TaskMapping.hpp:248
Zoltan2::KMeansCluster::clearHeap
void clearHeap()
Definition: Zoltan2_TaskMapping.hpp:753
Zoltan2::Algorithm::scalar_t
Adapter::scalar_t scalar_t
Definition: Zoltan2_Algorithm.hpp:84
Zoltan2::Algorithm::lno_t
Adapter::lno_t lno_t
Definition: Zoltan2_Algorithm.hpp:82
Teuchos
Definition: Zoltan2_AlgMultiJagged.hpp:110
Zoltan2_GraphModel.hpp
Defines the GraphModel interface.
Zoltan2::CommunicationModel::getCommunicationCostMetric
double getCommunicationCostMetric()
Definition: Zoltan2_TaskMapping.hpp:1055
tMVector_t
Tpetra::MultiVector< double, int, int > tMVector_t
Definition: APFMeshInput.cpp:68
Teuchos::Zoltan2_ReduceBestMapping::Zoltan2_ReduceBestMapping
Zoltan2_ReduceBestMapping()
Default Constructor.
Definition: Zoltan2_TaskMapping.hpp:48
Zoltan2::CoordinateTaskMapper::CoordinateTaskMapper
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric,...
Definition: Zoltan2_TaskMapping.hpp:2212
Zoltan2::GraphModel
GraphModel defines the interface required for graph models.
Definition: Zoltan2_GraphModel.hpp:81
Zoltan2::CoordinateTaskMapper::ntasks
part_t ntasks
Definition: Zoltan2_TaskMapping.hpp:1766
Zoltan2::CoordinateTaskMapper::getLocalNumberOfParts
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
Definition: Zoltan2_TaskMapping.hpp:2968
Zoltan2::modelFlag_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
Definition: Zoltan2_Model.hpp:93
Zoltan2::CoordinateTaskMapper::task_to_proc
ArrayRCP< part_t > task_to_proc
Definition: Zoltan2_TaskMapping.hpp:1756
Zoltan2_XpetraMultiVectorAdapter.hpp
Defines the XpetraMultiVectorAdapter.
Zoltan2::fillContinousArray
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
Definition: Zoltan2_TaskMapping.hpp:964
Zoltan2_AlgMultiJagged.hpp
Contains the Multi-jagged algorthm.
Zoltan2::Environment::timerStart
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
Definition: Zoltan2_Environment.hpp:430
Zoltan2::getGridCommunicationGraph
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Definition: Zoltan2_TaskMapping.hpp:119
Zoltan2::CoordinateTaskMapper::proc_task_comm
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm
Definition: Zoltan2_TaskMapping.hpp:1764
Zoltan2::StridedData
The StridedData class manages lists of weights or coordinates.
Definition: Zoltan2_StridedData.hpp:76
Zoltan2::KmeansHeap
KmeansHeap Class, max heap, but holds the minimum values.
Definition: Zoltan2_TaskMapping.hpp:604
Zoltan2::CoordinateCommunicationModel::partArraySize
int partArraySize
Definition: Zoltan2_TaskMapping.hpp:1100
Zoltan2::CoordinateCommunicationModel::CoordinateCommunicationModel
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
Definition: Zoltan2_TaskMapping.hpp:1137
Zoltan2::CoordinateTaskMapper::writeMapping
void writeMapping()
Definition: Zoltan2_TaskMapping.hpp:1898