Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
53 #include "Tpetra_CrsGraph_decl.hpp"
55 #include "Kokkos_Core.hpp"
56 #include <memory>
57 #include <string>
58 
77 
78 namespace Tpetra {
79 
80 #ifndef DOXYGEN_SHOULD_SKIP_THIS
81 // Forward declaration of Distributor
82 class Distributor;
83 #endif // DOXYGEN_SHOULD_SKIP_THIS
84 
85 //
86 // Users must never rely on anything in the Details namespace.
87 //
88 namespace Details {
89 
90 namespace UnpackAndCombineCrsGraphImpl {
91 
101 template<class Packet, class GO, class Device, class BufferDevice>
102 KOKKOS_FUNCTION int
103 unpackRow(typename Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
104  typename Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
105  const Kokkos::View<const Packet*,BufferDevice>& imports,
106  const size_t offset,
107  const size_t num_ent)
108 {
109  typedef typename Kokkos::View<GO*,Device>::size_type size_type;
110 
111  if (num_ent == 0) {
112  // Empty rows always take zero bytes, to ensure sparsity.
113  return 0;
114  }
115 
116  // Unpack GIDs
117  for (size_type k=0; k<num_ent; k++)
118  gids_out(k) = imports(offset+k);
119 
120  // Unpack PIDs
121  if (pids_out.size() > 0) {
122  for (size_type k=0; k<num_ent; k++)
123  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
124  }
125 
126  return 0;
127 }
128 
139 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
141 
142  typedef Packet packet_type;
143  typedef LocalMap local_map_type;
144  typedef LocalGraph local_graph_type;
145  typedef BufferDevice buffer_device_type;
146 
147  typedef typename local_map_type::local_ordinal_type LO;
148  typedef typename local_map_type::global_ordinal_type GO;
149  // Kokkos::parallel_reduce fails to compile if named device_type and typedef
150  // is public
151  typedef typename local_map_type::device_type device_type;
152  typedef typename device_type::execution_space execution_space;
153 
154  typedef Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_type;
155  typedef Kokkos::View<const size_t*, device_type> offsets_type;
156  typedef Kokkos::View<const packet_type*, buffer_device_type> input_buffer_type;
157  typedef Kokkos::View<const LO*, device_type> import_lids_type;
158 
159  typedef Kokkos::View<LO*, device_type> lids_scratch_type;
160  typedef Kokkos::View<GO*, device_type> gids_scratch_type;
161  typedef Kokkos::View<int*,device_type> pids_scratch_type;
162 
163  static_assert(std::is_same<LO, typename local_graph_type::data_type>::value,
164  "LocalMap::local_ordinal_type and "
165  "LocalGraph::data_type must be the same.");
166 
167  local_graph_type local_graph;
168  local_map_type local_col_map;
169  input_buffer_type imports;
170  num_packets_per_lid_type num_packets_per_lid;
171  import_lids_type import_lids;
172  offsets_type offsets;
173  Tpetra::CombineMode combine_mode;
174  size_t max_num_ent;
175  bool unpack_pids;
176  bool atomic;
177  Kokkos::Experimental::UniqueToken<execution_space,
178  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
179  lids_scratch_type lids_scratch;
180  gids_scratch_type gids_scratch;
181  pids_scratch_type pids_scratch;
182 
183  public:
184  typedef Kokkos::pair<int, LO> value_type;
185 
187  const local_graph_type& local_graph_in,
188  const local_map_type& local_col_map_in,
189  const input_buffer_type& imports_in,
190  const num_packets_per_lid_type& num_packets_per_lid_in,
191  const import_lids_type& import_lids_in,
192  const offsets_type& offsets_in,
193  const Tpetra::CombineMode combine_mode_in,
194  const size_t max_num_ent_in,
195  const bool unpack_pids_in,
196  const bool atomic_in) :
197  local_graph(local_graph_in),
198  local_col_map(local_col_map_in),
199  imports(imports_in),
200  num_packets_per_lid(num_packets_per_lid_in),
201  import_lids(import_lids_in),
202  offsets(offsets_in),
203  combine_mode(combine_mode_in),
204  max_num_ent(max_num_ent_in),
205  unpack_pids(unpack_pids_in),
206  atomic(atomic_in),
207  tokens(execution_space()),
208  lids_scratch("pids_scratch", tokens.size() * max_num_ent),
209  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
210  pids_scratch("lids_scratch", tokens.size() * max_num_ent)
211  {}
212 
213  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
214  {
215  using Tpetra::Details::OrdinalTraits;
216  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
217  }
218 
219  KOKKOS_INLINE_FUNCTION void
220  join(volatile value_type& dst, const volatile value_type& src) const
221  {
222  // `dst` should reflect the first (least) bad index and
223  // all other associated error codes and data. Thus, we need only
224  // check if the `src` object shows an error and if its associated
225  // bad index is less than `dst`'s bad index.
226  using Tpetra::Details::OrdinalTraits;
227  if (src.second != OrdinalTraits<LO>::invalid()) {
228  // An error in the src; check if
229  // 1. `dst` shows errors
230  // 2. If `dst` does show errors, if src's bad index is less than
231  // *this' bad index
232  if (dst.second == OrdinalTraits<LO>::invalid() ||
233  src.second < dst.second) {
234  dst = src;
235  }
236  }
237  }
238 
239  KOKKOS_INLINE_FUNCTION
240  void operator()(const LO i, value_type& dst) const
241  {
242  using Kokkos::View;
243  using Kokkos::subview;
244  using Kokkos::MemoryUnmanaged;
245  typedef typename execution_space::size_type size_type;
246  typedef typename Kokkos::pair<size_type, size_type> slice;
247 
248  typedef View<LO*, device_type, MemoryUnmanaged> lids_out_type;
249  typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
250  typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
251 
252  const size_t num_packets_this_lid = num_packets_per_lid(i);
253  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
254  : num_packets_this_lid;
255  if (unpack_pids && num_packets_this_lid%2 != 0) {
256  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
257  // should never
258  dst = Kokkos::make_pair(1, i);
259  return;
260  }
261 
262  // Only unpack data if there is a nonzero number to unpack
263  if (num_ent == 0) {
264  return;
265  }
266 
267  // there is actually something in the row
268  const size_t buf_size = imports.size();
269  const size_t offset = offsets(i);
270 
271  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
272  dst = Kokkos::make_pair(2, i); // out of bounds
273  return;
274  }
275 
276  // Get subviews in to the scratch arrays. The token returned from acquire
277  // is an integer in [0, tokens.size()). It is used to grab a unique (to
278  // this thread) subview of the scratch arrays.
279  const size_type token = tokens.acquire();
280  const size_t a = static_cast<size_t>(token) * max_num_ent;
281  const size_t b = a + num_ent;
282  lids_out_type lids_out = subview(lids_scratch, slice(a, b));
283  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
284  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
285 
286  // Unpack this row!
287  int err = unpackRow<packet_type,GO,device_type,buffer_device_type>(
288  gids_out, pids_out, imports, offset, num_ent);
289 
290  if (err != 0) {
291  dst = Kokkos::make_pair(3, i);
292  return;
293  }
294 
295  // Column indices come in as global indices, in case the
296  // source object's column Map differs from the target object's
297  // (this's) column Map, and must be converted local index values
298  for (size_t k = 0; k < num_ent; ++k) {
299  lids_out(k) = local_col_map.getLocalElement(gids_out(k));
300  }
301 
302  tokens.release(token);
303  }
304 };
305 
313 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
314 void
316  const LocalGraph& local_graph,
317  const LocalMap& local_map,
318  const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
319  const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
320  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
321  typename LocalMap::device_type,
322  Kokkos::MemoryUnmanaged>& import_lids,
323  const Tpetra::CombineMode combine_mode,
324  const bool unpack_pids,
325  const bool atomic)
326 {
327 
328  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
329  "unpackAndCombine[New] should not (yet) be called, the method is "
330  "incomplete. To complete, indices need to be inserted (unpacked) in to "
331  "the destination graph. The local graph, a Kokkos::StaticCrsGraph, does "
332  "not support insertion of indices");
333 
334  typedef typename LocalMap::local_ordinal_type LO;
335  typedef typename LocalMap::device_type device_type;
336  typedef typename device_type::execution_space execution_space;
337  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
339 
340  const char prefix[] =
341  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
342 
343  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
344  if (num_import_lids == 0) {
345  // Nothing to unpack
346  return;
347  }
348 
349  {
350  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
351  std::invalid_argument,
352  prefix << "INSERT combine mode is not allowed if the graph has a static graph "
353  "(i.e., was constructed with the CrsGraph constructor that takes a "
354  "const CrsGraph pointer).");
355 
356  // Unknown combine mode!
357  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode != REPLACE,
358  std::invalid_argument,
359  prefix << "Invalid combine mode; should never get "
360  "here! Please report this bug to the Tpetra developers.");
361 
362  // Check that sizes of input objects are consistent.
363  bool bad_num_import_lids =
364  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
365  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
366  std::invalid_argument,
367  prefix << "importLIDs.size() (" << num_import_lids << ") != "
368  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
369  } // end QA error checking
370 
371  // Get the offsets
372  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
373  computeOffsetsFromCounts(offsets, num_packets_per_lid);
374 
375  // Determine the maximum number of entries in any row in the graph. The
376  // maximum number of entries is needed to allocate unpack buffers on the
377  // device.
378  size_t max_num_ent;
379  Kokkos::parallel_reduce("MaxReduce",
380  num_packets_per_lid.size(),
381  KOKKOS_LAMBDA(const int& i, size_t& running_max_num_ent) {
382  size_t num_packets_this_lid = num_packets_per_lid(i);
383  size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
384  : num_packets_this_lid;
385  if (num_ent > running_max_num_ent) running_max_num_ent = num_ent;
386  }, Kokkos::Max<size_t>(max_num_ent));
387 
388  // Now do the actual unpack!
389  unpack_functor_type f(local_graph, local_map,
390  imports, num_packets_per_lid, import_lids, offsets, combine_mode,
391  max_num_ent, unpack_pids, atomic);
392 
393  typename unpack_functor_type::value_type x;
394  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
395  auto x_h = x.to_std_pair();
396  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
397  prefix << "UnpackAndCombineFunctor reported error code "
398  << x_h.first << " for the first bad row " << x_h.second);
399 
400  return;
401 }
402 
403 template<class Packet, class LocalGraph, class BufferDevice>
404 size_t
406  const LocalGraph& local_graph,
407  const Kokkos::View<const typename LocalGraph::data_type*,
408  typename LocalGraph::device_type,
409  Kokkos::MemoryUnmanaged> permute_from_lids,
410  const Kokkos::View<const Packet*, BufferDevice>& imports,
411  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
412  const size_t num_same_ids)
413 {
414  using Kokkos::parallel_reduce;
415  typedef LocalGraph local_graph_type;
416  typedef typename local_graph_type::data_type LO;
417  typedef typename local_graph_type::device_type device_type;
418  typedef typename device_type::execution_space execution_space;
419  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
420 
421  size_t count = 0;
422  LO num_items;
423 
424  // Number of graph entries to unpack (returned by this function).
425  num_items = static_cast<LO>(num_same_ids);
426  if (num_items) {
427  size_t kcnt = 0;
428  parallel_reduce(
429  range_policy(0, num_items),
430  KOKKOS_LAMBDA(const LO lid, size_t& update) {
431  update += static_cast<size_t>(local_graph.row_map[lid+1]
432  -local_graph.row_map[lid]);
433  }, kcnt);
434  count += kcnt;
435  }
436 
437  // Count entries copied directly from the source graph with permuting.
438  num_items = static_cast<LO>(permute_from_lids.extent(0));
439  if (num_items) {
440  size_t kcnt = 0;
441  parallel_reduce(
442  range_policy(0, num_items),
443  KOKKOS_LAMBDA(const LO i, size_t& update) {
444  const LO lid = permute_from_lids(i);
445  update += static_cast<size_t>(local_graph.row_map[lid+1]
446  - local_graph.row_map[lid]);
447  }, kcnt);
448  count += kcnt;
449  }
450 
451  {
452  // Count entries received from other MPI processes.
453  size_t tot_num_ent = 0;
454  Kokkos::parallel_reduce("SumReduce",
455  num_packets_per_lid.size(),
456  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
457  lsum += num_packets_per_lid(i) / 2;
458  }, Kokkos::Sum<size_t>(tot_num_ent));
459  count += tot_num_ent;
460  }
461 
462  return count;
463 }
464 
466 template<class Packet, class LO, class Device, class BufferDevice>
467 void
469  const Kokkos::View<size_t*, Device>& tgt_rowptr,
470  const Kokkos::View<const LO*, Device>& import_lids,
471  const Kokkos::View<const Packet*, BufferDevice>& imports,
472  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
473 {
474  using Kokkos::parallel_reduce;
475  typedef Device device_type;
476  typedef typename device_type::execution_space execution_space;
477  typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
478  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
479 
480  const size_type N = num_packets_per_lid.extent(0);
481  parallel_for("Setup row pointers for remotes",
482  range_policy(0, N),
483  KOKKOS_LAMBDA(const size_t i){
484  typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
485  const size_t num_packets_this_lid = num_packets_per_lid(i);
486  const size_t num_ent = num_packets_this_lid / 2;
487  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
488  });
489 }
490 
491 // Convert array of row lengths to a CRS pointer array
492 template<class Device>
493 void
494 makeCrsRowPtrFromLengths(
495  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
496  const Kokkos::View<size_t*,Device>& new_start_row)
497 {
498  using Kokkos::parallel_scan;
499  typedef Device device_type;
500  typedef typename device_type::execution_space execution_space;
501  typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
502  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
503  const size_type N = new_start_row.extent(0);
504  parallel_scan(
505  range_policy(0, N),
506  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
507  auto cur_val = tgt_rowptr(i);
508  if (final) {
509  tgt_rowptr(i) = update;
510  new_start_row(i) = tgt_rowptr(i);
511  }
512  update += cur_val;
513  }
514  );
515 }
516 
517 template<class LocalGraph, class LocalMap>
518 void
519 copyDataFromSameIDs(
520  const Kokkos::View<typename LocalMap::global_ordinal_type*,
521  typename LocalMap::device_type>& tgt_colind,
522  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
523  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
524  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
525  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
526  const LocalGraph& local_graph,
527  const LocalMap& local_col_map,
528  const size_t num_same_ids,
529  const int my_pid)
530 {
531  using Kokkos::parallel_for;
532  typedef typename LocalMap::device_type device_type;
533  typedef typename LocalMap::local_ordinal_type LO;
534  typedef typename device_type::execution_space execution_space;
535  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
536 
537  parallel_for(
538  range_policy(0, num_same_ids),
539  KOKKOS_LAMBDA(const size_t i) {
540  typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
541 
542  const LO src_lid = static_cast<LO>(i);
543  size_t src_row = local_graph.row_map(src_lid);
544 
545  const LO tgt_lid = static_cast<LO>(i);
546  const size_t tgt_row = tgt_rowptr(tgt_lid);
547 
548  const size_t nsr = local_graph.row_map(src_lid+1)
549  - local_graph.row_map(src_lid);
550  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
551 
552  for (size_t j=local_graph.row_map(src_lid);
553  j<local_graph.row_map(src_lid+1); ++j) {
554  LO src_col = local_graph.entries(j);
555  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
556  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
557  }
558  }
559  );
560 }
561 
562 template<class LocalGraph, class LocalMap>
563 void
564 copyDataFromPermuteIDs(
565  const Kokkos::View<typename LocalMap::global_ordinal_type*,
566  typename LocalMap::device_type>& tgt_colind,
567  const Kokkos::View<int*,
568  typename LocalMap::device_type>& tgt_pids,
569  const Kokkos::View<size_t*,
570  typename LocalMap::device_type>& new_start_row,
571  const Kokkos::View<size_t*,
572  typename LocalMap::device_type>& tgt_rowptr,
573  const Kokkos::View<const int*,
574  typename LocalMap::device_type>& src_pids,
575  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
576  typename LocalMap::device_type>& permute_to_lids,
577  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
578  typename LocalMap::device_type>& permute_from_lids,
579  const LocalGraph& local_graph,
580  const LocalMap& local_col_map,
581  const int my_pid)
582 {
583  using Kokkos::parallel_for;
584  typedef typename LocalMap::device_type device_type;
585  typedef typename LocalMap::local_ordinal_type LO;
586  typedef typename device_type::execution_space execution_space;
587  typedef typename Kokkos::View<LO*,device_type>::size_type size_type;
588  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
589 
590  const size_type num_permute_to_lids = permute_to_lids.extent(0);
591 
592  parallel_for(
593  range_policy(0, num_permute_to_lids),
594  KOKKOS_LAMBDA(const size_t i) {
595  typedef typename std::remove_reference<decltype(new_start_row(0)) >::type atomic_incr_type;
596 
597  const LO src_lid = permute_from_lids(i);
598  const size_t src_row = local_graph.row_map(src_lid);
599 
600  const LO tgt_lid = permute_to_lids(i);
601  const size_t tgt_row = tgt_rowptr(tgt_lid);
602 
603  size_t nsr = local_graph.row_map(src_lid+1)
604  - local_graph.row_map(src_lid);
605  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
606 
607  for (size_t j=local_graph.row_map(src_lid);
608  j<local_graph.row_map(src_lid+1); ++j) {
609  LO src_col = local_graph.entries(j);
610  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
611  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
612  }
613  }
614  );
615 }
616 
617 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
618 void
619 unpackAndCombineIntoCrsArrays2(
620  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
621  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
622  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
623  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
624  const Kokkos::View<const typename LocalMap::local_ordinal_type*, typename LocalMap::device_type>& import_lids,
625  const Kokkos::View<const Packet*, BufferDevice>& imports,
626  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
627  const LocalGraph& local_graph,
628  const LocalMap /*& local_col_map*/,
629  const int my_pid)
630 {
631  using Kokkos::View;
632  using Kokkos::subview;
633  using Kokkos::MemoryUnmanaged;
634  using Kokkos::parallel_reduce;
635  using Kokkos::atomic_fetch_add;
636 
637  typedef Packet packet_type;
638  typedef BufferDevice buffer_device_type;
639  typedef typename LocalMap::device_type device_type;
640  typedef typename LocalMap::local_ordinal_type LO;
641  typedef typename LocalMap::global_ordinal_type GO;
642  typedef typename device_type::execution_space execution_space;
643  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
644  typedef typename Kokkos::pair<size_type, size_type> slice;
645  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
646 
647  typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
648  typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
649 
650  const size_type num_import_lids = import_lids.size();
651  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
652 
653  // RemoteIDs: Loop structure following UnpackAndCombine
654  int gbl_err_count;
655  parallel_reduce("Unpack and combine into CRS",
656  range_policy(0, num_import_lids),
657  KOKKOS_LAMBDA(const size_t i, int& err) {
658  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
659  const size_t num_packets_this_lid = num_packets_per_lid(i);
660  const size_t num_ent = num_packets_this_lid / 2;
661  const size_t offset = offsets(i);
662  const LO lcl_row = import_lids(i);
663  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
664  const size_t end_row = start_row + num_ent;
665 
666  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
667  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
668 
669  err += unpackRow<packet_type,GO,device_type,buffer_device_type>(
670  gids_out, pids_out, imports, offset, num_ent);
671 
672  // Correct target PIDs.
673  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
674  const int pid = pids_out(j);
675  pids_out(j) = (pid != my_pid) ? pid : -1;
676  }
677  }, gbl_err_count);
678 
679  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
680  std::invalid_argument, prefix <<
681  "Attempting to unpack PIDs, but num_ent is not even; this should never "
682  "happen! Please report this bug to the Tpetra developers.");
683 
684  return;
685 }
686 
687 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
688 void
690  const LocalGraph & local_graph,
691  const LocalMap & local_col_map,
692  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
693  typename LocalMap::device_type,
694  Kokkos::MemoryUnmanaged>& import_lids,
695  const Kokkos::View<const Packet*, BufferDevice>& imports,
696  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
697  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
698  typename LocalMap::device_type,
699  Kokkos::MemoryUnmanaged>& permute_to_lids,
700  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
701  typename LocalMap::device_type,
702  Kokkos::MemoryUnmanaged>& permute_from_lids,
703  const Kokkos::View<size_t*,
704  typename LocalMap::device_type,
705  Kokkos::MemoryUnmanaged>& tgt_rowptr,
706  const Kokkos::View<typename LocalMap::global_ordinal_type*,
707  typename LocalMap::device_type,
708  Kokkos::MemoryUnmanaged>& tgt_colind,
709  const Kokkos::View<const int*,
710  typename LocalMap::device_type,
711  Kokkos::MemoryUnmanaged>& src_pids,
712  const Kokkos::View<int*,
713  typename LocalMap::device_type,
714  Kokkos::MemoryUnmanaged>& tgt_pids,
715  const size_t num_same_ids,
716  const size_t tgt_num_rows,
717  const size_t tgt_num_nonzeros,
718  const int my_tgt_pid)
719 {
720  using Kokkos::View;
721  using Kokkos::subview;
722  using Kokkos::parallel_for;
723  using Kokkos::MemoryUnmanaged;
724  typedef Packet packet_type;
725  typedef LocalMap local_map_type;
726  typedef LocalGraph local_graph_type;
727  typedef BufferDevice buffer_device_type;
728  typedef typename LocalMap::device_type device_type;
729  typedef typename LocalMap::local_ordinal_type LO;
730  typedef typename device_type::execution_space execution_space;
731  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
732  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
733 
734  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
735 
736  const size_t N = tgt_num_rows;
737  const size_t mynnz = tgt_num_nonzeros;
738 
739  // In the case of reduced communicators, the sourceGraph won't have
740  // the right "my_pid", so thus we have to supply it.
741  const int my_pid = my_tgt_pid;
742 
743  // Zero the rowptr
744  parallel_for(
745  range_policy(0, N+1),
746  KOKKOS_LAMBDA(const size_t i) {
747  tgt_rowptr(i) = 0;
748  }
749  );
750 
751  // same IDs: Always first, always in the same place
752  parallel_for(
753  range_policy(0, num_same_ids),
754  KOKKOS_LAMBDA(const size_t i) {
755  const LO tgt_lid = static_cast<LO>(i);
756  const LO src_lid = static_cast<LO>(i);
757  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
758  - local_graph.row_map(src_lid);
759  }
760  );
761 
762  // Permute IDs: Still local, but reordered
763  const size_type num_permute_to_lids = permute_to_lids.extent(0);
764  parallel_for(
765  range_policy(0, num_permute_to_lids),
766  KOKKOS_LAMBDA(const size_t i) {
767  const LO tgt_lid = permute_to_lids(i);
768  const LO src_lid = permute_from_lids(i);
769  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
770  - local_graph.row_map(src_lid);
771  }
772  );
773 
774  // Get the offsets from the number of packets per LID
775  const size_type num_import_lids = import_lids.extent(0);
776  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
777  computeOffsetsFromCounts(offsets, num_packets_per_lid);
778 
779 #ifdef HAVE_TPETRA_DEBUG
780  {
781  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
782  const bool condition =
783  nth_offset_h != static_cast<size_t>(imports.extent(0));
784  TEUCHOS_TEST_FOR_EXCEPTION
785  (condition, std::logic_error, prefix
786  << "The final offset in bytes " << nth_offset_h
787  << " != imports.size() = " << imports.extent(0)
788  << ". Please report this bug to the Tpetra developers.");
789  }
790 #endif // HAVE_TPETRA_DEBUG
791 
792  // Setup row pointers for remotes
793  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
794  tgt_rowptr, import_lids, imports, num_packets_per_lid);
795 
796  // If multiple processes contribute to the same row, we may need to
797  // update row offsets. This tracks that.
798  View<size_t*, device_type> new_start_row("new_start_row", N+1);
799 
800  // Turn row length into a real CRS row pointer
801  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
802  {
803  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
804  bool condition = nth_tgt_rowptr_h != mynnz;
805  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
806  prefix << "CRS_rowptr[last] = " <<
807  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
808  }
809 
810  // SameIDs: Copy the data over
811  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
812  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
813 
814  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
815  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
816  local_graph, local_col_map, my_pid);
817 
818  if (imports.extent(0) <= 0) {
819  return;
820  }
821 
822  unpackAndCombineIntoCrsArrays2<
823  packet_type,local_graph_type,local_map_type,buffer_device_type>(
824  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
825  num_packets_per_lid, local_graph, local_col_map, my_pid);
826 
827  return;
828 }
829 
830 } // namespace UnpackAndCombineCrsGraphImpl
831 
868 template<class LO, class GO, class Node>
869 void
871  const CrsGraph<LO, GO, Node>& sourceGraph,
872  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,Node>::packet_type>& imports,
873  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
874  const Teuchos::ArrayView<const LO>& importLIDs,
875  size_t constantNumPackets,
876  Distributor & distor,
877  CombineMode combineMode,
878  const bool atomic)
879 {
880  using Kokkos::View;
881  typedef typename Node::device_type device_type;
882  typedef typename CrsGraph<LO,GO,Node>::packet_type packet_type;
883  typedef typename CrsGraph<LO, GO, Node>::local_graph_type local_graph_type;
884  typedef typename CrsGraph<LO, GO, Node>::buffer_device_type buffer_device_type;
885  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
886  "Node::device_type and LocalGraph::device_type must be the same.");
887 
888  typedef typename device_type::execution_space execution_space;
889  typename execution_space::device_type outputDevice;
890 
891  typedef typename buffer_device_type::execution_space buffer_execution_space;
892  typename buffer_execution_space::device_type bufferOutputDevice;
893 
894  // Convert all Teuchos::Array to Kokkos::View.
895 
896  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
897  // them to device. Since unpacking is done directly in to the local graph
898  // (lclGraph), no copying needs to be performed after unpacking.
899  auto imports_d =
900  create_mirror_view_from_raw_host_array(bufferOutputDevice,
901  imports.getRawPtr(), imports.size(),
902  true, "imports");
903 
904  auto num_packets_per_lid_d =
905  create_mirror_view_from_raw_host_array(bufferOutputDevice,
906  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
907  true, "num_packets_per_lid");
908 
909  auto import_lids_d =
911  importLIDs.getRawPtr(), importLIDs.size(),
912  true, "import_lids");
913 
914  auto local_graph = sourceGraph.getLocalGraph();
915  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
916 
917  // Now do the actual unpack!
918  typedef decltype(local_col_map) local_map_type;
919  UnpackAndCombineCrsGraphImpl::unpackAndCombine<
920  packet_type,local_graph_type,local_map_type,buffer_device_type>(
921  local_graph, local_col_map, imports_d, num_packets_per_lid_d,
922  import_lids_d, combineMode, false, atomic);
923 
924  return;
925 }
926 
927 template<class LO, class GO, class Node>
928 void
929 unpackCrsGraphAndCombineNew(
930  const CrsGraph<LO, GO, Node>& sourceGraph,
931  const Kokkos::DualView<const typename CrsGraph<LO,GO,Node>::packet_type*,
932  typename CrsGraph<LO,GO,Node>::buffer_device_type>& imports,
933  const Kokkos::DualView<const size_t*,
934  typename CrsGraph<LO,GO,Node>::buffer_device_type>& numPacketsPerLID,
935  const Kokkos::DualView<const LO*, typename Node::device_type>& importLIDs,
936  const size_t constantNumPackets,
937  Distributor& distor,
938  const CombineMode combineMode,
939  const bool atomic)
940 {
942  using Kokkos::View;
943  typedef typename Node::device_type device_type;
944  typedef CrsGraph<LO, GO, Node> crs_graph_type;
945  typedef typename crs_graph_type::packet_type packet_type;
946  typedef typename crs_graph_type::local_graph_type local_graph_type;
947  typedef typename crs_graph_type::buffer_device_type buffer_device_type;
948  typedef typename buffer_device_type::memory_space buffer_memory_space;
949  typedef typename device_type::memory_space memory_space;
950 
951  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
952  "Node::device_type and LocalGraph::device_type must be "
953  "the same.");
954 
955  {
956  auto numPacketsPerLID_nc = castAwayConstDualView(numPacketsPerLID);
957  numPacketsPerLID_nc.template sync<buffer_memory_space>();
958  }
959  auto num_packets_per_lid_d = numPacketsPerLID.template view<buffer_memory_space>();
960 
961  {
962  auto importLIDs_nc = castAwayConstDualView(importLIDs);
963  importLIDs_nc.template sync<memory_space>();
964  }
965  auto import_lids_d = importLIDs.template view<memory_space>();
966 
967  {
968  auto imports_nc = castAwayConstDualView(imports);
969  imports_nc.template sync<buffer_memory_space>();
970  }
971  auto imports_d = imports.template view<buffer_memory_space>();
972 
973  auto local_graph = sourceGraph.getLocalGraph();
974  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
975  typedef decltype(local_col_map) local_map_type;
976 
977  // Now do the actual unpack!
978  UnpackAndCombineCrsGraphImpl::unpackAndCombine<
979  packet_type,local_graph_type,local_map_type,buffer_device_type>(
980  local_graph, local_col_map, imports_d, num_packets_per_lid_d,
981  import_lids_d, combineMode, false, atomic);
982 }
983 
1027 //
1036 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1037 size_t
1039  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1040  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1041  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
1042  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1043  size_t constantNumPackets,
1044  Distributor &distor,
1045  CombineMode combineMode,
1046  size_t numSameIDs,
1047  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1048  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1049 {
1050  using Kokkos::MemoryUnmanaged;
1051  using Kokkos::View;
1052  typedef typename Node::device_type device_type;
1053  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type packet_type;
1054  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type local_graph_type;
1055  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type buffer_device_type;
1056  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1057 
1058  TEUCHOS_TEST_FOR_EXCEPTION
1059  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1060  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1061  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1062  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1063  // process, then the graph is neither locally nor globally indexed.
1064  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
1065  TEUCHOS_TEST_FOR_EXCEPTION
1066  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1067  "CrsGraph 'sourceGraph' must be locally indexed.");
1068  TEUCHOS_TEST_FOR_EXCEPTION
1069  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1070  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1071  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1072 
1073  auto local_graph = sourceGraph.getLocalGraph();
1074  auto permute_from_lids_d =
1076  permuteFromLIDs.getRawPtr(),
1077  permuteFromLIDs.size(), true,
1078  "permute_from_lids");
1079  auto imports_d =
1080  create_mirror_view_from_raw_host_array(buffer_device_type(),
1081  imports.getRawPtr(),
1082  imports.size(), true,
1083  "imports");
1084  auto num_packets_per_lid_d =
1085  create_mirror_view_from_raw_host_array(buffer_device_type(),
1086  numPacketsPerLID.getRawPtr(),
1087  numPacketsPerLID.size(), true,
1088  "num_packets_per_lid");
1089 
1091  packet_type,local_graph_type,buffer_device_type>(
1092  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1093 }
1094 
1108 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1109 void
1111  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1112  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1113  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
1114  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1115  const size_t constantNumPackets,
1116  Distributor& distor,
1117  const CombineMode combineMode,
1118  const size_t numSameIDs,
1119  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1120  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1121  size_t TargetNumRows,
1122  size_t TargetNumNonzeros,
1123  const int MyTargetPID,
1124  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1125  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1126  const Teuchos::ArrayView<const int>& SourcePids,
1127  Teuchos::Array<int>& TargetPids)
1128 {
1129  using Kokkos::View;
1130  using Kokkos::deep_copy;
1131  using Teuchos::ArrayView;
1132  using Teuchos::outArg;
1133  using Teuchos::REDUCE_MAX;
1134  using Teuchos::reduceAll;
1135  typedef LocalOrdinal LO;
1136  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type packet_type;
1137  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type local_graph_type;
1138  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type buffer_device_type;
1139  typedef typename Node::device_type device_type;
1140  typedef typename device_type::execution_space execution_space;
1141  typedef typename buffer_device_type::execution_space buffer_execution_space;
1142  typedef typename ArrayView<const LO>::size_type size_type;
1143 
1144  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1145 
1146  TEUCHOS_TEST_FOR_EXCEPTION(
1147  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1148  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1149  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1150 
1151  TEUCHOS_TEST_FOR_EXCEPTION(
1152  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1153  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
1154  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1155  const size_type numImportLIDs = importLIDs.size();
1156 
1157  TEUCHOS_TEST_FOR_EXCEPTION(
1158  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1159  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1160  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1161 
1162  // Preseed TargetPids with -1 for local
1163  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1164  TargetPids.resize(TargetNumNonzeros);
1165  }
1166  TargetPids.assign(TargetNumNonzeros, -1);
1167 
1168  // Grab pointers for sourceGraph
1169  auto local_graph = sourceGraph.getLocalGraph();
1170  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
1171 
1172  // Convert input arrays to Kokkos::View
1173  typename execution_space::device_type outputDevice;
1174  typename buffer_execution_space::device_type bufferOutputDevice;
1175 
1176  auto import_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1177  importLIDs.getRawPtr(), importLIDs.size(),
1178  true, "import_lids");
1179 
1180  auto imports_d = create_mirror_view_from_raw_host_array(bufferOutputDevice,
1181  imports.getRawPtr(), imports.size(),
1182  true, "imports");
1183 
1184  auto num_packets_per_lid_d = create_mirror_view_from_raw_host_array(bufferOutputDevice,
1185  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1186  true, "num_packets_per_lid");
1187 
1188  auto permute_from_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1189  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1190  true, "permute_from_lids");
1191 
1192  auto permute_to_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1193  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1194  true, "permute_to_lids");
1195 
1196  auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice,
1197  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1198  true, "crs_rowptr");
1199 
1200  auto crs_colind_d = create_mirror_view_from_raw_host_array(outputDevice,
1201  CRS_colind.getRawPtr(), CRS_colind.size(),
1202  true, "crs_colidx");
1203 
1204  auto src_pids_d = create_mirror_view_from_raw_host_array(outputDevice,
1205  SourcePids.getRawPtr(), SourcePids.size(),
1206  true, "src_pids");
1207 
1208  auto tgt_pids_d = create_mirror_view_from_raw_host_array(outputDevice,
1209  TargetPids.getRawPtr(), TargetPids.size(),
1210  true, "tgt_pids");
1211 
1212  typedef decltype(local_col_map) local_map_type;
1214  packet_type,local_graph_type,local_map_type,buffer_device_type>(
1215  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1216  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1217  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1218 
1219  // Copy outputs back to host
1220  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1221  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1222  deep_copy(crs_rowptr_h, crs_rowptr_d);
1223 
1224  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1225  CRS_colind.getRawPtr(), CRS_colind.size());
1226  deep_copy(crs_colind_h, crs_colind_d);
1227 
1228  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1229  TargetPids.getRawPtr(), TargetPids.size());
1230  deep_copy(tgt_pids_h, tgt_pids_d);
1231 
1232 }
1233 
1234 } // namespace Details
1235 } // namespace Tpetra
1236 
1237 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1238  template void \
1239  Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1240  const CrsGraph<LO, GO, NT>&, \
1241  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1242  const Teuchos::ArrayView<const size_t>&, \
1243  const Teuchos::ArrayView<const LO>&, \
1244  size_t, \
1245  Distributor&, \
1246  CombineMode, \
1247  const bool); \
1248  template void \
1249  Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1250  const CrsGraph<LO, GO, NT>&, \
1251  const Kokkos::DualView<const typename CrsGraph<LO,GO,NT>::packet_type*, \
1252  typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1253  const Kokkos::DualView<const size_t*, typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1254  const Kokkos::DualView<const LO*, NT::device_type>&, \
1255  const size_t, \
1256  Distributor&, \
1257  const CombineMode, \
1258  const bool); \
1259  template void \
1260  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1261  const CrsGraph<LO, GO, NT> &, \
1262  const Teuchos::ArrayView<const LO>&, \
1263  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1264  const Teuchos::ArrayView<const size_t>&, \
1265  const size_t, \
1266  Distributor&, \
1267  const CombineMode, \
1268  const size_t, \
1269  const Teuchos::ArrayView<const LO>&, \
1270  const Teuchos::ArrayView<const LO>&, \
1271  size_t, \
1272  size_t, \
1273  const int, \
1274  const Teuchos::ArrayView<size_t>&, \
1275  const Teuchos::ArrayView<GO>&, \
1276  const Teuchos::ArrayView<const int>&, \
1277  Teuchos::Array<int>&); \
1278  template size_t \
1279  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1280  const CrsGraph<LO, GO, NT> &, \
1281  const Teuchos::ArrayView<const LO> &, \
1282  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1283  const Teuchos::ArrayView<const size_t>&, \
1284  size_t, \
1285  Distributor &, \
1286  CombineMode, \
1287  size_t, \
1288  const Teuchos::ArrayView<const LO>&, \
1289  const Teuchos::ArrayView<const LO>&);
1290 
1291 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
KOKKOS_FUNCTION int unpackRow(typename Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, typename Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
local_graph_type getLocalGraph() const
Get the local graph.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, Device > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &imports, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
Declaration of the Tpetra::CrsGraph class.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GlobalOrdinal > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
KOKKOS_INLINE_FUNCTION GlobalOrdinal getGlobalElement(const LocalOrdinal localIndex) const
Get the global index corresponding to the given local index.
void unpackCrsGraphAndCombine(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, const bool atomic)
Unpack the imported column indices and combine into graph.
Implementation details of Tpetra.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Replace existing values with new values.
"Local" part of Map suitable for Kokkos kernels.
::Kokkos::Details::ArithTraits< GlobalOrdinal >::val_type packet_type
The type of each datum being sent or received in an Import or Export.
Kokkos::StaticCrsGraph< LocalOrdinal, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
void unpackAndCombine(const LocalGraph &local_graph, const LocalMap &local_map, const Kokkos::View< const Packet *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const typename LocalMap::local_ordinal_type *, typename LocalMap::device_type, Kokkos::MemoryUnmanaged > &import_lids, const Tpetra::CombineMode combine_mode, const bool unpack_pids, const bool atomic)
Perform the unpack operation for the graph.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
bool isLocallyIndexed() const override
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.