42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP 43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP 45 #include "TpetraCore_config.h" 46 #include "Teuchos_Array.hpp" 47 #include "Teuchos_ArrayView.hpp" 55 #include "Kokkos_Core.hpp" 80 #ifndef DOXYGEN_SHOULD_SKIP_THIS 83 #endif // DOXYGEN_SHOULD_SKIP_THIS 90 namespace UnpackAndCombineCrsGraphImpl {
101 template<
class Packet,
class GO,
class Device,
class BufferDevice>
103 unpackRow(
typename Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
104 typename Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
105 const Kokkos::View<const Packet*,BufferDevice>& imports,
107 const size_t num_ent)
109 typedef typename Kokkos::View<GO*,Device>::size_type size_type;
117 for (size_type k=0; k<num_ent; k++)
118 gids_out(k) = imports(offset+k);
121 if (pids_out.size() > 0) {
122 for (size_type k=0; k<num_ent; k++)
123 pids_out(k) =
static_cast<int>(imports(offset+num_ent+k));
139 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
142 typedef Packet packet_type;
144 typedef LocalGraph local_graph_type;
145 typedef BufferDevice buffer_device_type;
147 typedef typename local_map_type::local_ordinal_type LO;
148 typedef typename local_map_type::global_ordinal_type GO;
151 typedef typename local_map_type::device_type device_type;
152 typedef typename device_type::execution_space execution_space;
154 typedef Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_type;
155 typedef Kokkos::View<const size_t*, device_type> offsets_type;
156 typedef Kokkos::View<const packet_type*, buffer_device_type> input_buffer_type;
157 typedef Kokkos::View<const LO*, device_type> import_lids_type;
159 typedef Kokkos::View<LO*, device_type> lids_scratch_type;
160 typedef Kokkos::View<GO*, device_type> gids_scratch_type;
161 typedef Kokkos::View<int*,device_type> pids_scratch_type;
163 static_assert(std::is_same<LO, typename local_graph_type::data_type>::value,
164 "LocalMap::local_ordinal_type and " 165 "LocalGraph::data_type must be the same.");
167 local_graph_type local_graph;
168 local_map_type local_col_map;
169 input_buffer_type imports;
170 num_packets_per_lid_type num_packets_per_lid;
171 import_lids_type import_lids;
172 offsets_type offsets;
177 Kokkos::Experimental::UniqueToken<execution_space,
178 Kokkos::Experimental::UniqueTokenScope::Global> tokens;
179 lids_scratch_type lids_scratch;
180 gids_scratch_type gids_scratch;
181 pids_scratch_type pids_scratch;
184 typedef Kokkos::pair<int, LO> value_type;
187 const local_graph_type& local_graph_in,
188 const local_map_type& local_col_map_in,
189 const input_buffer_type& imports_in,
190 const num_packets_per_lid_type& num_packets_per_lid_in,
191 const import_lids_type& import_lids_in,
192 const offsets_type& offsets_in,
194 const size_t max_num_ent_in,
195 const bool unpack_pids_in,
196 const bool atomic_in) :
197 local_graph(local_graph_in),
198 local_col_map(local_col_map_in),
200 num_packets_per_lid(num_packets_per_lid_in),
201 import_lids(import_lids_in),
203 combine_mode(combine_mode_in),
204 max_num_ent(max_num_ent_in),
205 unpack_pids(unpack_pids_in),
207 tokens(execution_space()),
208 lids_scratch(
"pids_scratch", tokens.size() * max_num_ent),
209 gids_scratch(
"gids_scratch", tokens.size() * max_num_ent),
210 pids_scratch(
"lids_scratch", tokens.size() * max_num_ent)
213 KOKKOS_INLINE_FUNCTION
void init(value_type& dst)
const 215 using Tpetra::Details::OrdinalTraits;
216 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
219 KOKKOS_INLINE_FUNCTION
void 220 join(
volatile value_type& dst,
const volatile value_type& src)
const 226 using Tpetra::Details::OrdinalTraits;
227 if (src.second != OrdinalTraits<LO>::invalid()) {
232 if (dst.second == OrdinalTraits<LO>::invalid() ||
233 src.second < dst.second) {
239 KOKKOS_INLINE_FUNCTION
240 void operator()(
const LO i, value_type& dst)
const 243 using Kokkos::subview;
244 using Kokkos::MemoryUnmanaged;
245 typedef typename execution_space::size_type size_type;
246 typedef typename Kokkos::pair<size_type, size_type> slice;
248 typedef View<LO*, device_type, MemoryUnmanaged> lids_out_type;
249 typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
250 typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
252 const size_t num_packets_this_lid = num_packets_per_lid(i);
253 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
254 : num_packets_this_lid;
255 if (unpack_pids && num_packets_this_lid%2 != 0) {
258 dst = Kokkos::make_pair(1, i);
268 const size_t buf_size = imports.size();
269 const size_t offset = offsets(i);
271 if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
272 dst = Kokkos::make_pair(2, i);
279 const size_type token = tokens.acquire();
280 const size_t a =
static_cast<size_t>(token) * max_num_ent;
281 const size_t b = a + num_ent;
282 lids_out_type lids_out = subview(lids_scratch, slice(a, b));
283 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
284 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
287 int err = unpackRow<packet_type,GO,device_type,buffer_device_type>(
288 gids_out, pids_out, imports, offset, num_ent);
291 dst = Kokkos::make_pair(3, i);
298 for (
size_t k = 0; k < num_ent; ++k) {
302 tokens.release(token);
313 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
316 const LocalGraph& local_graph,
318 const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
319 const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
320 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
321 typename LocalMap::device_type,
322 Kokkos::MemoryUnmanaged>& import_lids,
324 const bool unpack_pids,
328 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
329 "unpackAndCombine[New] should not (yet) be called, the method is " 330 "incomplete. To complete, indices need to be inserted (unpacked) in to " 331 "the destination graph. The local graph, a Kokkos::StaticCrsGraph, does " 332 "not support insertion of indices");
334 typedef typename LocalMap::local_ordinal_type LO;
335 typedef typename LocalMap::device_type device_type;
336 typedef typename device_type::execution_space execution_space;
337 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
340 const char prefix[] =
341 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
343 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
344 if (num_import_lids == 0) {
350 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
351 std::invalid_argument,
352 prefix <<
"INSERT combine mode is not allowed if the graph has a static graph " 353 "(i.e., was constructed with the CrsGraph constructor that takes a " 354 "const CrsGraph pointer).");
357 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode !=
REPLACE,
358 std::invalid_argument,
359 prefix <<
"Invalid combine mode; should never get " 360 "here! Please report this bug to the Tpetra developers.");
363 bool bad_num_import_lids =
364 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
365 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
366 std::invalid_argument,
367 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != " 368 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
372 Kokkos::View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
379 Kokkos::parallel_reduce(
"MaxReduce",
380 num_packets_per_lid.size(),
381 KOKKOS_LAMBDA(
const int& i,
size_t& running_max_num_ent) {
382 size_t num_packets_this_lid = num_packets_per_lid(i);
383 size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
384 : num_packets_this_lid;
385 if (num_ent > running_max_num_ent) running_max_num_ent = num_ent;
386 }, Kokkos::Max<size_t>(max_num_ent));
389 unpack_functor_type f(local_graph, local_map,
390 imports, num_packets_per_lid, import_lids, offsets, combine_mode,
391 max_num_ent, unpack_pids, atomic);
393 typename unpack_functor_type::value_type x;
394 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
395 auto x_h = x.to_std_pair();
396 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
397 prefix <<
"UnpackAndCombineFunctor reported error code " 398 << x_h.first <<
" for the first bad row " << x_h.second);
403 template<
class Packet,
class LocalGraph,
class BufferDevice>
406 const LocalGraph& local_graph,
407 const Kokkos::View<
const typename LocalGraph::data_type*,
408 typename LocalGraph::device_type,
409 Kokkos::MemoryUnmanaged> permute_from_lids,
410 const Kokkos::View<const Packet*, BufferDevice>& imports,
411 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
412 const size_t num_same_ids)
414 using Kokkos::parallel_reduce;
415 typedef LocalGraph local_graph_type;
416 typedef typename local_graph_type::data_type LO;
417 typedef typename local_graph_type::device_type device_type;
418 typedef typename device_type::execution_space execution_space;
419 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
425 num_items =
static_cast<LO
>(num_same_ids);
429 range_policy(0, num_items),
430 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
431 update +=
static_cast<size_t>(local_graph.row_map[lid+1]
432 -local_graph.row_map[lid]);
438 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
442 range_policy(0, num_items),
443 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
444 const LO lid = permute_from_lids(i);
445 update +=
static_cast<size_t>(local_graph.row_map[lid+1]
446 - local_graph.row_map[lid]);
453 size_t tot_num_ent = 0;
454 Kokkos::parallel_reduce(
"SumReduce",
455 num_packets_per_lid.size(),
456 KOKKOS_LAMBDA(
const int& i,
size_t& lsum) {
457 lsum += num_packets_per_lid(i) / 2;
458 }, Kokkos::Sum<size_t>(tot_num_ent));
459 count += tot_num_ent;
466 template<
class Packet,
class LO,
class Device,
class BufferDevice>
469 const Kokkos::View<size_t*, Device>& tgt_rowptr,
470 const Kokkos::View<const LO*, Device>& import_lids,
471 const Kokkos::View<const Packet*, BufferDevice>& imports,
472 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
474 using Kokkos::parallel_reduce;
475 typedef Device device_type;
476 typedef typename device_type::execution_space execution_space;
477 typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
478 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
480 const size_type N = num_packets_per_lid.extent(0);
481 parallel_for(
"Setup row pointers for remotes",
483 KOKKOS_LAMBDA(
const size_t i){
484 typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
485 const size_t num_packets_this_lid = num_packets_per_lid(i);
486 const size_t num_ent = num_packets_this_lid / 2;
487 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
492 template<
class Device>
494 makeCrsRowPtrFromLengths(
495 const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
496 const Kokkos::View<size_t*,Device>& new_start_row)
498 using Kokkos::parallel_scan;
499 typedef Device device_type;
500 typedef typename device_type::execution_space execution_space;
501 typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
502 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
503 const size_type N = new_start_row.extent(0);
506 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
507 auto cur_val = tgt_rowptr(i);
509 tgt_rowptr(i) = update;
510 new_start_row(i) = tgt_rowptr(i);
517 template<
class LocalGraph,
class LocalMap>
520 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
521 typename LocalMap::device_type>& tgt_colind,
522 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
523 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
524 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
525 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
526 const LocalGraph& local_graph,
528 const size_t num_same_ids,
531 using Kokkos::parallel_for;
532 typedef typename LocalMap::device_type device_type;
533 typedef typename LocalMap::local_ordinal_type LO;
534 typedef typename device_type::execution_space execution_space;
535 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
538 range_policy(0, num_same_ids),
539 KOKKOS_LAMBDA(
const size_t i) {
540 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
542 const LO src_lid =
static_cast<LO
>(i);
543 size_t src_row = local_graph.row_map(src_lid);
545 const LO tgt_lid =
static_cast<LO
>(i);
546 const size_t tgt_row = tgt_rowptr(tgt_lid);
548 const size_t nsr = local_graph.row_map(src_lid+1)
549 - local_graph.row_map(src_lid);
550 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
552 for (
size_t j=local_graph.row_map(src_lid);
553 j<local_graph.row_map(src_lid+1); ++j) {
554 LO src_col = local_graph.entries(j);
555 tgt_colind(tgt_row + j - src_row) = local_col_map.
getGlobalElement(src_col);
556 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
562 template<
class LocalGraph,
class LocalMap>
564 copyDataFromPermuteIDs(
565 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
566 typename LocalMap::device_type>& tgt_colind,
567 const Kokkos::View<
int*,
568 typename LocalMap::device_type>& tgt_pids,
569 const Kokkos::View<
size_t*,
570 typename LocalMap::device_type>& new_start_row,
571 const Kokkos::View<
size_t*,
572 typename LocalMap::device_type>& tgt_rowptr,
573 const Kokkos::View<
const int*,
574 typename LocalMap::device_type>& src_pids,
575 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
576 typename LocalMap::device_type>& permute_to_lids,
577 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
578 typename LocalMap::device_type>& permute_from_lids,
579 const LocalGraph& local_graph,
583 using Kokkos::parallel_for;
584 typedef typename LocalMap::device_type device_type;
585 typedef typename LocalMap::local_ordinal_type LO;
586 typedef typename device_type::execution_space execution_space;
587 typedef typename Kokkos::View<LO*,device_type>::size_type size_type;
588 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
590 const size_type num_permute_to_lids = permute_to_lids.extent(0);
593 range_policy(0, num_permute_to_lids),
594 KOKKOS_LAMBDA(
const size_t i) {
595 typedef typename std::remove_reference<decltype(new_start_row(0)) >::type atomic_incr_type;
597 const LO src_lid = permute_from_lids(i);
598 const size_t src_row = local_graph.row_map(src_lid);
600 const LO tgt_lid = permute_to_lids(i);
601 const size_t tgt_row = tgt_rowptr(tgt_lid);
603 size_t nsr = local_graph.row_map(src_lid+1)
604 - local_graph.row_map(src_lid);
605 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
607 for (
size_t j=local_graph.row_map(src_lid);
608 j<local_graph.row_map(src_lid+1); ++j) {
609 LO src_col = local_graph.entries(j);
610 tgt_colind(tgt_row + j - src_row) = local_col_map.
getGlobalElement(src_col);
611 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
617 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
619 unpackAndCombineIntoCrsArrays2(
620 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
621 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
622 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
623 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
624 const Kokkos::View<const typename LocalMap::local_ordinal_type*, typename LocalMap::device_type>& import_lids,
625 const Kokkos::View<const Packet*, BufferDevice>& imports,
626 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
627 const LocalGraph& local_graph,
632 using Kokkos::subview;
633 using Kokkos::MemoryUnmanaged;
634 using Kokkos::parallel_reduce;
635 using Kokkos::atomic_fetch_add;
637 typedef Packet packet_type;
638 typedef BufferDevice buffer_device_type;
639 typedef typename LocalMap::device_type device_type;
640 typedef typename LocalMap::local_ordinal_type LO;
641 typedef typename LocalMap::global_ordinal_type GO;
642 typedef typename device_type::execution_space execution_space;
643 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
644 typedef typename Kokkos::pair<size_type, size_type> slice;
645 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
647 typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
648 typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
650 const size_type num_import_lids = import_lids.size();
651 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
655 parallel_reduce(
"Unpack and combine into CRS",
656 range_policy(0, num_import_lids),
657 KOKKOS_LAMBDA(
const size_t i,
int& err) {
658 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
659 const size_t num_packets_this_lid = num_packets_per_lid(i);
660 const size_t num_ent = num_packets_this_lid / 2;
661 const size_t offset = offsets(i);
662 const LO lcl_row = import_lids(i);
663 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
664 const size_t end_row = start_row + num_ent;
666 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
667 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
669 err += unpackRow<packet_type,GO,device_type,buffer_device_type>(
670 gids_out, pids_out, imports, offset, num_ent);
673 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
674 const int pid = pids_out(j);
675 pids_out(j) = (pid != my_pid) ? pid : -1;
679 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
680 std::invalid_argument, prefix <<
681 "Attempting to unpack PIDs, but num_ent is not even; this should never " 682 "happen! Please report this bug to the Tpetra developers.");
687 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
690 const LocalGraph & local_graph,
692 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
693 typename LocalMap::device_type,
694 Kokkos::MemoryUnmanaged>& import_lids,
695 const Kokkos::View<const Packet*, BufferDevice>& imports,
696 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
697 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
698 typename LocalMap::device_type,
699 Kokkos::MemoryUnmanaged>& permute_to_lids,
700 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
701 typename LocalMap::device_type,
702 Kokkos::MemoryUnmanaged>& permute_from_lids,
703 const Kokkos::View<
size_t*,
704 typename LocalMap::device_type,
705 Kokkos::MemoryUnmanaged>& tgt_rowptr,
706 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
707 typename LocalMap::device_type,
708 Kokkos::MemoryUnmanaged>& tgt_colind,
709 const Kokkos::View<
const int*,
710 typename LocalMap::device_type,
711 Kokkos::MemoryUnmanaged>& src_pids,
712 const Kokkos::View<
int*,
713 typename LocalMap::device_type,
714 Kokkos::MemoryUnmanaged>& tgt_pids,
715 const size_t num_same_ids,
716 const size_t tgt_num_rows,
717 const size_t tgt_num_nonzeros,
718 const int my_tgt_pid)
721 using Kokkos::subview;
722 using Kokkos::parallel_for;
723 using Kokkos::MemoryUnmanaged;
724 typedef Packet packet_type;
726 typedef LocalGraph local_graph_type;
727 typedef BufferDevice buffer_device_type;
728 typedef typename LocalMap::device_type device_type;
729 typedef typename LocalMap::local_ordinal_type LO;
730 typedef typename device_type::execution_space execution_space;
731 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
732 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
734 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
736 const size_t N = tgt_num_rows;
737 const size_t mynnz = tgt_num_nonzeros;
741 const int my_pid = my_tgt_pid;
745 range_policy(0, N+1),
746 KOKKOS_LAMBDA(
const size_t i) {
753 range_policy(0, num_same_ids),
754 KOKKOS_LAMBDA(
const size_t i) {
755 const LO tgt_lid =
static_cast<LO
>(i);
756 const LO src_lid =
static_cast<LO
>(i);
757 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
758 - local_graph.row_map(src_lid);
763 const size_type num_permute_to_lids = permute_to_lids.extent(0);
765 range_policy(0, num_permute_to_lids),
766 KOKKOS_LAMBDA(
const size_t i) {
767 const LO tgt_lid = permute_to_lids(i);
768 const LO src_lid = permute_from_lids(i);
769 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
770 - local_graph.row_map(src_lid);
775 const size_type num_import_lids = import_lids.extent(0);
776 View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
779 #ifdef HAVE_TPETRA_DEBUG 781 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
782 const bool condition =
783 nth_offset_h !=
static_cast<size_t>(imports.extent(0));
784 TEUCHOS_TEST_FOR_EXCEPTION
785 (condition, std::logic_error, prefix
786 <<
"The final offset in bytes " << nth_offset_h
787 <<
" != imports.size() = " << imports.extent(0)
788 <<
". Please report this bug to the Tpetra developers.");
790 #endif // HAVE_TPETRA_DEBUG 793 setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
794 tgt_rowptr, import_lids, imports, num_packets_per_lid);
798 View<size_t*, device_type> new_start_row(
"new_start_row", N+1);
801 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
803 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
804 bool condition = nth_tgt_rowptr_h != mynnz;
805 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
806 prefix <<
"CRS_rowptr[last] = " <<
807 nth_tgt_rowptr_h <<
"!= mynnz = " << mynnz <<
".");
811 copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
812 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
814 copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
815 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
816 local_graph, local_col_map, my_pid);
818 if (imports.extent(0) <= 0) {
822 unpackAndCombineIntoCrsArrays2<
823 packet_type,local_graph_type,local_map_type,buffer_device_type>(
824 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
825 num_packets_per_lid, local_graph, local_col_map, my_pid);
868 template<
class LO,
class GO,
class Node>
873 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
874 const Teuchos::ArrayView<const LO>& importLIDs,
875 size_t constantNumPackets,
881 typedef typename Node::device_type device_type;
885 static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
886 "Node::device_type and LocalGraph::device_type must be the same.");
888 typedef typename device_type::execution_space execution_space;
889 typename execution_space::device_type outputDevice;
891 typedef typename buffer_device_type::execution_space buffer_execution_space;
892 typename buffer_execution_space::device_type bufferOutputDevice;
901 imports.getRawPtr(), imports.size(),
904 auto num_packets_per_lid_d =
906 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
907 true,
"num_packets_per_lid");
911 importLIDs.getRawPtr(), importLIDs.size(),
912 true,
"import_lids");
915 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
918 typedef decltype(local_col_map) local_map_type;
919 UnpackAndCombineCrsGraphImpl::unpackAndCombine<
920 packet_type,local_graph_type,local_map_type,buffer_device_type>(
921 local_graph, local_col_map, imports_d, num_packets_per_lid_d,
922 import_lids_d, combineMode,
false, atomic);
927 template<
class LO,
class GO,
class Node>
929 unpackCrsGraphAndCombineNew(
933 const Kokkos::DualView<
const size_t*,
935 const Kokkos::DualView<const LO*, typename Node::device_type>& importLIDs,
936 const size_t constantNumPackets,
943 typedef typename Node::device_type device_type;
945 typedef typename crs_graph_type::packet_type packet_type;
946 typedef typename crs_graph_type::local_graph_type local_graph_type;
947 typedef typename crs_graph_type::buffer_device_type buffer_device_type;
948 typedef typename buffer_device_type::memory_space buffer_memory_space;
949 typedef typename device_type::memory_space memory_space;
951 static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
952 "Node::device_type and LocalGraph::device_type must be " 957 numPacketsPerLID_nc.template sync<buffer_memory_space>();
959 auto num_packets_per_lid_d = numPacketsPerLID.template view<buffer_memory_space>();
963 importLIDs_nc.template sync<memory_space>();
965 auto import_lids_d = importLIDs.template view<memory_space>();
969 imports_nc.template sync<buffer_memory_space>();
971 auto imports_d = imports.template view<buffer_memory_space>();
974 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
975 typedef decltype(local_col_map) local_map_type;
978 UnpackAndCombineCrsGraphImpl::unpackAndCombine<
979 packet_type,local_graph_type,local_map_type,buffer_device_type>(
980 local_graph, local_col_map, imports_d, num_packets_per_lid_d,
981 import_lids_d, combineMode,
false, atomic);
1036 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1040 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1042 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1043 size_t constantNumPackets,
1047 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1048 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1050 using Kokkos::MemoryUnmanaged;
1052 typedef typename Node::device_type device_type;
1056 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1058 TEUCHOS_TEST_FOR_EXCEPTION
1059 (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1060 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size() <<
" != " 1061 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1065 TEUCHOS_TEST_FOR_EXCEPTION
1066 (! locallyIndexed, std::invalid_argument, prefix <<
"The input " 1067 "CrsGraph 'sourceGraph' must be locally indexed.");
1068 TEUCHOS_TEST_FOR_EXCEPTION
1069 (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1070 prefix <<
"importLIDs.size() = " << importLIDs.size() <<
" != " 1071 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1074 auto permute_from_lids_d =
1076 permuteFromLIDs.getRawPtr(),
1077 permuteFromLIDs.size(),
true,
1078 "permute_from_lids");
1081 imports.getRawPtr(),
1082 imports.size(),
true,
1084 auto num_packets_per_lid_d =
1086 numPacketsPerLID.getRawPtr(),
1087 numPacketsPerLID.size(),
true,
1088 "num_packets_per_lid");
1091 packet_type,local_graph_type,buffer_device_type>(
1092 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1108 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1112 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1114 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1115 const size_t constantNumPackets,
1118 const size_t numSameIDs,
1119 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1120 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1121 size_t TargetNumRows,
1122 size_t TargetNumNonzeros,
1123 const int MyTargetPID,
1124 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1125 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1126 const Teuchos::ArrayView<const int>& SourcePids,
1127 Teuchos::Array<int>& TargetPids)
1131 using Teuchos::ArrayView;
1132 using Teuchos::outArg;
1133 using Teuchos::REDUCE_MAX;
1134 using Teuchos::reduceAll;
1135 typedef LocalOrdinal LO;
1139 typedef typename Node::device_type device_type;
1140 typedef typename device_type::execution_space execution_space;
1141 typedef typename buffer_device_type::execution_space buffer_execution_space;
1142 typedef typename ArrayView<const LO>::size_type size_type;
1144 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1146 TEUCHOS_TEST_FOR_EXCEPTION(
1147 TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1148 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1149 CRS_rowptr.size() <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1151 TEUCHOS_TEST_FOR_EXCEPTION(
1152 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1153 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size()
1154 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1155 const size_type numImportLIDs = importLIDs.size();
1157 TEUCHOS_TEST_FOR_EXCEPTION(
1158 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1159 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != " 1160 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1163 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1164 TargetPids.resize(TargetNumNonzeros);
1166 TargetPids.assign(TargetNumNonzeros, -1);
1170 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
1173 typename execution_space::device_type outputDevice;
1174 typename buffer_execution_space::device_type bufferOutputDevice;
1177 importLIDs.getRawPtr(), importLIDs.size(),
1178 true,
"import_lids");
1181 imports.getRawPtr(), imports.size(),
1185 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1186 true,
"num_packets_per_lid");
1189 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1190 true,
"permute_from_lids");
1193 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1194 true,
"permute_to_lids");
1197 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1198 true,
"crs_rowptr");
1201 CRS_colind.getRawPtr(), CRS_colind.size(),
1202 true,
"crs_colidx");
1205 SourcePids.getRawPtr(), SourcePids.size(),
1209 TargetPids.getRawPtr(), TargetPids.size(),
1212 typedef decltype(local_col_map) local_map_type;
1214 packet_type,local_graph_type,local_map_type,buffer_device_type>(
1215 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1216 permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1217 tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1220 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1221 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1224 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1225 CRS_colind.getRawPtr(), CRS_colind.size());
1228 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1229 TargetPids.getRawPtr(), TargetPids.size());
1237 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \ 1239 Details::unpackCrsGraphAndCombine<LO, GO, NT>( \ 1240 const CrsGraph<LO, GO, NT>&, \ 1241 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \ 1242 const Teuchos::ArrayView<const size_t>&, \ 1243 const Teuchos::ArrayView<const LO>&, \ 1249 Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \ 1250 const CrsGraph<LO, GO, NT>&, \ 1251 const Kokkos::DualView<const typename CrsGraph<LO,GO,NT>::packet_type*, \ 1252 typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \ 1253 const Kokkos::DualView<const size_t*, typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \ 1254 const Kokkos::DualView<const LO*, NT::device_type>&, \ 1257 const CombineMode, \ 1260 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \ 1261 const CrsGraph<LO, GO, NT> &, \ 1262 const Teuchos::ArrayView<const LO>&, \ 1263 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \ 1264 const Teuchos::ArrayView<const size_t>&, \ 1267 const CombineMode, \ 1269 const Teuchos::ArrayView<const LO>&, \ 1270 const Teuchos::ArrayView<const LO>&, \ 1274 const Teuchos::ArrayView<size_t>&, \ 1275 const Teuchos::ArrayView<GO>&, \ 1276 const Teuchos::ArrayView<const int>&, \ 1277 Teuchos::Array<int>&); \ 1279 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \ 1280 const CrsGraph<LO, GO, NT> &, \ 1281 const Teuchos::ArrayView<const LO> &, \ 1282 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \ 1283 const Teuchos::ArrayView<const size_t>&, \ 1288 const Teuchos::ArrayView<const LO>&, \ 1289 const Teuchos::ArrayView<const LO>&); 1291 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP Namespace Tpetra contains the class and methods constituting the Tpetra library.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
KOKKOS_FUNCTION int unpackRow(typename Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, typename Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
local_graph_type getLocalGraph() const
Get the local graph.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, Device > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &imports, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
Declaration of the Tpetra::CrsGraph class.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GlobalOrdinal > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
KOKKOS_INLINE_FUNCTION GlobalOrdinal getGlobalElement(const LocalOrdinal localIndex) const
Get the global index corresponding to the given local index.
void unpackCrsGraphAndCombine(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, const bool atomic)
Unpack the imported column indices and combine into graph.
Implementation details of Tpetra.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Unpacks and combines a single row of the CrsGraph.
Replace existing values with new values.
"Local" part of Map suitable for Kokkos kernels.
::Kokkos::Details::ArithTraits< GlobalOrdinal >::val_type packet_type
The type of each datum being sent or received in an Import or Export.
Kokkos::StaticCrsGraph< LocalOrdinal, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
void unpackAndCombine(const LocalGraph &local_graph, const LocalMap &local_map, const Kokkos::View< const Packet *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const typename LocalMap::local_ordinal_type *, typename LocalMap::device_type, Kokkos::MemoryUnmanaged > &import_lids, const Tpetra::CombineMode combine_mode, const bool unpack_pids, const bool atomic)
Perform the unpack operation for the graph.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
bool isLocallyIndexed() const override
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.