Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
55 #include "Kokkos_Core.hpp"
56 #include <memory>
57 #include <string>
58 
79 
80 namespace Tpetra {
81 
82 #ifndef DOXYGEN_SHOULD_SKIP_THIS
83 // Forward declaration of Distributor
84 class Distributor;
85 #endif // DOXYGEN_SHOULD_SKIP_THIS
86 
87 //
88 // Users must never rely on anything in the Details namespace.
89 //
90 namespace Details {
91 
92 namespace UnpackAndCombineCrsMatrixImpl {
93 
106 template<class ST, class LO, class GO, class DT, class BDT>
107 KOKKOS_FUNCTION int
108 unpackRow(typename PackTraits<GO, DT>::output_array_type& gids_out,
109  typename PackTraits<int, DT>::output_array_type& pids_out,
110  typename PackTraits<ST, DT>::output_array_type& vals_out,
111  const Kokkos::View<const char*, BDT>& imports,
112  const size_t offset,
113  const size_t num_bytes,
114  const size_t num_ent,
115  const size_t num_bytes_per_value)
116 {
117  if (num_ent == 0) {
118  // Empty rows always take zero bytes, to ensure sparsity.
119  return 0;
120  }
121  bool unpack_pids = pids_out.size() > 0;
122 
123  const size_t num_ent_beg = offset;
124  const size_t num_ent_len = PackTraits<LO, BDT>::packValueCount (LO (0));
125 
126  const size_t gids_beg = num_ent_beg + num_ent_len;
127  const size_t gids_len =
128  num_ent * PackTraits<GO, BDT>::packValueCount (GO (0));
129 
130  const size_t pids_beg = gids_beg + gids_len;
131  const size_t pids_len = unpack_pids ?
132  size_t (num_ent * PackTraits<int, BDT>::packValueCount (int (0))) :
133  size_t (0);
134 
135  const size_t vals_beg = gids_beg + gids_len + pids_len;
136  const size_t vals_len = num_ent * num_bytes_per_value;
137 
138  const char* const num_ent_in = imports.data () + num_ent_beg;
139  const char* const gids_in = imports.data () + gids_beg;
140  const char* const pids_in = unpack_pids ? imports.data () + pids_beg : NULL;
141  const char* const vals_in = imports.data () + vals_beg;
142 
143  size_t num_bytes_out = 0;
144  LO num_ent_out;
145  num_bytes_out += PackTraits<LO, BDT>::unpackValue (num_ent_out, num_ent_in);
146  if (static_cast<size_t> (num_ent_out) != num_ent) {
147  return 20; // error code
148  }
149 
150  {
151  Kokkos::pair<int, size_t> p;
152  p = PackTraits<GO, BDT>::unpackArray (gids_out.data (), gids_in, num_ent);
153  if (p.first != 0) {
154  return 21; // error code
155  }
156  num_bytes_out += p.second;
157 
158  if (unpack_pids) {
159  p = PackTraits<int, BDT>::unpackArray (pids_out.data (), pids_in, num_ent);
160  if (p.first != 0) {
161  return 22; // error code
162  }
163  num_bytes_out += p.second;
164  }
165 
166  p = PackTraits<ST, BDT>::unpackArray (vals_out.data (), vals_in, num_ent);
167  if (p.first != 0) {
168  return 23; // error code
169  }
170  num_bytes_out += p.second;
171  }
172 
173  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
174  if (num_bytes_out != expected_num_bytes) {
175  return 24; // error code
176  }
177  return 0; // no errors
178 }
179 
190 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
192  typedef LocalMatrix local_matrix_type;
193  typedef LocalMap local_map_type;
194 
195  typedef typename local_matrix_type::value_type ST;
196  typedef typename local_map_type::local_ordinal_type LO;
197  typedef typename local_map_type::global_ordinal_type GO;
198  typedef typename local_map_type::device_type DT;
199  typedef typename DT::execution_space XS;
200 
201  typedef Kokkos::View<const size_t*, BufferDeviceType>
202  num_packets_per_lid_type;
203  typedef Kokkos::View<const size_t*, DT> offsets_type;
204  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
205  typedef Kokkos::View<const LO*, DT> import_lids_type;
206 
207  typedef Kokkos::View<LO*, DT> lids_scratch_type;
208  typedef Kokkos::View<GO*, DT> gids_scratch_type;
209  typedef Kokkos::View<int*,DT> pids_scratch_type;
210  typedef Kokkos::View<ST*, DT> vals_scratch_type;
211 
212  typedef Kokkos::pair<int, LO> value_type;
213 
214  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
215  "LocalMap::local_ordinal_type and "
216  "LocalMatrix::ordinal_type must be the same.");
217 
218  local_matrix_type local_matrix;
219  local_map_type local_col_map;
220  input_buffer_type imports;
221  num_packets_per_lid_type num_packets_per_lid;
222  import_lids_type import_lids;
223  offsets_type offsets;
224  Tpetra::CombineMode combine_mode;
225  size_t max_num_ent;
226  bool unpack_pids;
227  size_t num_bytes_per_value;
228  bool atomic;
229  Kokkos::Experimental::UniqueToken<XS, Kokkos::Experimental::UniqueTokenScope::Global> tokens;
230  lids_scratch_type lids_scratch;
231  gids_scratch_type gids_scratch;
232  pids_scratch_type pids_scratch;
233  vals_scratch_type vals_scratch;
234 
236  const local_matrix_type& local_matrix_in,
237  const local_map_type& local_col_map_in,
238  const input_buffer_type& imports_in,
239  const num_packets_per_lid_type& num_packets_per_lid_in,
240  const import_lids_type& import_lids_in,
241  const offsets_type& offsets_in,
242  const Tpetra::CombineMode combine_mode_in,
243  const size_t max_num_ent_in,
244  const bool unpack_pids_in,
245  const size_t num_bytes_per_value_in,
246  const bool atomic_in) :
247  local_matrix (local_matrix_in),
248  local_col_map (local_col_map_in),
249  imports (imports_in),
250  num_packets_per_lid (num_packets_per_lid_in),
251  import_lids (import_lids_in),
252  offsets (offsets_in),
253  combine_mode (combine_mode_in),
254  max_num_ent (max_num_ent_in),
255  unpack_pids (unpack_pids_in),
256  num_bytes_per_value (num_bytes_per_value_in),
257  atomic (atomic_in),
258  tokens (XS()),
259  lids_scratch ("pids_scratch", tokens.size() * max_num_ent),
260  gids_scratch ("gids_scratch", tokens.size() * max_num_ent),
261  pids_scratch ("lids_scratch", tokens.size() * max_num_ent),
262  vals_scratch ("vals_scratch", tokens.size() * max_num_ent)
263  {}
264 
265  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
266  {
267  using Tpetra::Details::OrdinalTraits;
268  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
269  }
270 
271  KOKKOS_INLINE_FUNCTION void
272  join (volatile value_type& dst, const volatile value_type& src) const
273  {
274  // `dst` should reflect the first (least) bad index and
275  // all other associated error codes and data. Thus, we need only
276  // check if the `src` object shows an error and if its associated
277  // bad index is less than `dst`'s bad index.
278  using Tpetra::Details::OrdinalTraits;
279  if (src.second != OrdinalTraits<LO>::invalid ()) {
280  // An error in the src; check if
281  // 1. `dst` shows errors
282  // 2. If `dst` does show errors, if src's bad index is less than
283  // *this' bad index
284  if (dst.second == OrdinalTraits<LO>::invalid () ||
285  src.second < dst.second) {
286  dst = src;
287  }
288  }
289  }
290 
291  KOKKOS_INLINE_FUNCTION
292  void operator()(const LO i, value_type& dst) const
293  {
294  using Kokkos::View;
295  using Kokkos::subview;
296  using Kokkos::MemoryUnmanaged;
297  typedef typename XS::size_type size_type;
298  typedef typename Kokkos::pair<size_type, size_type> slice;
299  typedef BufferDeviceType BDT;
300 
301  typedef View<LO*, DT, MemoryUnmanaged> lids_out_type;
302  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
303  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
304  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
305 
306  const size_t num_bytes = num_packets_per_lid(i);
307 
308  // Only unpack data if there is a nonzero number of bytes.
309  if (num_bytes == 0) {
310  return;
311  }
312 
313  // there is actually something in the row
314  const LO import_lid = import_lids[i];
315  const size_t buf_size = imports.size();
316  const size_t offset = offsets(i);
317 
318  // Get the number of entries to expect in the received data for this row.
319  LO num_ent_LO = 0;
320  const char* const in_buf = imports.data () + offset;
321  (void) PackTraits<LO, BDT>::unpackValue (num_ent_LO, in_buf);
322  const size_t num_ent = static_cast<size_t> (num_ent_LO);
323 
324  // Count the number of bytes expected to unpack
325  size_t expected_num_bytes = 0;
326  {
327  expected_num_bytes += PackTraits<LO, BDT>::packValueCount (LO (0));
328  expected_num_bytes += num_ent * PackTraits<GO, BDT>::packValueCount (GO (0));
329  if (unpack_pids) {
330  expected_num_bytes += num_ent * PackTraits<int, BDT>::packValueCount (int (0));
331  }
332  expected_num_bytes += num_ent * PackTraits<ST, BDT>::packValueCount (ST ());
333  }
334 
335  if (expected_num_bytes > num_bytes) {
336  dst = Kokkos::make_pair (1, i); // wrong number of bytes
337  return;
338  }
339 
340  if (offset > buf_size || offset + num_bytes > buf_size) {
341  dst = Kokkos::make_pair (2, i); // out of bounds
342  return;
343  }
344 
345  // Get subviews in to the scratch arrays. The token returned from acquire
346  // is an integer in [0, tokens.size()). It is used to grab a unique (to
347  // this thread) subview of the scratch arrays.
348  const size_type token = tokens.acquire();
349  const size_t a = static_cast<size_t>(token) * max_num_ent;
350  const size_t b = a + num_ent;
351  lids_out_type lids_out = subview(lids_scratch, slice(a, b));
352  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
353  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
354  vals_out_type vals_out = subview(vals_scratch, slice(a, b));
355 
356  // Unpack this row!
357  int unpack_err =
358  unpackRow<ST,LO,GO,DT,BDT>(gids_out, pids_out, vals_out,
359  imports, offset, num_bytes,
360  num_ent, num_bytes_per_value);
361  if (unpack_err != 0) {
362  dst = Kokkos::make_pair (unpack_err, i); // unpack error
363  tokens.release (token);
364  return;
365  }
366 
367  // Column indices come in as global indices, in case the
368  // source object's column Map differs from the target object's
369  // (this's) column Map, and must be converted local index values
370  for (size_t k = 0; k < num_ent; ++k) {
371  lids_out(k) = local_col_map.getLocalElement (gids_out(k));
372  }
373 
374  // Combine the values according to the combine_mode
375  const LO* const lids_raw = const_cast<const LO*> (lids_out.data ());
376  const ST* const vals_raw = const_cast<const ST*> (vals_out.data ());
377  LO num_modified = 0;
378  if (combine_mode == ADD) {
379  num_modified +=
380  local_matrix.sumIntoValues (import_lid, lids_raw, num_ent,
381  vals_raw, false, atomic);
382  }
383  else if (combine_mode == REPLACE) {
384  num_modified +=
385  local_matrix.replaceValues (import_lid, lids_raw, num_ent,
386  vals_raw, false, atomic);
387  }
388  else {
389  dst = Kokkos::make_pair (4, i); // invalid combine mode
390  tokens.release (token);
391  return;
392  }
393 
394  tokens.release (token);
395  }
396 };
397 
398 struct MaxNumEntTag {};
399 struct TotNumEntTag {};
400 
409 template<class LO, class DT, class BDT>
411 public:
412  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
413  typedef Kokkos::View<const size_t*, DT> offsets_type;
414  typedef Kokkos::View<const char*, BDT> input_buffer_type;
415  // This needs to be public, since it appears in the argument list of
416  // public methods (see below). Otherwise, build errors may happen.
417  typedef size_t value_type;
418 
419 private:
420  typedef Kokkos::pair<size_t,size_t> slice;
421 
422  num_packets_per_lid_type num_packets_per_lid;
423  offsets_type offsets;
424  input_buffer_type imports;
425 
426 public:
427  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
428  const offsets_type& offsets_in,
429  const input_buffer_type& imports_in) :
430  num_packets_per_lid (num_packets_per_lid_in),
431  offsets (offsets_in),
432  imports (imports_in)
433  {}
434 
435  KOKKOS_INLINE_FUNCTION void
436  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
437  // Get how many entries to expect in the received data for this row.
438  const size_t num_bytes = num_packets_per_lid(i);
439  if (num_bytes > 0) {
440  LO num_ent_LO = 0; // output argument of unpackValue
441  const char* const in_buf = imports.data () + offsets(i);
442  (void) PackTraits<LO, BDT>::unpackValue (num_ent_LO, in_buf);
443  const size_t num_ent = static_cast<size_t> (num_ent_LO);
444 
445  update = (update < num_ent) ? num_ent : update;
446  }
447  }
448 
449  KOKKOS_INLINE_FUNCTION void
450  join (const MaxNumEntTag,
451  volatile value_type& dst,
452  const volatile value_type& src) const
453  {
454  if (dst < src) dst = src;
455  }
456 
457  KOKKOS_INLINE_FUNCTION void
458  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
459  // Get how many entries to expect in the received data for this row.
460  const size_t num_bytes = num_packets_per_lid(i);
461  if (num_bytes > 0) {
462  LO num_ent_LO = 0; // output argument of unpackValue
463  const char* const in_buf = imports.data () + offsets(i);
464  (void) PackTraits<LO, BDT>::unpackValue (num_ent_LO, in_buf);
465  tot_num_ent += static_cast<size_t> (num_ent_LO);
466  }
467  }
468 };
469 
477 template<class LO, class DT, class BDT>
478 size_t
480  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
481  const Kokkos::View<const size_t*, DT>& offsets,
482  const Kokkos::View<const char*, BDT>& imports)
483 {
484  typedef typename DT::execution_space XS;
485  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
486  MaxNumEntTag> range_policy;
487 
488  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
489  imports);
490  const LO numRowsToUnpack =
491  static_cast<LO> (num_packets_per_lid.extent (0));
492  size_t max_num_ent = 0;
493  Kokkos::parallel_reduce ("Max num entries in CRS",
494  range_policy (0, numRowsToUnpack),
495  functor, max_num_ent);
496  return max_num_ent;
497 }
498 
506 template<class LO, class DT, class BDT>
507 size_t
509  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
510  const Kokkos::View<const size_t*, DT>& offsets,
511  const Kokkos::View<const char*, BDT>& imports)
512 {
513  typedef typename DT::execution_space XS;
514  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
515  size_t tot_num_ent = 0;
516  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
517  imports);
518  const LO numRowsToUnpack =
519  static_cast<LO> (num_packets_per_lid.extent (0));
520  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
521  range_policy (0, numRowsToUnpack),
522  functor, tot_num_ent);
523  return tot_num_ent;
524 }
525 
533 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
534 void
536  const LocalMatrix& local_matrix,
537  const LocalMap& local_map,
538  const Kokkos::View<const char*, BufferDeviceType>& imports,
539  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
541  const Tpetra::CombineMode combine_mode,
542  const bool unpack_pids,
543  const bool atomic)
544 {
545  typedef typename LocalMatrix::value_type ST;
546  typedef typename LocalMap::local_ordinal_type LO;
547  typedef typename LocalMap::device_type DT;
548  typedef typename DT::execution_space XS;
549  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
551 
552  const char prefix[] =
553  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix: ";
554 
555  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
556  if (num_import_lids == 0) {
557  // Nothing to unpack
558  return;
559  }
560 
561  {
562  // Check for correct input
563  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
564  std::invalid_argument,
565  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
566  "static graph (i.e., was constructed with the CrsMatrix constructor "
567  "that takes a const CrsGraph pointer).");
568 
569  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
570  std::invalid_argument,
571  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
572  "(i.e., was constructed with the CrsMatrix constructor that takes a "
573  "const CrsGraph pointer).");
574 
575  // Unknown combine mode!
576  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
577  std::invalid_argument,
578  prefix << "Invalid combine mode; should never get "
579  "here! Please report this bug to the Tpetra developers.");
580 
581  // Check that sizes of input objects are consistent.
582  bool bad_num_import_lids =
583  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
584  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
585  std::invalid_argument,
586  prefix << "importLIDs.size() (" << num_import_lids << ") != "
587  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
588  } // end QA error checking
589 
590  // Get the offsets
591  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
592  computeOffsetsFromCounts(offsets, num_packets_per_lid);
593 
594  // Determine the maximum number of entries in any row in the matrix. The
595  // maximum number of entries is needed to allocate unpack buffers on the
596  // device.
597  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(
598  num_packets_per_lid, offsets, imports);
599 
600  // FIXME (TJF SEP 2017)
601  // The scalar type is not necessarily default constructible
602  size_t num_bytes_per_value = PackTraits<ST, DT>::packValueCount(ST());
603 
604  // Now do the actual unpack!
605  unpack_functor_type f(local_matrix, local_map,
606  imports, num_packets_per_lid, import_lids, offsets, combine_mode,
607  max_num_ent, unpack_pids, num_bytes_per_value, atomic);
608 
609  typename unpack_functor_type::value_type x;
610  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
611  auto x_h = x.to_std_pair();
612  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
613  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code "
614  << x_h.first << " for the first bad row " << x_h.second);
615 
616  return;
617 }
618 
619 template<class LocalMatrix, class BufferDeviceType>
620 size_t
622  const LocalMatrix& local_matrix,
624  const Kokkos::View<const char*, BufferDeviceType>& imports,
625  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
626  const size_t num_same_ids)
627 {
628  using Kokkos::parallel_reduce;
629  typedef typename LocalMatrix::ordinal_type LO;
630  typedef typename LocalMatrix::device_type device_type;
631  typedef typename device_type::execution_space XS;
632  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
633  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
634  typedef BufferDeviceType BDT;
635 
636  size_t count = 0;
637  LO num_items;
638 
639  // Number of matrix entries to unpack (returned by this function).
640  num_items = static_cast<LO>(num_same_ids);
641  if (num_items) {
642  size_t kcnt = 0;
643  parallel_reduce(range_policy(0, num_items),
644  KOKKOS_LAMBDA(const LO lid, size_t& update) {
645  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
646  -local_matrix.graph.row_map[lid]);
647  }, kcnt);
648  count += kcnt;
649  }
650 
651  // Count entries copied directly from the source matrix with permuting.
652  num_items = static_cast<LO>(permute_from_lids.extent(0));
653  if (num_items) {
654  size_t kcnt = 0;
655  parallel_reduce(range_policy(0, num_items),
656  KOKKOS_LAMBDA(const LO i, size_t& update) {
657  const LO lid = permute_from_lids(i);
658  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
659  - local_matrix.graph.row_map[lid]);
660  }, kcnt);
661  count += kcnt;
662  }
663 
664  {
665  // Count entries received from other MPI processes.
666  const size_type np = num_packets_per_lid.extent(0);
667  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
668  computeOffsetsFromCounts(offsets, num_packets_per_lid);
669  count +=
670  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
671  offsets, imports);
672  }
673 
674  return count;
675 }
676 
677 template<class LO, class DT, class BDT>
678 KOKKOS_INLINE_FUNCTION
679 size_t
680 unpackRowCount(const Kokkos::View<const char*, BDT>& imports,
681  const size_t offset,
682  const size_t num_bytes)
683 {
684  LO num_ent_LO = 0;
685  if (num_bytes > 0) {
686  const size_t p_num_bytes = PackTraits<LO, DT>::packValueCount(num_ent_LO);
687  if (p_num_bytes > num_bytes) {
688  return OrdinalTraits<size_t>::invalid();
689  }
690  const char* const in_buf = imports.data () + offset;
691  (void) PackTraits<LO,DT>::unpackValue(num_ent_LO, in_buf);
692  }
693  return static_cast<size_t>(num_ent_LO);
694 }
695 
697 template<class LO, class DT, class BDT>
698 int
699 setupRowPointersForRemotes(
700  const typename PackTraits<size_t, DT>::output_array_type& tgt_rowptr,
701  const typename PackTraits<LO, DT>::input_array_type& import_lids,
702  const Kokkos::View<const char*, BDT>& imports,
703  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
704  const typename PackTraits<size_t, DT>::input_array_type& offsets)
705 {
706  using Kokkos::parallel_reduce;
707  typedef typename DT::execution_space XS;
708  typedef typename PackTraits<size_t,DT>::input_array_type::size_type size_type;
709  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
710 
711  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
712  const size_type N = num_packets_per_lid.extent(0);
713 
714  int errors = 0;
715  parallel_reduce ("Setup row pointers for remotes",
716  range_policy (0, N),
717  KOKKOS_LAMBDA (const size_t i, int& k_error) {
718  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
719  const size_t num_bytes = num_packets_per_lid(i);
720  const size_t offset = offsets(i);
721  const size_t num_ent = unpackRowCount<LO, DT, BDT> (imports, offset, num_bytes);
722  if (num_ent == InvalidNum) {
723  k_error += 1;
724  }
725  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
726  }, errors);
727  return errors;
728 }
729 
730 // Convert array of row lengths to a CRS pointer array
731 template<class DT>
732 void
733 makeCrsRowPtrFromLengths(
734  const typename PackTraits<size_t,DT>::output_array_type& tgt_rowptr,
735  const Kokkos::View<size_t*,DT>& new_start_row)
736 {
737  using Kokkos::parallel_scan;
738  typedef typename DT::execution_space XS;
739  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
740  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
741  const size_type N = new_start_row.extent(0);
742  parallel_scan(range_policy(0, N),
743  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
744  auto cur_val = tgt_rowptr(i);
745  if (final) {
746  tgt_rowptr(i) = update;
747  new_start_row(i) = tgt_rowptr(i);
748  }
749  update += cur_val;
750  }
751  );
752 }
753 
754 template<class LocalMatrix, class LocalMap>
755 void
756 copyDataFromSameIDs(
760  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
763  const LocalMatrix& local_matrix,
764  const LocalMap& local_col_map,
765  const size_t num_same_ids,
766  const int my_pid)
767 {
768  using Kokkos::parallel_for;
769  typedef typename LocalMap::device_type DT;
770  typedef typename LocalMap::local_ordinal_type LO;
771  typedef typename DT::execution_space XS;
772  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
773 
774  parallel_for(range_policy(0, num_same_ids),
775  KOKKOS_LAMBDA(const size_t i) {
776  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
777 
778  const LO src_lid = static_cast<LO>(i);
779  size_t src_row = local_matrix.graph.row_map(src_lid);
780 
781  const LO tgt_lid = static_cast<LO>(i);
782  const size_t tgt_row = tgt_rowptr(tgt_lid);
783 
784  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
785  - local_matrix.graph.row_map(src_lid);
786  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
787 
788  for (size_t j=local_matrix.graph.row_map(src_lid);
789  j<local_matrix.graph.row_map(src_lid+1); ++j) {
790  LO src_col = local_matrix.graph.entries(j);
791  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
792  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
793  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
794  }
795  }
796  );
797 }
798 
799 template<class LocalMatrix, class LocalMap>
800 void
801 copyDataFromPermuteIDs(
805  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
810  const LocalMatrix& local_matrix,
811  const LocalMap& local_col_map,
812  const int my_pid)
813 {
814  using Kokkos::parallel_for;
815  typedef typename LocalMap::device_type DT;
816  typedef typename LocalMap::local_ordinal_type LO;
817  typedef typename DT::execution_space XS;
818  typedef typename PackTraits<LO,DT>::input_array_type::size_type size_type;
819  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
820 
821  const size_type num_permute_to_lids = permute_to_lids.extent(0);
822 
823  parallel_for(range_policy(0, num_permute_to_lids),
824  KOKKOS_LAMBDA(const size_t i) {
825  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
826 
827  const LO src_lid = permute_from_lids(i);
828  const size_t src_row = local_matrix.graph.row_map(src_lid);
829 
830  const LO tgt_lid = permute_to_lids(i);
831  const size_t tgt_row = tgt_rowptr(tgt_lid);
832 
833  size_t nsr = local_matrix.graph.row_map(src_lid+1)
834  - local_matrix.graph.row_map(src_lid);
835  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
836 
837  for (size_t j=local_matrix.graph.row_map(src_lid);
838  j<local_matrix.graph.row_map(src_lid+1); ++j) {
839  LO src_col = local_matrix.graph.entries(j);
840  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
841  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
842  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
843  }
844  }
845  );
846 }
847 
848 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
849 int
850 unpackAndCombineIntoCrsArrays2(
854  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
857  const Kokkos::View<const char*, BufferDeviceType>& imports,
858  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
859  const LocalMatrix& local_matrix,
860  const LocalMap /*& local_col_map*/,
861  const int my_pid,
862  const size_t num_bytes_per_value)
863 {
864  using Kokkos::View;
865  using Kokkos::subview;
866  using Kokkos::MemoryUnmanaged;
867  using Kokkos::parallel_reduce;
868  using Kokkos::atomic_fetch_add;
870  typedef typename LocalMap::device_type DT;
871  typedef typename LocalMap::local_ordinal_type LO;
872  typedef typename LocalMap::global_ordinal_type GO;
873  typedef typename LocalMatrix::value_type ST;
874  typedef typename DT::execution_space XS;
875  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
876  typedef typename Kokkos::pair<size_type, size_type> slice;
877  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
878  typedef BufferDeviceType BDT;
879 
880  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
881  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
882  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
883 
884  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
885 
886  int errors = 0;
887  const size_type num_import_lids = import_lids.size();
888 
889  // RemoteIDs: Loop structure following UnpackAndCombine
890  parallel_reduce ("Unpack and combine into CRS",
891  range_policy (0, num_import_lids),
892  KOKKOS_LAMBDA (const size_t i, int& k_error) {
893  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
894  const size_t num_bytes = num_packets_per_lid(i);
895  const size_t offset = offsets(i);
896  if (num_bytes == 0) {
897  // Empty buffer means that the row is empty.
898  return;
899  }
900  size_t num_ent = unpackRowCount<LO,DT,BDT>(imports, offset, num_bytes);
901  if (num_ent == InvalidNum) {
902  k_error += 1;
903  return;
904  }
905  const LO lcl_row = import_lids(i);
906  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
907  const size_t end_row = start_row + num_ent;
908 
909  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
910  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
911  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
912 
913  k_error += unpackRow<ST,LO,GO,DT,BDT>(gids_out, pids_out, vals_out,
914  imports, offset, num_bytes,
915  num_ent, num_bytes_per_value);
916 
917  // Correct target PIDs.
918  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
919  const int pid = pids_out(j);
920  pids_out(j) = (pid != my_pid) ? pid : -1;
921  }
922  }, errors);
923 
924  return errors;
925 }
926 
927 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
928 void
930  const LocalMatrix & local_matrix,
931  const LocalMap & local_col_map,
933  const Kokkos::View<const char*, BufferDeviceType>& imports,
934  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
942  const size_t num_same_ids,
943  const size_t tgt_num_rows,
944  const size_t tgt_num_nonzeros,
945  const int my_tgt_pid,
946  const size_t num_bytes_per_value)
947 {
948  using Kokkos::View;
949  using Kokkos::subview;
950  using Kokkos::parallel_for;
951  using Kokkos::MemoryUnmanaged;
953  typedef typename LocalMap::device_type DT;
954  typedef typename LocalMap::local_ordinal_type LO;
955  typedef typename DT::execution_space XS;
956  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
957  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
958  typedef BufferDeviceType BDT;
959 
960  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
961 
962  const size_t N = tgt_num_rows;
963  const size_t mynnz = tgt_num_nonzeros;
964 
965  // In the case of reduced communicators, the sourceMatrix won't have
966  // the right "my_pid", so thus we have to supply it.
967  const int my_pid = my_tgt_pid;
968 
969  // Zero the rowptr
970  parallel_for(range_policy(0, N+1),
971  KOKKOS_LAMBDA(const size_t i) {
972  tgt_rowptr(i) = 0;
973  }
974  );
975 
976  // same IDs: Always first, always in the same place
977  parallel_for(range_policy(0, num_same_ids),
978  KOKKOS_LAMBDA(const size_t i) {
979  const LO tgt_lid = static_cast<LO>(i);
980  const LO src_lid = static_cast<LO>(i);
981  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
982  - local_matrix.graph.row_map(src_lid);
983  }
984  );
985 
986  // Permute IDs: Still local, but reordered
987  const size_type num_permute_to_lids = permute_to_lids.extent(0);
988  parallel_for(range_policy(0, num_permute_to_lids),
989  KOKKOS_LAMBDA(const size_t i) {
990  const LO tgt_lid = permute_to_lids(i);
991  const LO src_lid = permute_from_lids(i);
992  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
993  - local_matrix.graph.row_map(src_lid);
994  }
995  );
996 
997  // Get the offsets from the number of packets per LID
998  const size_type num_import_lids = import_lids.extent(0);
999  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1000  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1001 
1002 #ifdef HAVE_TPETRA_DEBUG
1003  {
1004  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1005  const bool condition =
1006  nth_offset_h != static_cast<size_t>(imports.extent (0));
1007  TEUCHOS_TEST_FOR_EXCEPTION
1008  (condition, std::logic_error, prefix
1009  << "The final offset in bytes " << nth_offset_h
1010  << " != imports.size() = " << imports.extent(0)
1011  << ". Please report this bug to the Tpetra developers.");
1012  }
1013 #endif // HAVE_TPETRA_DEBUG
1014 
1015  // Setup row pointers for remotes
1016  int k_error =
1017  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1018  import_lids, imports, num_packets_per_lid, offsets);
1019  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1020  << " Error transferring data to target row pointers. "
1021  "Please report this bug to the Tpetra developers.");
1022 
1023  // If multiple processes contribute to the same row, we may need to
1024  // update row offsets. This tracks that.
1025  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1026 
1027  // Turn row length into a real CRS row pointer
1028  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1029  {
1030  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
1031  bool condition = nth_tgt_rowptr_h != mynnz;
1032  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
1033  prefix << "CRS_rowptr[last] = " <<
1034  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
1035  }
1036 
1037  // SameIDs: Copy the data over
1038  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1039  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1040 
1041  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1042  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1043  local_matrix, local_col_map, my_pid);
1044 
1045  if (imports.extent(0) <= 0) {
1046  return;
1047  }
1048 
1049  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1050  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1051  local_matrix, local_col_map, my_pid, num_bytes_per_value);
1052  TEUCHOS_TEST_FOR_EXCEPTION(
1053  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1054  "should never happen. Please report this bug to the Tpetra developers.");
1055 
1056  return;
1057 }
1058 
1059 } // namespace UnpackAndCombineCrsMatrixImpl
1060 
1100 template<typename ST, typename LO, typename GO, typename Node>
1101 void
1103  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1104  const Teuchos::ArrayView<const char>& imports,
1105  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1106  const Teuchos::ArrayView<const LO>& importLIDs,
1107  size_t constantNumPackets,
1108  Distributor & distor,
1109  CombineMode combineMode,
1110  const bool atomic)
1111 {
1112  using Kokkos::View;
1113  typedef typename Node::device_type device_type;
1114  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_type local_matrix_type;
1115  static_assert (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1116  "Node::device_type and LocalMatrix::device_type must be the same.");
1117 
1118  // Execution space.
1119  typedef typename device_type::execution_space XS;
1120 
1121  // Convert all Teuchos::Array to Kokkos::View.
1122  typename XS::device_type outputDevice;
1123 
1124  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1125  // them to device. Since unpacking is done directly in to the local matrix
1126  // (lclMatrix), no copying needs to be performed after unpacking.
1127  auto num_packets_per_lid_d =
1128  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1129  numPacketsPerLID.size(), true, "num_packets_per_lid");
1130 
1131  auto import_lids_d =
1132  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1133  importLIDs.size(), true, "import_lids");
1134 
1135  auto imports_d =
1136  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1137  imports.size(), true, "imports");
1138 
1139  auto local_matrix = sourceMatrix.getLocalMatrix();
1140  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1141 
1142  // Now do the actual unpack!
1143  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1144  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1145  import_lids_d, combineMode, false, atomic);
1146 
1147  return;
1148 }
1149 
1150 template<typename ST, typename LO, typename GO, typename NT>
1151 void
1152 unpackCrsMatrixAndCombineNew (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1153  const Kokkos::DualView<const char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& imports,
1154  const Kokkos::DualView<const size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
1155  const Kokkos::DualView<const LO*, typename NT::device_type>& importLIDs,
1156  const size_t constantNumPackets,
1157  Distributor& distor,
1158  const CombineMode combineMode,
1159  const bool atomic)
1160 {
1162  using Kokkos::View;
1163  typedef typename NT::device_type device_type;
1164  typedef CrsMatrix<ST, LO, GO, NT> crs_matrix_type;
1165  typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
1166  typedef DistObject<char, LO, GO, NT> dist_object_type;
1167  typedef typename dist_object_type::buffer_device_type buffer_device_type;
1168  typedef typename buffer_device_type::memory_space BMS;
1169  typedef typename device_type::memory_space MS;
1170 
1171  static_assert (std::is_same<device_type,
1172  typename local_matrix_type::device_type>::value,
1173  "NT::device_type and LocalMatrix::device_type must be "
1174  "the same.");
1175 
1176  {
1177  auto numPacketsPerLID_nc = castAwayConstDualView (numPacketsPerLID);
1178  numPacketsPerLID_nc.template sync<BMS> ();
1179  }
1180  auto num_packets_per_lid_d = numPacketsPerLID.template view<BMS> ();
1181 
1182  {
1183  auto importLIDs_nc = castAwayConstDualView (importLIDs);
1184  importLIDs_nc.template sync<MS> ();
1185  }
1186  auto import_lids_d = importLIDs.template view<MS> ();
1187 
1188  {
1189  auto imports_nc = castAwayConstDualView (imports);
1190  imports_nc.template sync<BMS> ();
1191  }
1192  auto imports_d = imports.template view<BMS> ();
1193 
1194  auto local_matrix = sourceMatrix.getLocalMatrix ();
1195  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1196  typedef decltype (local_col_map) local_map_type;
1197 
1198  // Now do the actual unpack!
1199  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1200  local_matrix_type,
1201  local_map_type,
1202  buffer_device_type
1203  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1204  import_lids_d, combineMode, false, atomic);
1205 }
1206 
1253 //
1262 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1263 size_t
1266  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1267  const Teuchos::ArrayView<const char> &imports,
1268  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1269  size_t constantNumPackets,
1270  Distributor &distor,
1271  CombineMode combineMode,
1272  size_t numSameIDs,
1273  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1274  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1275 {
1276  using Kokkos::MemoryUnmanaged;
1277  using Kokkos::View;
1278  typedef typename Node::device_type DT;
1280  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1281 
1282  TEUCHOS_TEST_FOR_EXCEPTION
1283  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1284  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1285  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1286  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1287  // process, then the matrix is neither locally nor globally indexed.
1288  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1289  TEUCHOS_TEST_FOR_EXCEPTION
1290  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1291  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1292  TEUCHOS_TEST_FOR_EXCEPTION
1293  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1294  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1295  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1296 
1297  auto local_matrix = sourceMatrix.getLocalMatrix ();
1298  auto permute_from_lids_d =
1300  permuteFromLIDs.getRawPtr (),
1301  permuteFromLIDs.size (), true,
1302  "permute_from_lids");
1303  auto imports_d =
1305  imports.getRawPtr (),
1306  imports.size (), true,
1307  "imports");
1308  auto num_packets_per_lid_d =
1310  numPacketsPerLID.getRawPtr (),
1311  numPacketsPerLID.size (), true,
1312  "num_packets_per_lid");
1313 
1314  return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1315  local_matrix, permute_from_lids_d, imports_d,
1316  num_packets_per_lid_d, numSameIDs);
1317 }
1318 
1333 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1334 void
1337  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1338  const Teuchos::ArrayView<const char>& imports,
1339  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1340  const size_t constantNumPackets,
1341  Distributor& distor,
1342  const CombineMode combineMode,
1343  const size_t numSameIDs,
1344  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1345  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1346  size_t TargetNumRows,
1347  size_t TargetNumNonzeros,
1348  const int MyTargetPID,
1349  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1350  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1351  const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1352  const Teuchos::ArrayView<const int>& SourcePids,
1353  Teuchos::Array<int>& TargetPids)
1354 {
1356 
1357  using Kokkos::View;
1358  using Kokkos::deep_copy;
1359 
1360  using Teuchos::ArrayView;
1361  using Teuchos::outArg;
1362  using Teuchos::REDUCE_MAX;
1363  using Teuchos::reduceAll;
1364 
1365  typedef LocalOrdinal LO;
1366 
1367  typedef typename Node::device_type DT;
1368  typedef typename DT::execution_space XS;
1369 
1371  typedef typename matrix_type::impl_scalar_type ST;
1372  typedef typename ArrayView<const LO>::size_type size_type;
1373 
1374  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1375 
1376  TEUCHOS_TEST_FOR_EXCEPTION(
1377  TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1378  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1379  CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1380 
1381  TEUCHOS_TEST_FOR_EXCEPTION(
1382  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1383  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1384  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1385  const size_type numImportLIDs = importLIDs.size ();
1386 
1387  TEUCHOS_TEST_FOR_EXCEPTION(
1388  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1389  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1390  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1391 
1392  // Preseed TargetPids with -1 for local
1393  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1394  TargetPids.resize (TargetNumNonzeros);
1395  }
1396  TargetPids.assign (TargetNumNonzeros, -1);
1397 
1398  // Grab pointers for sourceMatrix
1399  auto local_matrix = sourceMatrix.getLocalMatrix();
1400  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1401 
1402  // Convert input arrays to Kokkos::View
1403  typename XS::device_type outputDevice;
1404  auto import_lids_d =
1405  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1406  importLIDs.size(), true, "import_lids");
1407 
1408  auto imports_d =
1409  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1410  imports.size(), true, "imports");
1411 
1412  auto num_packets_per_lid_d =
1413  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1414  numPacketsPerLID.size(), true, "num_packets_per_lid");
1415 
1416  auto permute_from_lids_d =
1417  create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1418  permuteFromLIDs.size(), true, "permute_from_lids");
1419 
1420  auto permute_to_lids_d =
1421  create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1422  permuteToLIDs.size(), true, "permute_to_lids");
1423 
1424  auto crs_rowptr_d =
1425  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1426  CRS_rowptr.size(), true, "crs_rowptr");
1427 
1428  auto crs_colind_d =
1429  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1430  CRS_colind.size(), true, "crs_colidx");
1431 
1432 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1433  static_assert (! std::is_same<
1434  typename std::remove_const<
1435  typename std::decay<
1436  decltype (CRS_vals)
1437  >::type::value_type
1438  >::type,
1439  std::complex<double> >::value,
1440  "CRS_vals::value_type is std::complex<double>; this should never happen"
1441  ", since std::complex does not work in Kokkos::View objects.");
1442 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1443 
1444  auto crs_vals_d =
1445  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1446  CRS_vals.size(), true, "crs_vals");
1447 
1448 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1449  static_assert (! std::is_same<
1450  typename decltype (crs_vals_d)::non_const_value_type,
1451  std::complex<double> >::value,
1452  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1453  "never happen, since std::complex does not work in Kokkos::View objects.");
1454 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1455 
1456  auto src_pids_d =
1457  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1458  SourcePids.size(), true, "src_pids");
1459 
1460  auto tgt_pids_d =
1461  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1462  TargetPids.size(), true, "tgt_pids");
1463 
1464  size_t num_bytes_per_value = 0;
1466  // assume that ST is default constructible
1467  num_bytes_per_value = PackTraits<ST,DT>::packValueCount(ST());
1468  }
1469  else {
1470  // Since the packed data come from the source matrix, we can use the source
1471  // matrix to get the number of bytes per Scalar value stored in the matrix.
1472  // This assumes that all Scalar values in the source matrix require the same
1473  // number of bytes. If the source matrix has no entries on the calling
1474  // process, then we hope that some process does have some idea how big
1475  // a Scalar value is. Of course, if no processes have any entries, then no
1476  // values should be packed (though this does assume that in our packing
1477  // scheme, rows with zero entries take zero bytes).
1478  size_t num_bytes_per_value_l = 0;
1479  if (local_matrix.values.extent(0) > 0) {
1480  const ST& val = local_matrix.values(0);
1481  num_bytes_per_value_l = PackTraits<ST,DT>::packValueCount(val);
1482  } else {
1483  const ST& val = crs_vals_d(0);
1484  num_bytes_per_value_l = PackTraits<ST,DT>::packValueCount(val);
1485  }
1486  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1487  Teuchos::REDUCE_MAX,
1488  num_bytes_per_value_l,
1489  outArg(num_bytes_per_value));
1490  }
1491 
1492 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1493  static_assert (! std::is_same<
1494  typename decltype (crs_vals_d)::non_const_value_type,
1495  std::complex<double> >::value,
1496  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1497  "never happen, since std::complex does not work in Kokkos::View objects.");
1498 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1499 
1500  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1501  local_matrix, local_col_map, import_lids_d, imports_d,
1502  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1503  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1504  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1505  num_bytes_per_value);
1506 
1507  // Copy outputs back to host
1508  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1509  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1510  deep_copy(crs_rowptr_h, crs_rowptr_d);
1511 
1512  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1513  CRS_colind.getRawPtr(), CRS_colind.size());
1514  deep_copy(crs_colind_h, crs_colind_d);
1515 
1516  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1517  CRS_vals.getRawPtr(), CRS_vals.size());
1518  deep_copy(crs_vals_h, crs_vals_d);
1519 
1520  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1521  TargetPids.getRawPtr(), TargetPids.size());
1522  deep_copy(tgt_pids_h, tgt_pids_d);
1523 
1524 }
1525 
1526 } // namespace Details
1527 } // namespace Tpetra
1528 
1529 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1530  template void \
1531  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1532  const CrsMatrix<ST, LO, GO, NT>&, \
1533  const Teuchos::ArrayView<const char>&, \
1534  const Teuchos::ArrayView<const size_t>&, \
1535  const Teuchos::ArrayView<const LO>&, \
1536  size_t, \
1537  Distributor&, \
1538  CombineMode, \
1539  const bool); \
1540  template void \
1541  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1542  const CrsMatrix<ST, LO, GO, NT>&, \
1543  const Kokkos::DualView<const char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1544  const Kokkos::DualView<const size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1545  const Kokkos::DualView<const LO*, NT::device_type>&, \
1546  const size_t, \
1547  Distributor&, \
1548  const CombineMode, \
1549  const bool); \
1550  template void \
1551  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1552  const CrsMatrix<ST, LO, GO, NT> &, \
1553  const Teuchos::ArrayView<const LO>&, \
1554  const Teuchos::ArrayView<const char>&, \
1555  const Teuchos::ArrayView<const size_t>&, \
1556  const size_t, \
1557  Distributor&, \
1558  const CombineMode, \
1559  const size_t, \
1560  const Teuchos::ArrayView<const LO>&, \
1561  const Teuchos::ArrayView<const LO>&, \
1562  size_t, \
1563  size_t, \
1564  const int, \
1565  const Teuchos::ArrayView<size_t>&, \
1566  const Teuchos::ArrayView<GO>&, \
1567  const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1568  const Teuchos::ArrayView<const int>&, \
1569  Teuchos::Array<int>&); \
1570  template size_t \
1571  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1572  const CrsMatrix<ST, LO, GO, NT> &, \
1573  const Teuchos::ArrayView<const LO> &, \
1574  const Teuchos::ArrayView<const char> &, \
1575  const Teuchos::ArrayView<const size_t>&, \
1576  size_t, \
1577  Distributor &, \
1578  CombineMode, \
1579  size_t, \
1580  const Teuchos::ArrayView<const LO>&, \
1581  const Teuchos::ArrayView<const LO>&);
1582 
1583 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Base class for distributed Tpetra objects that support data redistribution.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type, typename LocalMap::device_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode, const bool unpack_pids, const bool atomic)
Perform the unpack operation for the matrix.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
KokkosSparse::CrsMatrix< impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
KOKKOS_INLINE_FUNCTION GlobalOrdinal getGlobalElement(const LocalOrdinal localIndex) const
Get the global index corresponding to the given local index.
Implementation details of Tpetra.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Insert new values that don&#39;t currently exist.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
"Local" part of Map suitable for Kokkos kernels.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, const bool atomic)
Unpack the imported column indices and values, and combine into matrix.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Declaration and definition of Tpetra::Details::getEntryOnHost.
local_matrix_type getLocalMatrix() const
The local sparse matrix.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...