Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
50 #include "Teuchos_TimeMonitor.hpp"
51 #ifdef HAVE_TPETRA_DEBUG
52 # include <set>
53 #endif // HAVE_TPETRA_DEBUG
54 
55 //
56 // mfh 25 May 2016: Temporary fix for #393.
57 //
58 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
59 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
60 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
61 // use them in that case, either.
62 //
63 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
64 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
65 // thing for any GCC version.
66 //
67 #if defined(__CUDACC__)
68  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
69 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
70 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
71 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72 
73 #elif defined(__GNUC__)
74 
75 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
76 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
77 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78 
79 #else // some other compiler
80 
81  // Optimistically assume that other compilers aren't broken.
82 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
83 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
84 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85 #endif // defined(__CUDACC__), defined(__GNUC__)
86 
87 
88 namespace Tpetra {
89 namespace Experimental {
90 namespace Classes {
91 
92 namespace Impl {
93 
94 #if 0
95 template<class AlphaCoeffType,
96  class GraphType,
97  class MatrixValuesType,
98  class InVecType,
99  class BetaCoeffType,
100  class OutVecType>
101 class BcrsApplyNoTrans1VecTeamFunctor {
102 private:
103  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
104  "MatrixValuesType must be a Kokkos::View.");
105  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
106  "OutVecType must be a Kokkos::View.");
107  static_assert (Kokkos::Impl::is_view<InVecType>::value,
108  "InVecType must be a Kokkos::View.");
109  static_assert (std::is_same<MatrixValuesType,
110  typename MatrixValuesType::const_type>::value,
111  "MatrixValuesType must be a const Kokkos::View.");
112  static_assert (std::is_same<OutVecType,
113  typename OutVecType::non_const_type>::value,
114  "OutVecType must be a nonconst Kokkos::View.");
115  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
116  "InVecType must be a const Kokkos::View.");
117  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
118  "MatrixValuesType must be a rank-1 Kokkos::View.");
119  static_assert (static_cast<int> (InVecType::rank) == 1,
120  "InVecType must be a rank-1 Kokkos::View.");
121  static_assert (static_cast<int> (OutVecType::rank) == 1,
122  "OutVecType must be a rank-1 Kokkos::View.");
123  typedef typename MatrixValuesType::non_const_value_type scalar_type;
124  typedef typename GraphType::device_type device_type;
125  typedef typename device_type::execution_space execution_space;
126  typedef typename execution_space::scratch_memory_space shmem_space;
127 
128 public:
130  typedef typename std::remove_const<typename GraphType::data_type>::type
133  typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,
134  execution_space,
135  local_ordinal_type> policy_type;
142  void setX (const InVecType& X) { X_ = X; }
143 
150  void setY (const OutVecType& Y) { Y_ = Y; }
151 
155  KOKKOS_INLINE_FUNCTION local_ordinal_type
156  getScratchSizePerTeam () const
157  {
158  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
159  // that have run-time sizes, like some of those in Stokhos.
160  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
161  return blockSize_ * sizeof (y_value_type);
162  }
163 
167  KOKKOS_INLINE_FUNCTION local_ordinal_type
168  getScratchSizePerThread () const
169  {
170  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
171  // that have run-time sizes, like some of those in Stokhos.
172  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
173  return blockSize_ * sizeof (y_value_type);
174  }
175 
176 private:
177  KOKKOS_INLINE_FUNCTION local_ordinal_type
178  getNumLclMeshRows () const
179  {
180  return ptr_.extent (0) == 0 ?
181  static_cast<local_ordinal_type> (0) :
182  static_cast<local_ordinal_type> (ptr_.extent (0) - 1);
183  }
184 
185  static constexpr local_ordinal_type defaultRowsPerTeam = 20;
186 
187 public:
189  local_ordinal_type getNumTeams () const {
190  return 1;
191  // const local_ordinal_type numLclMeshRows = getNumLclMeshRows ();
192  // return (numLclMeshRows + rowsPerTeam_ - 1) / rowsPerTeam_;
193  }
194 
196  BcrsApplyNoTrans1VecTeamFunctor (const typename std::decay<AlphaCoeffType>::type& alpha,
197  const GraphType& graph,
198  const MatrixValuesType& val,
199  const local_ordinal_type blockSize,
200  const InVecType& X,
201  const typename std::decay<BetaCoeffType>::type& beta,
202  const OutVecType& Y,
203  const local_ordinal_type rowsPerTeam = defaultRowsPerTeam) :
204  alpha_ (alpha),
205  ptr_ (graph.row_map),
206  ind_ (graph.entries),
207  val_ (val),
208  blockSize_ (blockSize),
209  X_ (X),
210  beta_ (beta),
211  Y_ (Y),
212  rowsPerTeam_ (rowsPerTeam)
213  {
214  // std::ostringstream os;
215  // os << "Functor ctor: numLclMeshRows = " << getNumLclMeshRows () << std::endl;
216  // std::cerr << os.str ();
217  }
218 
219  KOKKOS_INLINE_FUNCTION void
220  operator () (const typename policy_type::member_type& member) const
221  {
226  using Kokkos::Details::ArithTraits;
227  // I'm not writing 'using Kokkos::make_pair;' here, because that
228  // may break builds for users who make the mistake of putting
229  // 'using namespace std;' in the global namespace. Please don't
230  // ever do that! But just in case you do, I'll take this
231  // precaution.
232  using Kokkos::parallel_for;
233  using Kokkos::subview;
234  typedef local_ordinal_type LO;
235  typedef typename decltype (ptr_)::non_const_value_type offset_type;
236  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
237  shmem_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
238  shared_array_type;
239  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
240  Kokkos::LayoutRight,
241  device_type,
242  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
243  out_little_vec_type;
244  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
245  Kokkos::LayoutRight,
246  device_type,
247  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
248  little_block_type;
249 
250  const LO leagueRank = member.league_rank();
251 
252  // This looks wrong! All the threads are writing into the same local storage.
253  // shared_array_type threadLocalMem =
254  // shared_array_type (member.thread_scratch (1), blockSize_);
255  shared_array_type threadLocalMem =
256  shared_array_type (member.thread_scratch (1), blockSize_ * rowsPerTeam_);
257 
258  // This looks wrong! All the threads are writing into the same local storage.
259  //out_little_vec_type Y_tlm (threadLocalMem.data (), blockSize_, 1);
260 
261  const LO numLclMeshRows = getNumLclMeshRows ();
262  const LO rowBeg = leagueRank * rowsPerTeam_;
263  const LO rowTmp = rowBeg + rowsPerTeam_;
264  const LO rowEnd = rowTmp < numLclMeshRows ? rowTmp : numLclMeshRows;
265 
266  // {
267  // std::ostringstream os;
268  // os << leagueRank << "," << member.team_rank () << ": "
269  // << rowBeg << "," << rowEnd << std::endl;
270  // std::cerr << os.str ();
271  // }
272 
273  // Each team takes rowsPerTeam_ (block) rows.
274  // Each thread in the team takes a single (block) row.
275  parallel_for (Kokkos::TeamThreadRange (member, rowBeg, rowEnd),
276  [&] (const LO& lclRow) {
277  // Each thread in the team gets its own temp storage.
278  out_little_vec_type Y_tlm (threadLocalMem.data () + blockSize_ * member.team_rank (), blockSize_);
279 
280  const offset_type Y_ptBeg = lclRow * blockSize_;
281  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
282  auto Y_cur =
283  subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
284  if (beta_ == ArithTraits<BetaCoeffType>::zero ()) {
285  FILL (Y_tlm, ArithTraits<BetaCoeffType>::zero ());
286  }
287  else if (beta_ == ArithTraits<BetaCoeffType>::one ()) {
288  COPY (Y_cur, Y_tlm);
289  }
290  else { // beta != 0 && beta != 1
291  COPY (Y_cur, Y_tlm);
292  SCAL (beta_, Y_tlm);
293  }
294 
295  if (alpha_ != ArithTraits<AlphaCoeffType>::zero ()) {
296  const offset_type blkBeg = ptr_[lclRow];
297  const offset_type blkEnd = ptr_[lclRow+1];
298  // Precompute to save integer math in the inner loop.
299  const offset_type bs2 = blockSize_ * blockSize_;
300  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
301  ++absBlkOff) {
302  little_block_type A_cur (val_.data () + absBlkOff * bs2,
303  blockSize_, blockSize_);
304  const offset_type X_blkCol = ind_[absBlkOff];
305  const offset_type X_ptBeg = X_blkCol * blockSize_;
306  const offset_type X_ptEnd = X_ptBeg + blockSize_;
307  auto X_cur =
308  subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
309  // Y_tlm += alpha*A_cur*X_cur
310  GEMV (alpha_, A_cur, X_cur, Y_tlm);
311  } // for each entry in current local block row of matrix
312  COPY (Y_tlm, Y_cur);
313  }
314  });
315  }
316 
317 private:
318  typename std::decay<AlphaCoeffType>::type alpha_;
319  typename GraphType::row_map_type::const_type ptr_;
320  typename GraphType::entries_type::const_type ind_;
321  MatrixValuesType val_;
322  local_ordinal_type blockSize_;
323  InVecType X_;
324  typename std::decay<BetaCoeffType>::type beta_;
325  OutVecType Y_;
326  local_ordinal_type rowsPerTeam_;
327 };
328 #endif // 0
329 
330 template<class AlphaCoeffType,
331  class GraphType,
332  class MatrixValuesType,
333  class InVecType,
334  class BetaCoeffType,
335  class OutVecType>
336 class BcrsApplyNoTrans1VecFunctor {
337 private:
338  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
339  "MatrixValuesType must be a Kokkos::View.");
340  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
341  "OutVecType must be a Kokkos::View.");
342  static_assert (Kokkos::Impl::is_view<InVecType>::value,
343  "InVecType must be a Kokkos::View.");
344  static_assert (std::is_same<MatrixValuesType,
345  typename MatrixValuesType::const_type>::value,
346  "MatrixValuesType must be a const Kokkos::View.");
347  static_assert (std::is_same<OutVecType,
348  typename OutVecType::non_const_type>::value,
349  "OutVecType must be a nonconst Kokkos::View.");
350  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
351  "InVecType must be a const Kokkos::View.");
352  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
353  "MatrixValuesType must be a rank-1 Kokkos::View.");
354  static_assert (static_cast<int> (InVecType::rank) == 1,
355  "InVecType must be a rank-1 Kokkos::View.");
356  static_assert (static_cast<int> (OutVecType::rank) == 1,
357  "OutVecType must be a rank-1 Kokkos::View.");
358  typedef typename MatrixValuesType::non_const_value_type scalar_type;
359 
360 public:
361  typedef typename GraphType::device_type device_type;
362 
364  typedef typename std::remove_const<typename GraphType::data_type>::type
365  local_ordinal_type;
367  typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,
368  typename device_type::execution_space,
369  local_ordinal_type> policy_type;
376  void setX (const InVecType& X) { X_ = X; }
377 
384  void setY (const OutVecType& Y) { Y_ = Y; }
385 
386  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
387  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
388 
390  BcrsApplyNoTrans1VecFunctor (const alpha_coeff_type& alpha,
391  const GraphType& graph,
392  const MatrixValuesType& val,
393  const local_ordinal_type blockSize,
394  const InVecType& X,
395  const beta_coeff_type& beta,
396  const OutVecType& Y) :
397  alpha_ (alpha),
398  ptr_ (graph.row_map),
399  ind_ (graph.entries),
400  val_ (val),
401  blockSize_ (blockSize),
402  X_ (X),
403  beta_ (beta),
404  Y_ (Y)
405  {}
406 
407  KOKKOS_INLINE_FUNCTION void
408  operator () (const local_ordinal_type& lclRow) const
409  {
414  using Kokkos::Details::ArithTraits;
415  // I'm not writing 'using Kokkos::make_pair;' here, because that
416  // may break builds for users who make the mistake of putting
417  // 'using namespace std;' in the global namespace. Please don't
418  // ever do that! But just in case you do, I'll take this
419  // precaution.
420  using Kokkos::parallel_for;
421  using Kokkos::subview;
422  typedef typename decltype (ptr_)::non_const_value_type offset_type;
423  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
424  Kokkos::LayoutRight,
425  device_type,
426  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
427  little_block_type;
428 
429  const offset_type Y_ptBeg = lclRow * blockSize_;
430  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
431  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
432 
433  // This version of the code does not use temporary storage.
434  // Each thread writes to its own block of the target vector.
435  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
436  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
437  }
438  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
439  SCAL (beta_, Y_cur);
440  }
441 
442  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
443  const offset_type blkBeg = ptr_[lclRow];
444  const offset_type blkEnd = ptr_[lclRow+1];
445  // Precompute to save integer math in the inner loop.
446  const offset_type bs2 = blockSize_ * blockSize_;
447  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
448  ++absBlkOff) {
449  little_block_type A_cur (val_.data () + absBlkOff * bs2,
450  blockSize_, blockSize_);
451  const offset_type X_blkCol = ind_[absBlkOff];
452  const offset_type X_ptBeg = X_blkCol * blockSize_;
453  const offset_type X_ptEnd = X_ptBeg + blockSize_;
454  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
455 
456  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
457  } // for each entry in current local block row of matrix
458  }
459  }
460 
461 private:
462  alpha_coeff_type alpha_;
463  typename GraphType::row_map_type::const_type ptr_;
464  typename GraphType::entries_type::const_type ind_;
465  MatrixValuesType val_;
466  local_ordinal_type blockSize_;
467  InVecType X_;
468  beta_coeff_type beta_;
469  OutVecType Y_;
470 };
471 
472 template<class AlphaCoeffType,
473  class GraphType,
474  class MatrixValuesType,
475  class InMultiVecType,
476  class BetaCoeffType,
477  class OutMultiVecType>
478 void
479 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
480  const GraphType& graph,
481  const MatrixValuesType& val,
482  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
483  const InMultiVecType& X,
484  const BetaCoeffType& beta,
485  const OutMultiVecType& Y
486 #if 0
487  , const typename std::remove_const<typename GraphType::data_type>::type rowsPerTeam = 20
488 #endif // 0
489  )
490 {
491  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
492  "MatrixValuesType must be a Kokkos::View.");
493  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
494  "OutMultiVecType must be a Kokkos::View.");
495  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
496  "InMultiVecType must be a Kokkos::View.");
497  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
498  "MatrixValuesType must be a rank-1 Kokkos::View.");
499  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
500  "OutMultiVecType must be a rank-2 Kokkos::View.");
501  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
502  "InMultiVecType must be a rank-2 Kokkos::View.");
503 
504  typedef typename MatrixValuesType::const_type matrix_values_type;
505  typedef typename OutMultiVecType::non_const_type out_multivec_type;
506  typedef typename InMultiVecType::const_type in_multivec_type;
507  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
508  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
509  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
510 
511  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
512  static_cast<LO> (0) :
513  static_cast<LO> (graph.row_map.extent (0) - 1);
514  const LO numVecs = Y.extent (1);
515  if (numLocalMeshRows == 0 || numVecs == 0) {
516  return; // code below doesn't handle numVecs==0 correctly
517  }
518 
519  // These assignments avoid instantiating the functor extra times
520  // unnecessarily, e.g., for X const vs. nonconst. We only need the
521  // X const case, so only instantiate for that case.
522  in_multivec_type X_in = X;
523  out_multivec_type Y_out = Y;
524 
525  // The functor only knows how to handle one vector at a time, and it
526  // expects 1-D Views. Thus, we need to know the type of each column
527  // of X and Y.
528  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
529  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
530 #if 0
531  typedef BcrsApplyNoTrans1VecTeamFunctor<alpha_type, GraphType,
532  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
533 #else
534  typedef BcrsApplyNoTrans1VecFunctor<alpha_type, GraphType,
535  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
536 #endif // 0
537  typedef typename functor_type::policy_type policy_type;
538 
539  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
540  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
541 #if 0
542  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0, rowsPerTeam);
543  const LO numTeams = functor.getNumTeams ();
544  policy_type policy (numTeams, Kokkos::AUTO ());
545  {
546  // KJ : hierarchy level of memory allocated e.g., cache (1),
547  // HBM (2), DDR (3), not used for now
548  const LO level = 1;
549  // KJ : for now provide two options for parallelizing (for vs. reduce)
550  const LO scratchSizePerTeam = functor.getScratchSizePerTeam (); // used for team parallel_red
551  const LO scratchSizePerThread = functor.getScratchSizePerThread (); // used for team parallel_for
552  policy =
553  policy.set_scratch_size (level,
554  Kokkos::PerTeam (scratchSizePerTeam),
555  Kokkos::PerThread (scratchSizePerThread));
556  }
557 #else
558  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
559  policy_type policy (0, numLocalMeshRows);
560 #endif // 0
561 
562  // Compute the first column of Y.
563  Kokkos::parallel_for (policy, functor);
564 
565  // Compute the remaining columns of Y.
566  for (LO j = 1; j < numVecs; ++j) {
567  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
568  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
569  functor.setX (X_j);
570  functor.setY (Y_j);
571  Kokkos::parallel_for (policy, functor);
572  }
573 }
574 
575 } // namespace Impl
576 
577 namespace { // (anonymous)
578 
579 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
580 // version that takes two Kokkos::View arguments).
581 template<class Scalar, class LO, class GO, class Node>
582 class GetLocalDiagCopy {
583 public:
584  typedef typename Node::device_type device_type;
585  typedef size_t diag_offset_type;
586  typedef Kokkos::View<const size_t*, device_type,
587  Kokkos::MemoryUnmanaged> diag_offsets_type;
588  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
589  typedef typename global_graph_type::local_graph_type local_graph_type;
590  typedef typename local_graph_type::row_map_type row_offsets_type;
591  typedef typename ::Tpetra::Experimental::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
592  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
593  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
594 
595  // Constructor
596  GetLocalDiagCopy (const diag_type& diag,
597  const values_type& val,
598  const diag_offsets_type& diagOffsets,
599  const row_offsets_type& ptr,
600  const LO blockSize) :
601  diag_ (diag),
602  diagOffsets_ (diagOffsets),
603  ptr_ (ptr),
604  blockSize_ (blockSize),
605  offsetPerBlock_ (blockSize_*blockSize_),
606  val_(val)
607  {}
608 
609  KOKKOS_INLINE_FUNCTION void
610  operator() (const LO& lclRowInd) const
611  {
612  using Kokkos::ALL;
613 
614  // Get row offset
615  const size_t absOffset = ptr_[lclRowInd];
616 
617  // Get offset relative to start of row
618  const size_t relOffset = diagOffsets_[lclRowInd];
619 
620  // Get the total offset
621  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
622 
623  // Get a view of the block. BCRS currently uses LayoutRight
624  // regardless of the device.
625  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
626  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
627  const_little_block_type;
628  const_little_block_type D_in (val_.data () + pointOffset,
629  blockSize_, blockSize_);
630  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
631  COPY (D_in, D_out);
632  }
633 
634  private:
635  diag_type diag_;
636  diag_offsets_type diagOffsets_;
637  row_offsets_type ptr_;
638  LO blockSize_;
639  LO offsetPerBlock_;
640  values_type val_;
641  };
642 } // namespace (anonymous)
643 
644  template<class Scalar, class LO, class GO, class Node>
645  std::ostream&
646  BlockCrsMatrix<Scalar, LO, GO, Node>::
647  markLocalErrorAndGetStream ()
648  {
649  * (this->localError_) = true;
650  if ((*errs_).is_null ()) {
651  *errs_ = Teuchos::rcp (new std::ostringstream ());
652  }
653  return **errs_;
654  }
655 
656  template<class Scalar, class LO, class GO, class Node>
659  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
660  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
661  blockSize_ (static_cast<LO> (0)),
662  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
663  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
664  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
665  offsetPerBlock_ (0),
666  localError_ (new bool (false)),
667  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
668  {
669  }
670 
671  template<class Scalar, class LO, class GO, class Node>
674  const LO blockSize) :
675  dist_object_type (graph.getMap ()),
676  graph_ (graph),
677  rowMeshMap_ (* (graph.getRowMap ())),
678  blockSize_ (blockSize),
679  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
680  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
681  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
682  offsetPerBlock_ (blockSize * blockSize),
683  localError_ (new bool (false)),
684  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
685  {
686  TEUCHOS_TEST_FOR_EXCEPTION(
687  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
688  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
689  "rows (isSorted() is false). This class assumes sorted rows.");
690 
691  graphRCP_ = Teuchos::rcpFromRef(graph_);
692 
693  // Trick to test whether LO is nonpositive, without a compiler
694  // warning in case LO is unsigned (which is generally a bad idea
695  // anyway). I don't promise that the trick works, but it
696  // generally does with gcc at least, in my experience.
697  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
698  TEUCHOS_TEST_FOR_EXCEPTION(
699  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
700  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
701  " <= 0. The block size must be positive.");
702 
703  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
704  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
705 
706  {
707  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
708  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
709 
710  row_map_type ptr_d = graph.getLocalGraph ().row_map;
711  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
712  Kokkos::deep_copy (ptr_h_nc, ptr_d);
713  ptrHost_ = ptr_h_nc;
714  }
715  {
716  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
717  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
718 
719  entries_type ind_d = graph.getLocalGraph ().entries;
720  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
721  Kokkos::deep_copy (ind_h_nc, ind_d);
722  indHost_ = ind_h_nc;
723  }
724 
725  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
726  val_ = decltype (val_) ("val", numValEnt);
727  }
728 
729  template<class Scalar, class LO, class GO, class Node>
732  const map_type& domainPointMap,
733  const map_type& rangePointMap,
734  const LO blockSize) :
735  dist_object_type (graph.getMap ()),
736  graph_ (graph),
737  rowMeshMap_ (* (graph.getRowMap ())),
738  domainPointMap_ (domainPointMap),
739  rangePointMap_ (rangePointMap),
740  blockSize_ (blockSize),
741  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
742  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
743  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
744  offsetPerBlock_ (blockSize * blockSize),
745  localError_ (new bool (false)),
746  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
747  {
748  TEUCHOS_TEST_FOR_EXCEPTION(
749  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
750  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
751  "rows (isSorted() is false). This class assumes sorted rows.");
752 
753  graphRCP_ = Teuchos::rcpFromRef(graph_);
754 
755  // Trick to test whether LO is nonpositive, without a compiler
756  // warning in case LO is unsigned (which is generally a bad idea
757  // anyway). I don't promise that the trick works, but it
758  // generally does with gcc at least, in my experience.
759  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
760  TEUCHOS_TEST_FOR_EXCEPTION(
761  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
762  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
763  " <= 0. The block size must be positive.");
764 
765  {
766  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
767  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
768 
769  row_map_type ptr_d = graph.getLocalGraph ().row_map;
770  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
771  Kokkos::deep_copy (ptr_h_nc, ptr_d);
772  ptrHost_ = ptr_h_nc;
773  }
774  {
775  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
776  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
777 
778  entries_type ind_d = graph.getLocalGraph ().entries;
779  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
780  Kokkos::deep_copy (ind_h_nc, ind_d);
781  indHost_ = ind_h_nc;
782  }
783 
784  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
785  val_ = decltype (val_) ("val", numValEnt);
786  }
787 
788  template<class Scalar, class LO, class GO, class Node>
789  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
792  { // Copy constructor of map_type does a shallow copy.
793  // We're only returning by RCP for backwards compatibility.
794  return Teuchos::rcp (new map_type (domainPointMap_));
795  }
796 
797  template<class Scalar, class LO, class GO, class Node>
798  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
800  getRangeMap () const
801  { // Copy constructor of map_type does a shallow copy.
802  // We're only returning by RCP for backwards compatibility.
803  return Teuchos::rcp (new map_type (rangePointMap_));
804  }
805 
806  template<class Scalar, class LO, class GO, class Node>
807  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
809  getRowMap () const
810  {
811  return graph_.getRowMap();
812  }
813 
814  template<class Scalar, class LO, class GO, class Node>
815  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
817  getColMap () const
818  {
819  return graph_.getColMap();
820  }
821 
822  template<class Scalar, class LO, class GO, class Node>
826  {
827  return graph_.getGlobalNumRows();
828  }
829 
830  template<class Scalar, class LO, class GO, class Node>
831  size_t
834  {
835  return graph_.getNodeNumRows();
836  }
837 
838  template<class Scalar, class LO, class GO, class Node>
839  size_t
842  {
843  return graph_.getNodeMaxNumRowEntries();
844  }
845 
846  template<class Scalar, class LO, class GO, class Node>
847  void
849  apply (const mv_type& X,
850  mv_type& Y,
851  Teuchos::ETransp mode,
852  Scalar alpha,
853  Scalar beta) const
854  {
856  TEUCHOS_TEST_FOR_EXCEPTION(
857  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
858  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
859  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
860  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
861 
862  BMV X_view;
863  BMV Y_view;
864  const LO blockSize = getBlockSize ();
865  try {
866  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
867  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
868  }
869  catch (std::invalid_argument& e) {
870  TEUCHOS_TEST_FOR_EXCEPTION(
871  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
872  "apply: Either the input MultiVector X or the output MultiVector Y "
873  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
874  "graph. BlockMultiVector's constructor threw the following exception: "
875  << e.what ());
876  }
877 
878  try {
879  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
880  // either that or mark fields of this class as 'mutable'. The
881  // problem is that applyBlock wants to do lazy initialization of
882  // temporary block multivectors.
883  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
884  } catch (std::invalid_argument& e) {
885  TEUCHOS_TEST_FOR_EXCEPTION(
886  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
887  "apply: The implementation method applyBlock complained about having "
888  "an invalid argument. It reported the following: " << e.what ());
889  } catch (std::logic_error& e) {
890  TEUCHOS_TEST_FOR_EXCEPTION(
891  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
892  "apply: The implementation method applyBlock complained about a "
893  "possible bug in its implementation. It reported the following: "
894  << e.what ());
895  } catch (std::exception& e) {
896  TEUCHOS_TEST_FOR_EXCEPTION(
897  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
898  "apply: The implementation method applyBlock threw an exception which "
899  "is neither std::invalid_argument nor std::logic_error, but is a "
900  "subclass of std::exception. It reported the following: "
901  << e.what ());
902  } catch (...) {
903  TEUCHOS_TEST_FOR_EXCEPTION(
904  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
905  "apply: The implementation method applyBlock threw an exception which "
906  "is not an instance of a subclass of std::exception. This probably "
907  "indicates a bug. Please report this to the Tpetra developers.");
908  }
909  }
910 
911  template<class Scalar, class LO, class GO, class Node>
912  void
916  Teuchos::ETransp mode,
917  const Scalar alpha,
918  const Scalar beta)
919  {
920  TEUCHOS_TEST_FOR_EXCEPTION(
921  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
922  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
923  "X and Y have different block sizes. X.getBlockSize() = "
924  << X.getBlockSize () << " != Y.getBlockSize() = "
925  << Y.getBlockSize () << ".");
926 
927  if (mode == Teuchos::NO_TRANS) {
928  applyBlockNoTrans (X, Y, alpha, beta);
929  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
930  applyBlockTrans (X, Y, mode, alpha, beta);
931  } else {
932  TEUCHOS_TEST_FOR_EXCEPTION(
933  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
934  "applyBlock: Invalid 'mode' argument. Valid values are "
935  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
936  }
937  }
938 
939  template<class Scalar, class LO, class GO, class Node>
940  void
942  setAllToScalar (const Scalar& alpha)
943  {
944 #ifdef HAVE_TPETRA_DEBUG
945  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::setAllToScalar: ";
946 #endif // HAVE_TPETRA_DEBUG
947 
948  if (this->template need_sync<device_type> ()) {
949  // If we need to sync to device, then the data were last
950  // modified on host. In that case, we should again modify them
951  // on host.
952 #ifdef HAVE_TPETRA_DEBUG
953  TEUCHOS_TEST_FOR_EXCEPTION
954  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
955  prefix << "The matrix's values need sync on both device and host.");
956 #endif // HAVE_TPETRA_DEBUG
957  this->template modify<Kokkos::HostSpace> ();
958  Kokkos::deep_copy (this->template getValues<Kokkos::HostSpace> (), alpha);
959  }
960  else if (this->template need_sync<Kokkos::HostSpace> ()) {
961  // If we need to sync to host, then the data were last modified
962  // on device. In that case, we should again modify them on
963  // device.
964 #ifdef HAVE_TPETRA_DEBUG
965  TEUCHOS_TEST_FOR_EXCEPTION
966  (this->template need_sync<device_type> (), std::runtime_error,
967  prefix << "The matrix's values need sync on both host and device.");
968 #endif // HAVE_TPETRA_DEBUG
969  this->template modify<device_type> ();
970  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
971  }
972  else { // neither host nor device marked as modified, so modify on device
973  this->template modify<device_type> ();
974  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
975  }
976  }
977 
978  template<class Scalar, class LO, class GO, class Node>
979  LO
981  replaceLocalValues (const LO localRowInd,
982  const LO colInds[],
983  const Scalar vals[],
984  const LO numColInds) const
985  {
986 #ifdef HAVE_TPETRA_DEBUG
987  const char prefix[] =
988  "Tpetra::Experimental::BlockCrsMatrix::replaceLocalValues: ";
989 #endif // HAVE_TPETRA_DEBUG
990 
991  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
992  // We modified no values, because the input local row index is
993  // invalid on the calling process. That may not be an error, if
994  // numColInds is zero anyway; it doesn't matter. This is the
995  // advantage of returning the number of valid indices.
996  return static_cast<LO> (0);
997  }
998  const impl_scalar_type* const vIn =
999  reinterpret_cast<const impl_scalar_type*> (vals);
1000  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1001  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1002  const LO perBlockSize = this->offsetPerBlock ();
1003  LO hint = 0; // Guess for the relative offset into the current row
1004  LO pointOffset = 0; // Current offset into input values
1005  LO validCount = 0; // number of valid column indices in colInds
1006 
1007 #ifdef HAVE_TPETRA_DEBUG
1008  TEUCHOS_TEST_FOR_EXCEPTION
1009  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1010  prefix << "The matrix's data were last modified on device, but have "
1011  "not been sync'd to host. Please sync to host (by calling "
1012  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1013  "method.");
1014 #endif // HAVE_TPETRA_DEBUG
1015 
1016  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1017  // version of the data always exists (no lazy allocation for host
1018  // data).
1020  auto vals_host_out =
1021  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1022  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1023 
1024  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1025  const LO relBlockOffset =
1026  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1027  if (relBlockOffset != LINV) {
1028  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1029  // are stored contiguously, with no padding. "Contiguously"
1030  // means that all memory between the first and last entries
1031  // belongs to the block (no striding). "No padding" means
1032  // that getBlockSize() * getBlockSize() is exactly the number
1033  // of entries that the block uses. For another place where
1034  // this assumption is encoded, see sumIntoLocalValues.
1035 
1036  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1037  // little_block_type A_old =
1038  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1039  impl_scalar_type* const A_old =
1040  vals_host_out_raw + absBlockOffset * perBlockSize;
1041  // const_little_block_type A_new =
1042  // getConstLocalBlockFromInput (vIn, pointOffset);
1043  const impl_scalar_type* const A_new = vIn + pointOffset;
1044  // COPY (A_new, A_old);
1045  for (LO i = 0; i < perBlockSize; ++i) {
1046  A_old[i] = A_new[i];
1047  }
1048  hint = relBlockOffset + 1;
1049  ++validCount;
1050  }
1051  }
1052  return validCount;
1053  }
1054 
1055  template <class Scalar, class LO, class GO, class Node>
1056  void
1058  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1059  Kokkos::MemoryUnmanaged>& offsets) const
1060  {
1061  graph_.getLocalDiagOffsets (offsets);
1062  }
1063 
1064  template <class Scalar, class LO, class GO, class Node>
1065  void TPETRA_DEPRECATED
1067  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1068  {
1069  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1070  // this method in CrsGraph too, so don't call it (otherwise build
1071  // warnings will show up and annoy users). Instead, copy results
1072  // in and out, if the memory space requires it.
1073 
1074  const size_t lclNumRows = graph_.getNodeNumRows ();
1075  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1076  offsets.resize (lclNumRows);
1077  }
1078 
1079  // The input ArrayRCP must always be a host pointer. Thus, if
1080  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1081  // to write to that allocation directly as a Kokkos::View.
1082  typedef typename device_type::memory_space memory_space;
1083  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1084  // It is always syntactically correct to assign a raw host
1085  // pointer to a device View, so this code will compile correctly
1086  // even if this branch never runs.
1087  typedef Kokkos::View<size_t*, device_type,
1088  Kokkos::MemoryUnmanaged> output_type;
1089  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1090  graph_.getLocalDiagOffsets (offsetsOut);
1091  }
1092  else {
1093  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1094  graph_.getLocalDiagOffsets (offsetsTmp);
1095  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1096  Kokkos::MemoryUnmanaged> output_type;
1097  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1098  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1099  }
1100  }
1101 
1102  template <class Scalar, class LO, class GO, class Node>
1103  void
1107  const Kokkos::View<impl_scalar_type***, device_type,
1108  Kokkos::MemoryUnmanaged>& D_inv,
1109  const Scalar& omega,
1110  const ESweepDirection direction) const
1111  {
1112  using Kokkos::ALL;
1113  const impl_scalar_type zero =
1114  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1115  const impl_scalar_type one =
1116  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1117  const LO numLocalMeshRows =
1118  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1119  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1120 
1121  // If using (new) Kokkos, replace localMem with thread-local
1122  // memory. Note that for larger block sizes, this will affect the
1123  // two-level parallelization. Look to Stokhos for best practice
1124  // on making this fast for GPUs.
1125  const LO blockSize = getBlockSize ();
1126  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1127  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1128  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1129 
1130  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1131  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1132  if (direction == Forward) {
1133  rowBegin = 1;
1134  rowEnd = numLocalMeshRows+1;
1135  rowStride = 1;
1136  }
1137  else if (direction == Backward) {
1138  rowBegin = numLocalMeshRows;
1139  rowEnd = 0;
1140  rowStride = -1;
1141  }
1142  else if (direction == Symmetric) {
1143  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1144  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1145  return;
1146  }
1147 
1148  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1149  const Scalar minus_omega = -omega;
1150 
1151  if (numVecs == 1) {
1152  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1153  const LO actlRow = lclRow - 1;
1154 
1155  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1156  COPY (B_cur, X_lcl);
1157  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1158 
1159  const size_t meshBeg = ptrHost_[actlRow];
1160  const size_t meshEnd = ptrHost_[actlRow+1];
1161  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1162  const LO meshCol = indHost_[absBlkOff];
1163  const_little_block_type A_cur =
1164  getConstLocalBlockFromAbsOffset (absBlkOff);
1165  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1166 
1167  // X_lcl += alpha*A_cur*X_cur
1168  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1169  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1170  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1171  } // for each entry in the current local row of the matrix
1172 
1173  // NOTE (mfh 20 Jan 2016) The two input Views here are
1174  // unmanaged already, so we don't have to take unmanaged
1175  // subviews first.
1176  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1177  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1178  FILL (X_update, zero);
1179  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1180  } // for each local row of the matrix
1181  }
1182  else {
1183  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1184  for (LO j = 0; j < numVecs; ++j) {
1185  LO actlRow = lclRow-1;
1186 
1187  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1188  COPY (B_cur, X_lcl);
1189  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1190 
1191  const size_t meshBeg = ptrHost_[actlRow];
1192  const size_t meshEnd = ptrHost_[actlRow+1];
1193  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1194  const LO meshCol = indHost_[absBlkOff];
1195  const_little_block_type A_cur =
1196  getConstLocalBlockFromAbsOffset (absBlkOff);
1197  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1198 
1199  // X_lcl += alpha*A_cur*X_cur
1200  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1201  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1202  } // for each entry in the current local row of the matrx
1203 
1204  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1205  auto X_update = X.getLocalBlock (actlRow, j);
1206  FILL (X_update, zero);
1207  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1208  } // for each entry in the current local row of the matrix
1209  } // for each local row of the matrix
1210  }
1211  }
1212 
1213  template <class Scalar, class LO, class GO, class Node>
1214  void
1219  const Scalar& dampingFactor,
1220  const ESweepDirection direction,
1221  const int numSweeps,
1222  const bool zeroInitialGuess) const
1223  {
1224  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1225  // interface for block Gauss-Seidel.
1226  TEUCHOS_TEST_FOR_EXCEPTION(
1227  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1228  "gaussSeidelCopy: Not implemented.");
1229  }
1230 
1231  template <class Scalar, class LO, class GO, class Node>
1232  void
1237  const Teuchos::ArrayView<LO>& rowIndices,
1238  const Scalar& dampingFactor,
1239  const ESweepDirection direction,
1240  const int numSweeps,
1241  const bool zeroInitialGuess) const
1242  {
1243  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1244  // interface for block Gauss-Seidel.
1245  TEUCHOS_TEST_FOR_EXCEPTION(
1246  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1247  "reorderedGaussSeidelCopy: Not implemented.");
1248  }
1249 
1250  template <class Scalar, class LO, class GO, class Node>
1251  void TPETRA_DEPRECATED
1254  const Teuchos::ArrayView<const size_t>& offsets) const
1255  {
1256  using Teuchos::ArrayRCP;
1257  using Teuchos::ArrayView;
1258  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1259 
1260  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1261  const LO* columnIndices;
1262  Scalar* vals;
1263  LO numColumns;
1264  Teuchos::Array<LO> cols(1);
1265 
1266  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1267  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1268  for (size_t i = 0; i < myNumRows; ++i) {
1269  cols[0] = i;
1270  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1271  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1272  }
1273  else {
1274  getLocalRowView (i, columnIndices, vals, numColumns);
1275  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1276  }
1277  }
1278  }
1279 
1280 
1281  template <class Scalar, class LO, class GO, class Node>
1282  void
1284  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1285  Kokkos::MemoryUnmanaged>& diag,
1286  const Kokkos::View<const size_t*, device_type,
1287  Kokkos::MemoryUnmanaged>& offsets) const
1288  {
1289  using Kokkos::parallel_for;
1290  typedef typename device_type::execution_space execution_space;
1291  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1292 
1293  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1294  const LO blockSize = this->getBlockSize ();
1295  TEUCHOS_TEST_FOR_EXCEPTION
1296  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1297  static_cast<LO> (diag.extent (1)) < blockSize ||
1298  static_cast<LO> (diag.extent (2)) < blockSize,
1299  std::invalid_argument, prefix <<
1300  "The input Kokkos::View is not big enough to hold all the data.");
1301  TEUCHOS_TEST_FOR_EXCEPTION
1302  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1303  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1304  "diagonal blocks " << lclNumMeshRows << ".");
1305 
1306 #ifdef HAVE_TPETRA_DEBUG
1307  TEUCHOS_TEST_FOR_EXCEPTION
1308  (this->template need_sync<device_type> (), std::runtime_error,
1309  prefix << "The matrix's data were last modified on host, but have "
1310  "not been sync'd to device. Please sync to device (by calling "
1311  "sync<device_type>() on this matrix) before calling this method.");
1312 #endif // HAVE_TPETRA_DEBUG
1313 
1314  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1315  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1316 
1317  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1318  // we reserve the right to do lazy allocation of device data. (We
1319  // don't plan to do lazy allocation for host data; the host
1320  // version of the data always exists.)
1322  auto vals_dev =
1323  const_cast<this_type*> (this)->template getValues<device_type> ();
1324 
1325  parallel_for (policy_type (0, lclNumMeshRows),
1326  functor_type (diag, vals_dev, offsets,
1327  graph_.getLocalGraph ().row_map, blockSize_));
1328  }
1329 
1330 
1331  template <class Scalar, class LO, class GO, class Node>
1332  void
1334  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1335  Kokkos::MemoryUnmanaged>& diag,
1336  const Teuchos::ArrayView<const size_t>& offsets) const
1337  {
1338  using Kokkos::ALL;
1339  using Kokkos::parallel_for;
1340  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1341  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1342 
1343  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1344  const LO blockSize = this->getBlockSize ();
1345  TEUCHOS_TEST_FOR_EXCEPTION
1346  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1347  static_cast<LO> (diag.extent (1)) < blockSize ||
1348  static_cast<LO> (diag.extent (2)) < blockSize,
1349  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1350  "The input Kokkos::View is not big enough to hold all the data.");
1351  TEUCHOS_TEST_FOR_EXCEPTION
1352  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1353  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1354  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1355  "blocks " << lclNumMeshRows << ".");
1356 
1357  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1358  // quite made everything work with CUDA yet.
1359  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1360  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1361  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1362  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1363  COPY (D_in, D_out);
1364  });
1365  }
1366 
1367 
1368  template<class Scalar, class LO, class GO, class Node>
1369  LO
1371  absMaxLocalValues (const LO localRowInd,
1372  const LO colInds[],
1373  const Scalar vals[],
1374  const LO numColInds) const
1375  {
1376  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1377  // We modified no values, because the input local row index is
1378  // invalid on the calling process. That may not be an error, if
1379  // numColInds is zero anyway; it doesn't matter. This is the
1380  // advantage of returning the number of valid indices.
1381  return static_cast<LO> (0);
1382  }
1383  const impl_scalar_type* const vIn =
1384  reinterpret_cast<const impl_scalar_type*> (vals);
1385  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1386  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1387  const LO perBlockSize = this->offsetPerBlock ();
1388  LO hint = 0; // Guess for the relative offset into the current row
1389  LO pointOffset = 0; // Current offset into input values
1390  LO validCount = 0; // number of valid column indices in colInds
1391 
1392  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1393  const LO relBlockOffset =
1394  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1395  if (relBlockOffset != LINV) {
1396  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1397  little_block_type A_old =
1398  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1399  const_little_block_type A_new =
1400  getConstLocalBlockFromInput (vIn, pointOffset);
1401 
1402  ::Tpetra::Experimental::Impl::absMax (A_old, A_new);
1403  hint = relBlockOffset + 1;
1404  ++validCount;
1405  }
1406  }
1407  return validCount;
1408  }
1409 
1410 
1411  template<class Scalar, class LO, class GO, class Node>
1412  LO
1414  sumIntoLocalValues (const LO localRowInd,
1415  const LO colInds[],
1416  const Scalar vals[],
1417  const LO numColInds) const
1418  {
1419 #ifdef HAVE_TPETRA_DEBUG
1420  const char prefix[] =
1421  "Tpetra::Experimental::BlockCrsMatrix::sumIntoLocalValues: ";
1422 #endif // HAVE_TPETRA_DEBUG
1423 
1424  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1425  // We modified no values, because the input local row index is
1426  // invalid on the calling process. That may not be an error, if
1427  // numColInds is zero anyway; it doesn't matter. This is the
1428  // advantage of returning the number of valid indices.
1429  return static_cast<LO> (0);
1430  }
1431  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1432  const impl_scalar_type* const vIn =
1433  reinterpret_cast<const impl_scalar_type*> (vals);
1434  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1435  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1436  const LO perBlockSize = this->offsetPerBlock ();
1437  LO hint = 0; // Guess for the relative offset into the current row
1438  LO pointOffset = 0; // Current offset into input values
1439  LO validCount = 0; // number of valid column indices in colInds
1440 
1441 #ifdef HAVE_TPETRA_DEBUG
1442  TEUCHOS_TEST_FOR_EXCEPTION
1443  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1444  prefix << "The matrix's data were last modified on device, but have not "
1445  "been sync'd to host. Please sync to host (by calling "
1446  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1447 #endif // HAVE_TPETRA_DEBUG
1448 
1449  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1450  // version of the data always exists (no lazy allocation for host
1451  // data).
1453  auto vals_host_out =
1454  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1455  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1456 
1457  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1458  const LO relBlockOffset =
1459  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1460  if (relBlockOffset != LINV) {
1461  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1462  // are stored contiguously, with no padding. "Contiguously"
1463  // means that all memory between the first and last entries
1464  // belongs to the block (no striding). "No padding" means
1465  // that getBlockSize() * getBlockSize() is exactly the number
1466  // of entries that the block uses. For another place where
1467  // this assumption is encoded, see replaceLocalValues.
1468 
1469  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1470  // little_block_type A_old =
1471  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1472  impl_scalar_type* const A_old =
1473  vals_host_out_raw + absBlockOffset * perBlockSize;
1474  // const_little_block_type A_new =
1475  // getConstLocalBlockFromInput (vIn, pointOffset);
1476  const impl_scalar_type* const A_new = vIn + pointOffset;
1477  // AXPY (ONE, A_new, A_old);
1478  for (LO i = 0; i < perBlockSize; ++i) {
1479  A_old[i] += A_new[i];
1480  }
1481  hint = relBlockOffset + 1;
1482  ++validCount;
1483  }
1484  }
1485  return validCount;
1486  }
1487 
1488  template<class Scalar, class LO, class GO, class Node>
1489  LO
1491  getLocalRowView (const LO localRowInd,
1492  const LO*& colInds,
1493  Scalar*& vals,
1494  LO& numInds) const
1495  {
1496 #ifdef HAVE_TPETRA_DEBUG
1497  const char prefix[] =
1498  "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: ";
1499 #endif // HAVE_TPETRA_DEBUG
1500 
1501  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1502  colInds = NULL;
1503  vals = NULL;
1504  numInds = 0;
1505  return Teuchos::OrdinalTraits<LO>::invalid ();
1506  }
1507  else {
1508  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1509  colInds = indHost_.data () + absBlockOffsetStart;
1510 
1511 #ifdef HAVE_TPETRA_DEBUG
1512  TEUCHOS_TEST_FOR_EXCEPTION
1513  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1514  prefix << "The matrix's data were last modified on device, but have "
1515  "not been sync'd to host. Please sync to host (by calling "
1516  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1517  "method.");
1518 #endif // HAVE_TPETRA_DEBUG
1519 
1520  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1521  // version of the data always exists (no lazy allocation for host
1522  // data).
1524  auto vals_host_out =
1525  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1526  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1527  impl_scalar_type* const vOut = vals_host_out_raw +
1528  absBlockOffsetStart * offsetPerBlock ();
1529  vals = reinterpret_cast<Scalar*> (vOut);
1530 
1531  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1532  return 0; // indicates no error
1533  }
1534  }
1535 
1536  template<class Scalar, class LO, class GO, class Node>
1537  void
1539  getLocalRowCopy (LO LocalRow,
1540  const Teuchos::ArrayView<LO>& Indices,
1541  const Teuchos::ArrayView<Scalar>& Values,
1542  size_t &NumEntries) const
1543  {
1544  const LO *colInds;
1545  Scalar *vals;
1546  LO numInds;
1547  getLocalRowView(LocalRow,colInds,vals,numInds);
1548  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1549  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1550  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1551  << numInds << " row entries");
1552  }
1553  for (LO i=0; i<numInds; ++i) {
1554  Indices[i] = colInds[i];
1555  }
1556  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1557  Values[i] = vals[i];
1558  }
1559  NumEntries = numInds;
1560  }
1561 
1562  template<class Scalar, class LO, class GO, class Node>
1563  LO
1565  getLocalRowOffsets (const LO localRowInd,
1566  ptrdiff_t offsets[],
1567  const LO colInds[],
1568  const LO numColInds) const
1569  {
1570  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1571  // We got no offsets, because the input local row index is
1572  // invalid on the calling process. That may not be an error, if
1573  // numColInds is zero anyway; it doesn't matter. This is the
1574  // advantage of returning the number of valid indices.
1575  return static_cast<LO> (0);
1576  }
1577 
1578  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1579  LO hint = 0; // Guess for the relative offset into the current row
1580  LO validCount = 0; // number of valid column indices in colInds
1581 
1582  for (LO k = 0; k < numColInds; ++k) {
1583  const LO relBlockOffset =
1584  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1585  if (relBlockOffset != LINV) {
1586  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1587  hint = relBlockOffset + 1;
1588  ++validCount;
1589  }
1590  }
1591  return validCount;
1592  }
1593 
1594 
1595  template<class Scalar, class LO, class GO, class Node>
1596  LO
1598  replaceLocalValuesByOffsets (const LO localRowInd,
1599  const ptrdiff_t offsets[],
1600  const Scalar vals[],
1601  const LO numOffsets) const
1602  {
1603  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1604  // We modified no values, because the input local row index is
1605  // invalid on the calling process. That may not be an error, if
1606  // numColInds is zero anyway; it doesn't matter. This is the
1607  // advantage of returning the number of valid indices.
1608  return static_cast<LO> (0);
1609  }
1610  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1611 
1612  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1613  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1614  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1615  size_t pointOffset = 0; // Current offset into input values
1616  LO validCount = 0; // number of valid offsets
1617 
1618  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1619  const size_t relBlockOffset = offsets[k];
1620  if (relBlockOffset != STINV) {
1621  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1622  little_block_type A_old =
1623  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1624  const_little_block_type A_new =
1625  getConstLocalBlockFromInput (vIn, pointOffset);
1626  COPY (A_new, A_old);
1627  ++validCount;
1628  }
1629  }
1630  return validCount;
1631  }
1632 
1633 
1634  template<class Scalar, class LO, class GO, class Node>
1635  LO
1637  absMaxLocalValuesByOffsets (const LO localRowInd,
1638  const ptrdiff_t offsets[],
1639  const Scalar vals[],
1640  const LO numOffsets) const
1641  {
1642  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1643  // We modified no values, because the input local row index is
1644  // invalid on the calling process. That may not be an error, if
1645  // numColInds is zero anyway; it doesn't matter. This is the
1646  // advantage of returning the number of valid indices.
1647  return static_cast<LO> (0);
1648  }
1649  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1650 
1651  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1652  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1653  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1654  size_t pointOffset = 0; // Current offset into input values
1655  LO validCount = 0; // number of valid offsets
1656 
1657  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1658  const size_t relBlockOffset = offsets[k];
1659  if (relBlockOffset != STINV) {
1660  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1661  little_block_type A_old =
1662  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1663  const_little_block_type A_new =
1664  getConstLocalBlockFromInput (vIn, pointOffset);
1665  ::Tpetra::Experimental::Impl::absMax (A_old, A_new);
1666  ++validCount;
1667  }
1668  }
1669  return validCount;
1670  }
1671 
1672 
1673  template<class Scalar, class LO, class GO, class Node>
1674  LO
1676  sumIntoLocalValuesByOffsets (const LO localRowInd,
1677  const ptrdiff_t offsets[],
1678  const Scalar vals[],
1679  const LO numOffsets) const
1680  {
1681  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1682  // We modified no values, because the input local row index is
1683  // invalid on the calling process. That may not be an error, if
1684  // numColInds is zero anyway; it doesn't matter. This is the
1685  // advantage of returning the number of valid indices.
1686  return static_cast<LO> (0);
1687  }
1688  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1689  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1690 
1691  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1692  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1693  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1694  size_t pointOffset = 0; // Current offset into input values
1695  LO validCount = 0; // number of valid offsets
1696 
1697  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1698  const size_t relBlockOffset = offsets[k];
1699  if (relBlockOffset != STINV) {
1700  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1701  little_block_type A_old =
1702  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1703  const_little_block_type A_new =
1704  getConstLocalBlockFromInput (vIn, pointOffset);
1705  //A_old.update (ONE, A_new);
1706  AXPY (ONE, A_new, A_old);
1707  ++validCount;
1708  }
1709  }
1710  return validCount;
1711  }
1712 
1713 
1714  template<class Scalar, class LO, class GO, class Node>
1715  size_t
1717  getNumEntriesInLocalRow (const LO localRowInd) const
1718  {
1719  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1720  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1721  return static_cast<LO> (0); // the calling process doesn't have that row
1722  } else {
1723  return static_cast<LO> (numEntInGraph);
1724  }
1725  }
1726 
1727  template<class Scalar, class LO, class GO, class Node>
1728  void
1732  const Teuchos::ETransp mode,
1733  const Scalar alpha,
1734  const Scalar beta)
1735  {
1736  (void) X;
1737  (void) Y;
1738  (void) mode;
1739  (void) alpha;
1740  (void) beta;
1741 
1742  TEUCHOS_TEST_FOR_EXCEPTION(
1743  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1744  "transpose and conjugate transpose modes are not yet implemented.");
1745  }
1746 
1747  template<class Scalar, class LO, class GO, class Node>
1748  void
1752  const Scalar alpha,
1753  const Scalar beta)
1754  {
1755  using Teuchos::RCP;
1756  using Teuchos::rcp;
1757  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1758  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1759  const Scalar zero = STS::zero ();
1760  const Scalar one = STS::one ();
1761  RCP<const import_type> import = graph_.getImporter ();
1762  // "export" is a reserved C++ keyword, so we can't use it.
1763  RCP<const export_type> theExport = graph_.getExporter ();
1764  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1765 
1766  if (alpha == zero) {
1767  if (beta == zero) {
1768  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1769  }
1770  else if (beta != one) {
1771  Y.scale (beta);
1772  }
1773  }
1774  else { // alpha != 0
1775  const BMV* X_colMap = NULL;
1776  if (import.is_null ()) {
1777  try {
1778  X_colMap = &X;
1779  }
1780  catch (std::exception& e) {
1781  TEUCHOS_TEST_FOR_EXCEPTION
1782  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1783  "operator= threw an exception: " << std::endl << e.what ());
1784  }
1785  }
1786  else {
1787  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1788  // Y_rowMap_ below. This lets us do lazy initialization
1789  // correctly with view semantics of BlockCrsMatrix. All views
1790  // of this BlockCrsMatrix have the same outer pointer. That
1791  // way, we can set the inner pointer in one view, and all
1792  // other views will see it.
1793  if ((*X_colMap_).is_null () ||
1794  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1795  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1796  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1797  static_cast<LO> (X.getNumVectors ())));
1798  }
1799 #ifdef HAVE_TPETRA_BCRS_DO_POINT_IMPORT
1800  if (pointImporter_->is_null ()) {
1801  // The Import ctor needs RCPs. Map's copy ctor does a shallow copy, so
1802  // these are small operations.
1803  const auto domainPointMap = rcp (new typename BMV::map_type (domainPointMap_));
1804  const auto colPointMap = rcp (new typename BMV::map_type (
1805  BMV::makePointMap (*graph_.getColMap(),
1806  blockSize_)));
1807  *pointImporter_ = rcp (new typename crs_graph_type::import_type (
1808  domainPointMap, colPointMap));
1809  }
1810  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1811  **pointImporter_,
1813 #else
1814  (**X_colMap_).doImport (X, *import, ::Tpetra::REPLACE);
1815 #endif
1816  try {
1817  X_colMap = &(**X_colMap_);
1818  }
1819  catch (std::exception& e) {
1820  TEUCHOS_TEST_FOR_EXCEPTION
1821  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1822  "operator= threw an exception: " << std::endl << e.what ());
1823  }
1824  }
1825 
1826  BMV* Y_rowMap = NULL;
1827  if (theExport.is_null ()) {
1828  Y_rowMap = &Y;
1829  }
1830  else if ((*Y_rowMap_).is_null () ||
1831  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1832  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1833  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1834  static_cast<LO> (X.getNumVectors ())));
1835  try {
1836  Y_rowMap = &(**Y_rowMap_);
1837  }
1838  catch (std::exception& e) {
1839  TEUCHOS_TEST_FOR_EXCEPTION(
1840  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1841  "operator= threw an exception: " << std::endl << e.what ());
1842  }
1843  }
1844 
1845  try {
1846  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1847  }
1848  catch (std::exception& e) {
1849  TEUCHOS_TEST_FOR_EXCEPTION
1850  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1851  "an exception: " << e.what ());
1852  }
1853  catch (...) {
1854  TEUCHOS_TEST_FOR_EXCEPTION
1855  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1856  "an exception not a subclass of std::exception.");
1857  }
1858 
1859  if (! theExport.is_null ()) {
1860  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1861  }
1862  }
1863  }
1864 
1865  template<class Scalar, class LO, class GO, class Node>
1866  void
1870  const Scalar alpha,
1871  const Scalar beta)
1872  {
1873  using ::Tpetra::Experimental::Classes::Impl::bcrsLocalApplyNoTrans;
1874 
1875  const impl_scalar_type alpha_impl = alpha;
1876  const auto graph = this->graph_.getLocalGraph ();
1877  const impl_scalar_type beta_impl = beta;
1878  const LO blockSize = this->getBlockSize ();
1879 
1880  auto X_mv = X.getMultiVectorView ();
1881  auto Y_mv = Y.getMultiVectorView ();
1882  Y_mv.template modify<device_type> ();
1883 
1884  auto X_lcl = X_mv.template getLocalView<device_type> ();
1885  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1886  auto val = this->val_.template view<device_type> ();
1887 
1888  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1889  beta_impl, Y_lcl);
1890  }
1891 
1892  template<class Scalar, class LO, class GO, class Node>
1893  LO
1895  findRelOffsetOfColumnIndex (const LO localRowIndex,
1896  const LO colIndexToFind,
1897  const LO hint) const
1898  {
1899  const size_t absStartOffset = ptrHost_[localRowIndex];
1900  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1901  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1902  // Amortize pointer arithmetic over the search loop.
1903  const LO* const curInd = indHost_.data () + absStartOffset;
1904 
1905  // If the hint was correct, then the hint is the offset to return.
1906  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1907  // Always return the offset relative to the current row.
1908  return hint;
1909  }
1910 
1911  // The hint was wrong, so we must search for the given column
1912  // index in the column indices for the given row.
1913  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1914 
1915  // We require that the graph have sorted rows. However, binary
1916  // search only pays if the current row is longer than a certain
1917  // amount. We set this to 32, but you might want to tune this.
1918  const LO maxNumEntriesForLinearSearch = 32;
1919  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1920  // Use binary search. It would probably be better for us to
1921  // roll this loop by hand. If we wrote it right, a smart
1922  // compiler could perhaps use conditional loads and avoid
1923  // branches (according to Jed Brown on May 2014).
1924  const LO* beg = curInd;
1925  const LO* end = curInd + numEntriesInRow;
1926  std::pair<const LO*, const LO*> p =
1927  std::equal_range (beg, end, colIndexToFind);
1928  if (p.first != p.second) {
1929  // offset is relative to the current row
1930  relOffset = static_cast<LO> (p.first - beg);
1931  }
1932  }
1933  else { // use linear search
1934  for (LO k = 0; k < numEntriesInRow; ++k) {
1935  if (colIndexToFind == curInd[k]) {
1936  relOffset = k; // offset is relative to the current row
1937  break;
1938  }
1939  }
1940  }
1941 
1942  return relOffset;
1943  }
1944 
1945  template<class Scalar, class LO, class GO, class Node>
1946  LO
1948  offsetPerBlock () const
1949  {
1950  return offsetPerBlock_;
1951  }
1952 
1953  template<class Scalar, class LO, class GO, class Node>
1957  const size_t pointOffset) const
1958  {
1959  // Row major blocks
1960  const LO rowStride = blockSize_;
1961  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1962  }
1963 
1964  template<class Scalar, class LO, class GO, class Node>
1968  const size_t pointOffset) const
1969  {
1970  // Row major blocks
1971  const LO rowStride = blockSize_;
1972  return little_block_type (val + pointOffset, blockSize_, rowStride);
1973  }
1974 
1975  template<class Scalar, class LO, class GO, class Node>
1978  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1979  {
1980 #ifdef HAVE_TPETRA_DEBUG
1981  const char prefix[] =
1982  "Tpetra::Experimental::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1983 #endif // HAVE_TPETRA_DEBUG
1984 
1985  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1986  // An empty block signifies an error. We don't expect to see
1987  // this error in correct code, but it's helpful for avoiding
1988  // memory corruption in case there is a bug.
1989  return const_little_block_type ();
1990  }
1991  else {
1992 #ifdef HAVE_TPETRA_DEBUG
1993  TEUCHOS_TEST_FOR_EXCEPTION
1994  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1995  prefix << "The matrix's data were last modified on device, but have "
1996  "not been sync'd to host. Please sync to host (by calling "
1997  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1998  "method.");
1999 #endif // HAVE_TPETRA_DEBUG
2000  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2001 
2002  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2003  // version of the data always exists (no lazy allocation for host
2004  // data).
2006  auto vals_host =
2007  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2008  const impl_scalar_type* vals_host_raw = vals_host.data ();
2009 
2010  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2011  }
2012  }
2013 
2014  template<class Scalar, class LO, class GO, class Node>
2017  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
2018  const size_t relMeshOffset) const
2019  {
2020  typedef impl_scalar_type IST;
2021 
2022  const LO* lclColInds = NULL;
2023  Scalar* lclVals = NULL;
2024  LO numEnt = 0;
2025 
2026  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2027  if (err != 0) {
2028  // An empty block signifies an error. We don't expect to see
2029  // this error in correct code, but it's helpful for avoiding
2030  // memory corruption in case there is a bug.
2031  return const_little_block_type ();
2032  }
2033  else {
2034  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2035  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2036  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2037  relPointOffset);
2038  }
2039  }
2040 
2041  template<class Scalar, class LO, class GO, class Node>
2044  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2045  {
2046 #ifdef HAVE_TPETRA_DEBUG
2047  const char prefix[] =
2048  "Tpetra::Experimental::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2049 #endif // HAVE_TPETRA_DEBUG
2050 
2051  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2052  // An empty block signifies an error. We don't expect to see
2053  // this error in correct code, but it's helpful for avoiding
2054  // memory corruption in case there is a bug.
2055  return little_block_type ();
2056  }
2057  else {
2058  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2059 #ifdef HAVE_TPETRA_DEBUG
2060  TEUCHOS_TEST_FOR_EXCEPTION
2061  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
2062  prefix << "The matrix's data were last modified on device, but have "
2063  "not been sync'd to host. Please sync to host (by calling "
2064  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2065  "method.");
2066 #endif // HAVE_TPETRA_DEBUG
2067  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2068  // version of the data always exists (no lazy allocation for host
2069  // data).
2071  auto vals_host =
2072  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2073  impl_scalar_type* vals_host_raw = vals_host.data ();
2074  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2075  }
2076  }
2077 
2078  template<class Scalar, class LO, class GO, class Node>
2081  getLocalBlock (const LO localRowInd, const LO localColInd) const
2082  {
2083  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2084  const LO relBlockOffset =
2085  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2086 
2087  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2088  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2089  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2090  }
2091  else {
2092  return little_block_type ();
2093  }
2094  }
2095 
2096  // template<class Scalar, class LO, class GO, class Node>
2097  // void
2098  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2099  // clearLocalErrorStateAndStream ()
2100  // {
2101  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2102  // * (const_cast<this_type*> (this)->localError_) = false;
2103  // *errs_ = Teuchos::null;
2104  // }
2105 
2106  template<class Scalar, class LO, class GO, class Node>
2107  bool
2109  checkSizes (const ::Tpetra::SrcDistObject& source)
2110  {
2111  using std::endl;
2113  const this_type* src = dynamic_cast<const this_type* > (&source);
2114 
2115  if (src == NULL) {
2116  std::ostream& err = markLocalErrorAndGetStream ();
2117  err << "checkSizes: The source object of the Import or Export "
2118  "must be a BlockCrsMatrix with the same template parameters as the "
2119  "target object." << endl;
2120  }
2121  else {
2122  // Use a string of ifs, not if-elseifs, because we want to know
2123  // all the errors.
2124  if (src->getBlockSize () != this->getBlockSize ()) {
2125  std::ostream& err = markLocalErrorAndGetStream ();
2126  err << "checkSizes: The source and target objects of the Import or "
2127  << "Export must have the same block sizes. The source's block "
2128  << "size = " << src->getBlockSize () << " != the target's block "
2129  << "size = " << this->getBlockSize () << "." << endl;
2130  }
2131  if (! src->graph_.isFillComplete ()) {
2132  std::ostream& err = markLocalErrorAndGetStream ();
2133  err << "checkSizes: The source object of the Import or Export is "
2134  "not fill complete. Both source and target objects must be fill "
2135  "complete." << endl;
2136  }
2137  if (! this->graph_.isFillComplete ()) {
2138  std::ostream& err = markLocalErrorAndGetStream ();
2139  err << "checkSizes: The target object of the Import or Export is "
2140  "not fill complete. Both source and target objects must be fill "
2141  "complete." << endl;
2142  }
2143  if (src->graph_.getColMap ().is_null ()) {
2144  std::ostream& err = markLocalErrorAndGetStream ();
2145  err << "checkSizes: The source object of the Import or Export does "
2146  "not have a column Map. Both source and target objects must have "
2147  "column Maps." << endl;
2148  }
2149  if (this->graph_.getColMap ().is_null ()) {
2150  std::ostream& err = markLocalErrorAndGetStream ();
2151  err << "checkSizes: The target object of the Import or Export does "
2152  "not have a column Map. Both source and target objects must have "
2153  "column Maps." << endl;
2154  }
2155  }
2156 
2157  return ! (* (this->localError_));
2158  }
2159 
2160  template<class Scalar, class LO, class GO, class Node>
2161  void
2163  copyAndPermute (const ::Tpetra::SrcDistObject& source,
2164  size_t numSameIDs,
2165  const Teuchos::ArrayView<const LO>& permuteToLIDs,
2166  const Teuchos::ArrayView<const LO>& permuteFromLIDs)
2167  {
2168  using std::endl;
2170  const bool debug = false;
2171 
2172  if (debug) {
2173  std::ostringstream os;
2174  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2175  os << "Proc " << myRank << ": copyAndPermute: "
2176  << "numSameIDs: " << numSameIDs
2177  << ", permuteToLIDs.size(): " << permuteToLIDs.size ()
2178  << ", permuteFromLIDs.size(): " << permuteFromLIDs.size ()
2179  << endl;
2180  std::cerr << os.str ();
2181  }
2182 
2183  // There's no communication in this method, so it's safe just to
2184  // return on error.
2185 
2186  if (* (this->localError_)) {
2187  std::ostream& err = this->markLocalErrorAndGetStream ();
2188  err << "copyAndPermute: The target object of the Import or Export is "
2189  "already in an error state." << endl;
2190  return;
2191  }
2192  if (permuteToLIDs.size () != permuteFromLIDs.size ()) {
2193  std::ostream& err = this->markLocalErrorAndGetStream ();
2194  err << "copyAndPermute: permuteToLIDs.size() = " << permuteToLIDs.size ()
2195  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."
2196  << endl;
2197  return;
2198  }
2199 
2200  const this_type* src = dynamic_cast<const this_type* > (&source);
2201  if (src == NULL) {
2202  std::ostream& err = this->markLocalErrorAndGetStream ();
2203  err << "copyAndPermute: The source object of the Import or Export is "
2204  "either not a BlockCrsMatrix, or does not have the right template "
2205  "parameters. checkSizes() should have caught this. "
2206  "Please report this bug to the Tpetra developers." << endl;
2207  return;
2208  }
2209  if (* (src->localError_)) {
2210  std::ostream& err = this->markLocalErrorAndGetStream ();
2211  err << "copyAndPermute: The source object of the Import or Export is "
2212  "already in an error state." << endl;
2213  return;
2214  }
2215 
2216  bool lclErr = false;
2217 #ifdef HAVE_TPETRA_DEBUG
2218  std::set<LO> invalidSrcCopyRows;
2219  std::set<LO> invalidDstCopyRows;
2220  std::set<LO> invalidDstCopyCols;
2221  std::set<LO> invalidDstPermuteCols;
2222  std::set<LO> invalidPermuteFromRows;
2223 #endif // HAVE_TPETRA_DEBUG
2224 
2225  // Copy the initial sequence of rows that are the same.
2226  //
2227  // The two graphs might have different column Maps, so we need to
2228  // do this using global column indices. This is purely local, so
2229  // we only need to check for local sameness of the two column
2230  // Maps.
2231 
2232 #ifdef HAVE_TPETRA_DEBUG
2233  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2234 #endif // HAVE_TPETRA_DEBUG
2235  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2236  const map_type& srcColMap = * (src->graph_.getColMap ());
2237  const map_type& dstColMap = * (this->graph_.getColMap ());
2238  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2239  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.size ());
2240 
2241  if (debug) {
2242  std::ostringstream os;
2243  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2244  os << "Proc " << myRank << ": copyAndPermute: "
2245  << "canUseLocalColumnIndices: "
2246  << (canUseLocalColumnIndices ? "true" : "false")
2247  << ", numPermute: " << numPermute
2248  << endl;
2249  std::cerr << os.str ();
2250  }
2251 
2252  if (canUseLocalColumnIndices) {
2253  // Copy local rows that are the "same" in both source and target.
2254  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2255 #ifdef HAVE_TPETRA_DEBUG
2256  if (! srcRowMap.isNodeLocalElement (localRow)) {
2257  lclErr = true;
2258  invalidSrcCopyRows.insert (localRow);
2259  continue; // skip invalid rows
2260  }
2261 #endif // HAVE_TPETRA_DEBUG
2262 
2263  const LO* lclSrcCols;
2264  Scalar* vals;
2265  LO numEntries;
2266  // If this call fails, that means the mesh row local index is
2267  // invalid. That means the Import or Export is invalid somehow.
2268  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2269  if (err != 0) {
2270  lclErr = true;
2271 #ifdef HAVE_TPETRA_DEBUG
2272  (void) invalidSrcCopyRows.insert (localRow);
2273 #endif // HAVE_TPETRA_DEBUG
2274  }
2275  else {
2276  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2277  if (err != numEntries) {
2278  lclErr = true;
2279  if (! dstRowMap.isNodeLocalElement (localRow)) {
2280 #ifdef HAVE_TPETRA_DEBUG
2281  invalidDstCopyRows.insert (localRow);
2282 #endif // HAVE_TPETRA_DEBUG
2283  }
2284  else {
2285  // Once there's an error, there's no sense in saving
2286  // time, so we check whether the column indices were
2287  // invalid. However, only remember which ones were
2288  // invalid in a debug build, because that might take a
2289  // lot of space.
2290  for (LO k = 0; k < numEntries; ++k) {
2291  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2292  lclErr = true;
2293 #ifdef HAVE_TPETRA_DEBUG
2294  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2295 #endif // HAVE_TPETRA_DEBUG
2296  }
2297  }
2298  }
2299  }
2300  }
2301  } // for each "same" local row
2302 
2303  // Copy the "permute" local rows.
2304  for (size_t k = 0; k < numPermute; ++k) {
2305  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2306  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2307 
2308  const LO* lclSrcCols;
2309  Scalar* vals;
2310  LO numEntries;
2311  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2312  if (err != 0) {
2313  lclErr = true;
2314 #ifdef HAVE_TPETRA_DEBUG
2315  invalidPermuteFromRows.insert (srcLclRow);
2316 #endif // HAVE_TPETRA_DEBUG
2317  }
2318  else {
2319  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2320  if (err != numEntries) {
2321  lclErr = true;
2322 #ifdef HAVE_TPETRA_DEBUG
2323  for (LO c = 0; c < numEntries; ++c) {
2324  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2325  invalidDstPermuteCols.insert (lclSrcCols[c]);
2326  }
2327  }
2328 #endif // HAVE_TPETRA_DEBUG
2329  }
2330  }
2331  }
2332  }
2333  else { // must convert column indices to global
2334  // Reserve space to store the destination matrix's local column indices.
2335  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2336  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2337 
2338  // Copy local rows that are the "same" in both source and target.
2339  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2340  const LO* lclSrcCols;
2341  Scalar* vals;
2342  LO numEntries;
2343  // If this call fails, that means the mesh row local index is
2344  // invalid. That means the Import or Export is invalid somehow.
2345  LO err = 0;
2346  try {
2347  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2348  } catch (std::exception& e) {
2349  if (debug) {
2350  std::ostringstream os;
2351  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2352  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2353  << localRow << ", src->getLocalRowView() threw an exception: "
2354  << e.what ();
2355  std::cerr << os.str ();
2356  }
2357  throw e;
2358  }
2359 
2360  if (err != 0) {
2361  lclErr = true;
2362 #ifdef HAVE_TPETRA_DEBUG
2363  invalidSrcCopyRows.insert (localRow);
2364 #endif // HAVE_TPETRA_DEBUG
2365  }
2366  else {
2367  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2368  lclErr = true;
2369  if (debug) {
2370  std::ostringstream os;
2371  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2372  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2373  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2374  << maxNumEnt << endl;
2375  std::cerr << os.str ();
2376  }
2377  }
2378  else {
2379  // Convert the source matrix's local column indices to the
2380  // destination matrix's local column indices.
2381  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2382  for (LO j = 0; j < numEntries; ++j) {
2383  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2384  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2385  lclErr = true;
2386 #ifdef HAVE_TPETRA_DEBUG
2387  invalidDstCopyCols.insert (lclDstColsView[j]);
2388 #endif // HAVE_TPETRA_DEBUG
2389  }
2390  }
2391  try {
2392  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2393  } catch (std::exception& e) {
2394  if (debug) {
2395  std::ostringstream os;
2396  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2397  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2398  << localRow << ", this->replaceLocalValues() threw an exception: "
2399  << e.what ();
2400  std::cerr << os.str ();
2401  }
2402  throw e;
2403  }
2404  if (err != numEntries) {
2405  lclErr = true;
2406  if (debug) {
2407  std::ostringstream os;
2408  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2409  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2410  "localRow " << localRow << ", this->replaceLocalValues "
2411  "returned " << err << " instead of numEntries = "
2412  << numEntries << endl;
2413  std::cerr << os.str ();
2414  }
2415  }
2416  }
2417  }
2418  }
2419 
2420  // Copy the "permute" local rows.
2421  for (size_t k = 0; k < numPermute; ++k) {
2422  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2423  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2424 
2425  const LO* lclSrcCols;
2426  Scalar* vals;
2427  LO numEntries;
2428  LO err = 0;
2429  try {
2430  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2431  } catch (std::exception& e) {
2432  if (debug) {
2433  std::ostringstream os;
2434  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2435  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2436  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2437  << ", src->getLocalRowView() threw an exception: " << e.what ();
2438  std::cerr << os.str ();
2439  }
2440  throw e;
2441  }
2442 
2443  if (err != 0) {
2444  lclErr = true;
2445 #ifdef HAVE_TPETRA_DEBUG
2446  invalidPermuteFromRows.insert (srcLclRow);
2447 #endif // HAVE_TPETRA_DEBUG
2448  }
2449  else {
2450  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2451  lclErr = true;
2452  }
2453  else {
2454  // Convert the source matrix's local column indices to the
2455  // destination matrix's local column indices.
2456  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2457  for (LO j = 0; j < numEntries; ++j) {
2458  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2459  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2460  lclErr = true;
2461 #ifdef HAVE_TPETRA_DEBUG
2462  invalidDstPermuteCols.insert (lclDstColsView[j]);
2463 #endif // HAVE_TPETRA_DEBUG
2464  }
2465  }
2466  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2467  if (err != numEntries) {
2468  lclErr = true;
2469  }
2470  }
2471  }
2472  }
2473  }
2474 
2475  if (lclErr) {
2476  std::ostream& err = this->markLocalErrorAndGetStream ();
2477 #ifdef HAVE_TPETRA_DEBUG
2478  err << "copyAndPermute: The graph structure of the source of the "
2479  "Import or Export must be a subset of the graph structure of the "
2480  "target. ";
2481  err << "invalidSrcCopyRows = [";
2482  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2483  it != invalidSrcCopyRows.end (); ++it) {
2484  err << *it;
2485  typename std::set<LO>::const_iterator itp1 = it;
2486  itp1++;
2487  if (itp1 != invalidSrcCopyRows.end ()) {
2488  err << ",";
2489  }
2490  }
2491  err << "], invalidDstCopyRows = [";
2492  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2493  it != invalidDstCopyRows.end (); ++it) {
2494  err << *it;
2495  typename std::set<LO>::const_iterator itp1 = it;
2496  itp1++;
2497  if (itp1 != invalidDstCopyRows.end ()) {
2498  err << ",";
2499  }
2500  }
2501  err << "], invalidDstCopyCols = [";
2502  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2503  it != invalidDstCopyCols.end (); ++it) {
2504  err << *it;
2505  typename std::set<LO>::const_iterator itp1 = it;
2506  itp1++;
2507  if (itp1 != invalidDstCopyCols.end ()) {
2508  err << ",";
2509  }
2510  }
2511  err << "], invalidDstPermuteCols = [";
2512  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2513  it != invalidDstPermuteCols.end (); ++it) {
2514  err << *it;
2515  typename std::set<LO>::const_iterator itp1 = it;
2516  itp1++;
2517  if (itp1 != invalidDstPermuteCols.end ()) {
2518  err << ",";
2519  }
2520  }
2521  err << "], invalidPermuteFromRows = [";
2522  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2523  it != invalidPermuteFromRows.end (); ++it) {
2524  err << *it;
2525  typename std::set<LO>::const_iterator itp1 = it;
2526  itp1++;
2527  if (itp1 != invalidPermuteFromRows.end ()) {
2528  err << ",";
2529  }
2530  }
2531  err << "]" << std::endl;
2532 #else
2533  err << "copyAndPermute: The graph structure of the source of the "
2534  "Import or Export must be a subset of the graph structure of the "
2535  "target." << std::endl;
2536 #endif // HAVE_TPETRA_DEBUG
2537  }
2538 
2539  if (debug) {
2540  std::ostringstream os;
2541  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2542  const bool lclSuccess = ! (* (this->localError_));
2543  os << "*** Proc " << myRank << ": copyAndPermute "
2544  << (lclSuccess ? "succeeded" : "FAILED");
2545  if (lclSuccess) {
2546  os << endl;
2547  } else {
2548  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2549  }
2550  std::cerr << os.str ();
2551  }
2552  }
2553 
2554  namespace { // (anonymous)
2555 
2574  template<class LO, class GO, class D>
2575  size_t
2576  packRowCount (const size_t numEnt,
2577  const size_t numBytesPerValue,
2578  const size_t blkSize)
2579  {
2580  using ::Tpetra::Details::PackTraits;
2581 
2582  if (numEnt == 0) {
2583  // Empty rows always take zero bytes, to ensure sparsity.
2584  return 0;
2585  }
2586  else {
2587  // We store the number of entries as a local index (LO).
2588  LO numEntLO = 0; // packValueCount wants this.
2589  GO gid;
2590  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2591  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2592  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2593  return numEntLen + gidsLen + valsLen;
2594  }
2595  }
2596 
2607  template<class ST, class LO, class GO, class D>
2608  size_t
2609  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2610  const size_t offset,
2611  const size_t numBytes,
2612  const size_t numBytesPerValue)
2613  {
2614  using Kokkos::subview;
2615  using ::Tpetra::Details::PackTraits;
2616 
2617  if (numBytes == 0) {
2618  // Empty rows always take zero bytes, to ensure sparsity.
2619  return static_cast<size_t> (0);
2620  }
2621  else {
2622  LO numEntLO = 0;
2623 #ifdef HAVE_TPETRA_DEBUG
2624  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2625  TEUCHOS_TEST_FOR_EXCEPTION(
2626  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2627  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2628  << ".");
2629 #endif // HAVE_TPETRA_DEBUG
2630  const char* const inBuf = imports.data () + offset;
2631  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2632 #ifdef HAVE_TPETRA_DEBUG
2633  TEUCHOS_TEST_FOR_EXCEPTION(
2634  actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2635  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2636  << ".");
2637 #else
2638  (void) actualNumBytes;
2639 #endif // HAVE_TPETRA_DEBUG
2640  return static_cast<size_t> (numEntLO);
2641  }
2642  }
2643 
2651  template<class ST, class LO, class GO, class D>
2652  size_t
2653  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
2654  const size_t offset,
2655  const size_t numEnt,
2656  const typename ::Tpetra::Details::PackTraits<GO, D>::input_array_type& gidsIn,
2657  const typename ::Tpetra::Details::PackTraits<ST, D>::input_array_type& valsIn,
2658  const size_t numBytesPerValue,
2659  const size_t blockSize)
2660  {
2661  using Kokkos::subview;
2662  using ::Tpetra::Details::PackTraits;
2663 
2664  if (numEnt == 0) {
2665  // Empty rows always take zero bytes, to ensure sparsity.
2666  return 0;
2667  }
2668  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2669 
2670  const GO gid = 0; // packValueCount wants this
2671  const LO numEntLO = static_cast<size_t> (numEnt);
2672 
2673  const size_t numEntBeg = offset;
2674  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2675  const size_t gidsBeg = numEntBeg + numEntLen;
2676  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2677  const size_t valsBeg = gidsBeg + gidsLen;
2678  const size_t valsLen = numScalarEnt * numBytesPerValue;
2679 
2680  char* const numEntOut = exports.data () + numEntBeg;
2681  char* const gidsOut = exports.data () + gidsBeg;
2682  char* const valsOut = exports.data () + valsBeg;
2683 
2684  size_t numBytesOut = 0;
2685  int errorCode = 0;
2686  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2687 
2688  {
2689  Kokkos::pair<int, size_t> p;
2690  p = PackTraits<GO, D>::packArray (gidsOut, gidsIn.data (), numEnt);
2691  errorCode += p.first;
2692  numBytesOut += p.second;
2693 
2694  p = PackTraits<ST, D>::packArray (valsOut, valsIn.data (), numScalarEnt);
2695  errorCode += p.first;
2696  numBytesOut += p.second;
2697  }
2698 
2699  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2700  TEUCHOS_TEST_FOR_EXCEPTION(
2701  numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
2702  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2703  << expectedNumBytes << ".");
2704 
2705  TEUCHOS_TEST_FOR_EXCEPTION(
2706  errorCode != 0, std::runtime_error, "packRow: "
2707  "PackTraits::packArray returned a nonzero error code");
2708 
2709  return numBytesOut;
2710  }
2711 
2712  // Return the number of bytes actually read / used.
2713  template<class ST, class LO, class GO, class D>
2714  size_t
2715  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2716  const typename ::Tpetra::Details::PackTraits<ST, D>::output_array_type& valsOut,
2717  const typename ::Tpetra::Details::PackTraits<int, D>::input_buffer_type& imports,
2718  const size_t offset,
2719  const size_t numBytes,
2720  const size_t numEnt,
2721  const size_t numBytesPerValue,
2722  const size_t blockSize)
2723  {
2724  using ::Tpetra::Details::PackTraits;
2725 
2726  if (numBytes == 0) {
2727  // Rows with zero bytes always have zero entries.
2728  return 0;
2729  }
2730  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2731  TEUCHOS_TEST_FOR_EXCEPTION(
2732  static_cast<size_t> (imports.extent (0)) <= offset,
2733  std::logic_error, "unpackRow: imports.extent(0) = "
2734  << imports.extent (0) << " <= offset = " << offset << ".");
2735  TEUCHOS_TEST_FOR_EXCEPTION(
2736  static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2737  std::logic_error, "unpackRow: imports.extent(0) = "
2738  << imports.extent (0) << " < offset + numBytes = "
2739  << (offset + numBytes) << ".");
2740 
2741  const GO gid = 0; // packValueCount wants this
2742  const LO lid = 0; // packValueCount wants this
2743 
2744  const size_t numEntBeg = offset;
2745  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2746  const size_t gidsBeg = numEntBeg + numEntLen;
2747  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2748  const size_t valsBeg = gidsBeg + gidsLen;
2749  const size_t valsLen = numScalarEnt * numBytesPerValue;
2750 
2751  const char* const numEntIn = imports.data () + numEntBeg;
2752  const char* const gidsIn = imports.data () + gidsBeg;
2753  const char* const valsIn = imports.data () + valsBeg;
2754 
2755  size_t numBytesOut = 0;
2756  int errorCode = 0;
2757  LO numEntOut;
2758  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2759  TEUCHOS_TEST_FOR_EXCEPTION(
2760  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2761  "unpackRow: Expected number of entries " << numEnt
2762  << " != actual number of entries " << numEntOut << ".");
2763 
2764  {
2765  Kokkos::pair<int, size_t> p;
2766  p = PackTraits<GO, D>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2767  errorCode += p.first;
2768  numBytesOut += p.second;
2769 
2770  p = PackTraits<ST, D>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2771  errorCode += p.first;
2772  numBytesOut += p.second;
2773  }
2774 
2775  TEUCHOS_TEST_FOR_EXCEPTION(
2776  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
2777  << numBytesOut << " != numBytes = " << numBytes << ".");
2778 
2779  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2780  TEUCHOS_TEST_FOR_EXCEPTION(
2781  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2782  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2783  << expectedNumBytes << ".");
2784 
2785  TEUCHOS_TEST_FOR_EXCEPTION(
2786  errorCode != 0, std::runtime_error, "unpackRow: "
2787  "PackTraits::unpackArray returned a nonzero error code");
2788 
2789  return numBytesOut;
2790  }
2791  } // namespace (anonymous)
2792 
2793  template<class Scalar, class LO, class GO, class Node>
2794  void
2796  packAndPrepare (const ::Tpetra::SrcDistObject& source,
2797  const Teuchos::ArrayView<const LO>& exportLIDs,
2798  Teuchos::Array<packet_type>& exports,
2799  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2800  size_t& constantNumPackets,
2801  ::Tpetra::Distributor& /* distor */)
2802  {
2803  using std::endl;
2804  using ::Tpetra::Details::PackTraits;
2805  using Kokkos::MemoryUnmanaged;
2806  using Kokkos::subview;
2807  using Kokkos::View;
2808  typedef typename ::Tpetra::MultiVector<Scalar, LO, GO, Node>::impl_scalar_type ST;
2809  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
2811  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2812  const bool debug = false;
2813 
2814  if (debug) {
2815  std::ostringstream os;
2816  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2817  os << "Proc " << myRank << ": packAndPrepare: exportLIDs.size() = "
2818  << exportLIDs.size () << ", numPacketsPerLID.size() = "
2819  << numPacketsPerLID.size () << endl;
2820  std::cerr << os.str ();
2821  }
2822 
2823  if (* (this->localError_)) {
2824  std::ostream& err = this->markLocalErrorAndGetStream ();
2825  err << "packAndPrepare: The target object of the Import or Export is "
2826  "already in an error state." << endl;
2827  return;
2828  }
2829 
2830  const this_type* src = dynamic_cast<const this_type* > (&source);
2831  // Should have checked for these cases in checkSizes().
2832  if (src == NULL) {
2833  std::ostream& err = this->markLocalErrorAndGetStream ();
2834  err << "packAndPrepare: The source (input) object of the Import or "
2835  "Export is either not a BlockCrsMatrix, or does not have the right "
2836  "template parameters. checkSizes() should have caught this. "
2837  "Please report this bug to the Tpetra developers." << endl;
2838  return;
2839  }
2840 
2841  const crs_graph_type& srcGraph = src->graph_;
2842  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2843  const size_type numExportLIDs = exportLIDs.size ();
2844 
2845  if (numExportLIDs != numPacketsPerLID.size ()) {
2846  std::ostream& err = this->markLocalErrorAndGetStream ();
2847  err << "packAndPrepare: exportLIDs.size() = " << numExportLIDs
2848  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
2849  << "." << endl;
2850  return;
2851  }
2852 
2853  // Graphs and matrices are allowed to have a variable number of
2854  // entries per row. We could test whether all rows have the same
2855  // number of entries, but DistObject can only use this
2856  // optimization if all rows on _all_ processes have the same
2857  // number of entries. Rather than do the all-reduce necessary to
2858  // test for this unlikely case, we tell DistObject (by setting
2859  // constantNumPackets to zero) to assume that different rows may
2860  // have different numbers of entries.
2861  constantNumPackets = 0;
2862 
2863  // Compute the number of bytes ("packets") per row to pack. While
2864  // we're at it, compute the total # of block entries to send, and
2865  // the max # of block entries in any of the rows we're sending.
2866  size_t totalNumBytes = 0;
2867  size_t totalNumEntries = 0;
2868  size_t maxRowLength = 0;
2869  for (size_type i = 0; i < numExportLIDs; ++i) {
2870  const LO lclRow = exportLIDs[i];
2871  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2872  // If any given LIDs are invalid, the above might return either
2873  // zero or the invalid size_t value. If the former, we have no
2874  // way to tell, but that's OK; it just means the calling process
2875  // won't pack anything (it has nothing to pack anyway). If the
2876  // latter, we replace it with zero (that row is not owned by the
2877  // calling process, so it has no entries to pack).
2878  if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2879  numEnt = 0;
2880  }
2881  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2882 
2883  // The 'if' branch implicitly assumes that packRowCount() returns
2884  // zero if numEnt == 0.
2885  size_t numBytesPerValue = 0;
2886  if (numEnt > 0) {
2887  // Get a locally indexed view of the current row's data. If
2888  // the current row has > 0 entries, we need an entry in order
2889  // to figure out the byte count of the packed row. (We really
2890  // only need it if ST's size is determined at run time.)
2891  Scalar* valsRaw = NULL;
2892  const LO* lidsRaw = NULL;
2893  LO actualNumEnt = 0;
2894  const LO errCode =
2895  src->getLocalRowView (lclRow, lidsRaw, valsRaw, actualNumEnt);
2896 
2897  if (numEnt != static_cast<size_t> (actualNumEnt)) {
2898  std::ostream& err = this->markLocalErrorAndGetStream ();
2899  err << "packAndPrepare: Local row " << i << " claims to have " <<
2900  numEnt << "entry/ies, but the View returned by getLocalRowView() "
2901  "has " << actualNumEnt << " entry/ies. This should never happen. "
2902  "Please report this bug to the Tpetra developers." << endl;
2903  return;
2904  }
2905  if (errCode == Teuchos::OrdinalTraits<LO>::invalid ()) {
2906  std::ostream& err = this->markLocalErrorAndGetStream ();
2907  err << "packAndPrepare: Local row " << i << " is not in the row Map "
2908  "of the source object on the calling process." << endl;
2909  return;
2910  }
2911 
2912  const ST* valsRawST =
2913  const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2914  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2915 
2916  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2917  // space here for now, this doesn't assume UVM. That may change
2918  // in the future, if we ever start packing on the device.
2919  numBytesPerValue = PackTraits<ST, HES>::packValueCount (vals(0));
2920  }
2921 
2922  const size_t numBytes =
2923  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue, blockSize);
2924  numPacketsPerLID[i] = numBytes;
2925  totalNumBytes += numBytes;
2926  totalNumEntries += numEnt;
2927  maxRowLength = std::max (maxRowLength, numEnt);
2928  }
2929 
2930  if (debug) {
2931  const int myRank = graph_.getComm ()->getRank ();
2932  std::ostringstream os;
2933  os << "Proc " << myRank << ": packAndPrepare: totalNumBytes = "
2934  << totalNumBytes << endl;
2935  std::cerr << os.str ();
2936  }
2937 
2938  // We use a "struct of arrays" approach to packing each row's
2939  // entries. All the column indices (as global indices) go first,
2940  // then all their owning process ranks, and then the values.
2941  exports.resize (totalNumBytes);
2942  if (totalNumEntries > 0) {
2943  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
2944  totalNumBytes);
2945 
2946  // Current position (in bytes) in the 'exports' output array.
2947  size_t offset = 0;
2948 
2949  // For each block row of the matrix owned by the calling
2950  // process, pack that block row's column indices and values into
2951  // the exports array.
2952 
2953  // Source matrix's column Map. We verified in checkSizes() that
2954  // the column Map exists (is not null).
2955  const map_type& srcColMap = * (srcGraph.getColMap ());
2956 
2957  // Temporary buffer for global column indices.
2958  View<GO*, HES> gblColInds;
2959  {
2960  GO gid = 0;
2961  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
2962  }
2963 
2964  // Pack the data for each row to send, into the 'exports' buffer.
2965  for (size_type i = 0; i < numExportLIDs; ++i) {
2966  const LO lclRowInd = exportLIDs[i];
2967  const LO* lclColIndsRaw;
2968  Scalar* valsRaw;
2969  LO numEntLO;
2970  // It's OK to ignore the return value, since if the calling
2971  // process doesn't own that local row, then the number of
2972  // entries in that row on the calling process is zero.
2973  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2974  const size_t numEnt = static_cast<size_t> (numEntLO);
2975  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2976  View<const LO*, HES, MemoryUnmanaged> lclColInds (lclColIndsRaw, numEnt);
2977  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2978  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2979 
2980  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2981  // space here for now, this doesn't assume UVM. That may
2982  // change in the future, if we ever start packing on device.
2983  const size_t numBytesPerValue = numEnt == 0 ?
2984  static_cast<size_t> (0) :
2985  PackTraits<ST, HES>::packValueCount (vals(0));
2986 
2987  // Convert column indices from local to global.
2988  for (size_t j = 0; j < numEnt; ++j) {
2989  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2990  }
2991 
2992  // Copy the row's data into the current spot in the exports array.
2993  const size_t numBytes =
2994  packRowForBlockCrs<ST, LO, GO, HES> (exportsK, offset, numEnt, gblColInds,
2995  vals, numBytesPerValue, blockSize);
2996  // Keep track of how many bytes we packed.
2997  offset += numBytes;
2998  } // for each LID (of a row) to send
2999 
3000  if (offset != totalNumBytes) {
3001  std::ostream& err = this->markLocalErrorAndGetStream ();
3002  err << "packAndPreapre: At end of method, the final offset (in bytes) "
3003  << offset << " does not equal the total number of bytes packed "
3004  << totalNumBytes << ". "
3005  << "Please report this bug to the Tpetra developers." << endl;
3006  return;
3007  }
3008  } // if totalNumEntries > 0
3009 
3010  if (debug) {
3011  std::ostringstream os;
3012  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3013  const bool lclSuccess = ! (* (this->localError_));
3014  os << "*** Proc " << myRank << ": packAndPrepare "
3015  << (lclSuccess ? "succeeded" : "FAILED")
3016  << " (totalNumEntries = " << totalNumEntries << ") ***" << endl;
3017  std::cerr << os.str ();
3018  }
3019  }
3020 
3021 
3022  template<class Scalar, class LO, class GO, class Node>
3023  void
3025  unpackAndCombine (const Teuchos::ArrayView<const LO>& importLIDs,
3026  const Teuchos::ArrayView<const packet_type>& imports,
3027  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
3028  size_t /* constantNumPackets */, // not worthwhile to use this
3029  ::Tpetra::Distributor& /* distor */,
3030  ::Tpetra::CombineMode CM)
3031  {
3032  using std::endl;
3033  using ::Tpetra::Details::PackTraits;
3034  using Kokkos::MemoryUnmanaged;
3035  using Kokkos::subview;
3036  using Kokkos::View;
3037  typedef typename ::Tpetra::MultiVector<Scalar, LO, GO, Node>::impl_scalar_type ST;
3038  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
3039  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
3040  typedef std::pair<typename View<int*, HES>::size_type,
3041  typename View<int*, HES>::size_type> pair_type;
3042  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
3043  typedef View<LO*, HES, MemoryUnmanaged> lids_out_type;
3044  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
3045  typedef typename PackTraits<GO, HES>::input_buffer_type input_buffer_type;
3046  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::unpackAndCombine: ";
3047  const bool debug = false;
3048 
3049  if (debug) {
3050  std::ostringstream os;
3051  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3052  os << "Proc " << myRank << ": unpackAndCombine" << endl;
3053  std::cerr << os.str ();
3054  }
3055 
3056  // It should not cause deadlock to return on error in this method,
3057  // since this method does not communicate.
3058 
3059  if (* (this->localError_)) {
3060  std::ostream& err = this->markLocalErrorAndGetStream ();
3061  err << prefix << "The target object of the Import or Export is "
3062  "already in an error state." << endl;
3063  return;
3064  }
3065  if (importLIDs.size () != numPacketsPerLID.size ()) {
3066  std::ostream& err = this->markLocalErrorAndGetStream ();
3067  err << prefix << "importLIDs.size() = " << importLIDs.size () << " != "
3068  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "." << endl;
3069  return;
3070  }
3071  if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
3072  std::ostream& err = this->markLocalErrorAndGetStream ();
3073  err << prefix << "Invalid CombineMode value " << CM << ". Valid "
3074  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3075  << endl;
3076  return;
3077  }
3078 
3079  // Target matrix's column Map. Use to convert the global column
3080  // indices in the receive buffer to local indices. We verified in
3081  // checkSizes() that the column Map exists (is not null).
3082  const map_type& tgtColMap = * (this->graph_.getColMap ());
3083 
3084  const size_type numImportLIDs = importLIDs.size ();
3085  if (CM == ZERO || numImportLIDs == 0) {
3086  if (debug) {
3087  std::ostringstream os;
3088  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3089  os << "Proc " << myRank << ": unpackAndCombine: Nothing to do" << endl;
3090  std::cerr << os.str ();
3091  }
3092  return; // nothing to do; no need to combine entries
3093  }
3094 
3095  if (debug) {
3096  std::ostringstream os;
3097  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3098  os << "Proc " << myRank << ": unpackAndCombine: Getting ready" << endl;
3099  std::cerr << os.str ();
3100  }
3101 
3102  input_buffer_type importsK (imports.getRawPtr (), imports.size ());
3103  const size_t blockSize = this->getBlockSize ();
3104  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3105  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3106 
3107  // Determine the number of bytes in a Scalar instance.
3108  size_t numBytesPerValue;
3109 
3110  // mfh 19 Sep 2017: Only Stokhos has Scalar types with run-time
3111  // size. For all of those types, any one allocated instance has
3112  // the same size as any other allocated instance. Thus, it
3113  // suffices to find some allocated instance as a representative
3114  // value.
3115  if (this->val_.h_view.extent (0) != 0) {
3116  const ST& val = this->val_.h_view[0];
3117  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
3118  }
3119  else {
3120  // FIXME (mfh 19 Sep 2017): I don't have any values on my
3121  // process, so I don't know how big the value should be, if it's
3122  // run-time-sized. The best I can do is use a default-allocated
3123  // Scalar instance's size. If we ever want to fix this, then
3124  // each sending process should pack the run-time size. This is
3125  // only necessary if the size is not a compile-time constant.
3126  Scalar val;
3127  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
3128  }
3129 
3130  // Temporary space to cache incoming global column indices and
3131  // values. Column indices come in as global indices, in case the
3132  // source object's column Map differs from the target object's
3133  // (this's) column Map.
3134  View<GO*, HES> gblColInds;
3135  View<LO*, HES> lclColInds;
3136  View<ST*, HES> vals;
3137  {
3138  GO gid = 0;
3139  LO lid = 0;
3140  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
3141  // run-time size? We already assume that all entries in both
3142  // the source and target matrices have the same size. If the
3143  // calling process owns at least one entry in either matrix, we
3144  // can use that entry to set the size. However, it is possible
3145  // that the calling process owns no entries. In that case,
3146  // we're in trouble. One way to fix this would be for each
3147  // row's data to contain the run-time size. This is only
3148  // necessary if the size is not a compile-time constant.
3149  Scalar val;
3150  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
3151  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
3152  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumScalarEnt, "vals");
3153  }
3154 
3155  size_t offset = 0;
3156  bool errorDuringUnpack = false;
3157  for (size_type i = 0; i < numImportLIDs; ++i) {
3158  const size_t numBytes = numPacketsPerLID[i];
3159  if (numBytes == 0) {
3160  continue; // empty buffer for that row means that the row is empty
3161  }
3162  const size_t numEnt =
3163  unpackRowCount<ST, LO, GO, HES> (importsK, offset, numBytes,
3164  numBytesPerValue);
3165  if (numEnt > maxRowNumEnt) {
3166  errorDuringUnpack = true;
3167 #ifdef HAVE_TPETRA_DEBUG
3168  std::ostream& err = this->markLocalErrorAndGetStream ();
3169  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3170  << " > maxRowNumEnt = " << maxRowNumEnt << endl;
3171 #endif // HAVE_TPETRA_DEBUG
3172  continue;
3173  }
3174 
3175  const size_t numScalarEnt = numEnt * blockSize * blockSize;
3176  const LO lclRow = importLIDs[i];
3177 
3178  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
3179  vals_out_type valsOut = subview (vals, pair_type (0, numScalarEnt));
3180 
3181  const size_t numBytesOut =
3182  unpackRowForBlockCrs<ST, LO, GO, HES> (gidsOut, valsOut, importsK,
3183  offset, numBytes, numEnt,
3184  numBytesPerValue, blockSize);
3185  if (numBytes != numBytesOut) {
3186  errorDuringUnpack = true;
3187 #ifdef HAVE_TPETRA_DEBUG
3188  std::ostream& err = this->markLocalErrorAndGetStream ();
3189  err << prefix << "At i = " << i << ", numBytes = " << numBytes
3190  << " != numBytesOut = " << numBytesOut << ".";
3191 #endif // HAVE_TPETRA_DEBUG
3192  continue;
3193  }
3194 
3195  // Convert incoming global indices to local indices.
3196  lids_out_type lidsOut = subview (lclColInds, pair_type (0, numEnt));
3197  for (size_t k = 0; k < numEnt; ++k) {
3198  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3199  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3200  errorDuringUnpack = true;
3201 #ifdef HAVE_TPETRA_DEBUG
3202  std::ostream& err = this->markLocalErrorAndGetStream ();
3203  err << prefix << "At i = " << i << ", GID " << gidsOut(k)
3204  << " is not owned by the calling process.";
3205 #endif // HAVE_TPETRA_DEBUG
3206  continue;
3207  }
3208  }
3209 
3210  // Combine the incoming data with the matrix's current data.
3211  LO numCombd = 0;
3212  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
3213  const Scalar* const valsRaw =
3214  reinterpret_cast<const Scalar*> (const_cast<const ST*> (valsOut.data ()));
3215  if (CM == ADD) {
3216  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3217  } else if (CM == INSERT || CM == REPLACE) {
3218  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3219  } else if (CM == ABSMAX) {
3220  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3221  }
3222 
3223  if (static_cast<LO> (numEnt) != numCombd) {
3224  errorDuringUnpack = true;
3225 #ifdef HAVE_TPETRA_DEBUG
3226  std::ostream& err = this->markLocalErrorAndGetStream ();
3227  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3228  << " != numCombd = " << numCombd << ".";
3229 #endif // HAVE_TPETRA_DEBUG
3230  continue;
3231  }
3232 
3233  // Don't update offset until current LID has succeeded.
3234  offset += numBytes;
3235  } // for each import LID i
3236 
3237  if (errorDuringUnpack) {
3238  std::ostream& err = this->markLocalErrorAndGetStream ();
3239  err << prefix << "Unpacking failed.";
3240 #ifndef HAVE_TPETRA_DEBUG
3241  err << " Please run again with a debug build to get more verbose "
3242  "diagnostic output.";
3243 #endif // ! HAVE_TPETRA_DEBUG
3244  err << endl;
3245  }
3246 
3247  if (debug) {
3248  std::ostringstream os;
3249  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3250  const bool lclSuccess = ! (* (this->localError_));
3251  os << "*** Proc " << myRank << ": unpackAndCombine "
3252  << (lclSuccess ? "succeeded" : "FAILED")
3253  << " ***" << endl;
3254  std::cerr << os.str ();
3255  }
3256  }
3257 
3258 
3259  template<class Scalar, class LO, class GO, class Node>
3260  std::string
3262  {
3263  using Teuchos::TypeNameTraits;
3264  std::ostringstream os;
3265  os << "\"Tpetra::BlockCrsMatrix\": { "
3266  << "Template parameters: { "
3267  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3268  << "LO: " << TypeNameTraits<LO>::name ()
3269  << "GO: " << TypeNameTraits<GO>::name ()
3270  << "Node: " << TypeNameTraits<Node>::name ()
3271  << " }"
3272  << ", Label: \"" << this->getObjectLabel () << "\""
3273  << ", Global dimensions: ["
3274  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3275  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3276  << ", Block size: " << getBlockSize ()
3277  << " }";
3278  return os.str ();
3279  }
3280 
3281 
3282  template<class Scalar, class LO, class GO, class Node>
3283  void
3285  describe (Teuchos::FancyOStream& out,
3286  const Teuchos::EVerbosityLevel verbLevel) const
3287  {
3288  using Teuchos::ArrayRCP;
3289  using Teuchos::CommRequest;
3290  using Teuchos::FancyOStream;
3291  using Teuchos::getFancyOStream;
3292  using Teuchos::ireceive;
3293  using Teuchos::isend;
3294  using Teuchos::outArg;
3295  using Teuchos::TypeNameTraits;
3296  using Teuchos::VERB_DEFAULT;
3297  using Teuchos::VERB_NONE;
3298  using Teuchos::VERB_LOW;
3299  using Teuchos::VERB_MEDIUM;
3300  // using Teuchos::VERB_HIGH;
3301  using Teuchos::VERB_EXTREME;
3302  using Teuchos::RCP;
3303  using Teuchos::wait;
3304  using std::endl;
3305 #ifdef HAVE_TPETRA_DEBUG
3306  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::describe: ";
3307 #endif // HAVE_TPETRA_DEBUG
3308 
3309  // Set default verbosity if applicable.
3310  const Teuchos::EVerbosityLevel vl =
3311  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3312 
3313  if (vl == VERB_NONE) {
3314  return; // print nothing
3315  }
3316 
3317  // describe() always starts with a tab before it prints anything.
3318  Teuchos::OSTab tab0 (out);
3319 
3320  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3321  Teuchos::OSTab tab1 (out);
3322 
3323  out << "Template parameters:" << endl;
3324  {
3325  Teuchos::OSTab tab2 (out);
3326  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3327  << "LO: " << TypeNameTraits<LO>::name () << endl
3328  << "GO: " << TypeNameTraits<GO>::name () << endl
3329  << "Node: " << TypeNameTraits<Node>::name () << endl;
3330  }
3331  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3332  << "Global dimensions: ["
3333  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3334  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3335 
3336  const LO blockSize = getBlockSize ();
3337  out << "Block size: " << blockSize << endl;
3338 
3339  // constituent objects
3340  if (vl >= VERB_MEDIUM) {
3341  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3342  const int myRank = comm.getRank ();
3343  if (myRank == 0) {
3344  out << "Row Map:" << endl;
3345  }
3346  getRowMap()->describe (out, vl);
3347 
3348  if (! getColMap ().is_null ()) {
3349  if (getColMap() == getRowMap()) {
3350  if (myRank == 0) {
3351  out << "Column Map: same as row Map" << endl;
3352  }
3353  }
3354  else {
3355  if (myRank == 0) {
3356  out << "Column Map:" << endl;
3357  }
3358  getColMap ()->describe (out, vl);
3359  }
3360  }
3361  if (! getDomainMap ().is_null ()) {
3362  if (getDomainMap () == getRowMap ()) {
3363  if (myRank == 0) {
3364  out << "Domain Map: same as row Map" << endl;
3365  }
3366  }
3367  else if (getDomainMap () == getColMap ()) {
3368  if (myRank == 0) {
3369  out << "Domain Map: same as column Map" << endl;
3370  }
3371  }
3372  else {
3373  if (myRank == 0) {
3374  out << "Domain Map:" << endl;
3375  }
3376  getDomainMap ()->describe (out, vl);
3377  }
3378  }
3379  if (! getRangeMap ().is_null ()) {
3380  if (getRangeMap () == getDomainMap ()) {
3381  if (myRank == 0) {
3382  out << "Range Map: same as domain Map" << endl;
3383  }
3384  }
3385  else if (getRangeMap () == getRowMap ()) {
3386  if (myRank == 0) {
3387  out << "Range Map: same as row Map" << endl;
3388  }
3389  }
3390  else {
3391  if (myRank == 0) {
3392  out << "Range Map: " << endl;
3393  }
3394  getRangeMap ()->describe (out, vl);
3395  }
3396  }
3397  }
3398 
3399  if (vl >= VERB_EXTREME) {
3400  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3401  // to host, since it's supposed to be const. However, that's
3402  // the easiest and least memory-intensive way to implement this
3403  // method.
3405  const_cast<this_type*> (this)->template sync<Kokkos::HostSpace> ();
3406 
3407 #ifdef HAVE_TPETRA_DEBUG
3408  TEUCHOS_TEST_FOR_EXCEPTION
3409  (this->template need_sync<Kokkos::HostSpace> (), std::logic_error,
3410  prefix << "Right after sync to host, the matrix claims that it needs "
3411  "sync to host. Please report this bug to the Tpetra developers.");
3412 #endif // HAVE_TPETRA_DEBUG
3413 
3414  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3415  const int myRank = comm.getRank ();
3416  const int numProcs = comm.getSize ();
3417 
3418  // Print the calling process' data to the given output stream.
3419  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3420  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3421  FancyOStream& os = *osPtr;
3422  os << "Process " << myRank << ":" << endl;
3423  Teuchos::OSTab tab2 (os);
3424 
3425  const map_type& meshRowMap = * (graph_.getRowMap ());
3426  const map_type& meshColMap = * (graph_.getColMap ());
3427  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3428  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3429  ++meshLclRow) {
3430  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3431  os << "Row " << meshGblRow << ": {";
3432 
3433  const LO* lclColInds = NULL;
3434  Scalar* vals = NULL;
3435  LO numInds = 0;
3436  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3437 
3438  for (LO k = 0; k < numInds; ++k) {
3439  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3440 
3441  os << "Col " << gblCol << ": [";
3442  for (LO i = 0; i < blockSize; ++i) {
3443  for (LO j = 0; j < blockSize; ++j) {
3444  os << vals[blockSize*blockSize*k + i*blockSize + j];
3445  if (j + 1 < blockSize) {
3446  os << ", ";
3447  }
3448  }
3449  if (i + 1 < blockSize) {
3450  os << "; ";
3451  }
3452  }
3453  os << "]";
3454  if (k + 1 < numInds) {
3455  os << ", ";
3456  }
3457  }
3458  os << "}" << endl;
3459  }
3460 
3461  // Print data on Process 0. This will automatically respect the
3462  // current indentation level.
3463  if (myRank == 0) {
3464  out << lclOutStrPtr->str ();
3465  lclOutStrPtr = Teuchos::null; // clear it to save space
3466  }
3467 
3468  const int sizeTag = 1337;
3469  const int dataTag = 1338;
3470 
3471  ArrayRCP<char> recvDataBuf; // only used on Process 0
3472 
3473  // Send string sizes and data from each process in turn to
3474  // Process 0, and print on that process.
3475  for (int p = 1; p < numProcs; ++p) {
3476  if (myRank == 0) {
3477  // Receive the incoming string's length.
3478  ArrayRCP<size_t> recvSize (1);
3479  recvSize[0] = 0;
3480  RCP<CommRequest<int> > recvSizeReq =
3481  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3482  wait<int> (comm, outArg (recvSizeReq));
3483  const size_t numCharsToRecv = recvSize[0];
3484 
3485  // Allocate space for the string to receive. Reuse receive
3486  // buffer space if possible. We can do this because in the
3487  // current implementation, we only have one receive in
3488  // flight at a time. Leave space for the '\0' at the end,
3489  // in case the sender doesn't send it.
3490  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3491  recvDataBuf.resize (numCharsToRecv + 1);
3492  }
3493  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3494  // Post the receive of the actual string data.
3495  RCP<CommRequest<int> > recvDataReq =
3496  ireceive<int, char> (recvData, p, dataTag, comm);
3497  wait<int> (comm, outArg (recvDataReq));
3498 
3499  // Print the received data. This will respect the current
3500  // indentation level. Make sure that the string is
3501  // null-terminated.
3502  recvDataBuf[numCharsToRecv] = '\0';
3503  out << recvDataBuf.getRawPtr ();
3504  }
3505  else if (myRank == p) { // if I am not Process 0, and my rank is p
3506  // This deep-copies the string at most twice, depending on
3507  // whether std::string reference counts internally (it
3508  // generally does, so this won't deep-copy at all).
3509  const std::string stringToSend = lclOutStrPtr->str ();
3510  lclOutStrPtr = Teuchos::null; // clear original to save space
3511 
3512  // Send the string's length to Process 0.
3513  const size_t numCharsToSend = stringToSend.size ();
3514  ArrayRCP<size_t> sendSize (1);
3515  sendSize[0] = numCharsToSend;
3516  RCP<CommRequest<int> > sendSizeReq =
3517  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3518  wait<int> (comm, outArg (sendSizeReq));
3519 
3520  // Send the actual string to Process 0. We know that the
3521  // string has length > 0, so it's save to take the address
3522  // of the first entry. Make a nonowning ArrayRCP to hold
3523  // the string. Process 0 will add a null termination
3524  // character at the end of the string, after it receives the
3525  // message.
3526  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3527  RCP<CommRequest<int> > sendDataReq =
3528  isend<int, char> (sendData, 0, dataTag, comm);
3529  wait<int> (comm, outArg (sendDataReq));
3530  }
3531  } // for each process rank p other than 0
3532  } // extreme verbosity level (print the whole matrix)
3533  }
3534 
3535  template<class Scalar, class LO, class GO, class Node>
3536  Teuchos::RCP<const Teuchos::Comm<int> >
3538  getComm() const
3539  {
3540  return graph_.getComm();
3541  }
3542 
3543  template<class Scalar, class LO, class GO, class Node>
3544  Teuchos::RCP<Node>
3546  getNode() const
3547  {
3548  return graph_.getNode();
3549 
3550  }
3551 
3552  template<class Scalar, class LO, class GO, class Node>
3556  {
3557  return graph_.getGlobalNumCols();
3558  }
3559 
3560  template<class Scalar, class LO, class GO, class Node>
3561  size_t
3564  {
3565  return graph_.getNodeNumCols();
3566  }
3567 
3568  template<class Scalar, class LO, class GO, class Node>
3569  GO
3572  {
3573  return graph_.getIndexBase();
3574  }
3575 
3576  template<class Scalar, class LO, class GO, class Node>
3580  {
3581  return graph_.getGlobalNumEntries();
3582  }
3583 
3584  template<class Scalar, class LO, class GO, class Node>
3585  size_t
3588  {
3589  return graph_.getNodeNumEntries();
3590  }
3591 
3592  template<class Scalar, class LO, class GO, class Node>
3593  size_t
3595  getNumEntriesInGlobalRow (GO globalRow) const
3596  {
3597  return graph_.getNumEntriesInGlobalRow(globalRow);
3598  }
3599 
3600  template<class Scalar, class LO, class GO, class Node>
3604  {
3606  return dynamic_cast<const HDM&> (this->graph_).getGlobalNumDiagsImpl ();
3607  }
3608 
3609  template<class Scalar, class LO, class GO, class Node>
3610  size_t
3613  {
3615  return dynamic_cast<const HDM&> (this->graph_).getNodeNumDiagsImpl ();
3616  }
3617 
3618  template<class Scalar, class LO, class GO, class Node>
3619  size_t
3622  {
3623  return graph_.getGlobalMaxNumRowEntries();
3624  }
3625 
3626  template<class Scalar, class LO, class GO, class Node>
3627  bool
3629  hasColMap() const
3630  {
3631  return graph_.hasColMap();
3632  }
3633 
3634  template<class Scalar, class LO, class GO, class Node>
3635  bool
3638  {
3640  return dynamic_cast<const HDM&> (this->graph_).isLowerTriangularImpl ();
3641  }
3642 
3643  template<class Scalar, class LO, class GO, class Node>
3644  bool
3647  {
3649  return dynamic_cast<const HDM&> (this->graph_).isUpperTriangularImpl ();
3650  }
3651 
3652  template<class Scalar, class LO, class GO, class Node>
3653  bool
3656  {
3657  return graph_.isLocallyIndexed();
3658  }
3659 
3660  template<class Scalar, class LO, class GO, class Node>
3661  bool
3664  {
3665  return graph_.isGloballyIndexed();
3666  }
3667 
3668  template<class Scalar, class LO, class GO, class Node>
3669  bool
3672  {
3673  return graph_.isFillComplete ();
3674  }
3675 
3676  template<class Scalar, class LO, class GO, class Node>
3677  bool
3680  {
3681  return false;
3682  }
3683 
3684 
3685  template<class Scalar, class LO, class GO, class Node>
3686  void
3688  getGlobalRowCopy (GO GlobalRow,
3689  const Teuchos::ArrayView<GO> &Indices,
3690  const Teuchos::ArrayView<Scalar> &Values,
3691  size_t &NumEntries) const
3692  {
3693  TEUCHOS_TEST_FOR_EXCEPTION(
3694  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
3695  "This class doesn't support global matrix indexing.");
3696 
3697  }
3698 
3699  template<class Scalar, class LO, class GO, class Node>
3700  void
3702  getGlobalRowView (GO GlobalRow,
3703  Teuchos::ArrayView<const GO> &indices,
3704  Teuchos::ArrayView<const Scalar> &values) const
3705  {
3706  TEUCHOS_TEST_FOR_EXCEPTION(
3707  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3708  "This class doesn't support global matrix indexing.");
3709 
3710  }
3711 
3712  template<class Scalar, class LO, class GO, class Node>
3713  void
3715  getLocalRowView (LO LocalRow,
3716  Teuchos::ArrayView<const LO>& indices,
3717  Teuchos::ArrayView<const Scalar>& values) const
3718  {
3719  TEUCHOS_TEST_FOR_EXCEPTION(
3720  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: "
3721  "This class doesn't support local matrix indexing.");
3722  }
3723 
3724  template<class Scalar, class LO, class GO, class Node>
3725  void
3728  {
3729 #ifdef HAVE_TPETRA_DEBUG
3730  const char prefix[] =
3731  "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: ";
3732 #endif // HAVE_TPETRA_DEBUG
3733 
3734  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3735 
3736  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3737  graph_.getLocalDiagOffsets (diagOffsets);
3738 
3739  // The code below works on host, so use a host View.
3740  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3741  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3742  // We're filling diag on host for now.
3743  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3744 
3745 #ifdef HAVE_TPETRA_DEBUG
3746  TEUCHOS_TEST_FOR_EXCEPTION
3747  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
3748  prefix << "The matrix's data were last modified on device, but have "
3749  "not been sync'd to host. Please sync to host (by calling "
3750  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3751  "method.");
3752 #endif // HAVE_TPETRA_DEBUG
3753 
3754  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
3755  // version of the data always exists (no lazy allocation for host
3756  // data).
3758  auto vals_host_out =
3759  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
3760  Scalar* vals_host_out_raw =
3761  reinterpret_cast<Scalar*> (vals_host_out.data ());
3762 
3763  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3764  size_t rowOffset = 0;
3765  size_t offset = 0;
3766  LO bs = getBlockSize();
3767  for(size_t r=0; r<getNodeNumRows(); r++)
3768  {
3769  // move pointer to start of diagonal block
3770  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3771  for(int b=0; b<bs; b++)
3772  {
3773  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3774  }
3775  // move pointer to start of next block row
3776  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3777  }
3778 
3779  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3780  }
3781 
3782  template<class Scalar, class LO, class GO, class Node>
3783  void
3785  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& x)
3786  {
3787  TEUCHOS_TEST_FOR_EXCEPTION(
3788  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
3789  "not implemented.");
3790 
3791  }
3792 
3793  template<class Scalar, class LO, class GO, class Node>
3794  void
3796  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& x)
3797  {
3798  TEUCHOS_TEST_FOR_EXCEPTION(
3799  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
3800  "not implemented.");
3801 
3802  }
3803 
3804  template<class Scalar, class LO, class GO, class Node>
3805  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3807  getGraph() const
3808  {
3809  return graphRCP_;
3810  }
3811 
3812  template<class Scalar, class LO, class GO, class Node>
3813  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3816  {
3817  TEUCHOS_TEST_FOR_EXCEPTION(
3818  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
3819  "not implemented.");
3820  }
3821 
3822 } // namespace Classes
3823 } // namespace Experimental
3824 } // namespace Tpetra
3825 
3826 //
3827 // Explicit instantiation macro
3828 //
3829 // Must be expanded from within the Tpetra namespace!
3830 //
3831 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3832  namespace Experimental { \
3833  namespace Classes { \
3834  template class BlockCrsMatrix< S, LO, GO, NODE >; \
3835  } \
3836  }
3837 
3838 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
Base class for distributed Tpetra objects that support data redistribution.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
global_size_t getGlobalNumCols() const override
Returns the number of global columns in the graph.
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in any row of the matrix, on this process.
mv_type getMultiVectorView() const
Get a Tpetra::MultiVector that views this BlockMultiVector&#39;s data.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
virtual size_t TPETRA_DEPRECATED getNodeNumDiags() const
Number of diagonal entries in the matrix&#39;s graph, on the calling process.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, CombineMode CM)
Export data into this object using an Export object ("forward mode").
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
LocalOrdinal getMinLocalIndex() const
The minimum local index.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
size_t getNodeNumRows() const override
Returns the number of graph rows owned on the calling node.
local_graph_type getLocalGraph() const
Get the local graph.
A distributed dense vector.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void putScalar(const Scalar &val)
Fill all entries with the given value val.
void reorderedGaussSeidelCopy(::Tpetra::MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
std::string description() const
One-line description of this object.
One or more distributed dense vectors.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
size_t getNodeNumEntries() const override
The local number of entries in the graph.
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
MultiVector for multiple degrees of freedom per mesh point.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
size_t getNodeNumRows() const
get the local number of block rows
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Returns the current number of entries on this node in the specified global row.
std::string errorMessages() const
The current stream of error messages.
int local_ordinal_type
Default value of Scalar template parameter.
virtual bool TPETRA_DEPRECATED isUpperTriangular() const
Whether the matrix&#39;s graph is locally upper triangular.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
bool hasColMap() const override
Whether the graph has a column Map.
global_size_t getGlobalNumRows() const
get the global number of block rows
size_t global_size_t
Global size_t object.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the graph, over all processes in the graph&#39;s communicator...
GlobalOrdinal getIndexBase() const override
Returns the index base for global indices for this graph.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
virtual bool hasColMap() const
Whether this matrix has a well-defined column map.
size_t getNodeNumCols() const override
Returns the number of columns connected to the locally owned rows of this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
Insert new values that don&#39;t currently exist.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
global_size_t getGlobalNumEntries() const override
Returns the global number of entries in the graph.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the graph.
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
virtual global_size_t TPETRA_DEPRECATED getGlobalNumDiags() const
Number of diagonal entries in the matrix&#39;s graph, over all processes in the matrix&#39;s communicator...
Sum new values into existing values.
bool locallySameAs(const Map< LocalOrdinal, GlobalOrdinal, node_type > &map) const
Is this Map locally the same as the input Map?
bool isGloballyIndexed() const override
If graph indices are in the global range, this function returns true. Otherwise, this function return...
LocalOrdinal getLocalElement(GlobalOrdinal globalIndex) const
The local index corresponding to the given global index.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
virtual bool isFillComplete() const
Whether fillComplete() has been called.
Replace old value with maximum of magnitudes of old and new values.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries across all rows/columns on all nodes.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
Replace existing values with new values.
Replace old values with zero.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
virtual bool TPETRA_DEPRECATED isLowerTriangular() const
Whether the matrix&#39;s graph is locally lower triangular.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
LO getBlockSize() const
The number of degrees of freedom per mesh point.
Teuchos::RCP< node_type > getNode() const override
Returns the underlying node.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
Returns the communicator.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const override
Get the number of entries in the given row (local index).
bool isNodeLocalElement(LocalOrdinal localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
virtual GO getIndexBase() const
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
bool isLocallyIndexed() const override
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
bool isFillComplete() const override
Returns true if fillComplete() has been called and the graph is in compute mode.
global_size_t getGlobalNumRows() const override
Returns the number of global rows in the graph.
device_type::memory_space memory_space
The Kokkos memory space that this class uses.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
A parallel distribution of indices over processes.
virtual typename ::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
virtual Teuchos::RCP< Node > getNode() const
The Kokkos Node instance.
Teuchos::RCP< const export_type > getExporter() const override
Returns the exporter associated with this graph.
Teuchos::RCP< const import_type > getImporter() const override
Returns the importer associated with this graph.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the graph, on this process.
Declaration of Tpetra::Experimental::BlockCrsMatrix.
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
Mix-in to avoid spurious deprecation warnings due to #2630.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
void scale(const Scalar &val)
Multiply all entries in place by the given value val.
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.