Tpetra parallel linear algebra  Version of the Day
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
44 
47 
51 #include "Tpetra_BlockMultiVector.hpp"
52 #include "Tpetra_BlockView.hpp"
53 
54 #include "Teuchos_TimeMonitor.hpp"
55 #ifdef HAVE_TPETRA_DEBUG
56 # include <set>
57 #endif // HAVE_TPETRA_DEBUG
58 
59 //
60 // mfh 25 May 2016: Temporary fix for #393.
61 //
62 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
63 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
64 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
65 // use them in that case, either.
66 //
67 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
68 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
69 // thing for any GCC version.
70 //
71 #if defined(__CUDACC__)
72  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
73 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
75 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
76 
77 #elif defined(__GNUC__)
78 
79 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
81 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
82 
83 #else // some other compiler
84 
85  // Optimistically assume that other compilers aren't broken.
86 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
88 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
89 #endif // defined(__CUDACC__), defined(__GNUC__)
90 
91 
92 namespace Tpetra {
93 
94 namespace Impl {
95 
96  template<typename T>
97  struct BlockCrsRowStruct {
98  T totalNumEntries, totalNumBytes, maxRowLength;
99  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct() = default;
100  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
101  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
102  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
103  };
104 
105  template<typename T>
106  static
107  KOKKOS_INLINE_FUNCTION
108  void operator+=(volatile BlockCrsRowStruct<T> &a,
109  volatile const BlockCrsRowStruct<T> &b) {
110  a.totalNumEntries += b.totalNumEntries;
111  a.totalNumBytes += b.totalNumBytes;
112  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
113  }
114 
115  template<typename T>
116  static
117  KOKKOS_INLINE_FUNCTION
118  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
119  a.totalNumEntries += b.totalNumEntries;
120  a.totalNumBytes += b.totalNumBytes;
121  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
122  }
123 
124  template<typename T, typename ExecSpace>
125  struct BlockCrsReducer {
126  typedef BlockCrsReducer reducer;
127  typedef T value_type;
128  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
129  value_type *value;
130 
131  KOKKOS_INLINE_FUNCTION
132  BlockCrsReducer(value_type &val) : value(&val) {}
133 
134  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
135  KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const { dst += src; }
136  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
137  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
138  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
139  };
140 
141  template<class AlphaCoeffType,
142  class GraphType,
143  class MatrixValuesType,
144  class InVecType,
145  class BetaCoeffType,
146  class OutVecType,
147  bool IsBuiltInType>
148  class BcrsApplyNoTransFunctor {
149  private:
150  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
151  "MatrixValuesType must be a Kokkos::View.");
152  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
153  "OutVecType must be a Kokkos::View.");
154  static_assert (Kokkos::Impl::is_view<InVecType>::value,
155  "InVecType must be a Kokkos::View.");
156  static_assert (std::is_same<MatrixValuesType,
157  typename MatrixValuesType::const_type>::value,
158  "MatrixValuesType must be a const Kokkos::View.");
159  static_assert (std::is_same<OutVecType,
160  typename OutVecType::non_const_type>::value,
161  "OutVecType must be a nonconst Kokkos::View.");
162  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
163  "InVecType must be a const Kokkos::View.");
164  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
165  "MatrixValuesType must be a rank-1 Kokkos::View.");
166  static_assert (static_cast<int> (InVecType::rank) == 1,
167  "InVecType must be a rank-1 Kokkos::View.");
168  static_assert (static_cast<int> (OutVecType::rank) == 1,
169  "OutVecType must be a rank-1 Kokkos::View.");
170  typedef typename MatrixValuesType::non_const_value_type scalar_type;
171 
172  public:
173  typedef typename GraphType::device_type device_type;
174 
176  typedef typename std::remove_const<typename GraphType::data_type>::type
178 
185  void setX (const InVecType& X) { X_ = X; }
186 
193  void setY (const OutVecType& Y) { Y_ = Y; }
194 
195  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
196  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
197 
199  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
200  const GraphType& graph,
201  const MatrixValuesType& val,
202  const local_ordinal_type blockSize,
203  const InVecType& X,
204  const beta_coeff_type& beta,
205  const OutVecType& Y) :
206  alpha_ (alpha),
207  ptr_ (graph.row_map),
208  ind_ (graph.entries),
209  val_ (val),
210  blockSize_ (blockSize),
211  X_ (X),
212  beta_ (beta),
213  Y_ (Y)
214  {}
215 
216  // Dummy team version
217  KOKKOS_INLINE_FUNCTION void
218  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
219  {
220  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
221  }
222 
223  // Range Policy for non built-in types
224  KOKKOS_INLINE_FUNCTION void
225  operator () (const local_ordinal_type& lclRow) const
226  {
231  using Kokkos::Details::ArithTraits;
232  // I'm not writing 'using Kokkos::make_pair;' here, because that
233  // may break builds for users who make the mistake of putting
234  // 'using namespace std;' in the global namespace. Please don't
235  // ever do that! But just in case you do, I'll take this
236  // precaution.
237  using Kokkos::parallel_for;
238  using Kokkos::subview;
239  typedef typename decltype (ptr_)::non_const_value_type offset_type;
240  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
241  Kokkos::LayoutRight,
242  device_type,
243  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
244  little_block_type;
245 
246  const offset_type Y_ptBeg = lclRow * blockSize_;
247  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
248  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
249 
250  // This version of the code does not use temporary storage.
251  // Each thread writes to its own block of the target vector.
252  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
253  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
254  }
255  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
256  SCAL (beta_, Y_cur);
257  }
258 
259  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
260  const offset_type blkBeg = ptr_[lclRow];
261  const offset_type blkEnd = ptr_[lclRow+1];
262  // Precompute to save integer math in the inner loop.
263  const offset_type bs2 = blockSize_ * blockSize_;
264  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
265  ++absBlkOff) {
266  little_block_type A_cur (val_.data () + absBlkOff * bs2,
267  blockSize_, blockSize_);
268  const offset_type X_blkCol = ind_[absBlkOff];
269  const offset_type X_ptBeg = X_blkCol * blockSize_;
270  const offset_type X_ptEnd = X_ptBeg + blockSize_;
271  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
272 
273  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
274  } // for each entry in current local block row of matrix
275  }
276  }
277 
278  private:
279  alpha_coeff_type alpha_;
280  typename GraphType::row_map_type::const_type ptr_;
281  typename GraphType::entries_type::const_type ind_;
282  MatrixValuesType val_;
283  local_ordinal_type blockSize_;
284  InVecType X_;
285  beta_coeff_type beta_;
286  OutVecType Y_;
287  };
288 
289  template<class AlphaCoeffType,
290  class GraphType,
291  class MatrixValuesType,
292  class InVecType,
293  class BetaCoeffType,
294  class OutVecType>
295  class BcrsApplyNoTransFunctor<AlphaCoeffType,
296  GraphType,
297  MatrixValuesType,
298  InVecType,
299  BetaCoeffType,
300  OutVecType,
301  true> {
302  private:
303  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
304  "MatrixValuesType must be a Kokkos::View.");
305  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
306  "OutVecType must be a Kokkos::View.");
307  static_assert (Kokkos::Impl::is_view<InVecType>::value,
308  "InVecType must be a Kokkos::View.");
309  static_assert (std::is_same<MatrixValuesType,
310  typename MatrixValuesType::const_type>::value,
311  "MatrixValuesType must be a const Kokkos::View.");
312  static_assert (std::is_same<OutVecType,
313  typename OutVecType::non_const_type>::value,
314  "OutVecType must be a nonconst Kokkos::View.");
315  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
316  "InVecType must be a const Kokkos::View.");
317  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
318  "MatrixValuesType must be a rank-1 Kokkos::View.");
319  static_assert (static_cast<int> (InVecType::rank) == 1,
320  "InVecType must be a rank-1 Kokkos::View.");
321  static_assert (static_cast<int> (OutVecType::rank) == 1,
322  "OutVecType must be a rank-1 Kokkos::View.");
323  typedef typename MatrixValuesType::non_const_value_type scalar_type;
324 
325  public:
326  typedef typename GraphType::device_type device_type;
327 
329  typedef typename std::remove_const<typename GraphType::data_type>::type
331 
338  void setX (const InVecType& X) { X_ = X; }
339 
346  void setY (const OutVecType& Y) { Y_ = Y; }
347 
348  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
349  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
350 
352  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
353  const GraphType& graph,
354  const MatrixValuesType& val,
355  const local_ordinal_type blockSize,
356  const InVecType& X,
357  const beta_coeff_type& beta,
358  const OutVecType& Y) :
359  alpha_ (alpha),
360  ptr_ (graph.row_map),
361  ind_ (graph.entries),
362  val_ (val),
363  blockSize_ (blockSize),
364  X_ (X),
365  beta_ (beta),
366  Y_ (Y)
367  {}
368 
369  // dummy Range version
370  KOKKOS_INLINE_FUNCTION void
371  operator () (const local_ordinal_type& lclRow) const
372  {
373  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
374  }
375 
376  // Team policy for built-in types
377  KOKKOS_INLINE_FUNCTION void
378  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
379  {
380  const local_ordinal_type lclRow = member.league_rank();
381 
382  using Kokkos::Details::ArithTraits;
383  // I'm not writing 'using Kokkos::make_pair;' here, because that
384  // may break builds for users who make the mistake of putting
385  // 'using namespace std;' in the global namespace. Please don't
386  // ever do that! But just in case you do, I'll take this
387  // precaution.
388  using Kokkos::parallel_for;
389  using Kokkos::subview;
390  typedef typename decltype (ptr_)::non_const_value_type offset_type;
391  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
392  Kokkos::LayoutRight,
393  device_type,
394  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
395  little_block_type;
396 
397  const offset_type Y_ptBeg = lclRow * blockSize_;
398  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
399  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
400 
401  // This version of the code does not use temporary storage.
402  // Each thread writes to its own block of the target vector.
403  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
404  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
405  [&](const local_ordinal_type &i) {
406  Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
407  });
408  }
409  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
410  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
411  [&](const local_ordinal_type &i) {
412  Y_cur(i) *= beta_;
413  });
414  }
415  member.team_barrier();
416 
417  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
418  const offset_type blkBeg = ptr_[lclRow];
419  const offset_type blkEnd = ptr_[lclRow+1];
420  // Precompute to save integer math in the inner loop.
421  const offset_type bs2 = blockSize_ * blockSize_;
422  little_block_type A_cur (val_.data (), blockSize_, blockSize_);
423  auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
424  Kokkos::parallel_for
425  (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
426  [&](const local_ordinal_type &absBlkOff) {
427  A_cur.assign_data(val_.data () + absBlkOff * bs2);
428  const offset_type X_blkCol = ind_[absBlkOff];
429  const offset_type X_ptBeg = X_blkCol * blockSize_;
430  X_cur.assign_data(&X_(X_ptBeg));
431 
432  Kokkos::parallel_for
433  (Kokkos::ThreadVectorRange(member, blockSize_),
434  [&](const local_ordinal_type &k0) {
435  scalar_type val(0);
436  for (local_ordinal_type k1=0;k1<blockSize_;++k1)
437  val += A_cur(k0,k1)*X_cur(k1);
438 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
439  // host space team size is always 1
440  Y_cur(k0) += alpha_*val;
441 #else
442  // cuda space team size can be larger than 1
443  // atomic is not allowed for sacado type;
444  // thus this needs to be specialized or
445  // sacado atomic should be supported.
446  Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
447 #endif
448  });
449  }); // for each entry in current local block row of matrix
450  }
451  }
452 
453  private:
454  alpha_coeff_type alpha_;
455  typename GraphType::row_map_type::const_type ptr_;
456  typename GraphType::entries_type::const_type ind_;
457  MatrixValuesType val_;
458  local_ordinal_type blockSize_;
459  InVecType X_;
460  beta_coeff_type beta_;
461  OutVecType Y_;
462  };
463 
464 
465  template<class AlphaCoeffType,
466  class GraphType,
467  class MatrixValuesType,
468  class InMultiVecType,
469  class BetaCoeffType,
470  class OutMultiVecType>
471  void
472  bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
473  const GraphType& graph,
474  const MatrixValuesType& val,
475  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
476  const InMultiVecType& X,
477  const BetaCoeffType& beta,
478  const OutMultiVecType& Y
479  )
480  {
481  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
482  "MatrixValuesType must be a Kokkos::View.");
483  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
484  "OutMultiVecType must be a Kokkos::View.");
485  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
486  "InMultiVecType must be a Kokkos::View.");
487  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
488  "MatrixValuesType must be a rank-1 Kokkos::View.");
489  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
490  "OutMultiVecType must be a rank-2 Kokkos::View.");
491  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
492  "InMultiVecType must be a rank-2 Kokkos::View.");
493 
494  typedef typename GraphType::device_type::execution_space execution_space;
495  typedef typename MatrixValuesType::const_type matrix_values_type;
496  typedef typename OutMultiVecType::non_const_type out_multivec_type;
497  typedef typename InMultiVecType::const_type in_multivec_type;
498  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
499  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
500  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
501 
502  constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
503 
504  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
505  static_cast<LO> (0) :
506  static_cast<LO> (graph.row_map.extent (0) - 1);
507  const LO numVecs = Y.extent (1);
508  if (numLocalMeshRows == 0 || numVecs == 0) {
509  return; // code below doesn't handle numVecs==0 correctly
510  }
511 
512  // These assignments avoid instantiating the functor extra times
513  // unnecessarily, e.g., for X const vs. nonconst. We only need the
514  // X const case, so only instantiate for that case.
515  in_multivec_type X_in = X;
516  out_multivec_type Y_out = Y;
517 
518  // The functor only knows how to handle one vector at a time, and it
519  // expects 1-D Views. Thus, we need to know the type of each column
520  // of X and Y.
521  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
522  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
523  typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
524  matrix_values_type, in_vec_type, beta_type, out_vec_type,
525  is_builtin_type_enabled> functor_type;
526 
527  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
528  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
529 
530  // Compute the first column of Y.
531  if (is_builtin_type_enabled) {
532  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
533  // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
534  typedef Kokkos::TeamPolicy<execution_space> policy_type;
535  policy_type policy(1,1);
536 #if defined(KOKKOS_ENABLE_CUDA)
537  constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
538 #else
539  constexpr bool is_cuda = false;
540 #endif // defined(KOKKOS_ENABLE_CUDA)
541  if (is_cuda) {
542  LO team_size = 8, vector_size = 1;
543  if (blockSize <= 5) vector_size = 4;
544  else if (blockSize <= 9) vector_size = 8;
545  else if (blockSize <= 12) vector_size = 12;
546  else if (blockSize <= 20) vector_size = 20;
547  else vector_size = 20;
548  policy = policy_type(numLocalMeshRows, team_size, vector_size);
549  } else {
550  policy = policy_type(numLocalMeshRows, 1, 1);
551  }
552  Kokkos::parallel_for (policy, functor);
553 
554  // Compute the remaining columns of Y.
555  for (LO j = 1; j < numVecs; ++j) {
556  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
557  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
558  functor.setX (X_j);
559  functor.setY (Y_j);
560  Kokkos::parallel_for (policy, functor);
561  }
562  } else {
563  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
564  Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
565  Kokkos::parallel_for (policy, functor);
566  for (LO j = 1; j < numVecs; ++j) {
567  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
568  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
569  functor.setX (X_j);
570  functor.setY (Y_j);
571  Kokkos::parallel_for (policy, functor);
572  }
573  }
574  }
575 } // namespace Impl
576 
577 namespace { // (anonymous)
578 
579 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
580 // version that takes two Kokkos::View arguments).
581 template<class Scalar, class LO, class GO, class Node>
582 class GetLocalDiagCopy {
583 public:
584  typedef typename Node::device_type device_type;
585  typedef size_t diag_offset_type;
586  typedef Kokkos::View<const size_t*, device_type,
587  Kokkos::MemoryUnmanaged> diag_offsets_type;
588  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
589  typedef typename global_graph_type::local_graph_type local_graph_type;
590  typedef typename local_graph_type::row_map_type row_offsets_type;
591  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
592  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
593  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
594 
595  // Constructor
596  GetLocalDiagCopy (const diag_type& diag,
597  const values_type& val,
598  const diag_offsets_type& diagOffsets,
599  const row_offsets_type& ptr,
600  const LO blockSize) :
601  diag_ (diag),
602  diagOffsets_ (diagOffsets),
603  ptr_ (ptr),
604  blockSize_ (blockSize),
605  offsetPerBlock_ (blockSize_*blockSize_),
606  val_(val)
607  {}
608 
609  KOKKOS_INLINE_FUNCTION void
610  operator() (const LO& lclRowInd) const
611  {
612  using Kokkos::ALL;
613 
614  // Get row offset
615  const size_t absOffset = ptr_[lclRowInd];
616 
617  // Get offset relative to start of row
618  const size_t relOffset = diagOffsets_[lclRowInd];
619 
620  // Get the total offset
621  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
622 
623  // Get a view of the block. BCRS currently uses LayoutRight
624  // regardless of the device.
625  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
626  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
627  const_little_block_type;
628  const_little_block_type D_in (val_.data () + pointOffset,
629  blockSize_, blockSize_);
630  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
631  COPY (D_in, D_out);
632  }
633 
634  private:
635  diag_type diag_;
636  diag_offsets_type diagOffsets_;
637  row_offsets_type ptr_;
638  LO blockSize_;
639  LO offsetPerBlock_;
640  values_type val_;
641  };
642 } // namespace (anonymous)
643 
644  template<class Scalar, class LO, class GO, class Node>
645  std::ostream&
646  BlockCrsMatrix<Scalar, LO, GO, Node>::
647  markLocalErrorAndGetStream ()
648  {
649  * (this->localError_) = true;
650  if ((*errs_).is_null ()) {
651  *errs_ = Teuchos::rcp (new std::ostringstream ());
652  }
653  return **errs_;
654  }
655 
656  template<class Scalar, class LO, class GO, class Node>
658  BlockCrsMatrix () :
659  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
660  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
661  blockSize_ (static_cast<LO> (0)),
662  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
663  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
664  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
665  offsetPerBlock_ (0),
666  localError_ (new bool (false)),
667  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
668  {
669  }
670 
671  template<class Scalar, class LO, class GO, class Node>
673  BlockCrsMatrix (const crs_graph_type& graph,
674  const LO blockSize) :
675  dist_object_type (graph.getMap ()),
676  graph_ (graph),
677  rowMeshMap_ (* (graph.getRowMap ())),
678  blockSize_ (blockSize),
679  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
680  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
681  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
682  offsetPerBlock_ (blockSize * blockSize),
683  localError_ (new bool (false)),
684  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
685  {
686  TEUCHOS_TEST_FOR_EXCEPTION(
687  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
688  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
689  "rows (isSorted() is false). This class assumes sorted rows.");
690 
691  graphRCP_ = Teuchos::rcpFromRef(graph_);
692 
693  // Trick to test whether LO is nonpositive, without a compiler
694  // warning in case LO is unsigned (which is generally a bad idea
695  // anyway). I don't promise that the trick works, but it
696  // generally does with gcc at least, in my experience.
697  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
698  TEUCHOS_TEST_FOR_EXCEPTION(
699  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
700  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
701  " <= 0. The block size must be positive.");
702 
703  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
704  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
705 
706  {
707  // These are rcp
708  const auto domainPointMap = getDomainMap();
709  const auto colPointMap = Teuchos::rcp
710  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
711  *pointImporter_ = Teuchos::rcp
712  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
713  }
714  {
715  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
716  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
717 
718  row_map_type ptr_d = graph.getLocalGraph ().row_map;
719  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
720  Kokkos::deep_copy (ptr_h_nc, ptr_d);
721  ptrHost_ = ptr_h_nc;
722  }
723  {
724  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
725  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
726 
727  entries_type ind_d = graph.getLocalGraph ().entries;
728  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
729  Kokkos::deep_copy (ind_h_nc, ind_d);
730  indHost_ = ind_h_nc;
731  }
732 
733  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
734  val_ = decltype (val_) ("val", numValEnt);
735  }
736 
737  template<class Scalar, class LO, class GO, class Node>
739  BlockCrsMatrix (const crs_graph_type& graph,
740  const map_type& domainPointMap,
741  const map_type& rangePointMap,
742  const LO blockSize) :
743  dist_object_type (graph.getMap ()),
744  graph_ (graph),
745  rowMeshMap_ (* (graph.getRowMap ())),
746  domainPointMap_ (domainPointMap),
747  rangePointMap_ (rangePointMap),
748  blockSize_ (blockSize),
749  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
750  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
751  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
752  offsetPerBlock_ (blockSize * blockSize),
753  localError_ (new bool (false)),
754  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
755  {
756  TEUCHOS_TEST_FOR_EXCEPTION(
757  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
758  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
759  "rows (isSorted() is false). This class assumes sorted rows.");
760 
761  graphRCP_ = Teuchos::rcpFromRef(graph_);
762 
763  // Trick to test whether LO is nonpositive, without a compiler
764  // warning in case LO is unsigned (which is generally a bad idea
765  // anyway). I don't promise that the trick works, but it
766  // generally does with gcc at least, in my experience.
767  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
768  TEUCHOS_TEST_FOR_EXCEPTION(
769  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
770  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
771  " <= 0. The block size must be positive.");
772  {
773  // These are rcp
774  const auto rcpDomainPointMap = getDomainMap();
775  const auto colPointMap = Teuchos::rcp
776  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
777  *pointImporter_ = Teuchos::rcp
778  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
779  }
780  {
781  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
782  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
783 
784  row_map_type ptr_d = graph.getLocalGraph ().row_map;
785  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
786  Kokkos::deep_copy (ptr_h_nc, ptr_d);
787  ptrHost_ = ptr_h_nc;
788  }
789  {
790  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
791  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
792 
793  entries_type ind_d = graph.getLocalGraph ().entries;
794  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
795  Kokkos::deep_copy (ind_h_nc, ind_d);
796  indHost_ = ind_h_nc;
797  }
798 
799  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
800  val_ = decltype (val_) ("val", numValEnt);
801  }
802 
803  template<class Scalar, class LO, class GO, class Node>
804  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
806  getDomainMap () const
807  { // Copy constructor of map_type does a shallow copy.
808  // We're only returning by RCP for backwards compatibility.
809  return Teuchos::rcp (new map_type (domainPointMap_));
810  }
811 
812  template<class Scalar, class LO, class GO, class Node>
813  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
815  getRangeMap () const
816  { // Copy constructor of map_type does a shallow copy.
817  // We're only returning by RCP for backwards compatibility.
818  return Teuchos::rcp (new map_type (rangePointMap_));
819  }
820 
821  template<class Scalar, class LO, class GO, class Node>
822  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
824  getRowMap () const
825  {
826  return graph_.getRowMap();
827  }
828 
829  template<class Scalar, class LO, class GO, class Node>
830  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
832  getColMap () const
833  {
834  return graph_.getColMap();
835  }
836 
837  template<class Scalar, class LO, class GO, class Node>
840  getGlobalNumRows() const
841  {
842  return graph_.getGlobalNumRows();
843  }
844 
845  template<class Scalar, class LO, class GO, class Node>
846  size_t
848  getNodeNumRows() const
849  {
850  return graph_.getNodeNumRows();
851  }
852 
853  template<class Scalar, class LO, class GO, class Node>
854  size_t
857  {
858  return graph_.getNodeMaxNumRowEntries();
859  }
860 
861  template<class Scalar, class LO, class GO, class Node>
862  void
864  apply (const mv_type& X,
865  mv_type& Y,
866  Teuchos::ETransp mode,
867  Scalar alpha,
868  Scalar beta) const
869  {
871  TEUCHOS_TEST_FOR_EXCEPTION(
872  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
873  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
874  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
875  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
876 
877  BMV X_view;
878  BMV Y_view;
879  const LO blockSize = getBlockSize ();
880  try {
881  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
882  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
883  }
884  catch (std::invalid_argument& e) {
885  TEUCHOS_TEST_FOR_EXCEPTION(
886  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
887  "apply: Either the input MultiVector X or the output MultiVector Y "
888  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
889  "graph. BlockMultiVector's constructor threw the following exception: "
890  << e.what ());
891  }
892 
893  try {
894  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
895  // either that or mark fields of this class as 'mutable'. The
896  // problem is that applyBlock wants to do lazy initialization of
897  // temporary block multivectors.
898  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
899  } catch (std::invalid_argument& e) {
900  TEUCHOS_TEST_FOR_EXCEPTION(
901  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
902  "apply: The implementation method applyBlock complained about having "
903  "an invalid argument. It reported the following: " << e.what ());
904  } catch (std::logic_error& e) {
905  TEUCHOS_TEST_FOR_EXCEPTION(
906  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
907  "apply: The implementation method applyBlock complained about a "
908  "possible bug in its implementation. It reported the following: "
909  << e.what ());
910  } catch (std::exception& e) {
911  TEUCHOS_TEST_FOR_EXCEPTION(
912  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
913  "apply: The implementation method applyBlock threw an exception which "
914  "is neither std::invalid_argument nor std::logic_error, but is a "
915  "subclass of std::exception. It reported the following: "
916  << e.what ());
917  } catch (...) {
918  TEUCHOS_TEST_FOR_EXCEPTION(
919  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
920  "apply: The implementation method applyBlock threw an exception which "
921  "is not an instance of a subclass of std::exception. This probably "
922  "indicates a bug. Please report this to the Tpetra developers.");
923  }
924  }
925 
926  template<class Scalar, class LO, class GO, class Node>
927  void
931  Teuchos::ETransp mode,
932  const Scalar alpha,
933  const Scalar beta)
934  {
935  TEUCHOS_TEST_FOR_EXCEPTION(
936  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
937  "Tpetra::BlockCrsMatrix::applyBlock: "
938  "X and Y have different block sizes. X.getBlockSize() = "
939  << X.getBlockSize () << " != Y.getBlockSize() = "
940  << Y.getBlockSize () << ".");
941 
942  if (mode == Teuchos::NO_TRANS) {
943  applyBlockNoTrans (X, Y, alpha, beta);
944  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
945  applyBlockTrans (X, Y, mode, alpha, beta);
946  } else {
947  TEUCHOS_TEST_FOR_EXCEPTION(
948  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
949  "applyBlock: Invalid 'mode' argument. Valid values are "
950  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
951  }
952  }
953 
954  template<class Scalar, class LO, class GO, class Node>
955  void
957  setAllToScalar (const Scalar& alpha)
958  {
959 #ifdef HAVE_TPETRA_DEBUG
960  const char prefix[] = "Tpetra::BlockCrsMatrix::setAllToScalar: ";
961 #endif // HAVE_TPETRA_DEBUG
962 
963  if (this->need_sync_device ()) {
964  // If we need to sync to device, then the data were last
965  // modified on host. In that case, we should again modify them
966  // on host.
967 #ifdef HAVE_TPETRA_DEBUG
968  TEUCHOS_TEST_FOR_EXCEPTION
969  (this->need_sync_host (), std::runtime_error,
970  prefix << "The matrix's values need sync on both device and host.");
971 #endif // HAVE_TPETRA_DEBUG
972  this->modify_host ();
973  Kokkos::deep_copy (getValuesHost (), alpha);
974  }
975  else {
976  // If we need to sync to host, then the data were last modified
977  // on device. In that case, we should again modify them on
978  // device. Also, prefer modifying on device if neither side is
979  // marked as modified.
980  this->modify_device ();
981  Kokkos::deep_copy (this->getValuesDevice (), alpha);
982  }
983  }
984 
985  template<class Scalar, class LO, class GO, class Node>
986  LO
988  replaceLocalValues (const LO localRowInd,
989  const LO colInds[],
990  const Scalar vals[],
991  const LO numColInds) const
992  {
993 #ifdef HAVE_TPETRA_DEBUG
994  const char prefix[] =
995  "Tpetra::BlockCrsMatrix::replaceLocalValues: ";
996 #endif // HAVE_TPETRA_DEBUG
997 
998  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
999  // We modified no values, because the input local row index is
1000  // invalid on the calling process. That may not be an error, if
1001  // numColInds is zero anyway; it doesn't matter. This is the
1002  // advantage of returning the number of valid indices.
1003  return static_cast<LO> (0);
1004  }
1005  const impl_scalar_type* const vIn =
1006  reinterpret_cast<const impl_scalar_type*> (vals);
1007  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1008  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1009  const LO perBlockSize = this->offsetPerBlock ();
1010  LO hint = 0; // Guess for the relative offset into the current row
1011  LO pointOffset = 0; // Current offset into input values
1012  LO validCount = 0; // number of valid column indices in colInds
1013 
1014 #ifdef HAVE_TPETRA_DEBUG
1015  TEUCHOS_TEST_FOR_EXCEPTION
1016  (this->need_sync_host (), std::runtime_error,
1017  prefix << "The matrix's data were last modified on device, but have "
1018  "not been sync'd to host. Please sync to host (by calling "
1019  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1020  "method.");
1021 #endif // HAVE_TPETRA_DEBUG
1022 
1023  auto vals_host_out = getValuesHost ();
1024  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1025 
1026  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1027  const LO relBlockOffset =
1028  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1029  if (relBlockOffset != LINV) {
1030  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1031  // are stored contiguously, with no padding. "Contiguously"
1032  // means that all memory between the first and last entries
1033  // belongs to the block (no striding). "No padding" means
1034  // that getBlockSize() * getBlockSize() is exactly the number
1035  // of entries that the block uses. For another place where
1036  // this assumption is encoded, see sumIntoLocalValues.
1037 
1038  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1039  // little_block_type A_old =
1040  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1041  impl_scalar_type* const A_old =
1042  vals_host_out_raw + absBlockOffset * perBlockSize;
1043  // const_little_block_type A_new =
1044  // getConstLocalBlockFromInput (vIn, pointOffset);
1045  const impl_scalar_type* const A_new = vIn + pointOffset;
1046  // COPY (A_new, A_old);
1047  for (LO i = 0; i < perBlockSize; ++i) {
1048  A_old[i] = A_new[i];
1049  }
1050  hint = relBlockOffset + 1;
1051  ++validCount;
1052  }
1053  }
1054  return validCount;
1055  }
1056 
1057  template <class Scalar, class LO, class GO, class Node>
1058  void
1060  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1061  Kokkos::MemoryUnmanaged>& offsets) const
1062  {
1063  graph_.getLocalDiagOffsets (offsets);
1064  }
1065 
1066 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1067  template <class Scalar, class LO, class GO, class Node>
1068  void TPETRA_DEPRECATED
1070  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1071  {
1072  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1073  // this method in CrsGraph too, so don't call it (otherwise build
1074  // warnings will show up and annoy users). Instead, copy results
1075  // in and out, if the memory space requires it.
1076 
1077  const size_t lclNumRows = graph_.getNodeNumRows ();
1078  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1079  offsets.resize (lclNumRows);
1080  }
1081 
1082  // The input ArrayRCP must always be a host pointer. Thus, if
1083  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1084  // to write to that allocation directly as a Kokkos::View.
1085  typedef typename device_type::memory_space memory_space;
1086  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1087  // It is always syntactically correct to assign a raw host
1088  // pointer to a device View, so this code will compile correctly
1089  // even if this branch never runs.
1090  typedef Kokkos::View<size_t*, device_type,
1091  Kokkos::MemoryUnmanaged> output_type;
1092  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1093  graph_.getLocalDiagOffsets (offsetsOut);
1094  }
1095  else {
1096  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1097  graph_.getLocalDiagOffsets (offsetsTmp);
1098  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1099  Kokkos::MemoryUnmanaged> output_type;
1100  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1101  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1102  }
1103  }
1104 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1105 
1106  template <class Scalar, class LO, class GO, class Node>
1107  void
1111  const Kokkos::View<impl_scalar_type***, device_type,
1112  Kokkos::MemoryUnmanaged>& D_inv,
1113  const Scalar& omega,
1114  const ESweepDirection direction) const
1115  {
1116  using Kokkos::ALL;
1117  const impl_scalar_type zero =
1118  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1119  const impl_scalar_type one =
1120  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1121  const LO numLocalMeshRows =
1122  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1123  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1124 
1125  // If using (new) Kokkos, replace localMem with thread-local
1126  // memory. Note that for larger block sizes, this will affect the
1127  // two-level parallelization. Look to Stokhos for best practice
1128  // on making this fast for GPUs.
1129  const LO blockSize = getBlockSize ();
1130  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1131  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1132  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1133 
1134  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1135  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1136  if (direction == Forward) {
1137  rowBegin = 1;
1138  rowEnd = numLocalMeshRows+1;
1139  rowStride = 1;
1140  }
1141  else if (direction == Backward) {
1142  rowBegin = numLocalMeshRows;
1143  rowEnd = 0;
1144  rowStride = -1;
1145  }
1146  else if (direction == Symmetric) {
1147  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1148  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1149  return;
1150  }
1151 
1152  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1153  const Scalar minus_omega = -omega;
1154 
1155  if (numVecs == 1) {
1156  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1157  const LO actlRow = lclRow - 1;
1158 
1159  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1160  COPY (B_cur, X_lcl);
1161  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1162 
1163  const size_t meshBeg = ptrHost_[actlRow];
1164  const size_t meshEnd = ptrHost_[actlRow+1];
1165  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1166  const LO meshCol = indHost_[absBlkOff];
1167  const_little_block_type A_cur =
1168  getConstLocalBlockFromAbsOffset (absBlkOff);
1169  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1170 
1171  // X_lcl += alpha*A_cur*X_cur
1172  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1173  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1174  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1175  } // for each entry in the current local row of the matrix
1176 
1177  // NOTE (mfh 20 Jan 2016) The two input Views here are
1178  // unmanaged already, so we don't have to take unmanaged
1179  // subviews first.
1180  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1181  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1182  FILL (X_update, zero);
1183  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1184  } // for each local row of the matrix
1185  }
1186  else {
1187  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1188  for (LO j = 0; j < numVecs; ++j) {
1189  LO actlRow = lclRow-1;
1190 
1191  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1192  COPY (B_cur, X_lcl);
1193  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1194 
1195  const size_t meshBeg = ptrHost_[actlRow];
1196  const size_t meshEnd = ptrHost_[actlRow+1];
1197  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1198  const LO meshCol = indHost_[absBlkOff];
1199  const_little_block_type A_cur =
1200  getConstLocalBlockFromAbsOffset (absBlkOff);
1201  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1202 
1203  // X_lcl += alpha*A_cur*X_cur
1204  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1205  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1206  } // for each entry in the current local row of the matrx
1207 
1208  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1209  auto X_update = X.getLocalBlock (actlRow, j);
1210  FILL (X_update, zero);
1211  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1212  } // for each entry in the current local row of the matrix
1213  } // for each local row of the matrix
1214  }
1215  }
1216 
1217  template <class Scalar, class LO, class GO, class Node>
1218  void
1221  const MultiVector<Scalar,LO,GO,Node> &/* B */,
1222  const MultiVector<Scalar,LO,GO,Node> &/* D */,
1223  const Scalar& /* dampingFactor */,
1224  const ESweepDirection /* direction */,
1225  const int /* numSweeps */,
1226  const bool /* zeroInitialGuess */) const
1227  {
1228  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1229  // interface for block Gauss-Seidel.
1230  TEUCHOS_TEST_FOR_EXCEPTION(
1231  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
1232  "gaussSeidelCopy: Not implemented.");
1233  }
1234 
1235  template <class Scalar, class LO, class GO, class Node>
1236  void
1239  const MultiVector<Scalar,LO,GO,Node>& /* B */,
1240  const MultiVector<Scalar,LO,GO,Node>& /* D */,
1241  const Teuchos::ArrayView<LO>& /* rowIndices */,
1242  const Scalar& /* dampingFactor */,
1243  const ESweepDirection /* direction */,
1244  const int /* numSweeps */,
1245  const bool /* zeroInitialGuess */) const
1246  {
1247  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1248  // interface for block Gauss-Seidel.
1249  TEUCHOS_TEST_FOR_EXCEPTION(
1250  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
1251  "reorderedGaussSeidelCopy: Not implemented.");
1252  }
1253 
1254 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1255  template <class Scalar, class LO, class GO, class Node>
1256  void TPETRA_DEPRECATED
1259  const Teuchos::ArrayView<const size_t>& offsets) const
1260  {
1261  using Teuchos::ArrayRCP;
1262  using Teuchos::ArrayView;
1263  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1264 
1265  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1266  const LO* columnIndices;
1267  Scalar* vals;
1268  LO numColumns;
1269  Teuchos::Array<LO> cols(1);
1270 
1271  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1272  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1273  for (size_t i = 0; i < myNumRows; ++i) {
1274  cols[0] = i;
1275  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1276  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1277  }
1278  else {
1279  getLocalRowView (i, columnIndices, vals, numColumns);
1280  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1281  }
1282  }
1283  }
1284 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1285 
1286  template <class Scalar, class LO, class GO, class Node>
1287  void
1289  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1290  Kokkos::MemoryUnmanaged>& diag,
1291  const Kokkos::View<const size_t*, device_type,
1292  Kokkos::MemoryUnmanaged>& offsets) const
1293  {
1294  using Kokkos::parallel_for;
1295  typedef typename device_type::execution_space execution_space;
1296  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1297 
1298  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1299  const LO blockSize = this->getBlockSize ();
1300  TEUCHOS_TEST_FOR_EXCEPTION
1301  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1302  static_cast<LO> (diag.extent (1)) < blockSize ||
1303  static_cast<LO> (diag.extent (2)) < blockSize,
1304  std::invalid_argument, prefix <<
1305  "The input Kokkos::View is not big enough to hold all the data.");
1306  TEUCHOS_TEST_FOR_EXCEPTION
1307  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1308  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1309  "diagonal blocks " << lclNumMeshRows << ".");
1310 
1311 #ifdef HAVE_TPETRA_DEBUG
1312  TEUCHOS_TEST_FOR_EXCEPTION
1313  (this->template need_sync<device_type> (), std::runtime_error,
1314  prefix << "The matrix's data were last modified on host, but have "
1315  "not been sync'd to device. Please sync to device (by calling "
1316  "sync<device_type>() on this matrix) before calling this method.");
1317 #endif // HAVE_TPETRA_DEBUG
1318 
1319  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1320  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1321 
1322  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1323  // we reserve the right to do lazy allocation of device data. (We
1324  // don't plan to do lazy allocation for host data; the host
1325  // version of the data always exists.)
1327  auto vals_dev =
1328  const_cast<this_type*> (this)->template getValues<device_type> ();
1329 
1330  parallel_for (policy_type (0, lclNumMeshRows),
1331  functor_type (diag, vals_dev, offsets,
1332  graph_.getLocalGraph ().row_map, blockSize_));
1333  }
1334 
1335 
1336  template <class Scalar, class LO, class GO, class Node>
1337  void
1339  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1340  Kokkos::MemoryUnmanaged>& diag,
1341  const Teuchos::ArrayView<const size_t>& offsets) const
1342  {
1343  using Kokkos::ALL;
1344  using Kokkos::parallel_for;
1345  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1346  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1347 
1348  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1349  const LO blockSize = this->getBlockSize ();
1350  TEUCHOS_TEST_FOR_EXCEPTION
1351  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1352  static_cast<LO> (diag.extent (1)) < blockSize ||
1353  static_cast<LO> (diag.extent (2)) < blockSize,
1354  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1355  "The input Kokkos::View is not big enough to hold all the data.");
1356  TEUCHOS_TEST_FOR_EXCEPTION
1357  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1358  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1359  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1360  "blocks " << lclNumMeshRows << ".");
1361 
1362  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1363  // quite made everything work with CUDA yet.
1364  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1365  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1366  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1367  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1368  COPY (D_in, D_out);
1369  });
1370  }
1371 
1372 
1373  template<class Scalar, class LO, class GO, class Node>
1374  LO
1376  absMaxLocalValues (const LO localRowInd,
1377  const LO colInds[],
1378  const Scalar vals[],
1379  const LO numColInds) const
1380  {
1381  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1382  // We modified no values, because the input local row index is
1383  // invalid on the calling process. That may not be an error, if
1384  // numColInds is zero anyway; it doesn't matter. This is the
1385  // advantage of returning the number of valid indices.
1386  return static_cast<LO> (0);
1387  }
1388  const impl_scalar_type* const vIn =
1389  reinterpret_cast<const impl_scalar_type*> (vals);
1390  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1391  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1392  const LO perBlockSize = this->offsetPerBlock ();
1393  LO hint = 0; // Guess for the relative offset into the current row
1394  LO pointOffset = 0; // Current offset into input values
1395  LO validCount = 0; // number of valid column indices in colInds
1396 
1397  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1398  const LO relBlockOffset =
1399  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1400  if (relBlockOffset != LINV) {
1401  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1402  little_block_type A_old =
1403  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1404  const_little_block_type A_new =
1405  getConstLocalBlockFromInput (vIn, pointOffset);
1406 
1407  ::Tpetra::Impl::absMax (A_old, A_new);
1408  hint = relBlockOffset + 1;
1409  ++validCount;
1410  }
1411  }
1412  return validCount;
1413  }
1414 
1415 
1416  template<class Scalar, class LO, class GO, class Node>
1417  LO
1419  sumIntoLocalValues (const LO localRowInd,
1420  const LO colInds[],
1421  const Scalar vals[],
1422  const LO numColInds) const
1423  {
1424 #ifdef HAVE_TPETRA_DEBUG
1425  const char prefix[] =
1426  "Tpetra::BlockCrsMatrix::sumIntoLocalValues: ";
1427 #endif // HAVE_TPETRA_DEBUG
1428 
1429  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1430  // We modified no values, because the input local row index is
1431  // invalid on the calling process. That may not be an error, if
1432  // numColInds is zero anyway; it doesn't matter. This is the
1433  // advantage of returning the number of valid indices.
1434  return static_cast<LO> (0);
1435  }
1436  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1437  const impl_scalar_type* const vIn =
1438  reinterpret_cast<const impl_scalar_type*> (vals);
1439  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1440  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1441  const LO perBlockSize = this->offsetPerBlock ();
1442  LO hint = 0; // Guess for the relative offset into the current row
1443  LO pointOffset = 0; // Current offset into input values
1444  LO validCount = 0; // number of valid column indices in colInds
1445 
1446 #ifdef HAVE_TPETRA_DEBUG
1447  TEUCHOS_TEST_FOR_EXCEPTION
1448  (this->need_sync_host (), std::runtime_error,
1449  prefix << "The matrix's data were last modified on device, but have not "
1450  "been sync'd to host. Please sync to host (by calling "
1451  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1452 #endif // HAVE_TPETRA_DEBUG
1453 
1454  auto vals_host_out =
1455  getValuesHost ();
1456  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1457 
1458  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1459  const LO relBlockOffset =
1460  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1461  if (relBlockOffset != LINV) {
1462  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1463  // are stored contiguously, with no padding. "Contiguously"
1464  // means that all memory between the first and last entries
1465  // belongs to the block (no striding). "No padding" means
1466  // that getBlockSize() * getBlockSize() is exactly the number
1467  // of entries that the block uses. For another place where
1468  // this assumption is encoded, see replaceLocalValues.
1469 
1470  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1471  // little_block_type A_old =
1472  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1473  impl_scalar_type* const A_old =
1474  vals_host_out_raw + absBlockOffset * perBlockSize;
1475  // const_little_block_type A_new =
1476  // getConstLocalBlockFromInput (vIn, pointOffset);
1477  const impl_scalar_type* const A_new = vIn + pointOffset;
1478  // AXPY (ONE, A_new, A_old);
1479  for (LO i = 0; i < perBlockSize; ++i) {
1480  A_old[i] += A_new[i];
1481  }
1482  hint = relBlockOffset + 1;
1483  ++validCount;
1484  }
1485  }
1486  return validCount;
1487  }
1488 
1489  template<class Scalar, class LO, class GO, class Node>
1490  LO
1492  getLocalRowView (const LO localRowInd,
1493  const LO*& colInds,
1494  Scalar*& vals,
1495  LO& numInds) const
1496  {
1497 #ifdef HAVE_TPETRA_DEBUG
1498  const char prefix[] =
1499  "Tpetra::BlockCrsMatrix::getLocalRowView: ";
1500 #endif // HAVE_TPETRA_DEBUG
1501 
1502  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1503  colInds = NULL;
1504  vals = NULL;
1505  numInds = 0;
1506  return Teuchos::OrdinalTraits<LO>::invalid ();
1507  }
1508  else {
1509  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1510  colInds = indHost_.data () + absBlockOffsetStart;
1511 
1512 #ifdef HAVE_TPETRA_DEBUG
1513  TEUCHOS_TEST_FOR_EXCEPTION
1514  (this->need_sync_host (), std::runtime_error,
1515  prefix << "The matrix's data were last modified on device, but have "
1516  "not been sync'd to host. Please sync to host (by calling "
1517  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1518  "method.");
1519 #endif // HAVE_TPETRA_DEBUG
1520 
1521  auto vals_host_out = getValuesHost ();
1522  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1523  impl_scalar_type* const vOut = vals_host_out_raw +
1524  absBlockOffsetStart * offsetPerBlock ();
1525  vals = reinterpret_cast<Scalar*> (vOut);
1526 
1527  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1528  return 0; // indicates no error
1529  }
1530  }
1531 
1532  template<class Scalar, class LO, class GO, class Node>
1533  void
1535  getLocalRowCopy (LO LocalRow,
1536  const Teuchos::ArrayView<LO>& Indices,
1537  const Teuchos::ArrayView<Scalar>& Values,
1538  size_t &NumEntries) const
1539  {
1540  const LO *colInds;
1541  Scalar *vals;
1542  LO numInds;
1543  getLocalRowView(LocalRow,colInds,vals,numInds);
1544  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1545  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1546  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1547  << numInds << " row entries");
1548  }
1549  for (LO i=0; i<numInds; ++i) {
1550  Indices[i] = colInds[i];
1551  }
1552  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1553  Values[i] = vals[i];
1554  }
1555  NumEntries = numInds;
1556  }
1557 
1558  template<class Scalar, class LO, class GO, class Node>
1559  LO
1561  getLocalRowOffsets (const LO localRowInd,
1562  ptrdiff_t offsets[],
1563  const LO colInds[],
1564  const LO numColInds) const
1565  {
1566  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1567  // We got no offsets, because the input local row index is
1568  // invalid on the calling process. That may not be an error, if
1569  // numColInds is zero anyway; it doesn't matter. This is the
1570  // advantage of returning the number of valid indices.
1571  return static_cast<LO> (0);
1572  }
1573 
1574  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1575  LO hint = 0; // Guess for the relative offset into the current row
1576  LO validCount = 0; // number of valid column indices in colInds
1577 
1578  for (LO k = 0; k < numColInds; ++k) {
1579  const LO relBlockOffset =
1580  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1581  if (relBlockOffset != LINV) {
1582  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1583  hint = relBlockOffset + 1;
1584  ++validCount;
1585  }
1586  }
1587  return validCount;
1588  }
1589 
1590 
1591  template<class Scalar, class LO, class GO, class Node>
1592  LO
1594  replaceLocalValuesByOffsets (const LO localRowInd,
1595  const ptrdiff_t offsets[],
1596  const Scalar vals[],
1597  const LO numOffsets) const
1598  {
1599  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1600  // We modified no values, because the input local row index is
1601  // invalid on the calling process. That may not be an error, if
1602  // numColInds is zero anyway; it doesn't matter. This is the
1603  // advantage of returning the number of valid indices.
1604  return static_cast<LO> (0);
1605  }
1606  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1607 
1608  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1609  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1610  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1611  size_t pointOffset = 0; // Current offset into input values
1612  LO validCount = 0; // number of valid offsets
1613 
1614  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1615  const size_t relBlockOffset = offsets[k];
1616  if (relBlockOffset != STINV) {
1617  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1618  little_block_type A_old =
1619  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1620  const_little_block_type A_new =
1621  getConstLocalBlockFromInput (vIn, pointOffset);
1622  COPY (A_new, A_old);
1623  ++validCount;
1624  }
1625  }
1626  return validCount;
1627  }
1628 
1629 
1630  template<class Scalar, class LO, class GO, class Node>
1631  LO
1633  absMaxLocalValuesByOffsets (const LO localRowInd,
1634  const ptrdiff_t offsets[],
1635  const Scalar vals[],
1636  const LO numOffsets) const
1637  {
1638  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1639  // We modified no values, because the input local row index is
1640  // invalid on the calling process. That may not be an error, if
1641  // numColInds is zero anyway; it doesn't matter. This is the
1642  // advantage of returning the number of valid indices.
1643  return static_cast<LO> (0);
1644  }
1645  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1646 
1647  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1648  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1649  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1650  size_t pointOffset = 0; // Current offset into input values
1651  LO validCount = 0; // number of valid offsets
1652 
1653  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1654  const size_t relBlockOffset = offsets[k];
1655  if (relBlockOffset != STINV) {
1656  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1657  little_block_type A_old =
1658  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1659  const_little_block_type A_new =
1660  getConstLocalBlockFromInput (vIn, pointOffset);
1661  ::Tpetra::Impl::absMax (A_old, A_new);
1662  ++validCount;
1663  }
1664  }
1665  return validCount;
1666  }
1667 
1668 
1669  template<class Scalar, class LO, class GO, class Node>
1670  LO
1672  sumIntoLocalValuesByOffsets (const LO localRowInd,
1673  const ptrdiff_t offsets[],
1674  const Scalar vals[],
1675  const LO numOffsets) const
1676  {
1677  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1678  // We modified no values, because the input local row index is
1679  // invalid on the calling process. That may not be an error, if
1680  // numColInds is zero anyway; it doesn't matter. This is the
1681  // advantage of returning the number of valid indices.
1682  return static_cast<LO> (0);
1683  }
1684  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1685  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1686 
1687  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1688  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1689  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1690  size_t pointOffset = 0; // Current offset into input values
1691  LO validCount = 0; // number of valid offsets
1692 
1693  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1694  const size_t relBlockOffset = offsets[k];
1695  if (relBlockOffset != STINV) {
1696  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1697  little_block_type A_old =
1698  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1699  const_little_block_type A_new =
1700  getConstLocalBlockFromInput (vIn, pointOffset);
1701  //A_old.update (ONE, A_new);
1702  AXPY (ONE, A_new, A_old);
1703  ++validCount;
1704  }
1705  }
1706  return validCount;
1707  }
1708 
1709 
1710  template<class Scalar, class LO, class GO, class Node>
1711  size_t
1713  getNumEntriesInLocalRow (const LO localRowInd) const
1714  {
1715  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1716  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1717  return static_cast<LO> (0); // the calling process doesn't have that row
1718  } else {
1719  return static_cast<LO> (numEntInGraph);
1720  }
1721  }
1722 
1723  template<class Scalar, class LO, class GO, class Node>
1724  void
1728  const Teuchos::ETransp mode,
1729  const Scalar alpha,
1730  const Scalar beta)
1731  {
1732  (void) X;
1733  (void) Y;
1734  (void) mode;
1735  (void) alpha;
1736  (void) beta;
1737 
1738  TEUCHOS_TEST_FOR_EXCEPTION(
1739  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1740  "transpose and conjugate transpose modes are not yet implemented.");
1741  }
1742 
1743  template<class Scalar, class LO, class GO, class Node>
1744  void
1745  BlockCrsMatrix<Scalar, LO, GO, Node>::
1746  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1747  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1748  const Scalar alpha,
1749  const Scalar beta)
1750  {
1751  using Teuchos::RCP;
1752  using Teuchos::rcp;
1753  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1754  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1755  const Scalar zero = STS::zero ();
1756  const Scalar one = STS::one ();
1757  RCP<const import_type> import = graph_.getImporter ();
1758  // "export" is a reserved C++ keyword, so we can't use it.
1759  RCP<const export_type> theExport = graph_.getExporter ();
1760  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1761 
1762  if (alpha == zero) {
1763  if (beta == zero) {
1764  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1765  }
1766  else if (beta != one) {
1767  Y.scale (beta);
1768  }
1769  }
1770  else { // alpha != 0
1771  const BMV* X_colMap = NULL;
1772  if (import.is_null ()) {
1773  try {
1774  X_colMap = &X;
1775  }
1776  catch (std::exception& e) {
1777  TEUCHOS_TEST_FOR_EXCEPTION
1778  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1779  "operator= threw an exception: " << std::endl << e.what ());
1780  }
1781  }
1782  else {
1783  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1784  // Y_rowMap_ below. This lets us do lazy initialization
1785  // correctly with view semantics of BlockCrsMatrix. All views
1786  // of this BlockCrsMatrix have the same outer pointer. That
1787  // way, we can set the inner pointer in one view, and all
1788  // other views will see it.
1789  if ((*X_colMap_).is_null () ||
1790  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1791  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1792  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1793  static_cast<LO> (X.getNumVectors ())));
1794  }
1795  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1796  **pointImporter_,
1797  ::Tpetra::REPLACE);
1798  try {
1799  X_colMap = &(**X_colMap_);
1800  }
1801  catch (std::exception& e) {
1802  TEUCHOS_TEST_FOR_EXCEPTION
1803  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1804  "operator= threw an exception: " << std::endl << e.what ());
1805  }
1806  }
1807 
1808  BMV* Y_rowMap = NULL;
1809  if (theExport.is_null ()) {
1810  Y_rowMap = &Y;
1811  }
1812  else if ((*Y_rowMap_).is_null () ||
1813  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1814  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1815  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1816  static_cast<LO> (X.getNumVectors ())));
1817  try {
1818  Y_rowMap = &(**Y_rowMap_);
1819  }
1820  catch (std::exception& e) {
1821  TEUCHOS_TEST_FOR_EXCEPTION(
1822  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1823  "operator= threw an exception: " << std::endl << e.what ());
1824  }
1825  }
1826 
1827  try {
1828  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1829  }
1830  catch (std::exception& e) {
1831  TEUCHOS_TEST_FOR_EXCEPTION
1832  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1833  "an exception: " << e.what ());
1834  }
1835  catch (...) {
1836  TEUCHOS_TEST_FOR_EXCEPTION
1837  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1838  "an exception not a subclass of std::exception.");
1839  }
1840 
1841  if (! theExport.is_null ()) {
1842  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1843  }
1844  }
1845  }
1846 
1847  template<class Scalar, class LO, class GO, class Node>
1848  void
1849  BlockCrsMatrix<Scalar, LO, GO, Node>::
1850  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1851  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1852  const Scalar alpha,
1853  const Scalar beta)
1854  {
1855  using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1856 
1857  const impl_scalar_type alpha_impl = alpha;
1858  const auto graph = this->graph_.getLocalGraph ();
1859  const impl_scalar_type beta_impl = beta;
1860  const LO blockSize = this->getBlockSize ();
1861 
1862  auto X_mv = X.getMultiVectorView ();
1863  auto Y_mv = Y.getMultiVectorView ();
1864  Y_mv.template modify<device_type> ();
1865 
1866  auto X_lcl = X_mv.template getLocalView<device_type> ();
1867  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1868  auto val = this->val_.template view<device_type> ();
1869 
1870  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1871  beta_impl, Y_lcl);
1872  }
1873 
1874  template<class Scalar, class LO, class GO, class Node>
1875  LO
1876  BlockCrsMatrix<Scalar, LO, GO, Node>::
1877  findRelOffsetOfColumnIndex (const LO localRowIndex,
1878  const LO colIndexToFind,
1879  const LO hint) const
1880  {
1881  const size_t absStartOffset = ptrHost_[localRowIndex];
1882  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1883  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1884  // Amortize pointer arithmetic over the search loop.
1885  const LO* const curInd = indHost_.data () + absStartOffset;
1886 
1887  // If the hint was correct, then the hint is the offset to return.
1888  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1889  // Always return the offset relative to the current row.
1890  return hint;
1891  }
1892 
1893  // The hint was wrong, so we must search for the given column
1894  // index in the column indices for the given row.
1895  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1896 
1897  // We require that the graph have sorted rows. However, binary
1898  // search only pays if the current row is longer than a certain
1899  // amount. We set this to 32, but you might want to tune this.
1900  const LO maxNumEntriesForLinearSearch = 32;
1901  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1902  // Use binary search. It would probably be better for us to
1903  // roll this loop by hand. If we wrote it right, a smart
1904  // compiler could perhaps use conditional loads and avoid
1905  // branches (according to Jed Brown on May 2014).
1906  const LO* beg = curInd;
1907  const LO* end = curInd + numEntriesInRow;
1908  std::pair<const LO*, const LO*> p =
1909  std::equal_range (beg, end, colIndexToFind);
1910  if (p.first != p.second) {
1911  // offset is relative to the current row
1912  relOffset = static_cast<LO> (p.first - beg);
1913  }
1914  }
1915  else { // use linear search
1916  for (LO k = 0; k < numEntriesInRow; ++k) {
1917  if (colIndexToFind == curInd[k]) {
1918  relOffset = k; // offset is relative to the current row
1919  break;
1920  }
1921  }
1922  }
1923 
1924  return relOffset;
1925  }
1926 
1927  template<class Scalar, class LO, class GO, class Node>
1928  LO
1929  BlockCrsMatrix<Scalar, LO, GO, Node>::
1930  offsetPerBlock () const
1931  {
1932  return offsetPerBlock_;
1933  }
1934 
1935  template<class Scalar, class LO, class GO, class Node>
1937  BlockCrsMatrix<Scalar, LO, GO, Node>::
1938  getConstLocalBlockFromInput (const impl_scalar_type* val,
1939  const size_t pointOffset) const
1940  {
1941  // Row major blocks
1942  const LO rowStride = blockSize_;
1943  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1944  }
1945 
1946  template<class Scalar, class LO, class GO, class Node>
1948  BlockCrsMatrix<Scalar, LO, GO, Node>::
1949  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1950  const size_t pointOffset) const
1951  {
1952  // Row major blocks
1953  const LO rowStride = blockSize_;
1954  return little_block_type (val + pointOffset, blockSize_, rowStride);
1955  }
1956 
1957  template<class Scalar, class LO, class GO, class Node>
1959  BlockCrsMatrix<Scalar, LO, GO, Node>::
1960  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1961  {
1962 #ifdef HAVE_TPETRA_DEBUG
1963  const char prefix[] =
1964  "Tpetra::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1965 #endif // HAVE_TPETRA_DEBUG
1966 
1967  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1968  // An empty block signifies an error. We don't expect to see
1969  // this error in correct code, but it's helpful for avoiding
1970  // memory corruption in case there is a bug.
1971  return const_little_block_type ();
1972  }
1973  else {
1974 #ifdef HAVE_TPETRA_DEBUG
1975  TEUCHOS_TEST_FOR_EXCEPTION
1976  (this->need_sync_host (), std::runtime_error,
1977  prefix << "The matrix's data were last modified on device, but have "
1978  "not been sync'd to host. Please sync to host (by calling "
1979  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1980  "method.");
1981 #endif // HAVE_TPETRA_DEBUG
1982  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1983 
1984  auto vals_host = getValuesHost ();
1985  const impl_scalar_type* vals_host_raw = vals_host.data ();
1986 
1987  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
1988  }
1989  }
1990 
1991  template<class Scalar, class LO, class GO, class Node>
1993  BlockCrsMatrix<Scalar, LO, GO, Node>::
1994  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
1995  const size_t relMeshOffset) const
1996  {
1997  typedef impl_scalar_type IST;
1998 
1999  const LO* lclColInds = NULL;
2000  Scalar* lclVals = NULL;
2001  LO numEnt = 0;
2002 
2003  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2004  if (err != 0) {
2005  // An empty block signifies an error. We don't expect to see
2006  // this error in correct code, but it's helpful for avoiding
2007  // memory corruption in case there is a bug.
2008  return const_little_block_type ();
2009  }
2010  else {
2011  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2012  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2013  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2014  relPointOffset);
2015  }
2016  }
2017 
2018  template<class Scalar, class LO, class GO, class Node>
2020  BlockCrsMatrix<Scalar, LO, GO, Node>::
2021  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2022  {
2023 #ifdef HAVE_TPETRA_DEBUG
2024  const char prefix[] =
2025  "Tpetra::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2026 #endif // HAVE_TPETRA_DEBUG
2027 
2028  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2029  // An empty block signifies an error. We don't expect to see
2030  // this error in correct code, but it's helpful for avoiding
2031  // memory corruption in case there is a bug.
2032  return little_block_type ();
2033  }
2034  else {
2035  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2036 #ifdef HAVE_TPETRA_DEBUG
2037  TEUCHOS_TEST_FOR_EXCEPTION
2038  (this->need_sync_host (), std::runtime_error,
2039  prefix << "The matrix's data were last modified on device, but have "
2040  "not been sync'd to host. Please sync to host (by calling "
2041  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2042  "method.");
2043 #endif // HAVE_TPETRA_DEBUG
2044  auto vals_host = getValuesHost();
2045  impl_scalar_type* vals_host_raw = vals_host.data ();
2046  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2047  }
2048  }
2049 
2050  template<class Scalar, class LO, class GO, class Node>
2052  BlockCrsMatrix<Scalar, LO, GO, Node>::
2053  getLocalBlock (const LO localRowInd, const LO localColInd) const
2054  {
2055  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2056  const LO relBlockOffset =
2057  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2058 
2059  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2060  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2061  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2062  }
2063  else {
2064  return little_block_type ();
2065  }
2066  }
2067 
2068  // template<class Scalar, class LO, class GO, class Node>
2069  // void
2070  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2071  // clearLocalErrorStateAndStream ()
2072  // {
2073  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2074  // * (const_cast<this_type*> (this)->localError_) = false;
2075  // *errs_ = Teuchos::null;
2076  // }
2077 
2078  template<class Scalar, class LO, class GO, class Node>
2079  bool
2080  BlockCrsMatrix<Scalar, LO, GO, Node>::
2081  checkSizes (const ::Tpetra::SrcDistObject& source)
2082  {
2083  using std::endl;
2084  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2085  const this_type* src = dynamic_cast<const this_type* > (&source);
2086 
2087  if (src == NULL) {
2088  std::ostream& err = markLocalErrorAndGetStream ();
2089  err << "checkSizes: The source object of the Import or Export "
2090  "must be a BlockCrsMatrix with the same template parameters as the "
2091  "target object." << endl;
2092  }
2093  else {
2094  // Use a string of ifs, not if-elseifs, because we want to know
2095  // all the errors.
2096  if (src->getBlockSize () != this->getBlockSize ()) {
2097  std::ostream& err = markLocalErrorAndGetStream ();
2098  err << "checkSizes: The source and target objects of the Import or "
2099  << "Export must have the same block sizes. The source's block "
2100  << "size = " << src->getBlockSize () << " != the target's block "
2101  << "size = " << this->getBlockSize () << "." << endl;
2102  }
2103  if (! src->graph_.isFillComplete ()) {
2104  std::ostream& err = markLocalErrorAndGetStream ();
2105  err << "checkSizes: The source object of the Import or Export is "
2106  "not fill complete. Both source and target objects must be fill "
2107  "complete." << endl;
2108  }
2109  if (! this->graph_.isFillComplete ()) {
2110  std::ostream& err = markLocalErrorAndGetStream ();
2111  err << "checkSizes: The target object of the Import or Export is "
2112  "not fill complete. Both source and target objects must be fill "
2113  "complete." << endl;
2114  }
2115  if (src->graph_.getColMap ().is_null ()) {
2116  std::ostream& err = markLocalErrorAndGetStream ();
2117  err << "checkSizes: The source object of the Import or Export does "
2118  "not have a column Map. Both source and target objects must have "
2119  "column Maps." << endl;
2120  }
2121  if (this->graph_.getColMap ().is_null ()) {
2122  std::ostream& err = markLocalErrorAndGetStream ();
2123  err << "checkSizes: The target object of the Import or Export does "
2124  "not have a column Map. Both source and target objects must have "
2125  "column Maps." << endl;
2126  }
2127  }
2128 
2129  return ! (* (this->localError_));
2130  }
2131 
2132  template<class Scalar, class LO, class GO, class Node>
2133  void
2134  BlockCrsMatrix<Scalar, LO, GO, Node>::
2135 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2136  copyAndPermuteNew
2137 #else // TPETRA_ENABLE_DEPRECATED_CODE
2138  copyAndPermute
2139 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2140  (const ::Tpetra::SrcDistObject& source,
2141  const size_t numSameIDs,
2142  const Kokkos::DualView<const local_ordinal_type*,
2143  buffer_device_type>& permuteToLIDs,
2144  const Kokkos::DualView<const local_ordinal_type*,
2145  buffer_device_type>& permuteFromLIDs)
2146  {
2147  using ::Tpetra::Details::Behavior;
2149  using ::Tpetra::Details::ProfilingRegion;
2150  using std::endl;
2151  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2152 
2153  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
2154  const bool debug = Behavior::debug();
2155  const bool verbose = Behavior::verbose();
2156 
2157  // Define this function prefix
2158  std::string prefix;
2159  {
2160  std::ostringstream os;
2161  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2162  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
2163  prefix = os.str();
2164  }
2165 
2166  // check if this already includes a local error
2167  if (* (this->localError_)) {
2168  std::ostream& err = this->markLocalErrorAndGetStream ();
2169  err << prefix
2170  << "The target object of the Import or Export is already in an error state."
2171  << endl;
2172  return;
2173  }
2174 
2175  //
2176  // Verbose input dual view status
2177  //
2178  if (verbose) {
2179  std::ostringstream os;
2180  os << prefix << endl
2181  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
2182  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
2183  std::cerr << os.str ();
2184  }
2185 
2189  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
2190  std::ostream& err = this->markLocalErrorAndGetStream ();
2191  err << prefix
2192  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
2193  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
2194  << "." << endl;
2195  return;
2196  }
2197  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
2198  std::ostream& err = this->markLocalErrorAndGetStream ();
2199  err << prefix
2200  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
2201  << endl;
2202  return;
2203  }
2204 
2205  const this_type* src = dynamic_cast<const this_type* > (&source);
2206  if (src == NULL) {
2207  std::ostream& err = this->markLocalErrorAndGetStream ();
2208  err << prefix
2209  << "The source (input) object of the Import or "
2210  "Export is either not a BlockCrsMatrix, or does not have the right "
2211  "template parameters. checkSizes() should have caught this. "
2212  "Please report this bug to the Tpetra developers." << endl;
2213  return;
2214  }
2215  else {
2216  // Kyungjoo: where is val_ modified ?
2217  // When we have dual view as a member variable,
2218  // which function should make sure the val_ is upto date ?
2219  // IMO, wherever it is used, the function should check its
2220  // availability.
2221  const_cast<this_type*>(src)->sync_host();
2222  }
2223  this->sync_host();
2224 
2225  bool lclErr = false;
2226 #ifdef HAVE_TPETRA_DEBUG
2227  std::set<LO> invalidSrcCopyRows;
2228  std::set<LO> invalidDstCopyRows;
2229  std::set<LO> invalidDstCopyCols;
2230  std::set<LO> invalidDstPermuteCols;
2231  std::set<LO> invalidPermuteFromRows;
2232 #endif // HAVE_TPETRA_DEBUG
2233 
2234  // Copy the initial sequence of rows that are the same.
2235  //
2236  // The two graphs might have different column Maps, so we need to
2237  // do this using global column indices. This is purely local, so
2238  // we only need to check for local sameness of the two column
2239  // Maps.
2240 
2241 #ifdef HAVE_TPETRA_DEBUG
2242  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2243 #endif // HAVE_TPETRA_DEBUG
2244  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2245  const map_type& srcColMap = * (src->graph_.getColMap ());
2246  const map_type& dstColMap = * (this->graph_.getColMap ());
2247  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2248 
2249  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
2250  if (verbose) {
2251  std::ostringstream os;
2252  os << prefix
2253  << "canUseLocalColumnIndices: "
2254  << (canUseLocalColumnIndices ? "true" : "false")
2255  << ", numPermute: " << numPermute
2256  << endl;
2257  std::cerr << os.str ();
2258  }
2259 
2260  const auto permuteToLIDsHost = permuteToLIDs.view_host();
2261  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
2262 
2263  if (canUseLocalColumnIndices) {
2264  // Copy local rows that are the "same" in both source and target.
2265  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2266 #ifdef HAVE_TPETRA_DEBUG
2267  if (! srcRowMap.isNodeLocalElement (localRow)) {
2268  lclErr = true;
2269  invalidSrcCopyRows.insert (localRow);
2270  continue; // skip invalid rows
2271  }
2272 #endif // HAVE_TPETRA_DEBUG
2273 
2274  const LO* lclSrcCols;
2275  Scalar* vals;
2276  LO numEntries;
2277  // If this call fails, that means the mesh row local index is
2278  // invalid. That means the Import or Export is invalid somehow.
2279  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2280  if (err != 0) {
2281  lclErr = true;
2282 #ifdef HAVE_TPETRA_DEBUG
2283  (void) invalidSrcCopyRows.insert (localRow);
2284 #endif // HAVE_TPETRA_DEBUG
2285  }
2286  else {
2287  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2288  if (err != numEntries) {
2289  lclErr = true;
2290  if (! dstRowMap.isNodeLocalElement (localRow)) {
2291 #ifdef HAVE_TPETRA_DEBUG
2292  invalidDstCopyRows.insert (localRow);
2293 #endif // HAVE_TPETRA_DEBUG
2294  }
2295  else {
2296  // Once there's an error, there's no sense in saving
2297  // time, so we check whether the column indices were
2298  // invalid. However, only remember which ones were
2299  // invalid in a debug build, because that might take a
2300  // lot of space.
2301  for (LO k = 0; k < numEntries; ++k) {
2302  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2303  lclErr = true;
2304 #ifdef HAVE_TPETRA_DEBUG
2305  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2306 #endif // HAVE_TPETRA_DEBUG
2307  }
2308  }
2309  }
2310  }
2311  }
2312  } // for each "same" local row
2313 
2314  // Copy the "permute" local rows.
2315  for (size_t k = 0; k < numPermute; ++k) {
2316  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2317  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2318 
2319  const LO* lclSrcCols;
2320  Scalar* vals;
2321  LO numEntries;
2322  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2323  if (err != 0) {
2324  lclErr = true;
2325 #ifdef HAVE_TPETRA_DEBUG
2326  invalidPermuteFromRows.insert (srcLclRow);
2327 #endif // HAVE_TPETRA_DEBUG
2328  }
2329  else {
2330  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2331  if (err != numEntries) {
2332  lclErr = true;
2333 #ifdef HAVE_TPETRA_DEBUG
2334  for (LO c = 0; c < numEntries; ++c) {
2335  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2336  invalidDstPermuteCols.insert (lclSrcCols[c]);
2337  }
2338  }
2339 #endif // HAVE_TPETRA_DEBUG
2340  }
2341  }
2342  }
2343  }
2344  else { // must convert column indices to global
2345  // Reserve space to store the destination matrix's local column indices.
2346  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2347  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2348 
2349  // Copy local rows that are the "same" in both source and target.
2350  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2351  const LO* lclSrcCols;
2352  Scalar* vals;
2353  LO numEntries;
2354  // If this call fails, that means the mesh row local index is
2355  // invalid. That means the Import or Export is invalid somehow.
2356  LO err = 0;
2357  try {
2358  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2359  } catch (std::exception& e) {
2360  if (debug) {
2361  std::ostringstream os;
2362  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2363  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2364  << localRow << ", src->getLocalRowView() threw an exception: "
2365  << e.what ();
2366  std::cerr << os.str ();
2367  }
2368  throw e;
2369  }
2370 
2371  if (err != 0) {
2372  lclErr = true;
2373 #ifdef HAVE_TPETRA_DEBUG
2374  invalidSrcCopyRows.insert (localRow);
2375 #endif // HAVE_TPETRA_DEBUG
2376  }
2377  else {
2378  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2379  lclErr = true;
2380  if (debug) {
2381  std::ostringstream os;
2382  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2383  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2384  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2385  << maxNumEnt << endl;
2386  std::cerr << os.str ();
2387  }
2388  }
2389  else {
2390  // Convert the source matrix's local column indices to the
2391  // destination matrix's local column indices.
2392  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2393  for (LO j = 0; j < numEntries; ++j) {
2394  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2395  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2396  lclErr = true;
2397 #ifdef HAVE_TPETRA_DEBUG
2398  invalidDstCopyCols.insert (lclDstColsView[j]);
2399 #endif // HAVE_TPETRA_DEBUG
2400  }
2401  }
2402  try {
2403  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2404  } catch (std::exception& e) {
2405  if (debug) {
2406  std::ostringstream os;
2407  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2408  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2409  << localRow << ", this->replaceLocalValues() threw an exception: "
2410  << e.what ();
2411  std::cerr << os.str ();
2412  }
2413  throw e;
2414  }
2415  if (err != numEntries) {
2416  lclErr = true;
2417  if (debug) {
2418  std::ostringstream os;
2419  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2420  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2421  "localRow " << localRow << ", this->replaceLocalValues "
2422  "returned " << err << " instead of numEntries = "
2423  << numEntries << endl;
2424  std::cerr << os.str ();
2425  }
2426  }
2427  }
2428  }
2429  }
2430 
2431  // Copy the "permute" local rows.
2432  for (size_t k = 0; k < numPermute; ++k) {
2433  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2434  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2435 
2436  const LO* lclSrcCols;
2437  Scalar* vals;
2438  LO numEntries;
2439  LO err = 0;
2440  try {
2441  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2442  } catch (std::exception& e) {
2443  if (debug) {
2444  std::ostringstream os;
2445  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2446  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2447  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2448  << ", src->getLocalRowView() threw an exception: " << e.what ();
2449  std::cerr << os.str ();
2450  }
2451  throw e;
2452  }
2453 
2454  if (err != 0) {
2455  lclErr = true;
2456 #ifdef HAVE_TPETRA_DEBUG
2457  invalidPermuteFromRows.insert (srcLclRow);
2458 #endif // HAVE_TPETRA_DEBUG
2459  }
2460  else {
2461  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2462  lclErr = true;
2463  }
2464  else {
2465  // Convert the source matrix's local column indices to the
2466  // destination matrix's local column indices.
2467  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2468  for (LO j = 0; j < numEntries; ++j) {
2469  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2470  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2471  lclErr = true;
2472 #ifdef HAVE_TPETRA_DEBUG
2473  invalidDstPermuteCols.insert (lclDstColsView[j]);
2474 #endif // HAVE_TPETRA_DEBUG
2475  }
2476  }
2477  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2478  if (err != numEntries) {
2479  lclErr = true;
2480  }
2481  }
2482  }
2483  }
2484  }
2485 
2486  if (lclErr) {
2487  std::ostream& err = this->markLocalErrorAndGetStream ();
2488 #ifdef HAVE_TPETRA_DEBUG
2489  err << "copyAndPermute: The graph structure of the source of the "
2490  "Import or Export must be a subset of the graph structure of the "
2491  "target. ";
2492  err << "invalidSrcCopyRows = [";
2493  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2494  it != invalidSrcCopyRows.end (); ++it) {
2495  err << *it;
2496  typename std::set<LO>::const_iterator itp1 = it;
2497  itp1++;
2498  if (itp1 != invalidSrcCopyRows.end ()) {
2499  err << ",";
2500  }
2501  }
2502  err << "], invalidDstCopyRows = [";
2503  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2504  it != invalidDstCopyRows.end (); ++it) {
2505  err << *it;
2506  typename std::set<LO>::const_iterator itp1 = it;
2507  itp1++;
2508  if (itp1 != invalidDstCopyRows.end ()) {
2509  err << ",";
2510  }
2511  }
2512  err << "], invalidDstCopyCols = [";
2513  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2514  it != invalidDstCopyCols.end (); ++it) {
2515  err << *it;
2516  typename std::set<LO>::const_iterator itp1 = it;
2517  itp1++;
2518  if (itp1 != invalidDstCopyCols.end ()) {
2519  err << ",";
2520  }
2521  }
2522  err << "], invalidDstPermuteCols = [";
2523  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2524  it != invalidDstPermuteCols.end (); ++it) {
2525  err << *it;
2526  typename std::set<LO>::const_iterator itp1 = it;
2527  itp1++;
2528  if (itp1 != invalidDstPermuteCols.end ()) {
2529  err << ",";
2530  }
2531  }
2532  err << "], invalidPermuteFromRows = [";
2533  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2534  it != invalidPermuteFromRows.end (); ++it) {
2535  err << *it;
2536  typename std::set<LO>::const_iterator itp1 = it;
2537  itp1++;
2538  if (itp1 != invalidPermuteFromRows.end ()) {
2539  err << ",";
2540  }
2541  }
2542  err << "]" << endl;
2543 #else
2544  err << "copyAndPermute: The graph structure of the source of the "
2545  "Import or Export must be a subset of the graph structure of the "
2546  "target." << endl;
2547 #endif // HAVE_TPETRA_DEBUG
2548  }
2549 
2550  if (debug) {
2551  std::ostringstream os;
2552  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2553  const bool lclSuccess = ! (* (this->localError_));
2554  os << "*** Proc " << myRank << ": copyAndPermute "
2555  << (lclSuccess ? "succeeded" : "FAILED");
2556  if (lclSuccess) {
2557  os << endl;
2558  } else {
2559  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2560  }
2561  std::cerr << os.str ();
2562  }
2563  }
2564 
2565  namespace { // (anonymous)
2566 
2585  template<class LO, class GO, class D>
2586  size_t
2587  packRowCount (const size_t numEnt,
2588  const size_t numBytesPerValue,
2589  const size_t blkSize)
2590  {
2591  using ::Tpetra::Details::PackTraits;
2592 
2593  if (numEnt == 0) {
2594  // Empty rows always take zero bytes, to ensure sparsity.
2595  return 0;
2596  }
2597  else {
2598  // We store the number of entries as a local index (LO).
2599  LO numEntLO = 0; // packValueCount wants this.
2600  GO gid;
2601  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2602  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2603  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2604  return numEntLen + gidsLen + valsLen;
2605  }
2606  }
2607 
2618  template<class ST, class LO, class GO, class D>
2619  size_t
2620  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2621  const size_t offset,
2622  const size_t numBytes,
2623  const size_t /* numBytesPerValue */)
2624  {
2625  using Kokkos::subview;
2626  using ::Tpetra::Details::PackTraits;
2627 
2628  if (numBytes == 0) {
2629  // Empty rows always take zero bytes, to ensure sparsity.
2630  return static_cast<size_t> (0);
2631  }
2632  else {
2633  LO numEntLO = 0;
2634  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2635  TEUCHOS_TEST_FOR_EXCEPTION
2636  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2637  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2638  << ".");
2639  const char* const inBuf = imports.data () + offset;
2640  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2641  TEUCHOS_TEST_FOR_EXCEPTION
2642  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2643  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2644  << ".");
2645  return static_cast<size_t> (numEntLO);
2646  }
2647  }
2648 
2652  template<class ST, class LO, class GO, class D>
2653  size_t
2654  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO, D>::output_buffer_type exports,
2655  const size_t offset,
2656  const size_t numEnt,
2657  const typename ::Tpetra::Details::PackTraits<GO, D>::input_array_type& gidsIn,
2658  const typename ::Tpetra::Details::PackTraits<ST, D>::input_array_type& valsIn,
2659  const size_t numBytesPerValue,
2660  const size_t blockSize)
2661  {
2662  using Kokkos::subview;
2663  using ::Tpetra::Details::PackTraits;
2664 
2665  if (numEnt == 0) {
2666  // Empty rows always take zero bytes, to ensure sparsity.
2667  return 0;
2668  }
2669  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2670 
2671  const GO gid = 0; // packValueCount wants this
2672  const LO numEntLO = static_cast<size_t> (numEnt);
2673 
2674  const size_t numEntBeg = offset;
2675  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2676  const size_t gidsBeg = numEntBeg + numEntLen;
2677  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2678  const size_t valsBeg = gidsBeg + gidsLen;
2679  const size_t valsLen = numScalarEnt * numBytesPerValue;
2680 
2681  char* const numEntOut = exports.data () + numEntBeg;
2682  char* const gidsOut = exports.data () + gidsBeg;
2683  char* const valsOut = exports.data () + valsBeg;
2684 
2685  size_t numBytesOut = 0;
2686  int errorCode = 0;
2687  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2688 
2689  {
2690  Kokkos::pair<int, size_t> p;
2691  p = PackTraits<GO, D>::packArray (gidsOut, gidsIn.data (), numEnt);
2692  errorCode += p.first;
2693  numBytesOut += p.second;
2694 
2695  p = PackTraits<ST, D>::packArray (valsOut, valsIn.data (), numScalarEnt);
2696  errorCode += p.first;
2697  numBytesOut += p.second;
2698  }
2699 
2700  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2701  TEUCHOS_TEST_FOR_EXCEPTION
2702  (numBytesOut != expectedNumBytes, std::logic_error,
2703  "packRowForBlockCrs: numBytesOut = " << numBytesOut
2704  << " != expectedNumBytes = " << expectedNumBytes << ".");
2705 
2706  TEUCHOS_TEST_FOR_EXCEPTION
2707  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2708  "PackTraits::packArray returned a nonzero error code");
2709 
2710  return numBytesOut;
2711  }
2712 
2713  // Return the number of bytes actually read / used.
2714  template<class ST, class LO, class GO, class D>
2715  size_t
2716  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2717  const typename ::Tpetra::Details::PackTraits<ST, D>::output_array_type& valsOut,
2718  const typename ::Tpetra::Details::PackTraits<int, D>::input_buffer_type& imports,
2719  const size_t offset,
2720  const size_t numBytes,
2721  const size_t numEnt,
2722  const size_t numBytesPerValue,
2723  const size_t blockSize)
2724  {
2725  using ::Tpetra::Details::PackTraits;
2726 
2727  if (numBytes == 0) {
2728  // Rows with zero bytes always have zero entries.
2729  return 0;
2730  }
2731  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2732  TEUCHOS_TEST_FOR_EXCEPTION
2733  (static_cast<size_t> (imports.extent (0)) <= offset,
2734  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2735  << imports.extent (0) << " <= offset = " << offset << ".");
2736  TEUCHOS_TEST_FOR_EXCEPTION
2737  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2738  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2739  << imports.extent (0) << " < offset + numBytes = "
2740  << (offset + numBytes) << ".");
2741 
2742  const GO gid = 0; // packValueCount wants this
2743  const LO lid = 0; // packValueCount wants this
2744 
2745  const size_t numEntBeg = offset;
2746  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2747  const size_t gidsBeg = numEntBeg + numEntLen;
2748  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2749  const size_t valsBeg = gidsBeg + gidsLen;
2750  const size_t valsLen = numScalarEnt * numBytesPerValue;
2751 
2752  const char* const numEntIn = imports.data () + numEntBeg;
2753  const char* const gidsIn = imports.data () + gidsBeg;
2754  const char* const valsIn = imports.data () + valsBeg;
2755 
2756  size_t numBytesOut = 0;
2757  int errorCode = 0;
2758  LO numEntOut;
2759  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2760  TEUCHOS_TEST_FOR_EXCEPTION
2761  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2762  "unpackRowForBlockCrs: Expected number of entries " << numEnt
2763  << " != actual number of entries " << numEntOut << ".");
2764 
2765  {
2766  Kokkos::pair<int, size_t> p;
2767  p = PackTraits<GO, D>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2768  errorCode += p.first;
2769  numBytesOut += p.second;
2770 
2771  p = PackTraits<ST, D>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2772  errorCode += p.first;
2773  numBytesOut += p.second;
2774  }
2775 
2776  TEUCHOS_TEST_FOR_EXCEPTION
2777  (numBytesOut != numBytes, std::logic_error,
2778  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2779  << " != numBytes = " << numBytes << ".");
2780 
2781  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2782  TEUCHOS_TEST_FOR_EXCEPTION
2783  (numBytesOut != expectedNumBytes, std::logic_error,
2784  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2785  << " != expectedNumBytes = " << expectedNumBytes << ".");
2786 
2787  TEUCHOS_TEST_FOR_EXCEPTION
2788  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2789  "PackTraits::unpackArray returned a nonzero error code");
2790 
2791  return numBytesOut;
2792  }
2793  } // namespace (anonymous)
2794 
2795  template<class Scalar, class LO, class GO, class Node>
2796  void
2797  BlockCrsMatrix<Scalar, LO, GO, Node>::
2798 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2799  packAndPrepareNew
2800 #else // TPETRA_ENABLE_DEPRECATED_CODE
2801  packAndPrepare
2802 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2803  (const ::Tpetra::SrcDistObject& source,
2804  const Kokkos::DualView<const local_ordinal_type*,
2805  buffer_device_type>& exportLIDs,
2806  Kokkos::DualView<packet_type*,
2807  buffer_device_type>& exports, // output
2808  Kokkos::DualView<size_t*,
2809  buffer_device_type> numPacketsPerLID, // output
2810  size_t& constantNumPackets,
2811  Distributor& /* distor */)
2812  {
2813  using ::Tpetra::Details::Behavior;
2815  using ::Tpetra::Details::ProfilingRegion;
2816  using ::Tpetra::Details::PackTraits;
2817 
2818  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2819 
2820  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2821 
2822  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2823 
2824  const bool debug = Behavior::debug();
2825  const bool verbose = Behavior::verbose();
2826 
2827  // Define this function prefix
2828  std::string prefix;
2829  {
2830  std::ostringstream os;
2831  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2832  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2833  prefix = os.str();
2834  }
2835 
2836  // check if this already includes a local error
2837  if (* (this->localError_)) {
2838  std::ostream& err = this->markLocalErrorAndGetStream ();
2839  err << prefix
2840  << "The target object of the Import or Export is already in an error state."
2841  << std::endl;
2842  return;
2843  }
2844 
2845  //
2846  // Verbose input dual view status
2847  //
2848  if (verbose) {
2849  std::ostringstream os;
2850  os << prefix << std::endl
2851  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2852  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2853  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2854  std::cerr << os.str ();
2855  }
2856 
2860  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2861  std::ostream& err = this->markLocalErrorAndGetStream ();
2862  err << prefix
2863  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2864  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2865  << "." << std::endl;
2866  return;
2867  }
2868  if (exportLIDs.need_sync_host ()) {
2869  std::ostream& err = this->markLocalErrorAndGetStream ();
2870  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2871  return;
2872  }
2873 
2874  const this_type* src = dynamic_cast<const this_type* > (&source);
2875  if (src == NULL) {
2876  std::ostream& err = this->markLocalErrorAndGetStream ();
2877  err << prefix
2878  << "The source (input) object of the Import or "
2879  "Export is either not a BlockCrsMatrix, or does not have the right "
2880  "template parameters. checkSizes() should have caught this. "
2881  "Please report this bug to the Tpetra developers." << std::endl;
2882  return;
2883  }
2884 
2885  // Graphs and matrices are allowed to have a variable number of
2886  // entries per row. We could test whether all rows have the same
2887  // number of entries, but DistObject can only use this
2888  // optimization if all rows on _all_ processes have the same
2889  // number of entries. Rather than do the all-reduce necessary to
2890  // test for this unlikely case, we tell DistObject (by setting
2891  // constantNumPackets to zero) to assume that different rows may
2892  // have different numbers of entries.
2893  constantNumPackets = 0;
2894 
2895  // const values
2896  const crs_graph_type& srcGraph = src->graph_;
2897  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2898  const size_t numExportLIDs = exportLIDs.extent (0);
2899  const size_t numBytesPerValue =
2900  PackTraits<impl_scalar_type, host_exec>
2901  ::packValueCount(this->val_.extent(0) ? this->val_.view_host()(0) : impl_scalar_type());
2902 
2903  // Compute the number of bytes ("packets") per row to pack. While
2904  // we're at it, compute the total # of block entries to send, and
2905  // the max # of block entries in any of the rows we're sending.
2906 
2907  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2908 
2909  // Graph information is on host; let's do this on host parallel reduce
2910  auto exportLIDsHost = exportLIDs.view_host();
2911  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2912  numPacketsPerLID.modify_host ();
2913  {
2914  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2915  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2916  Kokkos::parallel_reduce
2917  (policy,
2918  [=](const size_t &i, typename reducer_type::value_type &update) {
2919  const LO lclRow = exportLIDsHost(i);
2920  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2921  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2922 
2923  const size_t numBytes =
2924  packRowCount<LO, GO, host_exec> (numEnt, numBytesPerValue, blockSize);
2925  numPacketsPerLIDHost(i) = numBytes;
2926  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2927  }, rowReducerStruct);
2928  }
2929 
2930  // Compute the number of bytes ("packets") per row to pack. While
2931  // we're at it, compute the total # of block entries to send, and
2932  // the max # of block entries in any of the rows we're sending.
2933  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2934  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2935  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2936 
2937  if (verbose) {
2938  std::ostringstream os;
2939  os << prefix
2940  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2941  << std::endl;
2942  std::cerr << os.str ();
2943  }
2944 
2945  // We use a "struct of arrays" approach to packing each row's
2946  // entries. All the column indices (as global indices) go first,
2947  // then all their owning process ranks, and then the values.
2948  if(exports.extent(0) != totalNumBytes)
2949  {
2950  const std::string oldLabel = exports.d_view.label ();
2951  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2952  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2953  }
2954  if (totalNumEntries > 0) {
2955  // Current position (in bytes) in the 'exports' output array.
2956  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2957  {
2958  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2959  Kokkos::parallel_scan
2960  (policy,
2961  [=](const size_t &i, size_t &update, const bool &final) {
2962  if (final) offset(i) = update;
2963  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2964  });
2965  }
2966  if (offset(numExportLIDs) != totalNumBytes) {
2967  std::ostream& err = this->markLocalErrorAndGetStream ();
2968  err << prefix
2969  << "At end of method, the final offset (in bytes) "
2970  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2971  << totalNumBytes << ". "
2972  << "Please report this bug to the Tpetra developers." << std::endl;
2973  return;
2974  }
2975 
2976  // For each block row of the matrix owned by the calling
2977  // process, pack that block row's column indices and values into
2978  // the exports array.
2979 
2980  // Source matrix's column Map. We verified in checkSizes() that
2981  // the column Map exists (is not null).
2982  const map_type& srcColMap = * (srcGraph.getColMap ());
2983 
2984  // Pack the data for each row to send, into the 'exports' buffer.
2985  // exports will be modified on host.
2986  exports.modify_host();
2987  {
2988  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2989  const auto policy =
2990  policy_type(numExportLIDs, 1, 1)
2991  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2992  Kokkos::parallel_for
2993  (policy,
2994  [=](const typename policy_type::member_type &member) {
2995  const size_t i = member.league_rank();
2996  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2997  gblColInds(member.team_scratch(0), maxRowLength);
2998 
2999  const LO lclRowInd = exportLIDsHost(i);
3000  const LO* lclColIndsRaw;
3001  Scalar* valsRaw;
3002  LO numEntLO;
3003  // It's OK to ignore the return value, since if the calling
3004  // process doesn't own that local row, then the number of
3005  // entries in that row on the calling process is zero.
3006  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
3007 
3008  const size_t numEnt = static_cast<size_t> (numEntLO);
3009  Kokkos::View<const LO*,host_exec> lclColInds (lclColIndsRaw, numEnt);
3010 
3011  // Convert column indices from local to global.
3012  for (size_t j = 0; j < numEnt; ++j)
3013  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
3014 
3015  // Kyungjoo: additional wrapping scratch view is necessary
3016  // the following function interface need the same execution space
3017  // host scratch space somehow is not considered same as the host_exec
3018  // Copy the row's data into the current spot in the exports array.
3019  const size_t numBytes = packRowForBlockCrs<impl_scalar_type,LO,GO,host_exec>
3020  (exports.view_host(),
3021  offset(i),
3022  numEnt,
3023  Kokkos::View<const GO*,host_exec>(gblColInds.data(), maxRowLength),
3024  Kokkos::View<const impl_scalar_type*,host_exec>(reinterpret_cast<const impl_scalar_type*>(valsRaw), numEnt*blockSize*blockSize),
3025  numBytesPerValue,
3026  blockSize);
3027 
3028  // numBytes should be same as the difference between offsets
3029  if (debug) {
3030  const size_t offsetDiff = offset(i+1) - offset(i);
3031  if (numBytes != offsetDiff) {
3032  std::ostringstream os;
3033  os << prefix
3034  << "numBytes computed from packRowForBlockCrs is different from "
3035  << "precomputed offset values, LID = " << i << std::endl;
3036  std::cerr << os.str ();
3037  }
3038  }
3039  }); // for each LID (of a row) to send
3040  }
3041  } // if totalNumEntries > 0
3042 
3043  if (debug) {
3044  std::ostringstream os;
3045  const bool lclSuccess = ! (* (this->localError_));
3046  os << prefix
3047  << (lclSuccess ? "succeeded" : "FAILED")
3048  << std::endl;
3049  std::cerr << os.str ();
3050  }
3051  }
3052 
3053  template<class Scalar, class LO, class GO, class Node>
3054  void
3055  BlockCrsMatrix<Scalar, LO, GO, Node>::
3056 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3057  unpackAndCombineNew
3058 #else // TPETRA_ENABLE_DEPRECATED_CODE
3059  unpackAndCombine
3060 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3061  (const Kokkos::DualView<const local_ordinal_type*,
3062  buffer_device_type>& importLIDs,
3063  Kokkos::DualView<packet_type*,
3064  buffer_device_type> imports,
3065  Kokkos::DualView<size_t*,
3066  buffer_device_type> numPacketsPerLID,
3067  const size_t /* constantNumPackets */,
3068  Distributor& /* distor */,
3069  const CombineMode combineMode)
3070  {
3071  using ::Tpetra::Details::Behavior;
3073  using ::Tpetra::Details::ProfilingRegion;
3074  using ::Tpetra::Details::PackTraits;
3075  using std::endl;
3076  using host_exec =
3077  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
3078 
3079  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
3080  const bool verbose = Behavior::verbose ();
3081 
3082  // Define this function prefix
3083  std::string prefix;
3084  {
3085  std::ostringstream os;
3086  auto map = this->graph_.getRowMap ();
3087  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
3088  const int myRank = comm.is_null () ? -1 : comm->getRank ();
3089  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
3090  prefix = os.str ();
3091  if (verbose) {
3092  os << "Start" << endl;
3093  std::cerr << os.str ();
3094  }
3095  }
3096 
3097  // check if this already includes a local error
3098  if (* (this->localError_)) {
3099  std::ostream& err = this->markLocalErrorAndGetStream ();
3100  std::ostringstream os;
3101  os << prefix << "{Im/Ex}port target is already in error." << endl;
3102  if (verbose) {
3103  std::cerr << os.str ();
3104  }
3105  err << os.str ();
3106  return;
3107  }
3108 
3112  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
3113  std::ostream& err = this->markLocalErrorAndGetStream ();
3114  std::ostringstream os;
3115  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
3116  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
3117  << "." << endl;
3118  if (verbose) {
3119  std::cerr << os.str ();
3120  }
3121  err << os.str ();
3122  return;
3123  }
3124 
3125  if (combineMode != ADD && combineMode != INSERT &&
3126  combineMode != REPLACE && combineMode != ABSMAX &&
3127  combineMode != ZERO) {
3128  std::ostream& err = this->markLocalErrorAndGetStream ();
3129  std::ostringstream os;
3130  os << prefix
3131  << "Invalid CombineMode value " << combineMode << ". Valid "
3132  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3133  << std::endl;
3134  if (verbose) {
3135  std::cerr << os.str ();
3136  }
3137  err << os.str ();
3138  return;
3139  }
3140 
3141  if (this->graph_.getColMap ().is_null ()) {
3142  std::ostream& err = this->markLocalErrorAndGetStream ();
3143  std::ostringstream os;
3144  os << prefix << "Target matrix's column Map is null." << endl;
3145  if (verbose) {
3146  std::cerr << os.str ();
3147  }
3148  err << os.str ();
3149  return;
3150  }
3151 
3152  // Target matrix's column Map. Use to convert the global column
3153  // indices in the receive buffer to local indices. We verified in
3154  // checkSizes() that the column Map exists (is not null).
3155  const map_type& tgtColMap = * (this->graph_.getColMap ());
3156 
3157  // Const values
3158  const size_t blockSize = this->getBlockSize ();
3159  const size_t numImportLIDs = importLIDs.extent(0);
3160  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
3161  // default-constructed instance could have a different size than
3162  // other instances. (We assume all nominally constructed
3163  // instances have the same size; that's not the issue here.) This
3164  // could be bad if the calling process has no entries, but other
3165  // processes have entries that they want to send to us.
3166  const size_t numBytesPerValue =
3167  PackTraits<impl_scalar_type, host_exec>::packValueCount
3168  (this->val_.extent (0) ? this->val_.view_host () (0) : impl_scalar_type ());
3169  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3170  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3171 
3172  if (verbose) {
3173  std::ostringstream os;
3174  os << prefix << "combineMode: "
3175  << ::Tpetra::combineModeToString (combineMode)
3176  << ", blockSize: " << blockSize
3177  << ", numImportLIDs: " << numImportLIDs
3178  << ", numBytesPerValue: " << numBytesPerValue
3179  << ", maxRowNumEnt: " << maxRowNumEnt
3180  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
3181  std::cerr << os.str ();
3182  }
3183 
3184  if (combineMode == ZERO || numImportLIDs == 0) {
3185  if (verbose) {
3186  std::ostringstream os;
3187  os << prefix << "Nothing to unpack. Done!" << std::endl;
3188  std::cerr << os.str ();
3189  }
3190  return;
3191  }
3192 
3193  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
3194  // we can remove this sync.
3195  if (imports.need_sync_host ()) {
3196  imports.sync_host ();
3197  }
3198 
3199  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
3200  // sync'd numPacketsPerLID to host, since it needs to do that in
3201  // order to post MPI_Irecv messages with the correct lengths. A
3202  // hypothetical device-specific boundary exchange implementation
3203  // could possibly receive data without sync'ing lengths to host,
3204  // but we don't need to design for that nonexistent thing yet.
3205 
3206  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
3207  importLIDs.need_sync_host ()) {
3208  std::ostream& err = this->markLocalErrorAndGetStream ();
3209  std::ostringstream os;
3210  os << prefix << "All input DualViews must be sync'd to host by now. "
3211  << "imports_nc.need_sync_host()="
3212  << (imports.need_sync_host () ? "true" : "false")
3213  << ", numPacketsPerLID.need_sync_host()="
3214  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
3215  << ", importLIDs.need_sync_host()="
3216  << (importLIDs.need_sync_host () ? "true" : "false")
3217  << "." << endl;
3218  if (verbose) {
3219  std::cerr << os.str ();
3220  }
3221  err << os.str ();
3222  return;
3223  }
3224 
3225  const auto importLIDsHost = importLIDs.view_host ();
3226  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
3227 
3228  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
3229  // loop, by only unpacking on final==true (when we know the
3230  // current offset's value).
3231 
3232  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
3233  {
3234  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
3235  Kokkos::parallel_scan
3236  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
3237  [=] (const size_t &i, size_t &update, const bool &final) {
3238  if (final) offset(i) = update;
3239  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
3240  });
3241  }
3242 
3243  // this variable does not matter with a race condition (just error flag)
3244  //
3245  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
3246  // or atomics on bool, so we use int instead. (I know we're not
3247  // launching a CUDA loop here, but we could in the future, and I'd
3248  // like to avoid potential trouble.)
3249  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
3250  errorDuringUnpack ("errorDuringUnpack");
3251  errorDuringUnpack () = 0;
3252  {
3253  using policy_type = Kokkos::TeamPolicy<host_exec>;
3254  const auto policy = policy_type (numImportLIDs, 1, 1)
3255  .set_scratch_size (0, Kokkos::PerTeam (sizeof (GO) * maxRowNumEnt +
3256  sizeof (LO) * maxRowNumEnt +
3257  numBytesPerValue * maxRowNumScalarEnt));
3258  using host_scratch_space = typename host_exec::scratch_memory_space;
3259  using pair_type = Kokkos::pair<size_t, size_t>;
3260  Kokkos::parallel_for
3261  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
3262  [=] (const typename policy_type::member_type& member) {
3263  const size_t i = member.league_rank();
3264 
3265  Kokkos::View<GO*, host_scratch_space> gblColInds
3266  (member.team_scratch (0), maxRowNumEnt);
3267  Kokkos::View<LO*, host_scratch_space> lclColInds
3268  (member.team_scratch (0), maxRowNumEnt);
3269  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
3270  (member.team_scratch (0), maxRowNumScalarEnt);
3271 
3272  const size_t offval = offset(i);
3273  const LO lclRow = importLIDsHost(i);
3274  const size_t numBytes = numPacketsPerLIDHost(i);
3275  const size_t numEnt =
3276  unpackRowCount<impl_scalar_type, LO, GO, host_exec>
3277  (imports.view_host (), offval, numBytes, numBytesPerValue);
3278 
3279  if (numBytes > 0) {
3280  if (numEnt > maxRowNumEnt) {
3281  errorDuringUnpack() = 1;
3282  if (verbose) {
3283  std::ostringstream os;
3284  os << prefix
3285  << "At i = " << i << ", numEnt = " << numEnt
3286  << " > maxRowNumEnt = " << maxRowNumEnt
3287  << std::endl;
3288  std::cerr << os.str ();
3289  }
3290  return;
3291  }
3292  }
3293  const size_t numScalarEnt = numEnt*blockSize*blockSize;
3294  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
3295  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
3296  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
3297 
3298  // Kyungjoo: additional wrapping scratch view is necessary
3299  // the following function interface need the same execution space
3300  // host scratch space somehow is not considered same as the host_exec
3301  size_t numBytesOut = 0;
3302  try {
3303  numBytesOut =
3304  unpackRowForBlockCrs<impl_scalar_type, LO, GO, host_exec>
3305  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
3306  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
3307  imports.view_host(),
3308  offval, numBytes, numEnt,
3309  numBytesPerValue, blockSize);
3310  }
3311  catch (std::exception& e) {
3312  errorDuringUnpack() = 1;
3313  if (verbose) {
3314  std::ostringstream os;
3315  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
3316  << e.what () << endl;
3317  std::cerr << os.str ();
3318  }
3319  return;
3320  }
3321 
3322  if (numBytes != numBytesOut) {
3323  errorDuringUnpack() = 1;
3324  if (verbose) {
3325  std::ostringstream os;
3326  os << prefix
3327  << "At i = " << i << ", numBytes = " << numBytes
3328  << " != numBytesOut = " << numBytesOut << "."
3329  << std::endl;
3330  std::cerr << os.str();
3331  }
3332  return;
3333  }
3334 
3335  // Convert incoming global indices to local indices.
3336  for (size_t k = 0; k < numEnt; ++k) {
3337  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3338  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3339  if (verbose) {
3340  std::ostringstream os;
3341  os << prefix
3342  << "At i = " << i << ", GID " << gidsOut(k)
3343  << " is not owned by the calling process."
3344  << std::endl;
3345  std::cerr << os.str();
3346  }
3347  return;
3348  }
3349  }
3350 
3351  // Combine the incoming data with the matrix's current data.
3352  LO numCombd = 0;
3353  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
3354  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
3355  (const_cast<const impl_scalar_type*> (valsOut.data ()));
3356  if (combineMode == ADD) {
3357  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3358  } else if (combineMode == INSERT || combineMode == REPLACE) {
3359  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3360  } else if (combineMode == ABSMAX) {
3361  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3362  }
3363 
3364  if (static_cast<LO> (numEnt) != numCombd) {
3365  errorDuringUnpack() = 1;
3366  if (verbose) {
3367  std::ostringstream os;
3368  os << prefix << "At i = " << i << ", numEnt = " << numEnt
3369  << " != numCombd = " << numCombd << "."
3370  << endl;
3371  std::cerr << os.str ();
3372  }
3373  return;
3374  }
3375  }); // for each import LID i
3376  }
3377 
3378  if (errorDuringUnpack () != 0) {
3379  std::ostream& err = this->markLocalErrorAndGetStream ();
3380  err << prefix << "Unpacking failed.";
3381  if (! verbose) {
3382  err << " Please run again with the environment variable setting "
3383  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3384  }
3385  err << endl;
3386  }
3387 
3388  if (verbose) {
3389  std::ostringstream os;
3390  const bool lclSuccess = ! (* (this->localError_));
3391  os << prefix
3392  << (lclSuccess ? "succeeded" : "FAILED")
3393  << std::endl;
3394  std::cerr << os.str ();
3395  }
3396  }
3397 
3398  template<class Scalar, class LO, class GO, class Node>
3399  std::string
3401  {
3402  using Teuchos::TypeNameTraits;
3403  std::ostringstream os;
3404  os << "\"Tpetra::BlockCrsMatrix\": { "
3405  << "Template parameters: { "
3406  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3407  << "LO: " << TypeNameTraits<LO>::name ()
3408  << "GO: " << TypeNameTraits<GO>::name ()
3409  << "Node: " << TypeNameTraits<Node>::name ()
3410  << " }"
3411  << ", Label: \"" << this->getObjectLabel () << "\""
3412  << ", Global dimensions: ["
3413  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3414  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3415  << ", Block size: " << getBlockSize ()
3416  << " }";
3417  return os.str ();
3418  }
3419 
3420 
3421  template<class Scalar, class LO, class GO, class Node>
3422  void
3424  describe (Teuchos::FancyOStream& out,
3425  const Teuchos::EVerbosityLevel verbLevel) const
3426  {
3427  using Teuchos::ArrayRCP;
3428  using Teuchos::CommRequest;
3429  using Teuchos::FancyOStream;
3430  using Teuchos::getFancyOStream;
3431  using Teuchos::ireceive;
3432  using Teuchos::isend;
3433  using Teuchos::outArg;
3434  using Teuchos::TypeNameTraits;
3435  using Teuchos::VERB_DEFAULT;
3436  using Teuchos::VERB_NONE;
3437  using Teuchos::VERB_LOW;
3438  using Teuchos::VERB_MEDIUM;
3439  // using Teuchos::VERB_HIGH;
3440  using Teuchos::VERB_EXTREME;
3441  using Teuchos::RCP;
3442  using Teuchos::wait;
3443  using std::endl;
3444 #ifdef HAVE_TPETRA_DEBUG
3445  const char prefix[] = "Tpetra::BlockCrsMatrix::describe: ";
3446 #endif // HAVE_TPETRA_DEBUG
3447 
3448  // Set default verbosity if applicable.
3449  const Teuchos::EVerbosityLevel vl =
3450  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3451 
3452  if (vl == VERB_NONE) {
3453  return; // print nothing
3454  }
3455 
3456  // describe() always starts with a tab before it prints anything.
3457  Teuchos::OSTab tab0 (out);
3458 
3459  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3460  Teuchos::OSTab tab1 (out);
3461 
3462  out << "Template parameters:" << endl;
3463  {
3464  Teuchos::OSTab tab2 (out);
3465  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3466  << "LO: " << TypeNameTraits<LO>::name () << endl
3467  << "GO: " << TypeNameTraits<GO>::name () << endl
3468  << "Node: " << TypeNameTraits<Node>::name () << endl;
3469  }
3470  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3471  << "Global dimensions: ["
3472  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3473  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3474 
3475  const LO blockSize = getBlockSize ();
3476  out << "Block size: " << blockSize << endl;
3477 
3478  // constituent objects
3479  if (vl >= VERB_MEDIUM) {
3480  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3481  const int myRank = comm.getRank ();
3482  if (myRank == 0) {
3483  out << "Row Map:" << endl;
3484  }
3485  getRowMap()->describe (out, vl);
3486 
3487  if (! getColMap ().is_null ()) {
3488  if (getColMap() == getRowMap()) {
3489  if (myRank == 0) {
3490  out << "Column Map: same as row Map" << endl;
3491  }
3492  }
3493  else {
3494  if (myRank == 0) {
3495  out << "Column Map:" << endl;
3496  }
3497  getColMap ()->describe (out, vl);
3498  }
3499  }
3500  if (! getDomainMap ().is_null ()) {
3501  if (getDomainMap () == getRowMap ()) {
3502  if (myRank == 0) {
3503  out << "Domain Map: same as row Map" << endl;
3504  }
3505  }
3506  else if (getDomainMap () == getColMap ()) {
3507  if (myRank == 0) {
3508  out << "Domain Map: same as column Map" << endl;
3509  }
3510  }
3511  else {
3512  if (myRank == 0) {
3513  out << "Domain Map:" << endl;
3514  }
3515  getDomainMap ()->describe (out, vl);
3516  }
3517  }
3518  if (! getRangeMap ().is_null ()) {
3519  if (getRangeMap () == getDomainMap ()) {
3520  if (myRank == 0) {
3521  out << "Range Map: same as domain Map" << endl;
3522  }
3523  }
3524  else if (getRangeMap () == getRowMap ()) {
3525  if (myRank == 0) {
3526  out << "Range Map: same as row Map" << endl;
3527  }
3528  }
3529  else {
3530  if (myRank == 0) {
3531  out << "Range Map: " << endl;
3532  }
3533  getRangeMap ()->describe (out, vl);
3534  }
3535  }
3536  }
3537 
3538  if (vl >= VERB_EXTREME) {
3539  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3540  // to host, since it's supposed to be const. However, that's
3541  // the easiest and least memory-intensive way to implement this
3542  // method.
3544  const_cast<this_type*> (this)->sync_host ();
3545 
3546 #ifdef HAVE_TPETRA_DEBUG
3547  TEUCHOS_TEST_FOR_EXCEPTION
3548  (this->need_sync_host (), std::logic_error,
3549  prefix << "Right after sync to host, the matrix claims that it needs "
3550  "sync to host. Please report this bug to the Tpetra developers.");
3551 #endif // HAVE_TPETRA_DEBUG
3552 
3553  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3554  const int myRank = comm.getRank ();
3555  const int numProcs = comm.getSize ();
3556 
3557  // Print the calling process' data to the given output stream.
3558  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3559  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3560  FancyOStream& os = *osPtr;
3561  os << "Process " << myRank << ":" << endl;
3562  Teuchos::OSTab tab2 (os);
3563 
3564  const map_type& meshRowMap = * (graph_.getRowMap ());
3565  const map_type& meshColMap = * (graph_.getColMap ());
3566  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3567  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3568  ++meshLclRow) {
3569  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3570  os << "Row " << meshGblRow << ": {";
3571 
3572  const LO* lclColInds = NULL;
3573  Scalar* vals = NULL;
3574  LO numInds = 0;
3575  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3576 
3577  for (LO k = 0; k < numInds; ++k) {
3578  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3579 
3580  os << "Col " << gblCol << ": [";
3581  for (LO i = 0; i < blockSize; ++i) {
3582  for (LO j = 0; j < blockSize; ++j) {
3583  os << vals[blockSize*blockSize*k + i*blockSize + j];
3584  if (j + 1 < blockSize) {
3585  os << ", ";
3586  }
3587  }
3588  if (i + 1 < blockSize) {
3589  os << "; ";
3590  }
3591  }
3592  os << "]";
3593  if (k + 1 < numInds) {
3594  os << ", ";
3595  }
3596  }
3597  os << "}" << endl;
3598  }
3599 
3600  // Print data on Process 0. This will automatically respect the
3601  // current indentation level.
3602  if (myRank == 0) {
3603  out << lclOutStrPtr->str ();
3604  lclOutStrPtr = Teuchos::null; // clear it to save space
3605  }
3606 
3607  const int sizeTag = 1337;
3608  const int dataTag = 1338;
3609 
3610  ArrayRCP<char> recvDataBuf; // only used on Process 0
3611 
3612  // Send string sizes and data from each process in turn to
3613  // Process 0, and print on that process.
3614  for (int p = 1; p < numProcs; ++p) {
3615  if (myRank == 0) {
3616  // Receive the incoming string's length.
3617  ArrayRCP<size_t> recvSize (1);
3618  recvSize[0] = 0;
3619  RCP<CommRequest<int> > recvSizeReq =
3620  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3621  wait<int> (comm, outArg (recvSizeReq));
3622  const size_t numCharsToRecv = recvSize[0];
3623 
3624  // Allocate space for the string to receive. Reuse receive
3625  // buffer space if possible. We can do this because in the
3626  // current implementation, we only have one receive in
3627  // flight at a time. Leave space for the '\0' at the end,
3628  // in case the sender doesn't send it.
3629  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3630  recvDataBuf.resize (numCharsToRecv + 1);
3631  }
3632  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3633  // Post the receive of the actual string data.
3634  RCP<CommRequest<int> > recvDataReq =
3635  ireceive<int, char> (recvData, p, dataTag, comm);
3636  wait<int> (comm, outArg (recvDataReq));
3637 
3638  // Print the received data. This will respect the current
3639  // indentation level. Make sure that the string is
3640  // null-terminated.
3641  recvDataBuf[numCharsToRecv] = '\0';
3642  out << recvDataBuf.getRawPtr ();
3643  }
3644  else if (myRank == p) { // if I am not Process 0, and my rank is p
3645  // This deep-copies the string at most twice, depending on
3646  // whether std::string reference counts internally (it
3647  // generally does, so this won't deep-copy at all).
3648  const std::string stringToSend = lclOutStrPtr->str ();
3649  lclOutStrPtr = Teuchos::null; // clear original to save space
3650 
3651  // Send the string's length to Process 0.
3652  const size_t numCharsToSend = stringToSend.size ();
3653  ArrayRCP<size_t> sendSize (1);
3654  sendSize[0] = numCharsToSend;
3655  RCP<CommRequest<int> > sendSizeReq =
3656  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3657  wait<int> (comm, outArg (sendSizeReq));
3658 
3659  // Send the actual string to Process 0. We know that the
3660  // string has length > 0, so it's save to take the address
3661  // of the first entry. Make a nonowning ArrayRCP to hold
3662  // the string. Process 0 will add a null termination
3663  // character at the end of the string, after it receives the
3664  // message.
3665  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3666  RCP<CommRequest<int> > sendDataReq =
3667  isend<int, char> (sendData, 0, dataTag, comm);
3668  wait<int> (comm, outArg (sendDataReq));
3669  }
3670  } // for each process rank p other than 0
3671  } // extreme verbosity level (print the whole matrix)
3672  }
3673 
3674  template<class Scalar, class LO, class GO, class Node>
3675  Teuchos::RCP<const Teuchos::Comm<int> >
3677  getComm() const
3678  {
3679  return graph_.getComm();
3680  }
3681 
3682 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3683  template<class Scalar, class LO, class GO, class Node>
3684  TPETRA_DEPRECATED
3685  Teuchos::RCP<Node>
3687  getNode() const
3688  {
3689  return Teuchos::null;
3690 
3691  }
3692 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3693 
3694  template<class Scalar, class LO, class GO, class Node>
3697  getGlobalNumCols() const
3698  {
3699  return graph_.getGlobalNumCols();
3700  }
3701 
3702  template<class Scalar, class LO, class GO, class Node>
3703  size_t
3705  getNodeNumCols() const
3706  {
3707  return graph_.getNodeNumCols();
3708  }
3709 
3710  template<class Scalar, class LO, class GO, class Node>
3711  GO
3712  BlockCrsMatrix<Scalar, LO, GO, Node>::
3713  getIndexBase() const
3714  {
3715  return graph_.getIndexBase();
3716  }
3717 
3718  template<class Scalar, class LO, class GO, class Node>
3721  getGlobalNumEntries() const
3722  {
3723  return graph_.getGlobalNumEntries();
3724  }
3725 
3726  template<class Scalar, class LO, class GO, class Node>
3727  size_t
3729  getNodeNumEntries() const
3730  {
3731  return graph_.getNodeNumEntries();
3732  }
3733 
3734  template<class Scalar, class LO, class GO, class Node>
3735  size_t
3737  getNumEntriesInGlobalRow (GO globalRow) const
3738  {
3739  return graph_.getNumEntriesInGlobalRow(globalRow);
3740  }
3741 
3742 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3743  template<class Scalar, class LO, class GO, class Node>
3744  global_size_t TPETRA_DEPRECATED
3746  getGlobalNumDiags () const
3747  {
3748  using HDM = Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3749  return dynamic_cast<const HDM&> (this->graph_).getGlobalNumDiagsImpl ();
3750  }
3751 
3752  template<class Scalar, class LO, class GO, class Node>
3753  size_t TPETRA_DEPRECATED
3754  BlockCrsMatrix<Scalar, LO, GO, Node>::
3755  getNodeNumDiags () const
3756  {
3757  using HDM = Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3758  return dynamic_cast<const HDM&> (this->graph_).getNodeNumDiagsImpl ();
3759  }
3760 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3761 
3762  template<class Scalar, class LO, class GO, class Node>
3763  size_t
3766  {
3767  return graph_.getGlobalMaxNumRowEntries();
3768  }
3769 
3770  template<class Scalar, class LO, class GO, class Node>
3771  bool
3773  hasColMap() const
3774  {
3775  return graph_.hasColMap();
3776  }
3777 
3778 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3779  template<class Scalar, class LO, class GO, class Node>
3780  bool TPETRA_DEPRECATED
3782  isLowerTriangular () const
3783  {
3784  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3785  return dynamic_cast<const HDM&> (this->graph_).isLowerTriangularImpl ();
3786  }
3787 
3788  template<class Scalar, class LO, class GO, class Node>
3789  bool TPETRA_DEPRECATED
3790  BlockCrsMatrix<Scalar, LO, GO, Node>::
3791  isUpperTriangular () const
3792  {
3793  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3794  return dynamic_cast<const HDM&> (this->graph_).isUpperTriangularImpl ();
3795  }
3796 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3797 
3798  template<class Scalar, class LO, class GO, class Node>
3799  bool
3801  isLocallyIndexed() const
3802  {
3803  return graph_.isLocallyIndexed();
3804  }
3805 
3806  template<class Scalar, class LO, class GO, class Node>
3807  bool
3809  isGloballyIndexed() const
3810  {
3811  return graph_.isGloballyIndexed();
3812  }
3813 
3814  template<class Scalar, class LO, class GO, class Node>
3815  bool
3817  isFillComplete() const
3818  {
3819  return graph_.isFillComplete ();
3820  }
3821 
3822  template<class Scalar, class LO, class GO, class Node>
3823  bool
3825  supportsRowViews() const
3826  {
3827  return false;
3828  }
3829 
3830 
3831  template<class Scalar, class LO, class GO, class Node>
3832  void
3834  getGlobalRowCopy (GO /* GlobalRow */,
3835  const Teuchos::ArrayView<GO> &/* Indices */,
3836  const Teuchos::ArrayView<Scalar> &/* Values */,
3837  size_t &/* NumEntries */) const
3838  {
3839  TEUCHOS_TEST_FOR_EXCEPTION(
3840  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3841  "This class doesn't support global matrix indexing.");
3842 
3843  }
3844 
3845  template<class Scalar, class LO, class GO, class Node>
3846  void
3848  getGlobalRowView (GO /* GlobalRow */,
3849  Teuchos::ArrayView<const GO> &/* indices */,
3850  Teuchos::ArrayView<const Scalar> &/* values */) const
3851  {
3852  TEUCHOS_TEST_FOR_EXCEPTION(
3853  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3854  "This class doesn't support global matrix indexing.");
3855 
3856  }
3857 
3858  template<class Scalar, class LO, class GO, class Node>
3859  void
3861  getLocalRowView (LO /* LocalRow */,
3862  Teuchos::ArrayView<const LO>& /* indices */,
3863  Teuchos::ArrayView<const Scalar>& /* values */) const
3864  {
3865  TEUCHOS_TEST_FOR_EXCEPTION(
3866  true, std::logic_error, "Tpetra::BlockCrsMatrix::getLocalRowView: "
3867  "This class doesn't support local matrix indexing.");
3868  }
3869 
3870  template<class Scalar, class LO, class GO, class Node>
3871  void
3874  {
3875 #ifdef HAVE_TPETRA_DEBUG
3876  const char prefix[] =
3877  "Tpetra::BlockCrsMatrix::getLocalDiagCopy: ";
3878 #endif // HAVE_TPETRA_DEBUG
3879 
3880  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3881 
3882  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3883  graph_.getLocalDiagOffsets (diagOffsets);
3884 
3885  // The code below works on host, so use a host View.
3886  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3887  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3888  // We're filling diag on host for now.
3889  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3890 
3891 #ifdef HAVE_TPETRA_DEBUG
3892  TEUCHOS_TEST_FOR_EXCEPTION
3893  (this->need_sync_host (), std::runtime_error,
3894  prefix << "The matrix's data were last modified on device, but have "
3895  "not been sync'd to host. Please sync to host (by calling "
3896  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3897  "method.");
3898 #endif // HAVE_TPETRA_DEBUG
3899 
3900  auto vals_host_out = getValuesHost ();
3901  Scalar* vals_host_out_raw =
3902  reinterpret_cast<Scalar*> (vals_host_out.data ());
3903 
3904  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3905  size_t rowOffset = 0;
3906  size_t offset = 0;
3907  LO bs = getBlockSize();
3908  for(size_t r=0; r<getNodeNumRows(); r++)
3909  {
3910  // move pointer to start of diagonal block
3911  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3912  for(int b=0; b<bs; b++)
3913  {
3914  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3915  }
3916  // move pointer to start of next block row
3917  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3918  }
3919 
3920  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3921  }
3922 
3923  template<class Scalar, class LO, class GO, class Node>
3924  void
3926  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3927  {
3928  TEUCHOS_TEST_FOR_EXCEPTION(
3929  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3930  "not implemented.");
3931 
3932  }
3933 
3934  template<class Scalar, class LO, class GO, class Node>
3935  void
3937  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3938  {
3939  TEUCHOS_TEST_FOR_EXCEPTION(
3940  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3941  "not implemented.");
3942 
3943  }
3944 
3945  template<class Scalar, class LO, class GO, class Node>
3946  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3948  getGraph() const
3949  {
3950  return graphRCP_;
3951  }
3952 
3953  template<class Scalar, class LO, class GO, class Node>
3954  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3956  getFrobeniusNorm () const
3957  {
3958  TEUCHOS_TEST_FOR_EXCEPTION(
3959  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3960  "not implemented.");
3961  }
3962 
3963 } // namespace Tpetra
3964 
3965 //
3966 // Explicit instantiation macro
3967 //
3968 // Must be expanded from within the Tpetra namespace!
3969 //
3970 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3971  template class BlockCrsMatrix< S, LO, GO, NODE >;
3972 
3973 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Tpetra::BlockCrsMatrix::getComm
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3677
Tpetra::BlockCrsMatrix::getGlobalNumEntries
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3721
Tpetra::BlockCrsMatrix::applyBlock
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
Definition: Tpetra_BlockCrsMatrix_def.hpp:929
Tpetra::BlockCrsMatrix::reorderedGaussSeidelCopy
void reorderedGaussSeidelCopy(::Tpetra::MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1238
Tpetra::ESweepDirection
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Definition: Tpetra_ConfigDefs.hpp:245
Tpetra::CrsGraph::getDomainMap
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Definition: Tpetra_CrsGraph_def.hpp:1116
Tpetra::BlockCrsMatrix::execution_space
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:177
Tpetra::BlockCrsMatrix::localGaussSeidel
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1109
Tpetra::COPY
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
Definition: Tpetra_BlockView.hpp:720
Tpetra::BlockCrsMatrix::apply
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Definition: Tpetra_BlockCrsMatrix_def.hpp:864
Tpetra::BlockCrsMatrix::getColMap
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:832
Tpetra::Map::getMinLocalIndex
LocalOrdinal getMinLocalIndex() const
The minimum local index.
Definition: Tpetra_Map_decl.hpp:636
Tpetra::BlockCrsMatrix::getGlobalRowView
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3848
Tpetra::REPLACE
@ REPLACE
Replace existing values with new values.
Definition: Tpetra_CombineMode.hpp:97
Tpetra::GEMV
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
Definition: Tpetra_BlockView.hpp:746
Tpetra::BlockCrsMatrix::getGraph
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3948
Tpetra::Map::getMaxLocalIndex
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
Definition: Tpetra_Map_decl.hpp:650
Tpetra::BlockCrsMatrix::impl_scalar_type
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:162
Tpetra::MultiVector
One or more distributed dense vectors.
Definition: Tpetra_MultiVector_decl.hpp:425
Tpetra::BlockMultiVector::getNumVectors
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
Definition: Tpetra_BlockMultiVector_decl.hpp:346
Tpetra::Map::getGlobalElement
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
Definition: Tpetra_Map_def.hpp:1246
Tpetra::BlockCrsMatrix::getLocalRowView
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1492
Tpetra::BlockCrsMatrix::replaceLocalValues
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
Definition: Tpetra_BlockCrsMatrix_def.hpp:988
Tpetra::BlockCrsMatrix::getDomainMap
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:806
Tpetra_Details_Behavior.hpp
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Tpetra::BlockCrsMatrix::little_vec_type
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:205
Tpetra::BlockCrsMatrix::getNumEntriesInLocalRow
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1713
Tpetra::Map< LO, GO, node_type >
Tpetra::BlockCrsMatrix::isLocallyIndexed
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3801
Tpetra::BlockCrsMatrix::getNodeNumEntries
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3729
Tpetra::BlockCrsMatrix::isFillComplete
virtual bool isFillComplete() const
Whether fillComplete() has been called.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3817
Tpetra::BlockCrsMatrix::getNumEntriesInGlobalRow
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3737
Tpetra::BlockCrsMatrix::setAllToScalar
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Definition: Tpetra_BlockCrsMatrix_def.hpp:957
Tpetra::BlockCrsMatrix::getLocalRowCopy
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1535
Tpetra_Details_PackTraits.hpp
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Tpetra::BlockCrsMatrix::device_type
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:175
Tpetra::DistObject< char, LO, GO, Node >
Tpetra::BlockCrsMatrix::BlockCrsMatrix
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:658
Tpetra::ZERO
@ ZERO
Replace old values with zero.
Definition: Tpetra_CombineMode.hpp:99
Tpetra::BlockCrsMatrix::getNodeNumRows
size_t getNodeNumRows() const
get the local number of block rows
Definition: Tpetra_BlockCrsMatrix_def.hpp:848
Tpetra::BlockCrsMatrix::absMaxLocalValuesByOffsets
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1633
Tpetra::BlockCrsMatrix::getGlobalNumCols
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3697
Tpetra::Impl::absMax
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Definition: Tpetra_BlockView.hpp:166
Tpetra::ADD
@ ADD
Sum new values into existing values.
Definition: Tpetra_CombineMode.hpp:95
Tpetra::Vector::replaceLocalValue
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
Definition: Tpetra_Vector_def.hpp:138
Tpetra::BlockCrsMatrix::isGloballyIndexed
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3809
Tpetra::CrsGraph::isSorted
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Definition: Tpetra_CrsGraph_def.hpp:1419
Tpetra::BlockCrsMatrix::sumIntoLocalValuesByOffsets
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1672
Tpetra::ABSMAX
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
Definition: Tpetra_CombineMode.hpp:98
Tpetra::AXPY
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Definition: Tpetra_BlockView.hpp:694
Tpetra_Details_Profiling.hpp
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Tpetra::Details::getGlobalNumDiags
CrsGraphType::global_ordinal_type getGlobalNumDiags(const CrsGraphType &G)
Number of populated diagonal entries in the given sparse graph, over all processes in the graph's (MP...
Definition: Tpetra_Details_getNumDiags.hpp:406
Tpetra::BlockCrsMatrix::getRangeMap
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:815
Tpetra::BlockCrsMatrix::getLocalRowOffsets
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1561
Tpetra::BlockCrsMatrix::rightScale
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3937
Tpetra::Details::dualViewStatusToString
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
Definition: Tpetra_Util.hpp:946
Tpetra::BlockCrsMatrix::little_block_type
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:193
Tpetra::Vector
A distributed dense vector.
Definition: Tpetra_Vector_decl.hpp:82
Tpetra::Distributor
Sets up and executes a communication plan for a Tpetra DistObject.
Definition: Tpetra_Distributor.hpp:192
Tpetra::CrsGraph::getNodeNumEntries
size_t getNodeNumEntries() const override
The local number of entries in the graph.
Definition: Tpetra_CrsGraph_def.hpp:1200
Tpetra::BlockCrsMatrix::replaceLocalValuesByOffsets
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1594
Tpetra::CrsGraph< LO, GO, node_type >::import_type
::Tpetra::Import< LO, GO, node_type > import_type
The Import specialization used by this class.
Definition: Tpetra_CrsGraph_decl.hpp:323
Tpetra::BlockCrsMatrix::getGlobalRowCopy
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row's entries.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3834
Tpetra::Details::DefaultTypes::local_ordinal_type
int local_ordinal_type
Default value of Scalar template parameter.
Definition: Tpetra_Details_DefaultTypes.hpp:72
Tpetra::BlockCrsMatrix::hasColMap
virtual bool hasColMap() const
Whether this matrix has a well-defined column Map.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3773
Tpetra::BlockCrsMatrix::gaussSeidelCopy
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1220
Tpetra::BlockCrsMatrix::sumIntoLocalValues
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
Definition: Tpetra_BlockCrsMatrix_def.hpp:1419
Tpetra::BlockCrsMatrix::supportsRowViews
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
Definition: Tpetra_BlockCrsMatrix_def.hpp:3825
Tpetra::CrsGraph::getLocalGraph
local_graph_type getLocalGraph() const
Get the local graph.
Definition: Tpetra_CrsGraph_def.hpp:4874
Tpetra::CrsGraph< LO, GO, node_type >
Tpetra::CrsGraph::local_graph_type
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Definition: Tpetra_CrsGraph_decl.hpp:308
Tpetra::FILL
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
Definition: Tpetra_BlockView.hpp:676
Tpetra::BlockCrsMatrix::const_little_block_type
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:199
Tpetra::BlockMultiVector::getBlockSize
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Definition: Tpetra_BlockMultiVector_decl.hpp:341
Tpetra::BlockCrsMatrix::description
std::string description() const
One-line description of this object.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3400
Tpetra::BlockMultiVector::getLocalBlock
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
Definition: Tpetra_BlockMultiVector_def.hpp:421
Tpetra::BlockCrsMatrix::getGlobalNumRows
global_size_t getGlobalNumRows() const
get the global number of block rows
Definition: Tpetra_BlockCrsMatrix_def.hpp:840
Tpetra::BlockCrsMatrix::getLocalDiagCopy
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Definition: Tpetra_BlockCrsMatrix_def.hpp:1289
Tpetra::BlockCrsMatrix::leftScale
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3926
Tpetra::BlockCrsMatrix::getLocalDiagOffsets
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1060
Tpetra::BlockCrsMatrix::absMaxLocalValues
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Definition: Tpetra_BlockCrsMatrix_def.hpp:1376
Tpetra::SCAL
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Definition: Tpetra_BlockView.hpp:664
Tpetra::CrsGraph::getRangeMap
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Definition: Tpetra_CrsGraph_def.hpp:1124
Tpetra::global_size_t
size_t global_size_t
Global size_t object.
Definition: Tpetra_ConfigDefs.hpp:109
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::BlockCrsMatrix::describe
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3424
Tpetra::deep_copy
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Definition: Tpetra_MultiVector_decl.hpp:2557
Tpetra::BlockCrsMatrix::getGlobalMaxNumRowEntries
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries in any row over all processes in the matrix's communicator.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3765
Tpetra::BlockMultiVector
MultiVector for multiple degrees of freedom per mesh point.
Definition: Tpetra_BlockMultiVector_decl.hpp:147
Tpetra::BlockMultiVector::makePointMap
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Definition: Tpetra_BlockMultiVector_def.hpp:258
Tpetra::CrsGraph::getColMap
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Definition: Tpetra_CrsGraph_def.hpp:1108
Tpetra::BlockCrsMatrix
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
Definition: Tpetra_BlockCrsMatrix_decl.hpp:139
Tpetra_BlockView.hpp
Linear algebra kernels for small dense matrices and vectors.
Tpetra::BlockCrsMatrix::getRowMap
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:824
Tpetra::INSERT
@ INSERT
Insert new values that don't currently exist.
Definition: Tpetra_CombineMode.hpp:96
Tpetra::BlockCrsMatrix::getFrobeniusNorm
virtual typename ::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
Definition: Tpetra_BlockCrsMatrix_def.hpp:3956
Tpetra::CombineMode
CombineMode
Rule for combining data in an Import or Export.
Definition: Tpetra_CombineMode.hpp:94