41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP 42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP 45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" 46 #include "Teuchos_VerboseObject.hpp" 47 #include "Teuchos_Array.hpp" 49 #include "Tpetra_ConfigDefs.hpp" 50 #include "Tpetra_CrsMatrix.hpp" 52 #include "Tpetra_RowMatrixTransposer.hpp" 53 #include "Tpetra_ConfigDefs.hpp" 54 #include "Tpetra_Map.hpp" 55 #include "Tpetra_Export.hpp" 60 #include "Teuchos_FancyOStream.hpp" 71 namespace TripleMatrixMultiply{
79 template <
class Scalar,
91 bool call_FillComplete_on_result,
92 const std::string& label,
93 const Teuchos::RCP<Teuchos::ParameterList>& params)
98 typedef LocalOrdinal LO;
99 typedef GlobalOrdinal GO;
108 #ifdef HAVE_TPETRA_MMM_TIMINGS 109 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
110 using Teuchos::TimeMonitor;
111 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Setup"))));
114 const std::string prefix =
"TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
119 TEUCHOS_TEST_FOR_EXCEPTION(!R.
isFillComplete(), std::runtime_error, prefix <<
"Matrix R is not fill complete.");
120 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
121 TEUCHOS_TEST_FOR_EXCEPTION(!P.
isFillComplete(), std::runtime_error, prefix <<
"Matrix P is not fill complete.");
126 RCP<const crs_matrix_type> Rprime = null;
130 RCP<const crs_matrix_type> Aprime = null;
134 RCP<const crs_matrix_type> Pprime = null;
144 const bool newFlag = !Ac.
getGraph()->isLocallyIndexed() && !Ac.
getGraph()->isGloballyIndexed();
146 if (transposeR && &R != &P) {
147 transposer_type transposer(rcpFromRef (R));
148 Rprime = transposer.createTranspose();
150 Rprime = rcpFromRef(R);
154 transposer_type transposer(rcpFromRef (A));
155 Aprime = transposer.createTranspose();
157 Aprime = rcpFromRef(A);
161 transposer_type transposer(rcpFromRef (P));
162 Pprime = transposer.createTranspose();
164 Pprime = rcpFromRef(P);
177 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
178 prefix <<
"ERROR, inner dimensions of op(R) and op(A) " 179 "must match for matrix-matrix product. op(R) is " 180 << Rleft <<
"x" << Rright <<
", op(A) is "<< Aleft <<
"x" << Aright);
182 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
183 prefix <<
"ERROR, inner dimensions of op(A) and op(P) " 184 "must match for matrix-matrix product. op(A) is " 185 << Aleft <<
"x" << Aright <<
", op(P) is "<< Pleft <<
"x" << Pright);
191 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.
getGlobalNumRows(), std::runtime_error,
192 prefix <<
"ERROR, dimensions of result Ac must " 193 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.
getGlobalNumRows()
194 <<
" rows, should have at least " << Rleft << std::endl);
206 int numProcs = P.
getComm()->getSize();
210 crs_matrix_struct_type Rview;
211 crs_matrix_struct_type Aview;
212 crs_matrix_struct_type Pview;
214 RCP<const map_type> targetMap_R = Rprime->getRowMap();
215 RCP<const map_type> targetMap_A = Aprime->getRowMap();
216 RCP<const map_type> targetMap_P = Pprime->getRowMap();
218 #ifdef HAVE_TPETRA_MMM_TIMINGS 219 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All I&X"))));
225 RCP<const import_type> dummyImporter;
227 if (!(transposeR && &R == &P))
228 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter,
true, label, params);
230 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
235 targetMap_P = Aprime->getColMap();
238 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(),
false, label, params);
241 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
243 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
244 if (needs_final_export)
247 Actemp = rcp(&Ac,
false);
249 #ifdef HAVE_TPETRA_MMM_TIMINGS 250 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Multiply"))));
254 if (call_FillComplete_on_result && newFlag) {
255 if (transposeR && &R == &P)
256 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
258 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
259 }
else if (call_FillComplete_on_result) {
262 if (transposeR && &R == &P)
263 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
265 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
288 if (transposeR && &R == &P)
289 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
291 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
294 if (needs_final_export) {
295 #ifdef HAVE_TPETRA_MMM_TIMINGS 296 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP exportAndFillComplete"))));
298 Teuchos::ParameterList labelList;
299 labelList.set(
"Timer Label", label);
300 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
301 if(!params.is_null()) labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
302 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
303 Actemp->exportAndFillComplete(Acprime,
305 Acprime->getDomainMap(),
306 Acprime->getRangeMap(),
307 rcp(&labelList,
false));
310 #ifdef HAVE_TPETRA_MMM_STATISTICS 311 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(
" RAP MMM"));
322 template<
class CrsMatrixType>
323 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
324 size_t nnzPerRowA = 100, Pcols = 100;
325 if (A.getNodeNumEntries() > 0)
326 nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
327 if (P.getNodeNumEntries() > 0)
328 Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
329 return (
size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
335 template<
class Scalar,
339 void mult_R_A_P_newmatrix(
344 const std::string& label,
345 const Teuchos::RCP<Teuchos::ParameterList>& params)
347 using Teuchos::Array;
348 using Teuchos::ArrayRCP;
349 using Teuchos::ArrayView;
354 typedef LocalOrdinal LO;
355 typedef GlobalOrdinal GO;
361 #ifdef HAVE_TPETRA_MMM_TIMINGS 362 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
363 using Teuchos::TimeMonitor;
364 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
366 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
369 RCP<const import_type> Acimport;
370 RCP<const map_type> Accolmap;
371 RCP<const import_type> Pimport = Pview.
origMatrix->getGraph()->getImporter();
372 RCP<const import_type> Iimport = Pview.
importMatrix.is_null() ?
373 Teuchos::null : Pview.
importMatrix->getGraph()->getImporter();
380 Array<LO> Pcol2Accol(Pview.
colMap->getNodeNumElements()), PIcol2Accol;
387 for (
size_t i = 0; i < Pview.
colMap->getNodeNumElements(); i++)
388 Pcol2Accol[i] = Teuchos::as<LO>(i);
399 if (!Pimport.is_null() && !Iimport.is_null())
400 Acimport = Pimport->setUnion(*Iimport);
402 else if (!Pimport.is_null() && Iimport.is_null())
403 Acimport = Pimport->setUnion();
405 else if (Pimport.is_null() && !Iimport.is_null())
406 Acimport = Iimport->setUnion();
409 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
411 Accolmap = Acimport->getTargetMap();
416 TEUCHOS_TEST_FOR_EXCEPTION(!Acimport->getSourceMap()->isSameAs(*Pview.
origMatrix->getDomainMap()),
417 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
424 PIcol2Accol.resize(Pview.
importMatrix->getColMap()->getNodeNumElements());
425 ArrayView<const GO> Pgid = Pview.
origMatrix->getColMap()->getNodeElementList();
426 ArrayView<const GO> Igid = Pview.
importMatrix->getColMap()->getNodeElementList();
428 for (
size_t i = 0; i < Pview.
origMatrix->getColMap()->getNodeNumElements(); i++)
429 Pcol2Accol[i] = Accolmap->getLocalElement(Pgid[i]);
430 for (
size_t i = 0; i < Pview.
importMatrix->getColMap()->getNodeNumElements(); i++)
431 PIcol2Accol[i] = Accolmap->getLocalElement(Igid[i]);
456 Array<LO> Acol2PRow (Aview.
colMap->getNodeNumElements(), LO_INVALID);
457 Array<LO> Acol2PRowImport(Aview.
colMap->getNodeNumElements(), LO_INVALID);
459 for (LO i = Aview.
colMap->getMinLocalIndex(); i <= Aview.
colMap->getMaxLocalIndex(); i++) {
460 LO P_LID = Pview.
origMatrix->getRowMap()->getLocalElement(Aview.
colMap->getGlobalElement(i));
461 if (P_LID != LO_INVALID) {
462 Acol2PRow[i] = P_LID;
465 LO I_LID = Pview.
importMatrix->getRowMap()->getLocalElement(Aview.
colMap->getGlobalElement(i));
466 Acol2PRowImport[i] = I_LID;
472 KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::
473 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
474 Acol2PRow, Acol2PRowImport, Pcol2Accol, PIcol2Accol,
475 Ac, Acimport, label, params);
479 template<
class InRowptrArrayType,
class InColindArrayType,
class InValsArrayType,
480 class OutRowptrType,
class OutColindType,
class OutValsType>
481 void copy_out_from_thread_memory(
const InRowptrArrayType & Inrowptr,
const InColindArrayType &Incolind,
const InValsArrayType & Invalues,
482 size_t m,
double thread_chunk,
483 OutRowptrType & row_mapC, OutColindType &entriesC, OutValsType & valuesC ) {
484 typedef OutRowptrType lno_view_t;
485 typedef OutColindType lno_nnz_view_t;
486 typedef OutValsType scalar_view_t;
487 typedef typename lno_view_t::execution_space execution_space;
488 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
491 size_t thread_max = Inrowptr.size();
493 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
495 lno_view_t thread_nnz_count(
"thread_nnz_counts", thread_max);
496 for(
size_t i = 0; i < thread_max; i++)
497 thread_nnz_count(i) = Inrowptr(i)(Inrowptr(i).extent(0) - 1);
500 c_nnz_size = thread_start_nnz(thread_max);
503 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
504 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
507 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
508 size_t my_thread_start = tid * thread_chunk;
509 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
510 size_t nnz_thread_start = thread_start_nnz(tid);
512 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
513 size_t ii = i - my_thread_start;
515 row_mapC(i) = nnz_thread_start + Inrowptr(tid)(ii);
517 row_mapC(m) = nnz_thread_start + Inrowptr(tid)(ii+1);
521 for(
size_t j = Inrowptr(tid)(ii); j<Inrowptr(tid)(ii+1); j++) {
522 entriesC(nnz_thread_start + j) = Incolind(tid)(j);
523 valuesC(nnz_thread_start + j) = Invalues(tid)(j);
532 template<
class Scalar,
539 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
540 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
541 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
542 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
545 const std::string& label,
546 const Teuchos::RCP<Teuchos::ParameterList>& params) {
547 #ifdef HAVE_TPETRA_MMM_TIMINGS 548 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
549 using Teuchos::TimeMonitor;
550 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
553 using Teuchos::Array;
554 using Teuchos::ArrayRCP;
555 using Teuchos::ArrayView;
560 typedef LocalOrdinal LO;
561 typedef GlobalOrdinal GO;
564 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
565 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
566 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
569 RCP<const map_type> Accolmap = Ac.
getColMap();
570 size_t m = Rview.
origMatrix->getNodeNumRows();
571 size_t n = Accolmap->getNodeNumElements();
574 ArrayRCP<const size_t> Rrowptr_RCP, Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
575 ArrayRCP<size_t> Acrowptr_RCP;
576 ArrayRCP<const LO> Rcolind_RCP, Acolind_RCP, Pcolind_RCP, Icolind_RCP;
577 ArrayRCP<LO> Accolind_RCP;
578 ArrayRCP<const Scalar> Rvals_RCP, Avals_RCP, Pvals_RCP, Ivals_RCP;
579 ArrayRCP<SC> Acvals_RCP;
586 Rview.
origMatrix->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
587 Aview.
origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
588 Pview.
origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
591 Pview.
importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
598 ArrayView<const size_t> Rrowptr, Arowptr, Prowptr, Irowptr;
599 ArrayView<const LO> Rcolind, Acolind, Pcolind, Icolind;
600 ArrayView<const SC> Rvals, Avals, Pvals, Ivals;
601 ArrayView<size_t> Acrowptr;
602 ArrayView<LO> Accolind;
603 ArrayView<SC> Acvals;
604 Rrowptr = Rrowptr_RCP(); Rcolind = Rcolind_RCP(); Rvals = Rvals_RCP();
605 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
606 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
608 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
619 Acrowptr_RCP.resize(m+1); Acrowptr = Acrowptr_RCP();
620 Accolind_RCP.resize(nnzAllocated); Accolind = Accolind_RCP();
621 Acvals_RCP.resize(nnzAllocated); Acvals = Acvals_RCP();
631 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
632 Array<size_t> ac_status(n, ST_INVALID);
642 size_t nnz = 0, nnz_old = 0;
643 for (
size_t i = 0; i < m; i++) {
649 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
651 const SC Rik = Rvals[kk];
655 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
657 const SC Akl = Avals[ll];
662 if (Acol2PRow[l] != LO_INVALID) {
669 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
672 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
674 LO Acj = Pcol2Accol[j];
677 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
678 #ifdef HAVE_TPETRA_DEBUG 680 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
682 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
685 ac_status[Acj] = nnz;
687 Acvals[nnz] = Rik*Akl*Plj;
690 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
700 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
701 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
703 LO Acj = PIcol2Accol[j];
706 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
707 #ifdef HAVE_TPETRA_DEBUG 709 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
711 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
714 ac_status[Acj] = nnz;
716 Acvals[nnz] = Rik*Akl*Plj;
719 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
726 if (nnz + n > nnzAllocated) {
728 Accolind_RCP.resize(nnzAllocated); Accolind = Accolind_RCP();
729 Acvals_RCP.resize(nnzAllocated); Acvals = Acvals_RCP();
737 Acvals_RCP.resize(nnz);
738 Accolind_RCP.resize(nnz);
740 #ifdef HAVE_TPETRA_MMM_TIMINGS 741 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
748 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
750 Ac.
setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
752 #ifdef HAVE_TPETRA_MMM_TIMINGS 753 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
764 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
765 labelList->set(
"Timer Label",label);
766 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
767 RCP<const Export<LO,GO,NO> > dummyExport;
776 #ifdef HAVE_TPETRA_INST_OPENMP 780 template<
class Scalar,
783 struct KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>
788 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
789 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
790 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
791 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
794 const std::string& label,
795 const Teuchos::RCP<Teuchos::ParameterList>& params) {
797 using Tpetra::MatrixMatrix::UnmanagedView;
798 #ifdef HAVE_TPETRA_MMM_TIMINGS 799 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
800 using Teuchos::TimeMonitor;
801 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
804 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
806 typedef LocalOrdinal LO;
807 typedef GlobalOrdinal GO;
811 typedef typename KCRS::StaticCrsGraphType graph_t;
812 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
813 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
814 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
815 typedef typename KCRS::values_type::non_const_type scalar_view_t;
816 typedef typename KCRS::device_type device_t;
817 typedef typename device_t::execution_space execution_space;
818 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
821 typedef UnmanagedView<lno_view_t> u_lno_view_t;
822 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
823 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
825 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
826 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
829 RCP<const map_type> Accolmap = Ac.
getColMap();
830 size_t m = Rview.
origMatrix->getNodeNumRows();
831 size_t n = Accolmap->getNodeNumElements();
834 const KCRS & Rmat = Rview.
origMatrix->getLocalMatrix();
835 const KCRS & Amat = Aview.
origMatrix->getLocalMatrix();
836 const KCRS & Pmat = Pview.
origMatrix->getLocalMatrix();
838 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
839 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
840 lno_nnz_view_t Icolind;
841 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
846 const KCRS& Imat = Pview.
importMatrix->getLocalMatrix();
847 Irowptr = Imat.graph.row_map;
848 Icolind = Imat.graph.entries;
860 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.
origMatrix, *Pview.
origMatrix) / m);
861 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
864 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
865 if(!params.is_null()) {
866 if(params->isParameter(
"openmp: ltg thread max"))
867 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
870 double thread_chunk = (double)(m) / thread_max;
880 Kokkos::View<u_lno_view_t*> tl_rowptr(
"top_rowptr", thread_max);
881 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind", thread_max);
882 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values", thread_max);
885 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
888 size_t my_thread_start = tid * thread_chunk;
889 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
890 size_t my_thread_m = my_thread_stop - my_thread_start;
892 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
894 std::vector<size_t> ac_status(n, INVALID);
897 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
898 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
899 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
902 size_t nnz = 0, nnz_old = 0;
904 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
905 Acrowptr(i - my_thread_start) = nnz;
907 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
909 const SC Rik = Rvals(kk);
913 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
915 const SC Akl = Avals(ll);
919 if (Acol2PRow[l] != LO_INVALID) {
926 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
929 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
931 LO Acj = Pcol2Accol[j];
934 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
935 #ifdef HAVE_TPETRA_DEBUG 937 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
939 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
942 ac_status[Acj] = nnz;
944 Acvals(nnz) = Rik*Akl*Plj;
947 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
957 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
958 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
960 LO Acj = PIcol2Accol[j];
963 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
964 #ifdef HAVE_TPETRA_DEBUG 966 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
968 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
971 ac_status[Acj] = nnz;
973 Acvals(nnz) = Rik*Akl*Plj;
976 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
983 if (nnz + n > nnzAllocated) {
985 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
986 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
990 Acrowptr(my_thread_m) = nnz;
991 tl_rowptr(tid) = Acrowptr;
992 tl_colind(tid) = Accolind;
993 tl_values(tid) = Acvals;
995 #ifdef HAVE_TPETRA_MMM_TIMINGS 996 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
999 lno_view_t rowmapAc(
"non_const_lnow_row", m + 1);
1000 lno_nnz_view_t entriesAc;
1001 scalar_view_t valuesAc;
1002 copy_out_from_thread_memory(tl_rowptr, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1004 for(
size_t i=0; i<thread_max; i++) {
1005 if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
1006 if(tl_colind(i).data()) free(tl_colind(i).data());
1007 if(tl_values(i).data()) free(tl_values(i).data());
1010 #ifdef HAVE_TPETRA_MMM_TIMINGS 1011 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1015 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1019 #ifdef HAVE_TPETRA_MMM_TIMINGS 1020 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1031 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1032 labelList->set(
"Timer Label",label);
1033 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1034 RCP<const Export<LO,GO,NO> > dummyExport;
1048 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1049 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1050 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1051 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1054 const std::string& label,
1055 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1056 #ifdef HAVE_TPETRA_MMM_TIMINGS 1057 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1058 using Teuchos::TimeMonitor;
1059 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix OpenMP"))));
1063 using Tpetra::MatrixMatrix::UnmanagedView;
1065 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1067 typedef LocalOrdinal LO;
1068 typedef GlobalOrdinal GO;
1072 typedef typename KCRS::StaticCrsGraphType graph_t;
1073 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1074 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1075 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1076 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1077 typedef typename KCRS::device_type device_t;
1078 typedef typename device_t::execution_space execution_space;
1079 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1082 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1083 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1084 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1086 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1087 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1092 size_t n = Ac.
getRowMap()->getNodeNumElements();
1093 LO maxAccol = Ac.
getColMap()->getMaxLocalIndex();
1095 #ifdef HAVE_TPETRA_MMM_TIMINGS 1096 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1100 transposer_type transposer (Pview.
origMatrix,label+std::string(
"XP: "));
1101 RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(
new Teuchos::ParameterList);
1102 if (!params.is_null())
1103 transposeParams->set(
"compute global constants",
1104 params->get(
"compute global constants: temporaries",
1106 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1109 const KCRS & Rmat = Ptrans->getLocalMatrix();
1110 const KCRS & Amat = Aview.
origMatrix->getLocalMatrix();
1111 const KCRS & Pmat = Pview.
origMatrix->getLocalMatrix();
1113 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1114 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1115 lno_nnz_view_t Icolind;
1116 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1117 scalar_view_t Ivals;
1121 const KCRS & Imat = Pview.
importMatrix->getLocalMatrix();
1122 Irowptr = Imat.graph.row_map;
1123 Icolind = Imat.graph.entries;
1124 Ivals = Imat.values;
1127 #ifdef HAVE_TPETRA_MMM_TIMINGS 1128 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Alloc"))));
1133 size_t Acest_nnz_per_row = std::ceil(Acest_total_nnz / n);
1134 size_t nnzPerRowA = 100;
1135 if (Aview.
origMatrix->getNodeNumEntries() > 0)
1146 const size_t ST_INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1149 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1150 if(!params.is_null()) {
1151 if(params->isParameter(
"openmp: ltg thread max"))
1152 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1155 double thread_chunk = (double)(n) / thread_max;
1158 #ifdef HAVE_TPETRA_MMM_TIMINGS 1159 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1170 Kokkos::View<u_lno_view_t*> tl_rowptr(
"top_rowptr", thread_max);
1171 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind", thread_max);
1172 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values", thread_max);
1174 Kokkos::parallel_for(
"MMM::PTAP::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1177 size_t my_thread_start = tid * thread_chunk;
1178 size_t my_thread_stop = tid == thread_max-1 ? n : (tid+1)*thread_chunk;
1179 size_t my_thread_n = my_thread_stop - my_thread_start;
1181 size_t nnzAllocated = (size_t) (my_thread_n * Acest_nnz_per_row + 100);
1183 std::vector<size_t> ac_status(maxAccol + 1, ST_INVALID);
1186 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_n+1)), my_thread_n + 1);
1187 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1188 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1191 size_t nnz = 0, nnz_old = 0;
1193 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1194 Acrowptr(i - my_thread_start) = nnz;
1196 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1198 const SC Rik = Rvals(kk);
1202 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1204 const SC Akl = Avals(ll);
1208 if (Acol2PRow[l] != LO_INVALID) {
1215 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1218 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1223 LO Acj = Pcol2Accol[j];
1225 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1226 #ifdef HAVE_TPETRA_DEBUG 1228 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1230 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.extent(0) << std::endl);
1233 ac_status[Acj] = nnz;
1234 Accolind(nnz) = Acj;
1235 Acvals(nnz) = Rik*Akl*Plj;
1238 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1248 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1249 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1254 LO Acj = PIcol2Accol[j];
1256 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1257 #ifdef HAVE_TPETRA_DEBUG 1259 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1261 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1264 ac_status[Acj] = nnz;
1265 Accolind(nnz) = Acj;
1266 Acvals(nnz) = Rik*Akl*Plj;
1269 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1276 if (nnz + std::max(5*nnzPerRowA, n) > nnzAllocated) {
1278 nnzAllocated = std::max(nnzAllocated, nnz + std::max(5*nnzPerRowA, n));
1279 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1280 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1284 tl_rowptr(tid) = Acrowptr;
1285 tl_colind(tid) = Accolind;
1286 tl_values(tid) = Acvals;
1287 Acrowptr(my_thread_n) = nnz;
1290 #ifdef HAVE_TPETRA_MMM_TIMINGS 1291 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP copy from thread local"))));
1293 lno_view_t rowmapAc(
"non_const_lnow_row", n + 1);
1294 lno_nnz_view_t entriesAc;
1295 scalar_view_t valuesAc;
1296 copy_out_from_thread_memory(tl_rowptr, tl_colind, tl_values, n, thread_chunk, rowmapAc, entriesAc, valuesAc);
1298 #ifdef HAVE_TPETRA_MMM_TIMINGS 1299 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1306 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1311 #ifdef HAVE_TPETRA_MMM_TIMINGS 1312 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1323 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1324 labelList->set(
"Timer Label",label);
1326 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1327 RCP<const Export<LO,GO,NO> > dummyExport;
1331 dummyExport, labelList);
1338 template<
class Scalar,
1340 class GlobalOrdinal,
1342 void mult_PT_A_P_newmatrix(
1346 const std::string& label,
1347 const Teuchos::RCP<Teuchos::ParameterList>& params)
1349 using Teuchos::Array;
1350 using Teuchos::ArrayRCP;
1351 using Teuchos::ArrayView;
1356 typedef LocalOrdinal LO;
1357 typedef GlobalOrdinal GO;
1363 #ifdef HAVE_TPETRA_MMM_TIMINGS 1364 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1365 using Teuchos::TimeMonitor;
1366 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP")))));
1369 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1372 RCP<const import_type> Acimport;
1373 RCP<const map_type> Accolmap;
1374 RCP<const import_type> Pimport = Pview.
origMatrix->getGraph()->getImporter();
1375 RCP<const import_type> Iimport = Pview.
importMatrix.is_null() ?
1376 Teuchos::null : Pview.
importMatrix->getGraph()->getImporter();
1383 Array<LO> Pcol2Accol(Pview.
colMap->getNodeNumElements()), PIcol2Accol;
1390 for (
size_t i = 0; i < Pview.
colMap->getNodeNumElements(); i++)
1391 Pcol2Accol[i] = Teuchos::as<LO>(i);
1402 if (!Pimport.is_null() && !Iimport.is_null())
1403 Acimport = Pimport->setUnion(*Iimport);
1405 else if (!Pimport.is_null() && Iimport.is_null())
1406 Acimport = Pimport->setUnion();
1408 else if (Pimport.is_null() && !Iimport.is_null())
1409 Acimport = Iimport->setUnion();
1412 throw std::runtime_error(
"TpetraExt::PTAP status of matrix importers is nonsensical");
1414 Accolmap = Acimport->getTargetMap();
1419 TEUCHOS_TEST_FOR_EXCEPTION(!Acimport->getSourceMap()->isSameAs(*Pview.
origMatrix->getDomainMap()),
1420 std::runtime_error,
"Tpetra::PTAP: Import setUnion messed with the DomainMap in an unfortunate way");
1427 PIcol2Accol.resize(Pview.
importMatrix->getColMap()->getNodeNumElements());
1428 ArrayView<const GO> Pgid = Pview.
origMatrix->getColMap()->getNodeElementList();
1429 ArrayView<const GO> Igid = Pview.
importMatrix->getColMap()->getNodeElementList();
1431 for (
size_t i = 0; i < Pview.
origMatrix->getColMap()->getNodeNumElements(); i++)
1432 Pcol2Accol[i] = Accolmap->getLocalElement(Pgid[i]);
1434 for (
size_t i = 0; i < Pview.
importMatrix->getColMap()->getNodeNumElements(); i++)
1435 PIcol2Accol[i] = Accolmap->getLocalElement(Igid[i]);
1463 Array<LO> Acol2PRow (Aview.
colMap->getNodeNumElements(), LO_INVALID);
1464 Array<LO> Acol2PRowImport(Aview.
colMap->getNodeNumElements(), LO_INVALID);
1465 for (LO A_LID = Aview.
colMap->getMinLocalIndex(); A_LID <= Aview.
colMap->getMaxLocalIndex(); A_LID++) {
1466 GO A_GID = Aview.
colMap->getGlobalElement(A_LID);
1467 LO P_LID = Pview.
origMatrix->getRowMap()->getLocalElement(A_GID);
1468 if (P_LID != LO_INVALID) {
1469 Acol2PRow[A_LID] = P_LID;
1471 LO I_LID = Pview.
importMatrix->getRowMap()->getLocalElement(A_GID);
1472 Acol2PRowImport[A_LID] = I_LID;
1478 KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper(Aview,
1495 template<
class Scalar,
1497 class GlobalOrdinal,
1501 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1502 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1503 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1504 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1507 const std::string& label,
1508 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1509 #ifdef HAVE_TPETRA_MMM_TIMINGS 1510 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1511 using Teuchos::TimeMonitor;
1512 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1515 using Teuchos::Array;
1516 using Teuchos::ArrayRCP;
1517 using Teuchos::ArrayView;
1522 typedef LocalOrdinal LO;
1523 typedef GlobalOrdinal GO;
1526 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1527 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1532 size_t n = Ac.
getRowMap()->getNodeNumElements();
1533 LO maxAccol = Ac.
getColMap()->getMaxLocalIndex();
1536 ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1537 ArrayRCP<size_t> Acrowptr_RCP;
1538 ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1539 ArrayRCP<LO> Accolind_RCP;
1540 ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1541 ArrayRCP<SC> Acvals_RCP;
1548 Aview.
origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1549 Pview.
origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1552 Pview.
importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1559 ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1560 ArrayView<const LO> Acolind, Pcolind, Icolind;
1561 ArrayView<const SC> Avals, Pvals, Ivals;
1562 ArrayView<size_t> Acrowptr;
1563 ArrayView<LO> Accolind;
1564 ArrayView<SC> Acvals;
1565 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1566 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1568 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1571 #ifdef HAVE_TPETRA_MMM_TIMINGS 1572 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1579 ArrayRCP<const size_t> Rrowptr_RCP;
1580 ArrayRCP<const LO> Rcolind_RCP;
1581 ArrayRCP<const Scalar> Rvals_RCP;
1582 ArrayView<const size_t> Rrowptr;
1583 ArrayView<const LO> Rcolind;
1584 ArrayView<const SC> Rvals;
1586 transposer_type transposer (Pview.
origMatrix,label+std::string(
"XP: "));
1587 RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(
new Teuchos::ParameterList);
1588 if (!params.is_null())
1589 transposeParams->set(
"compute global constants",
1590 params->get(
"compute global constants: temporaries",
1592 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1594 Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1595 Rrowptr = Rrowptr_RCP();
1596 Rcolind = Rcolind_RCP();
1597 Rvals = Rvals_RCP();
1599 #ifdef HAVE_TPETRA_MMM_TIMINGS 1600 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Alloc"))));
1605 size_t nnzPerRowA = 100;
1606 if (Aview.
origMatrix->getNodeNumEntries() > 0)
1608 Acrowptr_RCP.resize(n+1); Acrowptr = Acrowptr_RCP();
1609 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1610 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1628 const size_t ST_INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1629 Array<size_t> ac_status(maxAccol+1, ST_INVALID);
1632 #ifdef HAVE_TPETRA_MMM_TIMINGS 1633 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1643 size_t nnz = 0, nnz_old = 0;
1646 for (
size_t i = 0; i < n; i++) {
1652 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1654 const SC Rik = Rvals[kk];
1658 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1660 const SC Akl = Avals[ll];
1664 if (Acol2PRow[l] != LO_INVALID) {
1671 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1674 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1679 LO Acj = Pcol2Accol[j];
1681 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1682 #ifdef HAVE_TPETRA_DEBUG 1684 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1686 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1689 ac_status[Acj] = nnz;
1690 Accolind[nnz] = Acj;
1691 Acvals[nnz] = Rik*Akl*Plj;
1694 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
1704 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1705 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1710 LO Acj = PIcol2Accol[j];
1712 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1713 #ifdef HAVE_TPETRA_DEBUG 1715 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1717 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1720 ac_status[Acj] = nnz;
1721 Accolind[nnz] = Acj;
1722 Acvals[nnz] = Rik*Akl*Plj;
1725 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
1732 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1734 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1735 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1736 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1744 Acvals_RCP.resize(nnz);
1745 Accolind_RCP.resize(nnz);
1747 #ifdef HAVE_TPETRA_MMM_TIMINGS 1748 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1755 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1758 Ac.
setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1760 #ifdef HAVE_TPETRA_MMM_TIMINGS 1761 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1772 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1773 labelList->set(
"Timer Label",label);
1775 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1776 RCP<const Export<LO,GO,NO> > dummyExport;
1780 dummyExport, labelList);
1789 template<
class Scalar,
1791 class GlobalOrdinal,
1795 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1796 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1797 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1798 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1801 const std::string& label,
1802 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1803 #ifdef HAVE_TPETRA_MMM_TIMINGS 1804 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1805 using Teuchos::TimeMonitor;
1806 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1809 using Teuchos::Array;
1810 using Teuchos::ArrayRCP;
1811 using Teuchos::ArrayView;
1816 typedef LocalOrdinal LO;
1817 typedef GlobalOrdinal GO;
1820 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1821 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1826 size_t n = Ac.
getRowMap()->getNodeNumElements();
1827 LO maxAccol = Ac.
getColMap()->getMaxLocalIndex();
1830 ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1831 ArrayRCP<size_t> Acrowptr_RCP;
1832 ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1833 ArrayRCP<LO> Accolind_RCP;
1834 ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1835 ArrayRCP<SC> Acvals_RCP;
1842 Aview.
origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1843 Pview.
origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1846 Pview.
importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1853 ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1854 ArrayView<const LO> Acolind, Pcolind, Icolind;
1855 ArrayView<const SC> Avals, Pvals, Ivals;
1856 ArrayView<size_t> Acrowptr;
1857 ArrayView<LO> Accolind;
1858 ArrayView<SC> Acvals;
1859 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1860 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1862 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1874 #ifdef HAVE_TPETRA_MMM_TIMINGS 1875 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1882 ArrayRCP<const size_t> Rrowptr_RCP;
1883 ArrayRCP<const LO> Rcolind_RCP;
1884 ArrayRCP<const Scalar> Rvals_RCP;
1885 ArrayView<const size_t> Rrowptr;
1886 ArrayView<const LO> Rcolind;
1887 ArrayView<const SC> Rvals;
1889 transposer_type transposer (Pview.
origMatrix,label+std::string(
"XP: "));
1890 RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(
new Teuchos::ParameterList);
1891 if (!params.is_null())
1892 transposeParams->set(
"compute global constants",
1893 params->get(
"compute global constants: temporaries",
1895 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1897 Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1898 Rrowptr = Rrowptr_RCP();
1899 Rcolind = Rcolind_RCP();
1900 Rvals = Rvals_RCP();
1905 #ifdef HAVE_TPETRA_MMM_TIMINGS 1906 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP graph"))));
1909 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1910 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1913 size_t nnzPerRowA = 100;
1914 if (Aview.
origMatrix->getNodeNumEntries() > 0)
1916 Acrowptr_RCP.resize(n+1);
1917 Acrowptr = Acrowptr_RCP();
1918 Accolind_RCP.resize(nnz_alloc);
1919 Accolind = Accolind_RCP();
1921 size_t nnz = 0, nnz_old = 0;
1922 for (
size_t i = 0; i < n; i++) {
1928 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1931 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1934 if (Acol2PRow[l] != LO_INVALID) {
1941 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1944 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1946 LO Acj = Pcol2Accol[j];
1948 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1950 ac_status[Acj] = nnz;
1951 Accolind[nnz] = Acj;
1962 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1963 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1965 LO Acj = PIcol2Accol[j];
1967 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1969 ac_status[Acj] = nnz;
1970 Accolind[nnz] = Acj;
1980 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1982 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1983 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1984 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1991 Accolind_RCP.resize(nnz);
1992 Accolind = Accolind_RCP();
1995 Acvals_RCP.resize(nnz, SC_ZERO);
1996 Acvals = Acvals_RCP();
2003 #ifdef HAVE_TPETRA_MMM_TIMINGS 2004 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
2008 for (
size_t k = 0; k < n; k++) {
2009 for (
size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
2011 const SC Pki = Pvals[ii];
2012 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
2014 const SC Akl = Avals[ll];
2017 if (Acol2PRow[l] != LO_INVALID) {
2024 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
2025 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
2027 LO Acj = Pcol2Accol[j];
2029 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
2030 if (Accolind[pp] == Acj)
2034 Acvals[pp] += Pki*Akl*Pvals[jj];
2043 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
2044 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
2046 LO Acj = PIcol2Accol[j];
2048 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
2049 if (Accolind[pp] == Acj)
2053 Acvals[pp] += Pki*Akl*Ivals[jj];
2061 #ifdef HAVE_TPETRA_MMM_TIMINGS 2062 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
2069 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
2072 Ac.
setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
2074 #ifdef HAVE_TPETRA_MMM_TIMINGS 2075 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
2086 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2087 labelList->set(
"Timer Label",label);
2089 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2090 RCP<const Export<LO,GO,NO> > dummyExport;
2094 dummyExport, labelList);
2108 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \ 2111 void TripleMatrixMultiply::MultiplyRAP( \ 2112 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \ 2114 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \ 2116 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \ 2118 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \ 2119 bool call_FillComplete_on_result, \ 2120 const std::string & label, \ 2121 const Teuchos::RCP<Teuchos::ParameterList>& params); \ 2125 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
size_t global_size_t
Global size_t object.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMatrix
The imported matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Utility functions for packing and unpacking sparse matrix entries.
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
A parallel distribution of indices over processes.