Tpetra parallel linear algebra Version of the Day
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41#ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42#define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43
45#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46#include "Teuchos_VerboseObject.hpp"
47#include "Teuchos_Array.hpp"
48#include "Tpetra_Util.hpp"
49#include "Tpetra_ConfigDefs.hpp"
50#include "Tpetra_CrsMatrix.hpp"
52#include "Tpetra_RowMatrixTransposer.hpp"
53#include "Tpetra_ConfigDefs.hpp"
54#include "Tpetra_Map.hpp"
55#include "Tpetra_Export.hpp"
58#include <algorithm>
59#include <cmath>
60#include "Teuchos_FancyOStream.hpp"
61// #include "KokkosSparse_spgemm.hpp"
62
63
71/*********************************************************************************************************/
72// Include the architecture-specific kernel partial specializations here
73// NOTE: This needs to be outside all namespaces
74#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75#include "TpetraExt_MatrixMatrix_Cuda.hpp"
76#include "TpetraExt_MatrixMatrix_HIP.hpp"
77
78namespace Tpetra {
79
80 namespace TripleMatrixMultiply{
81
82 //
83 // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
84 // op(A) == A if transposeA is false,
85 // op(A) == A^T if transposeA is true,
86 // and similarly for op(R) and op(P).
87 //
88 template <class Scalar,
89 class LocalOrdinal,
90 class GlobalOrdinal,
91 class Node>
94 bool transposeR,
96 bool transposeA,
98 bool transposeP,
100 bool call_FillComplete_on_result,
101 const std::string& label,
102 const Teuchos::RCP<Teuchos::ParameterList>& params)
103 {
104 using Teuchos::null;
105 using Teuchos::RCP;
106 typedef Scalar SC;
107 typedef LocalOrdinal LO;
108 typedef GlobalOrdinal GO;
109 typedef Node NO;
110 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
111 typedef Import<LO,GO,NO> import_type;
112 typedef Export<LO,GO,NO> export_type;
113 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
114 typedef Map<LO,GO,NO> map_type;
115 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
116
117#ifdef HAVE_TPETRA_MMM_TIMINGS
118 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
119 using Teuchos::TimeMonitor;
120 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
121#endif
122
123 const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
124
125 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
126
127 // The input matrices R, A and P must both be fillComplete.
128 TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
129 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
130 TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
131
132 // If transposeA is true, then Rprime will be the transpose of R
133 // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
134 // will just be a pointer to R.
135 RCP<const crs_matrix_type> Rprime = null;
136 // If transposeA is true, then Aprime will be the transpose of A
137 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
138 // will just be a pointer to A.
139 RCP<const crs_matrix_type> Aprime = null;
140 // If transposeB is true, then Pprime will be the transpose of P
141 // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
142 // will just be a pointer to P.
143 RCP<const crs_matrix_type> Pprime = null;
144
145 // Is this a "clean" matrix?
146 //
147 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
148 // locally nor globally indexed, then it was empty. I don't like
149 // this, because the most straightforward implementation presumes
150 // lazy allocation of indices. However, historical precedent
151 // demands that we keep around this predicate as a way to test
152 // whether the matrix is empty.
153 const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
154
155 using Teuchos::ParameterList;
156 RCP<ParameterList> transposeParams (new ParameterList);
157 transposeParams->set ("sort", false);
158
159 if (transposeR && &R != &P) {
160 transposer_type transposer(rcpFromRef (R));
161 Rprime = transposer.createTranspose (transposeParams);
162 } else {
163 Rprime = rcpFromRef(R);
164 }
165
166 if (transposeA) {
167 transposer_type transposer(rcpFromRef (A));
168 Aprime = transposer.createTranspose (transposeParams);
169 } else {
170 Aprime = rcpFromRef(A);
171 }
172
173 if (transposeP) {
174 transposer_type transposer(rcpFromRef (P));
175 Pprime = transposer.createTranspose (transposeParams);
176 } else {
177 Pprime = rcpFromRef(P);
178 }
179
180 // Check size compatibility
181 global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
182 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
183 global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
184 global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
185 global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
186 global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
187 global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
188 global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
189 global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
190 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
191 prefix << "ERROR, inner dimensions of op(R) and op(A) "
192 "must match for matrix-matrix product. op(R) is "
193 << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
194
195 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
196 prefix << "ERROR, inner dimensions of op(A) and op(P) "
197 "must match for matrix-matrix product. op(A) is "
198 << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
199
200 // The result matrix Ac must at least have a row-map that reflects the correct
201 // row-size. Don't check the number of columns because rectangular matrices
202 // which were constructed with only one map can still end up having the
203 // correct capacity and dimensions when filled.
204 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
205 prefix << "ERROR, dimensions of result Ac must "
206 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
207 << " rows, should have at least " << Rleft << std::endl);
208
209 // It doesn't matter whether Ac is already Filled or not. If it is already
210 // Filled, it must have space allocated for the positions that will be
211 // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
212 // we'll error out later when trying to store result values.
213
214 // CGB: However, matrix must be in active-fill
215 TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
216
217 // We're going to need to import remotely-owned sections of P if
218 // more than one processor is performing this run, depending on the scenario.
219 int numProcs = P.getComm()->getSize();
220
221 // Declare a couple of structs that will be used to hold views of the data
222 // of R, A and P, to be used for fast access during the matrix-multiplication.
223 crs_matrix_struct_type Rview;
224 crs_matrix_struct_type Aview;
225 crs_matrix_struct_type Pview;
226
227 RCP<const map_type> targetMap_R = Rprime->getRowMap();
228 RCP<const map_type> targetMap_A = Aprime->getRowMap();
229 RCP<const map_type> targetMap_P = Pprime->getRowMap();
230
231#ifdef HAVE_TPETRA_MMM_TIMINGS
232 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
233#endif
234
235 // Now import any needed remote rows and populate the Aview struct
236 // NOTE: We assert that an import isn't needed --- since we do the transpose
237 // above to handle that.
238 RCP<const import_type> dummyImporter;
239
240 if (!(transposeR && &R == &P))
241 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
242
243 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
244
245 // We will also need local access to all rows of P that correspond to the
246 // column-map of op(A).
247 if (numProcs > 1)
248 targetMap_P = Aprime->getColMap();
249
250 // Import any needed remote rows and populate the Pview struct.
251 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
252
253
254 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
255
256 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
257 if (needs_final_export)
258 Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
259 else
260 Actemp = rcp(&Ac,false);// don't allow deallocation
261
262#ifdef HAVE_TPETRA_MMM_TIMINGS
263 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
264#endif
265
266 // Call the appropriate method to perform the actual multiplication.
267 if (call_FillComplete_on_result && newFlag) {
268 if (transposeR && &R == &P)
269 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
270 else
271 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
272 } else if (call_FillComplete_on_result) {
273 if (transposeR && &R == &P)
274 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
275 else
276 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
277 } else {
278 // mfh 27 Sep 2016: Is this the "slow" case? This
279 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
280 // thread-parallel inserts, but that may take some effort.
281 // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
282
283 // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
284
285 // #ifdef HAVE_TPETRA_MMM_TIMINGS
286 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
287 // #endif
288 // if (call_FillComplete_on_result) {
289 // // We'll call FillComplete on the C matrix before we exit, and give it a
290 // // domain-map and a range-map.
291 // // The domain-map will be the domain-map of B, unless
292 // // op(B)==transpose(B), in which case the range-map of B will be used.
293 // // The range-map will be the range-map of A, unless op(A)==transpose(A),
294 // // in which case the domain-map of A will be used.
295 // if (!C.isFillComplete())
296 // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
297 // }
298 // Not implemented
299 if (transposeR && &R == &P)
300 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
301 else
302 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
303 }
304
305 if (needs_final_export) {
306#ifdef HAVE_TPETRA_MMM_TIMINGS
307 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
308#endif
309 Teuchos::ParameterList labelList;
310 labelList.set("Timer Label", label);
311 Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
312
313 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
314 bool isMM = true;
315 bool overrideAllreduce = false;
316 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
317 if(!params.is_null()) {
318 Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
319 mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
320 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
321 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
322 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
323 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
324 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
325
326 labelList.set("compute global constants",params->get("compute global constants",true));
327 }
328 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
329
330 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
331 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
332 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
333
334 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
335 Actemp->exportAndFillComplete(Acprime,
336 exporter,
337 Acprime->getDomainMap(),
338 Acprime->getRangeMap(),
339 rcp(&labelList,false));
340
341 }
342#ifdef HAVE_TPETRA_MMM_STATISTICS
343 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
344#endif
345
346 }
347
348
349 } //End namespace TripleMatrixMultiply
350
351 namespace MMdetails{
352
353 // Kernel method for computing the local portion of Ac = R*A*P
354 template<class Scalar,
355 class LocalOrdinal,
356 class GlobalOrdinal,
357 class Node>
358 void mult_R_A_P_newmatrix(
359 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
360 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
361 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
362 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
363 const std::string& label,
364 const Teuchos::RCP<Teuchos::ParameterList>& params)
365 {
366 using Teuchos::Array;
367 using Teuchos::ArrayRCP;
368 using Teuchos::ArrayView;
369 using Teuchos::RCP;
370 using Teuchos::rcp;
371
372 //typedef Scalar SC; // unused
373 typedef LocalOrdinal LO;
374 typedef GlobalOrdinal GO;
375 typedef Node NO;
376
377 typedef Import<LO,GO,NO> import_type;
378 typedef Map<LO,GO,NO> map_type;
379
380 // Kokkos typedefs
381 typedef typename map_type::local_map_type local_map_type;
383 typedef typename KCRS::StaticCrsGraphType graph_t;
384 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
385 typedef typename NO::execution_space execution_space;
386 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
387 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
388
389#ifdef HAVE_TPETRA_MMM_TIMINGS
390 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
391 using Teuchos::TimeMonitor;
392 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
393#endif
394 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
395
396 // Build the final importer / column map, hash table lookups for Ac
397 RCP<const import_type> Cimport;
398 RCP<const map_type> Ccolmap;
399 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
400 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
401 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
402 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
403 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
404 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
405 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
406
407
408 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
409 // indices of B, to local column indices of Ac. (B and Ac have the
410 // same number of columns.) The kernel uses this, instead of
411 // copying the entire input matrix B and converting its column
412 // indices to those of C.
413 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
414
415 if (Pview.importMatrix.is_null()) {
416 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
417 Cimport = Pimport;
418 Ccolmap = Pview.colMap;
419 const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
420 // Pcol2Ccol is trivial
421 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
422 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
423 KOKKOS_LAMBDA(const LO i) {
424 Pcol2Ccol(i) = i;
425 });
426 }
427 else {
428 // mfh 27 Sep 2016: P has "remotes," so we need to build the
429 // column Map of C, as well as C's Import object (from its domain
430 // Map to its column Map). C's column Map is the union of the
431 // column Maps of (the local part of) P, and the "remote" part of
432 // P. Ditto for the Import. We have optimized this "setUnion"
433 // operation on Import objects and Maps.
434
435 // Choose the right variant of setUnion
436 if (!Pimport.is_null() && !Iimport.is_null()) {
437 Cimport = Pimport->setUnion(*Iimport);
438 }
439 else if (!Pimport.is_null() && Iimport.is_null()) {
440 Cimport = Pimport->setUnion();
441 }
442 else if (Pimport.is_null() && !Iimport.is_null()) {
443 Cimport = Iimport->setUnion();
444 }
445 else {
446 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
447 }
448 Ccolmap = Cimport->getTargetMap();
449
450 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
451 // in general. We should get rid of it in order to reduce
452 // communication costs of sparse matrix-matrix multiply.
453 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
454 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
455
456 // NOTE: This is not efficient and should be folded into setUnion
457 //
458 // mfh 27 Sep 2016: What the above comment means, is that the
459 // setUnion operation on Import objects could also compute these
460 // local index - to - local index look-up tables.
461 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
462 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
463 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
464 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
465 });
466 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
467 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
468 });
469
470 }
471
472 // Replace the column map
473 //
474 // mfh 27 Sep 2016: We do this because C was originally created
475 // without a column Map. Now we have its column Map.
476 Ac.replaceColMap(Ccolmap);
477
478 // mfh 27 Sep 2016: Construct tables that map from local column
479 // indices of A, to local row indices of either B_local (the locally
480 // owned part of B), or B_remote (the "imported" remote part of B).
481 //
482 // For column index Aik in row i of A, if the corresponding row of B
483 // exists in the local part of B ("orig") (which I'll call B_local),
484 // then targetMapToOrigRow[Aik] is the local index of that row of B.
485 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
486 //
487 // For column index Aik in row i of A, if the corresponding row of B
488 // exists in the remote part of B ("Import") (which I'll call
489 // B_remote), then targetMapToImportRow[Aik] is the local index of
490 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
491 // (a flag value).
492
493 // Run through all the hash table lookups once and for all
494 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
495 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
496 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
497 GO aidx = Acolmap_local.getGlobalElement(i);
498 LO P_LID = Prowmap_local.getLocalElement(aidx);
499 if (P_LID != LO_INVALID) {
500 targetMapToOrigRow(i) = P_LID;
501 targetMapToImportRow(i) = LO_INVALID;
502 } else {
503 LO I_LID = Irowmap_local.getLocalElement(aidx);
504 targetMapToOrigRow(i) = LO_INVALID;
505 targetMapToImportRow(i) = I_LID;
506 }
507 });
508
509 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
510 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
511 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
512 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
513 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
514 Ac, Cimport, label, params);
515 }
516
517
518 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
519 template<class Scalar,
520 class LocalOrdinal,
521 class GlobalOrdinal,
522 class Node>
523 void mult_R_A_P_reuse(
524 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
525 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
526 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
527 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
528 const std::string& label,
529 const Teuchos::RCP<Teuchos::ParameterList>& params)
530 {
531 using Teuchos::Array;
532 using Teuchos::ArrayRCP;
533 using Teuchos::ArrayView;
534 using Teuchos::RCP;
535 using Teuchos::rcp;
536
537 //typedef Scalar SC; // unused
538 typedef LocalOrdinal LO;
539 typedef GlobalOrdinal GO;
540 typedef Node NO;
541
542 typedef Import<LO,GO,NO> import_type;
543 typedef Map<LO,GO,NO> map_type;
544
545 // Kokkos typedefs
546 typedef typename map_type::local_map_type local_map_type;
548 typedef typename KCRS::StaticCrsGraphType graph_t;
549 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
550 typedef typename NO::execution_space execution_space;
551 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
552 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
553
554#ifdef HAVE_TPETRA_MMM_TIMINGS
555 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
556 using Teuchos::TimeMonitor;
557 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
558#endif
559 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
560
561 // Build the final importer / column map, hash table lookups for Ac
562 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
563 RCP<const map_type> Ccolmap = Ac.getColMap();
564 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
565 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
566 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
567 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
568 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
569 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
570 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
571 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
572
573 // Build the final importer / column map, hash table lookups for C
574 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
575 {
576 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
577 // So, column map of C may be a strict subset of the column map of B
578 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
579 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
580 });
581
582 if (!Pview.importMatrix.is_null()) {
583 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
584 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
585
586 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
587 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
588 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
589 });
590 }
591 }
592
593 // Run through all the hash table lookups once and for all
594 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
595 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
596 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
597 GO aidx = Acolmap_local.getGlobalElement(i);
598 LO B_LID = Prowmap_local.getLocalElement(aidx);
599 if (B_LID != LO_INVALID) {
600 targetMapToOrigRow(i) = B_LID;
601 targetMapToImportRow(i) = LO_INVALID;
602 } else {
603 LO I_LID = Irowmap_local.getLocalElement(aidx);
604 targetMapToOrigRow(i) = LO_INVALID;
605 targetMapToImportRow(i) = I_LID;
606
607 }
608 });
609
610 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
611 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
612 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
613 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
614 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
615 Ac, Cimport, label, params);
616 }
617
618
619 // Kernel method for computing the local portion of Ac = R*A*P
620 template<class Scalar,
621 class LocalOrdinal,
622 class GlobalOrdinal,
623 class Node>
624 void mult_PT_A_P_newmatrix(
625 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
626 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
627 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
628 const std::string& label,
629 const Teuchos::RCP<Teuchos::ParameterList>& params)
630 {
631 using Teuchos::Array;
632 using Teuchos::ArrayRCP;
633 using Teuchos::ArrayView;
634 using Teuchos::RCP;
635 using Teuchos::rcp;
636
637 //typedef Scalar SC; // unused
638 typedef LocalOrdinal LO;
639 typedef GlobalOrdinal GO;
640 typedef Node NO;
641
642 typedef Import<LO,GO,NO> import_type;
643 typedef Map<LO,GO,NO> map_type;
644
645 // Kokkos typedefs
646 typedef typename map_type::local_map_type local_map_type;
648 typedef typename KCRS::StaticCrsGraphType graph_t;
649 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
650 typedef typename NO::execution_space execution_space;
651 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
652 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
653
654#ifdef HAVE_TPETRA_MMM_TIMINGS
655 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
656 using Teuchos::TimeMonitor;
657 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
658#endif
659 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
660
661 // Build the final importer / column map, hash table lookups for Ac
662 RCP<const import_type> Cimport;
663 RCP<const map_type> Ccolmap;
664 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
665 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
666 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
667 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
668 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
669 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
670 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
671
672
673 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
674 // indices of B, to local column indices of Ac. (B and Ac have the
675 // same number of columns.) The kernel uses this, instead of
676 // copying the entire input matrix B and converting its column
677 // indices to those of C.
678 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
679
680 if (Pview.importMatrix.is_null()) {
681 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
682 Cimport = Pimport;
683 Ccolmap = Pview.colMap;
684 const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
685 // Pcol2Ccol is trivial
686 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
687 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
688 KOKKOS_LAMBDA(const LO i) {
689 Pcol2Ccol(i) = i;
690 });
691 }
692 else {
693 // mfh 27 Sep 2016: P has "remotes," so we need to build the
694 // column Map of C, as well as C's Import object (from its domain
695 // Map to its column Map). C's column Map is the union of the
696 // column Maps of (the local part of) P, and the "remote" part of
697 // P. Ditto for the Import. We have optimized this "setUnion"
698 // operation on Import objects and Maps.
699
700 // Choose the right variant of setUnion
701 if (!Pimport.is_null() && !Iimport.is_null()) {
702 Cimport = Pimport->setUnion(*Iimport);
703 }
704 else if (!Pimport.is_null() && Iimport.is_null()) {
705 Cimport = Pimport->setUnion();
706 }
707 else if (Pimport.is_null() && !Iimport.is_null()) {
708 Cimport = Iimport->setUnion();
709 }
710 else {
711 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
712 }
713 Ccolmap = Cimport->getTargetMap();
714
715 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
716 // in general. We should get rid of it in order to reduce
717 // communication costs of sparse matrix-matrix multiply.
718 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
719 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
720
721 // NOTE: This is not efficient and should be folded into setUnion
722 //
723 // mfh 27 Sep 2016: What the above comment means, is that the
724 // setUnion operation on Import objects could also compute these
725 // local index - to - local index look-up tables.
726 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
727 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
728 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
729 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
730 });
731 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
732 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
733 });
734
735 }
736
737 // Replace the column map
738 //
739 // mfh 27 Sep 2016: We do this because C was originally created
740 // without a column Map. Now we have its column Map.
741 Ac.replaceColMap(Ccolmap);
742
743 // mfh 27 Sep 2016: Construct tables that map from local column
744 // indices of A, to local row indices of either B_local (the locally
745 // owned part of B), or B_remote (the "imported" remote part of B).
746 //
747 // For column index Aik in row i of A, if the corresponding row of B
748 // exists in the local part of B ("orig") (which I'll call B_local),
749 // then targetMapToOrigRow[Aik] is the local index of that row of B.
750 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
751 //
752 // For column index Aik in row i of A, if the corresponding row of B
753 // exists in the remote part of B ("Import") (which I'll call
754 // B_remote), then targetMapToImportRow[Aik] is the local index of
755 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
756 // (a flag value).
757
758 // Run through all the hash table lookups once and for all
759 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
760 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
761
762 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
763 GO aidx = Acolmap_local.getGlobalElement(i);
764 LO P_LID = Prowmap_local.getLocalElement(aidx);
765 if (P_LID != LO_INVALID) {
766 targetMapToOrigRow(i) = P_LID;
767 targetMapToImportRow(i) = LO_INVALID;
768 } else {
769 LO I_LID = Irowmap_local.getLocalElement(aidx);
770 targetMapToOrigRow(i) = LO_INVALID;
771 targetMapToImportRow(i) = I_LID;
772 }
773 });
774
775 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
776 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
777 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
778 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
779 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
780 Ac, Cimport, label, params);
781 }
782
783 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
784 template<class Scalar,
785 class LocalOrdinal,
786 class GlobalOrdinal,
787 class Node>
788 void mult_PT_A_P_reuse(
789 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
790 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
791 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
792 const std::string& label,
793 const Teuchos::RCP<Teuchos::ParameterList>& params)
794 {
795 using Teuchos::Array;
796 using Teuchos::ArrayRCP;
797 using Teuchos::ArrayView;
798 using Teuchos::RCP;
799 using Teuchos::rcp;
800
801 //typedef Scalar SC; // unused
802 typedef LocalOrdinal LO;
803 typedef GlobalOrdinal GO;
804 typedef Node NO;
805
806 typedef Import<LO,GO,NO> import_type;
807 typedef Map<LO,GO,NO> map_type;
808
809 // Kokkos typedefs
810 typedef typename map_type::local_map_type local_map_type;
812 typedef typename KCRS::StaticCrsGraphType graph_t;
813 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
814 typedef typename NO::execution_space execution_space;
815 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
816 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
817
818#ifdef HAVE_TPETRA_MMM_TIMINGS
819 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
820 using Teuchos::TimeMonitor;
821 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
822#endif
823 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
824
825 // Build the final importer / column map, hash table lookups for Ac
826 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
827 RCP<const map_type> Ccolmap = Ac.getColMap();
828 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
829 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
830 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
831 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
832 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
833 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
834 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
835 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
836
837 // Build the final importer / column map, hash table lookups for C
838 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
839 {
840 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
841 // So, column map of C may be a strict subset of the column map of B
842 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
843 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
844 });
845
846 if (!Pview.importMatrix.is_null()) {
847 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
848 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
849
850 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
851 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
852 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
853 });
854 }
855 }
856
857 // Run through all the hash table lookups once and for all
858 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
859 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
860 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
861 GO aidx = Acolmap_local.getGlobalElement(i);
862 LO B_LID = Prowmap_local.getLocalElement(aidx);
863 if (B_LID != LO_INVALID) {
864 targetMapToOrigRow(i) = B_LID;
865 targetMapToImportRow(i) = LO_INVALID;
866 } else {
867 LO I_LID = Irowmap_local.getLocalElement(aidx);
868 targetMapToOrigRow(i) = LO_INVALID;
869 targetMapToImportRow(i) = I_LID;
870
871 }
872 });
873
874 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
875 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
876 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
877 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
878 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
879 Ac, Cimport, label, params);
880 }
881
882
883 /*********************************************************************************************************/
884 // RAP NewMatrix Kernel wrappers (Default non-threaded version)
885 // Computes R * A * P -> Ac using classic Gustavson approach
886 template<class Scalar,
887 class LocalOrdinal,
888 class GlobalOrdinal,
889 class Node,
890 class LocalOrdinalViewType>
891 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
892 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
893 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
894 const LocalOrdinalViewType & Acol2Prow_dev,
895 const LocalOrdinalViewType & Acol2PIrow_dev,
896 const LocalOrdinalViewType & Pcol2Accol_dev,
897 const LocalOrdinalViewType & PIcol2Accol_dev,
898 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
899 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
900 const std::string& label,
901 const Teuchos::RCP<Teuchos::ParameterList>& params) {
902#ifdef HAVE_TPETRA_MMM_TIMINGS
903 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
904 using Teuchos::TimeMonitor;
905 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
906#endif
907
908 using Teuchos::Array;
909 using Teuchos::ArrayRCP;
910 using Teuchos::ArrayView;
911 using Teuchos::RCP;
912 using Teuchos::rcp;
913
914 // Lots and lots of typedefs
916 typedef typename KCRS::StaticCrsGraphType graph_t;
917 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
918 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
919 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
920 typedef typename KCRS::values_type::non_const_type scalar_view_t;
921
922 typedef Scalar SC;
923 typedef LocalOrdinal LO;
924 typedef GlobalOrdinal GO;
925 typedef Node NO;
926 typedef Map<LO,GO,NO> map_type;
927 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
928 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
929 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
930
931 // Sizes
932 RCP<const map_type> Accolmap = Ac.getColMap();
933 size_t m = Rview.origMatrix->getNodeNumRows();
934 size_t n = Accolmap->getNodeNumElements();
935 size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
936
937 // Routine runs on host; have to put arguments on host, too
938 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
939 Acol2Prow_dev);
940 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
941 Acol2PIrow_dev);
942 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
943 Pcol2Accol_dev);
944 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
945 PIcol2Accol_dev);
946
947 // Grab the Kokkos::SparseCrsMatrices & inner stuff
948 const auto Amat = Aview.origMatrix->getLocalMatrixHost();
949 const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
950 const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
951
952 auto Arowptr = Amat.graph.row_map;
953 auto Prowptr = Pmat.graph.row_map;
954 auto Rrowptr = Rmat.graph.row_map;
955 const auto Acolind = Amat.graph.entries;
956 const auto Pcolind = Pmat.graph.entries;
957 const auto Rcolind = Rmat.graph.entries;
958 const auto Avals = Amat.values;
959 const auto Pvals = Pmat.values;
960 const auto Rvals = Rmat.values;
961
962 typename c_lno_view_t::HostMirror::const_type Irowptr;
963 typename lno_nnz_view_t::HostMirror Icolind;
964 typename scalar_view_t::HostMirror Ivals;
965 if(!Pview.importMatrix.is_null()) {
966 auto lclP = Pview.importMatrix->getLocalMatrixHost();
967 Irowptr = lclP.graph.row_map;
968 Icolind = lclP.graph.entries;
969 Ivals = lclP.values;
970 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
971 }
972
973#ifdef HAVE_TPETRA_MMM_TIMINGS
974 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
975#endif
976
977 // Classic csr assembly (low memory edition)
978 //
979 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
980 // The method loops over rows of R, and may resize after processing
981 // each row. Chris Siefert says that this reflects experience in
982 // ML; for the non-threaded case, ML found it faster to spend less
983 // effort on estimation and risk an occasional reallocation.
984 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
985 typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
986 typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
987 typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
988
989 // mfh 27 Sep 2016: The ac_status array is an implementation detail
990 // of the local sparse matrix-matrix multiply routine.
991
992 // The status array will contain the index into colind where this entry was last deposited.
993 // ac_status[i] < nnz - not in the row yet
994 // ac_status[i] >= nnz - this is the entry where you can find the data
995 // We start with this filled with INVALID's indicating that there are no entries yet.
996 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
997 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
998 Array<size_t> ac_status(n, ST_INVALID);
999
1000 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1001 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1002 //
1003 // For column index Aik in row i of A, Acol2Prow[Aik] tells
1004 // you whether the corresponding row of P belongs to P_local
1005 // ("orig") or P_remote ("Import").
1006
1007 // For each row of R
1008 size_t nnz = 0, nnz_old = 0;
1009 for (size_t i = 0; i < m; i++) {
1010 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1011 // on the calling process.
1012 Crowptr[i] = nnz;
1013
1014 // mfh 27 Sep 2016: For each entry of R in the current row of R
1015 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1016 LO k = Rcolind[kk]; // local column index of current entry of R
1017 const SC Rik = Rvals[kk]; // value of current entry of R
1018 if (Rik == SC_ZERO)
1019 continue; // skip explicitly stored zero values in R
1020 // For each entry of A in the current row of A
1021 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1022 LO l = Acolind[ll]; // local column index of current entry of A
1023 const SC Akl = Avals[ll]; // value of current entry of A
1024 if (Akl == SC_ZERO)
1025 continue; // skip explicitly stored zero values in A
1026
1027
1028 if (Acol2Prow[l] != LO_INVALID) {
1029 // mfh 27 Sep 2016: If the entry of Acol2Prow
1030 // corresponding to the current entry of A is populated, then
1031 // the corresponding row of P is in P_local (i.e., it lives on
1032 // the calling process).
1033
1034 // Local matrix
1035 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1036
1037 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1038 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1039 LO j = Pcolind[jj];
1040 LO Acj = Pcol2Accol[j];
1041 SC Plj = Pvals[jj];
1042
1043 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1044#ifdef HAVE_TPETRA_DEBUG
1045 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1046 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1047 std::runtime_error,
1048 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1049#endif
1050 // New entry
1051 ac_status[Acj] = nnz;
1052 Ccolind[nnz] = Acj;
1053 Cvals[nnz] = Rik*Akl*Plj;
1054 nnz++;
1055 } else {
1056 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1057 }
1058 }
1059 } else {
1060 // mfh 27 Sep 2016: If the entry of Acol2PRow
1061 // corresponding to the current entry of A is NOT populated (has
1062 // a flag "invalid" value), then the corresponding row of P is
1063 // in P_remote (i.e., it does not live on the calling process).
1064
1065 // Remote matrix
1066 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1067 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1068 LO j = Icolind[jj];
1069 LO Acj = PIcol2Accol[j];
1070 SC Plj = Ivals[jj];
1071
1072 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1073#ifdef HAVE_TPETRA_DEBUG
1074 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1075 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1076 std::runtime_error,
1077 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1078#endif
1079 // New entry
1080 ac_status[Acj] = nnz;
1081 Ccolind[nnz] = Acj;
1082 Cvals[nnz] = Rik*Akl*Plj;
1083 nnz++;
1084 } else {
1085 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1086 }
1087 }
1088 }
1089 }
1090 }
1091 // Resize for next pass if needed
1092 if (nnz + n > CSR_alloc) {
1093 CSR_alloc *= 2;
1094 Kokkos::resize(Ccolind,CSR_alloc);
1095 Kokkos::resize(Cvals,CSR_alloc);
1096 }
1097 nnz_old = nnz;
1098 }
1099
1100 Crowptr[m] = nnz;
1101
1102 // Downward resize
1103 Kokkos::resize(Ccolind,nnz);
1104 Kokkos::resize(Cvals,nnz);
1105
1106#ifdef HAVE_TPETRA_MMM_TIMINGS
1107 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1108#endif
1109 auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1110 typename KCRS::device_type(), Crowptr);
1111 auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1112 typename KCRS::device_type(), Ccolind);
1113 auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1114 typename KCRS::device_type(), Cvals);
1115
1116 // Final sort & set of CRS arrays
1117 if (params.is_null() || params->get("sort entries",true))
1118 Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1119 Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1120
1121#ifdef HAVE_TPETRA_MMM_TIMINGS
1122 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1123#endif
1124
1125 // Final FillComplete
1126 //
1127 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1128 // Import (from domain Map to column Map) construction (which costs
1129 // lots of communication) by taking the previously constructed
1130 // Import object. We should be able to do this without interfering
1131 // with the implementation of the local part of sparse matrix-matrix
1132 // multply above.
1133 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1134 labelList->set("Timer Label",label);
1135 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1136 RCP<const Export<LO,GO,NO> > dummyExport;
1137 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1138 Rview.origMatrix->getRangeMap(),
1139 Acimport,
1140 dummyExport,
1141 labelList);
1142
1143 }
1144
1145 /*********************************************************************************************************/
1146 // RAP Reuse Kernel wrappers (Default non-threaded version)
1147 // Computes R * A * P -> Ac using reuse Gustavson
1148 template<class Scalar,
1149 class LocalOrdinal,
1150 class GlobalOrdinal,
1151 class Node,
1152 class LocalOrdinalViewType>
1153 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1154 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1155 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1156 const LocalOrdinalViewType & Acol2Prow_dev,
1157 const LocalOrdinalViewType & Acol2PIrow_dev,
1158 const LocalOrdinalViewType & Pcol2Accol_dev,
1159 const LocalOrdinalViewType & PIcol2Accol_dev,
1160 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1161 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1162 const std::string& label,
1163 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1164#ifdef HAVE_TPETRA_MMM_TIMINGS
1165 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1166 using Teuchos::TimeMonitor;
1167 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1168#endif
1169
1170 using Teuchos::Array;
1171 using Teuchos::ArrayRCP;
1172 using Teuchos::ArrayView;
1173 using Teuchos::RCP;
1174 using Teuchos::rcp;
1175
1176 // Lots and lots of typedefs
1177 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1178 typedef typename KCRS::StaticCrsGraphType graph_t;
1179 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1180 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1181 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1182
1183 typedef Scalar SC;
1184 typedef LocalOrdinal LO;
1185 typedef GlobalOrdinal GO;
1186 typedef Node NO;
1187 typedef Map<LO,GO,NO> map_type;
1188 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1189 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1190 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1191
1192 // Sizes
1193 RCP<const map_type> Accolmap = Ac.getColMap();
1194 size_t m = Rview.origMatrix->getNodeNumRows();
1195 size_t n = Accolmap->getNodeNumElements();
1196 size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
1197
1198 // Routine runs on host; have to put arguments on host, too
1199 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1200 Acol2Prow_dev);
1201 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1202 Acol2PIrow_dev);
1203 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1204 Pcol2Accol_dev);
1205 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1206 PIcol2Accol_dev);
1207
1208 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1209 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1210 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1211 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1212 const KCRS & Cmat = Ac.getLocalMatrixHost();
1213
1214 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1215 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1216 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1217 scalar_view_t Cvals = Cmat.values;
1218
1219 c_lno_view_t Irowptr;
1220 lno_nnz_view_t Icolind;
1221 scalar_view_t Ivals;
1222 if(!Pview.importMatrix.is_null()) {
1223 auto lclP = Pview.importMatrix->getLocalMatrixHost();
1224 Irowptr = lclP.graph.row_map;
1225 Icolind = lclP.graph.entries;
1226 Ivals = lclP.values;
1227 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
1228 }
1229
1230#ifdef HAVE_TPETRA_MMM_TIMINGS
1231 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1232#endif
1233
1234 // mfh 27 Sep 2016: The ac_status array is an implementation detail
1235 // of the local sparse matrix-matrix multiply routine.
1236
1237 // The status array will contain the index into colind where this entry was last deposited.
1238 // ac_status[i] < nnz - not in the row yet
1239 // ac_status[i] >= nnz - this is the entry where you can find the data
1240 // We start with this filled with INVALID's indicating that there are no entries yet.
1241 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1242 Array<size_t> ac_status(n, ST_INVALID);
1243
1244 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1245 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1246 //
1247 // For column index Aik in row i of A, Acol2Prow[Aik] tells
1248 // you whether the corresponding row of P belongs to P_local
1249 // ("orig") or P_remote ("Import").
1250
1251 // Necessary until following UVM host accesses are changed - for example Crowptr
1252 // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1253 Kokkos::fence();
1254
1255 // For each row of R
1256 size_t OLD_ip = 0, CSR_ip = 0;
1257 for (size_t i = 0; i < m; i++) {
1258 // First fill the c_status array w/ locations where we're allowed to
1259 // generate nonzeros for this row
1260 OLD_ip = Crowptr[i];
1261 CSR_ip = Crowptr[i+1];
1262 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1263 ac_status[Ccolind[k]] = k;
1264
1265 // Reset values in the row of C
1266 Cvals[k] = SC_ZERO;
1267 }
1268
1269 // mfh 27 Sep 2016: For each entry of R in the current row of R
1270 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1271 LO k = Rcolind[kk]; // local column index of current entry of R
1272 const SC Rik = Rvals[kk]; // value of current entry of R
1273 if (Rik == SC_ZERO)
1274 continue; // skip explicitly stored zero values in R
1275 // For each entry of A in the current row of A
1276 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1277 LO l = Acolind[ll]; // local column index of current entry of A
1278 const SC Akl = Avals[ll]; // value of current entry of A
1279 if (Akl == SC_ZERO)
1280 continue; // skip explicitly stored zero values in A
1281
1282
1283 if (Acol2Prow[l] != LO_INVALID) {
1284 // mfh 27 Sep 2016: If the entry of Acol2Prow
1285 // corresponding to the current entry of A is populated, then
1286 // the corresponding row of P is in P_local (i.e., it lives on
1287 // the calling process).
1288
1289 // Local matrix
1290 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1291
1292 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1293 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1294 LO j = Pcolind[jj];
1295 LO Cij = Pcol2Accol[j];
1296 SC Plj = Pvals[jj];
1297
1298 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1299 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1300 "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1301
1302 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1303 }
1304 } else {
1305 // mfh 27 Sep 2016: If the entry of Acol2PRow
1306 // corresponding to the current entry of A is NOT populated (has
1307 // a flag "invalid" value), then the corresponding row of P is
1308 // in P_remote (i.e., it does not live on the calling process).
1309
1310 // Remote matrix
1311 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1312 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1313 LO j = Icolind[jj];
1314 LO Cij = PIcol2Accol[j];
1315 SC Plj = Ivals[jj];
1316
1317 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1318 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1319 "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1320
1321 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1322 }
1323 }
1324 }
1325 }
1326 }
1327
1328#ifdef HAVE_TPETRA_MMM_TIMINGS
1329 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1330#endif
1331
1332 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1333
1334}
1335
1336
1337 /*********************************************************************************************************/
1338 // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1339 // Computes P.T * A * P -> Ac
1340 template<class Scalar,
1341 class LocalOrdinal,
1342 class GlobalOrdinal,
1343 class Node,
1344 class LocalOrdinalViewType>
1345 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1346 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1347 const LocalOrdinalViewType & Acol2Prow,
1348 const LocalOrdinalViewType & Acol2PIrow,
1349 const LocalOrdinalViewType & Pcol2Accol,
1350 const LocalOrdinalViewType & PIcol2Accol,
1351 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1352 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1353 const std::string& label,
1354 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1355#ifdef HAVE_TPETRA_MMM_TIMINGS
1356 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1357 using Teuchos::TimeMonitor;
1358 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1359#endif
1360
1361 // We don't need a kernel-level PTAP, we just transpose here
1362 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1363 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1364
1365 using Teuchos::ParameterList;
1366 using Teuchos::RCP;
1367 RCP<ParameterList> transposeParams (new ParameterList);
1368 transposeParams->set ("sort", false);
1369
1370 if (! params.is_null ()) {
1371 transposeParams->set ("compute global constants",
1372 params->get ("compute global constants: temporaries",
1373 false));
1374 }
1375 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1376 transposer.createTransposeLocal (transposeParams);
1377 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1378 Rview.origMatrix = Ptrans;
1379
1380 mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1381 }
1382
1383 /*********************************************************************************************************/
1384 // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1385 // Computes P.T * A * P -> Ac
1386 template<class Scalar,
1387 class LocalOrdinal,
1388 class GlobalOrdinal,
1389 class Node,
1390 class LocalOrdinalViewType>
1391 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1392 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1393 const LocalOrdinalViewType & Acol2Prow,
1394 const LocalOrdinalViewType & Acol2PIrow,
1395 const LocalOrdinalViewType & Pcol2Accol,
1396 const LocalOrdinalViewType & PIcol2Accol,
1397 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1398 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1399 const std::string& label,
1400 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1401#ifdef HAVE_TPETRA_MMM_TIMINGS
1402 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1403 using Teuchos::TimeMonitor;
1404 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1405#endif
1406
1407 // We don't need a kernel-level PTAP, we just transpose here
1408 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1409 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1410
1411 using Teuchos::ParameterList;
1412 using Teuchos::RCP;
1413 RCP<ParameterList> transposeParams (new ParameterList);
1414 transposeParams->set ("sort", false);
1415
1416 if (! params.is_null ()) {
1417 transposeParams->set ("compute global constants",
1418 params->get ("compute global constants: temporaries",
1419 false));
1420 }
1421 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1422 transposer.createTransposeLocal (transposeParams);
1423 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1424 Rview.origMatrix = Ptrans;
1425
1426 mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1427 }
1428
1429 /*********************************************************************************************************/
1430 // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1431 // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1432 // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1433 // Currently, this implementation never gets called.
1434 template<class Scalar,
1435 class LocalOrdinal,
1436 class GlobalOrdinal,
1437 class Node>
1438 void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1439 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1440 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1441 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1442 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1443 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1444 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1445 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1446 const std::string& label,
1447 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1448#ifdef HAVE_TPETRA_MMM_TIMINGS
1449 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1450 using Teuchos::TimeMonitor;
1451 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1452#endif
1453
1454 using Teuchos::Array;
1455 using Teuchos::ArrayRCP;
1456 using Teuchos::ArrayView;
1457 using Teuchos::RCP;
1458 using Teuchos::rcp;
1459
1460 typedef Scalar SC;
1461 typedef LocalOrdinal LO;
1462 typedef GlobalOrdinal GO;
1463 typedef Node NO;
1464 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1465 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1466 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1467
1468 // number of rows on the process of the fine matrix
1469 // size_t m = Pview.origMatrix->getNodeNumRows();
1470 // number of rows on the process of the coarse matrix
1471 size_t n = Ac.getRowMap()->getNodeNumElements();
1472 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1473
1474 // Get Data Pointers
1475 ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1476 ArrayRCP<size_t> Acrowptr_RCP;
1477 ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1478 ArrayRCP<LO> Accolind_RCP;
1479 ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1480 ArrayRCP<SC> Acvals_RCP;
1481
1482 // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1483 // out of the CrsMatrix. This code computes R * A * (P_local +
1484 // P_remote), where P_local contains the locally owned rows of P,
1485 // and P_remote the (previously Import'ed) remote rows of P.
1486
1487 Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1488 Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1489
1490 if (!Pview.importMatrix.is_null())
1491 Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1492
1493 // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1494 // where Teuchos::ArrayRCP::operator[] may be slower than
1495 // Teuchos::ArrayView::operator[].
1496
1497 // For efficiency
1498 ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1499 ArrayView<const LO> Acolind, Pcolind, Icolind;
1500 ArrayView<const SC> Avals, Pvals, Ivals;
1501 ArrayView<size_t> Acrowptr;
1502 ArrayView<LO> Accolind;
1503 ArrayView<SC> Acvals;
1504 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1505 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1506 if (!Pview.importMatrix.is_null()) {
1507 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1508 }
1509
1511 // In a first pass, determine the graph of Ac.
1513
1515 // Get the graph of Ac. This gets the local transpose of P,
1516 // then loops over R, A, P to get the graph of Ac.
1518
1519#ifdef HAVE_TPETRA_MMM_TIMINGS
1520 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1521#endif
1522
1524 // Get the local transpose of the graph of P by locally transposing
1525 // all of P
1526
1527 ArrayRCP<const size_t> Rrowptr_RCP;
1528 ArrayRCP<const LO> Rcolind_RCP;
1529 ArrayRCP<const Scalar> Rvals_RCP;
1530 ArrayView<const size_t> Rrowptr;
1531 ArrayView<const LO> Rcolind;
1532 ArrayView<const SC> Rvals;
1533
1534 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1535
1536 using Teuchos::ParameterList;
1537 RCP<ParameterList> transposeParams (new ParameterList);
1538 transposeParams->set ("sort", false);
1539 if (! params.is_null ()) {
1540 transposeParams->set ("compute global constants",
1541 params->get ("compute global constants: temporaries",
1542 false));
1543 }
1544 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1545 transposer.createTransposeLocal (transposeParams);
1546
1547 Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1548 Rrowptr = Rrowptr_RCP();
1549 Rcolind = Rcolind_RCP();
1550 Rvals = Rvals_RCP();
1551
1553 // Construct graph
1554
1555 #ifdef HAVE_TPETRA_MMM_TIMINGS
1556 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1557 #endif
1558
1559 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1560 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1561
1562 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1563 size_t nnzPerRowA = 100;
1564 if (Aview.origMatrix->getNodeNumEntries() > 0)
1565 nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1566 Acrowptr_RCP.resize(n+1);
1567 Acrowptr = Acrowptr_RCP();
1568 Accolind_RCP.resize(nnz_alloc);
1569 Accolind = Accolind_RCP();
1570
1571 size_t nnz = 0, nnz_old = 0;
1572 for (size_t i = 0; i < n; i++) {
1573 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1574 // on the calling process.
1575 Acrowptr[i] = nnz;
1576
1577 // mfh 27 Sep 2016: For each entry of R in the current row of R
1578 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1579 LO k = Rcolind[kk]; // local column index of current entry of R
1580 // For each entry of A in the current row of A
1581 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1582 LO l = Acolind[ll]; // local column index of current entry of A
1583
1584 if (Acol2PRow[l] != LO_INVALID) {
1585 // mfh 27 Sep 2016: If the entry of Acol2PRow
1586 // corresponding to the current entry of A is populated, then
1587 // the corresponding row of P is in P_local (i.e., it lives on
1588 // the calling process).
1589
1590 // Local matrix
1591 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1592
1593 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1594 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1595 LO j = Pcolind[jj];
1596 LO Acj = Pcol2Accol[j];
1597
1598 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1599 // New entry
1600 ac_status[Acj] = nnz;
1601 Accolind[nnz] = Acj;
1602 nnz++;
1603 }
1604 }
1605 } else {
1606 // mfh 27 Sep 2016: If the entry of Acol2PRow
1607 // corresponding to the current entry of A is NOT populated (has
1608 // a flag "invalid" value), then the corresponding row of P is
1609 // in P_remote (i.e., it does not live on the calling process).
1610
1611 // Remote matrix
1612 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1613 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1614 LO j = Icolind[jj];
1615 LO Acj = PIcol2Accol[j];
1616
1617 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1618 // New entry
1619 ac_status[Acj] = nnz;
1620 Accolind[nnz] = Acj;
1621 nnz++;
1622 }
1623 }
1624 }
1625 }
1626 }
1627 // Resize for next pass if needed
1628 // cag: Maybe we can do something more subtle here, and not double
1629 // the size right away.
1630 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1631 nnz_alloc *= 2;
1632 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1633 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1634 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1635 }
1636 nnz_old = nnz;
1637 }
1638 Acrowptr[n] = nnz;
1639
1640 // Downward resize
1641 Accolind_RCP.resize(nnz);
1642 Accolind = Accolind_RCP();
1643
1644 // Allocate Acvals
1645 Acvals_RCP.resize(nnz, SC_ZERO);
1646 Acvals = Acvals_RCP();
1647
1648
1650 // In a second pass, enter the values into Acvals.
1652
1653 #ifdef HAVE_TPETRA_MMM_TIMINGS
1654 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1655 #endif
1656
1657
1658 for (size_t k = 0; k < n; k++) {
1659 for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1660 LO i = Pcolind[ii];
1661 const SC Pki = Pvals[ii];
1662 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1663 LO l = Acolind[ll];
1664 const SC Akl = Avals[ll];
1665 if (Akl == 0.)
1666 continue;
1667 if (Acol2PRow[l] != LO_INVALID) {
1668 // mfh 27 Sep 2016: If the entry of Acol2PRow
1669 // corresponding to the current entry of A is populated, then
1670 // the corresponding row of P is in P_local (i.e., it lives on
1671 // the calling process).
1672
1673 // Local matrix
1674 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1675 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1676 LO j = Pcolind[jj];
1677 LO Acj = Pcol2Accol[j];
1678 size_t pp;
1679 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1680 if (Accolind[pp] == Acj)
1681 break;
1682 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1683 // std::runtime_error, "problem with Ac column indices");
1684 Acvals[pp] += Pki*Akl*Pvals[jj];
1685 }
1686 } else {
1687 // mfh 27 Sep 2016: If the entry of Acol2PRow
1688 // corresponding to the current entry of A NOT populated (has
1689 // a flag "invalid" value), then the corresponding row of P is
1690 // in P_remote (i.e., it does not live on the calling process).
1691
1692 // Remote matrix
1693 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1694 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1695 LO j = Icolind[jj];
1696 LO Acj = PIcol2Accol[j];
1697 size_t pp;
1698 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1699 if (Accolind[pp] == Acj)
1700 break;
1701 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1702 // std::runtime_error, "problem with Ac column indices");
1703 Acvals[pp] += Pki*Akl*Ivals[jj];
1704 }
1705 }
1706 }
1707 }
1708 }
1709
1710
1711#ifdef HAVE_TPETRA_MMM_TIMINGS
1712 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1713#endif
1714
1715 // Final sort & set of CRS arrays
1716 //
1717 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1718 // matrix-matrix multiply routine sort the entries for us?
1719 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1720
1721 // mfh 27 Sep 2016: This just sets pointers.
1722 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1723
1724#ifdef HAVE_TPETRA_MMM_TIMINGS
1725 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1726#endif
1727
1728 // Final FillComplete
1729 //
1730 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1731 // Import (from domain Map to column Map) construction (which costs
1732 // lots of communication) by taking the previously constructed
1733 // Import object. We should be able to do this without interfering
1734 // with the implementation of the local part of sparse matrix-matrix
1735 // multply above.
1736 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1737 labelList->set("Timer Label",label);
1738 // labelList->set("Sort column Map ghost GIDs")
1739 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1740 RCP<const Export<LO,GO,NO> > dummyExport;
1741 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1742 Pview.origMatrix->getDomainMap(),
1743 Acimport,
1744 dummyExport, labelList);
1745 }
1746
1747
1748
1749 } //End namepsace MMdetails
1750
1751} //End namespace Tpetra
1752//
1753// Explicit instantiation macro
1754//
1755// Must be expanded from within the Tpetra namespace!
1756//
1757
1758#define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1759 \
1760 template \
1761 void TripleMatrixMultiply::MultiplyRAP( \
1762 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1763 bool transposeR, \
1764 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1765 bool transposeA, \
1766 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1767 bool transposeP, \
1768 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1769 bool call_FillComplete_on_result, \
1770 const std::string & label, \
1771 const Teuchos::RCP<Teuchos::ParameterList>& params); \
1772 \
1773
1774
1775#endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool isFillComplete() const override
Whether the matrix is fill complete.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.