Tpetra parallel linear algebra Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42#define TPETRA_MATRIXMATRIX_DEF_HPP
43#include "Tpetra_ConfigDefs.hpp"
45#include "Teuchos_VerboseObject.hpp"
46#include "Teuchos_Array.hpp"
47#include "Tpetra_Util.hpp"
48#include "Tpetra_CrsMatrix.hpp"
50#include "Tpetra_RowMatrixTransposer.hpp"
53#include "Tpetra_Details_makeColMap.hpp"
54#include "Tpetra_ConfigDefs.hpp"
55#include "Tpetra_Map.hpp"
56#include "Tpetra_Export.hpp"
59#include <algorithm>
60#include <type_traits>
61#include "Teuchos_FancyOStream.hpp"
62
63#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65
66#include "KokkosSparse_spgemm.hpp"
67#include "KokkosSparse_spadd.hpp"
68#include "Kokkos_Bitset.hpp"
69
71
79/*********************************************************************************************************/
80// Include the architecture-specific kernel partial specializations here
81// NOTE: This needs to be outside all namespaces
82#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83#include "TpetraExt_MatrixMatrix_Cuda.hpp"
84#include "TpetraExt_MatrixMatrix_HIP.hpp"
85
86namespace Tpetra {
87
88namespace MatrixMatrix{
89
90//
91// This method forms the matrix-matrix product C = op(A) * op(B), where
92// op(A) == A if transposeA is false,
93// op(A) == A^T if transposeA is true,
94// and similarly for op(B).
95//
96template <class Scalar,
97 class LocalOrdinal,
98 class GlobalOrdinal,
99 class Node>
102 bool transposeA,
104 bool transposeB,
106 bool call_FillComplete_on_result,
107 const std::string& label,
108 const Teuchos::RCP<Teuchos::ParameterList>& params)
109{
110 using Teuchos::null;
111 using Teuchos::RCP;
112 using Teuchos::rcp;
113 typedef Scalar SC;
114 typedef LocalOrdinal LO;
115 typedef GlobalOrdinal GO;
116 typedef Node NO;
117 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
118 typedef Import<LO,GO,NO> import_type;
119 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
120 typedef Map<LO,GO,NO> map_type;
121 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
122
123#ifdef HAVE_TPETRA_MMM_TIMINGS
124 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
125 using Teuchos::TimeMonitor;
126 //MM is used to time setup, and then multiply.
127
128 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
129#endif
130
131 const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
132
133 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
134
135 // The input matrices A and B must both be fillComplete.
136 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
137 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
138
139 // If transposeA is true, then Aprime will be the transpose of A
140 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
141 // will just be a pointer to A.
142 RCP<const crs_matrix_type> Aprime = null;
143 // If transposeB is true, then Bprime will be the transpose of B
144 // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
145 // will just be a pointer to B.
146 RCP<const crs_matrix_type> Bprime = null;
147
148 // Is this a "clean" matrix?
149 //
150 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
151 // locally nor globally indexed, then it was empty. I don't like
152 // this, because the most straightforward implementation presumes
153 // lazy allocation of indices. However, historical precedent
154 // demands that we keep around this predicate as a way to test
155 // whether the matrix is empty.
156 const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
157
158 bool use_optimized_ATB = false;
159 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160 use_optimized_ATB = true;
161
162#ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163 use_optimized_ATB = false;
164#endif
165
166 using Teuchos::ParameterList;
167 RCP<ParameterList> transposeParams (new ParameterList);
168 transposeParams->set ("sort", false);
169
170 if (!use_optimized_ATB && transposeA) {
171 transposer_type transposer (rcpFromRef (A));
172 Aprime = transposer.createTranspose (transposeParams);
173 }
174 else {
175 Aprime = rcpFromRef(A);
176 }
177
178 if (transposeB) {
179 transposer_type transposer (rcpFromRef (B));
180 Bprime = transposer.createTranspose (transposeParams);
181 }
182 else {
183 Bprime = rcpFromRef(B);
184 }
185
186 // Check size compatibility
187 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
188 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
189 global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
190 global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
191 global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
192 global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
193 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194 prefix << "ERROR, inner dimensions of op(A) and op(B) "
195 "must match for matrix-matrix product. op(A) is "
196 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
197
198 // The result matrix C must at least have a row-map that reflects the correct
199 // row-size. Don't check the number of columns because rectangular matrices
200 // which were constructed with only one map can still end up having the
201 // correct capacity and dimensions when filled.
202 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
203 prefix << "ERROR, dimensions of result C must "
204 "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
205 << " rows, should have at least " << Aouter << std::endl);
206
207 // It doesn't matter whether C is already Filled or not. If it is already
208 // Filled, it must have space allocated for the positions that will be
209 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
210 // we'll error out later when trying to store result values.
211
212 // CGB: However, matrix must be in active-fill
213 if (!C.isFillActive()) C.resumeFill();
214
215 // We're going to need to import remotely-owned sections of A and/or B if
216 // more than one processor is performing this run, depending on the scenario.
217 int numProcs = A.getComm()->getSize();
218
219 // Declare a couple of structs that will be used to hold views of the data
220 // of A and B, to be used for fast access during the matrix-multiplication.
221 crs_matrix_struct_type Aview;
222 crs_matrix_struct_type Bview;
223
224 RCP<const map_type> targetMap_A = Aprime->getRowMap();
225 RCP<const map_type> targetMap_B = Bprime->getRowMap();
226
227#ifdef HAVE_TPETRA_MMM_TIMINGS
228 {
229 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
230#endif
231
232 // Now import any needed remote rows and populate the Aview struct
233 // NOTE: We assert that an import isn't needed --- since we do the transpose
234 // above to handle that.
235 if (!use_optimized_ATB) {
236 RCP<const import_type> dummyImporter;
237 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
238 }
239
240 // We will also need local access to all rows of B that correspond to the
241 // column-map of op(A).
242 if (numProcs > 1)
243 targetMap_B = Aprime->getColMap();
244
245 // Import any needed remote rows and populate the Bview struct.
246 if (!use_optimized_ATB)
247 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
248
249#ifdef HAVE_TPETRA_MMM_TIMINGS
250 } //stop MM_importExtract here
251 //stop the setup timer, and start the multiply timer
252 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
253#endif
254
255 // Call the appropriate method to perform the actual multiplication.
256 if (use_optimized_ATB) {
257 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258
259 } else if (call_FillComplete_on_result && newFlag) {
260 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261
262 } else if (call_FillComplete_on_result) {
263 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
264
265 } else {
266 // mfh 27 Sep 2016: Is this the "slow" case? This
267 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
268 // thread-parallel inserts, but that may take some effort.
269 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
270
271 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
272 }
273
274#ifdef HAVE_TPETRA_MMM_TIMINGS
275 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
276#endif
277 if (call_FillComplete_on_result && !C.isFillComplete()) {
278 // We'll call FillComplete on the C matrix before we exit, and give it a
279 // domain-map and a range-map.
280 // The domain-map will be the domain-map of B, unless
281 // op(B)==transpose(B), in which case the range-map of B will be used.
282 // The range-map will be the range-map of A, unless op(A)==transpose(A),
283 // in which case the domain-map of A will be used.
284 C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
285 }
286}
287
288
289template <class Scalar,
290 class LocalOrdinal,
291 class GlobalOrdinal,
292 class Node>
293void Jacobi(Scalar omega,
298 bool call_FillComplete_on_result,
299 const std::string& label,
300 const Teuchos::RCP<Teuchos::ParameterList>& params)
301{
302 using Teuchos::RCP;
303 typedef Scalar SC;
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
306 typedef Node NO;
307 typedef Import<LO,GO,NO> import_type;
308 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
309 typedef Map<LO,GO,NO> map_type;
310 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
311
312#ifdef HAVE_TPETRA_MMM_TIMINGS
313 std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
314 using Teuchos::TimeMonitor;
315 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
316#endif
317
318 const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
319
320 // A and B should already be Filled.
321 // Should we go ahead and call FillComplete() on them if necessary or error
322 // out? For now, we choose to error out.
323 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
324 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
325
326 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
328
329 // Now check size compatibility
330 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
331 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
332 global_size_t Aouter = A.getGlobalNumRows();
333 global_size_t Bouter = numBCols;
334 global_size_t Ainner = numACols;
335 global_size_t Binner = B.getGlobalNumRows();
336 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337 prefix << "ERROR, inner dimensions of op(A) and op(B) "
338 "must match for matrix-matrix product. op(A) is "
339 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
340
341 // The result matrix C must at least have a row-map that reflects the correct
342 // row-size. Don't check the number of columns because rectangular matrices
343 // which were constructed with only one map can still end up having the
344 // correct capacity and dimensions when filled.
345 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
346 prefix << "ERROR, dimensions of result C must "
347 "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
348 << " rows, should have at least "<< Aouter << std::endl);
349
350 // It doesn't matter whether C is already Filled or not. If it is already
351 // Filled, it must have space allocated for the positions that will be
352 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
353 // we'll error out later when trying to store result values.
354
355 // CGB: However, matrix must be in active-fill
356 TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
357
358 // We're going to need to import remotely-owned sections of A and/or B if
359 // more than one processor is performing this run, depending on the scenario.
360 int numProcs = A.getComm()->getSize();
361
362 // Declare a couple of structs that will be used to hold views of the data of
363 // A and B, to be used for fast access during the matrix-multiplication.
364 crs_matrix_struct_type Aview;
365 crs_matrix_struct_type Bview;
366
367 RCP<const map_type> targetMap_A = Aprime->getRowMap();
368 RCP<const map_type> targetMap_B = Bprime->getRowMap();
369
370#ifdef HAVE_TPETRA_MMM_TIMINGS
371 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
372 {
373#endif
374
375 // Enable globalConstants by default
376 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
377 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
378 if(!params.is_null()) {
379 importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
380 int mm_optimization_core_count=0;
381 auto slist = params->sublist("matrixmatrix: kernel params",false);
382 mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
383 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
386 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
387 auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
388 ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389 ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
390 ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
391 }
392
393 //Now import any needed remote rows and populate the Aview struct.
394 RCP<const import_type> dummyImporter;
395 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
396
397 // We will also need local access to all rows of B that correspond to the
398 // column-map of op(A).
399 if (numProcs > 1)
400 targetMap_B = Aprime->getColMap();
401
402 // Now import any needed remote rows and populate the Bview struct.
403 // Enable globalConstants by default
404 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
405 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
406 if(!params.is_null()) {
407 importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
408
409 auto slist = params->sublist("matrixmatrix: kernel params",false);
410 int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
411 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
412 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
413 auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
414 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416 ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417 ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
418 ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
419 }
420
421 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
422
423#ifdef HAVE_TPETRA_MMM_TIMINGS
424 }
425 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
426#endif
427
428 // Now call the appropriate method to perform the actual multiplication.
429 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
430
431 // Is this a "clean" matrix
432 bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
433
434 if (call_FillComplete_on_result && newFlag) {
435 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436
437 } else if (call_FillComplete_on_result) {
438 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
439
440 } else {
441 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
442 // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
443 // commenting it out.
444// #ifdef HAVE_TPETRA_MMM_TIMINGS
445// MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
446// #endif
447 // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
448 // commenting it out.
449 // if (call_FillComplete_on_result) {
450 // //We'll call FillComplete on the C matrix before we exit, and give
451 // //it a domain-map and a range-map.
452 // //The domain-map will be the domain-map of B, unless
453 // //op(B)==transpose(B), in which case the range-map of B will be used.
454 // //The range-map will be the range-map of A, unless
455 // //op(A)==transpose(A), in which case the domain-map of A will be used.
456 // if (!C.isFillComplete()) {
457 // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
458 // }
459 // }
460 }
461}
462
463
464template <class Scalar,
465 class LocalOrdinal,
466 class GlobalOrdinal,
467 class Node>
468void Add(
470 bool transposeA,
471 Scalar scalarA,
473 Scalar scalarB )
474{
475 using Teuchos::Array;
476 using Teuchos::RCP;
477 using Teuchos::null;
478 typedef Scalar SC;
479 typedef LocalOrdinal LO;
480 typedef GlobalOrdinal GO;
481 typedef Node NO;
482 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
483 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
484
485 const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
486
487 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
488 prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489 "(Result matrix B is not required to be isFillComplete()).");
490 TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
491 prefix << "ERROR, input matrix B must not be fill complete!");
492 TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
493 prefix << "ERROR, input matrix B must not have static graph!");
494 TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
495 prefix << "ERROR, input matrix B must not be locally indexed!");
496
497 using Teuchos::ParameterList;
498 RCP<ParameterList> transposeParams (new ParameterList);
499 transposeParams->set ("sort", false);
500
501 RCP<const crs_matrix_type> Aprime = null;
502 if (transposeA) {
503 transposer_type transposer (rcpFromRef (A));
504 Aprime = transposer.createTranspose (transposeParams);
505 }
506 else {
507 Aprime = rcpFromRef(A);
508 }
509
510 size_t a_numEntries;
511 typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getNodeMaxNumRowEntries());
512 typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getNodeMaxNumRowEntries());
513 GO row;
514
515 if (scalarB != Teuchos::ScalarTraits<SC>::one())
516 B.scale(scalarB);
517
518 bool bFilled = B.isFillComplete();
519 size_t numMyRows = B.getNodeNumRows();
520 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521 for (LO i = 0; (size_t)i < numMyRows; ++i) {
522 row = B.getRowMap()->getGlobalElement(i);
523 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
524
525 if (scalarA != Teuchos::ScalarTraits<SC>::one())
526 for (size_t j = 0; j < a_numEntries; ++j)
527 a_vals[j] *= scalarA;
528
529 if (bFilled)
530 B.sumIntoGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
531 else
532 B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
533 }
534 }
535}
536
537template <class Scalar,
538 class LocalOrdinal,
539 class GlobalOrdinal,
540 class Node>
541Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
542add (const Scalar& alpha,
543 const bool transposeA,
545 const Scalar& beta,
546 const bool transposeB,
548 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
549 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
550 const Teuchos::RCP<Teuchos::ParameterList>& params)
551{
552 using Teuchos::RCP;
553 using Teuchos::rcpFromRef;
554 using Teuchos::rcp;
555 using Teuchos::ParameterList;
557 if(!params.is_null())
558 {
559 TEUCHOS_TEST_FOR_EXCEPTION(
560 params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
561 std::invalid_argument,
562 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564 params->set("Call fillComplete", true);
565 }
566 //If transposeB, must compute B's explicit transpose to
567 //get the correct row map for C.
568 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
569 if(transposeB)
570 {
572 Brcp = transposer.createTranspose();
573 }
574 //Check that A,B are fillComplete before getting B's column map
575 TEUCHOS_TEST_FOR_EXCEPTION
576 (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578 RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
579 //this version of add() always fill completes the result, no matter what is in params on input
580 add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
581 return C;
582}
583
584//This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
585//but since the spadd() output is always packed there is no need for a separate
586//numRowEntries here.
587//
588template<class LO, class GO, class LOView, class GOView, class LocalMap>
589struct ConvertGlobalToLocalFunctor
590{
591 ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
592 : lids(lids_), gids(gids_), localColMap(localColMap_)
593 {}
594
595 KOKKOS_FUNCTION void operator() (const GO i) const
596 {
597 lids(i) = localColMap.getLocalElement(gids(i));
598 }
599
600 LOView lids;
601 const GOView gids;
602 const LocalMap localColMap;
603};
604
605template <class Scalar,
606 class LocalOrdinal,
607 class GlobalOrdinal,
608 class Node>
609void
610add (const Scalar& alpha,
611 const bool transposeA,
613 const Scalar& beta,
614 const bool transposeB,
617 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
618 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
619 const Teuchos::RCP<Teuchos::ParameterList>& params)
620{
621 using Teuchos::RCP;
622 using Teuchos::rcp;
623 using Teuchos::rcpFromRef;
624 using Teuchos::rcp_implicit_cast;
625 using Teuchos::rcp_dynamic_cast;
626 using Teuchos::TimeMonitor;
627 using SC = Scalar;
628 using LO = LocalOrdinal;
629 using GO = GlobalOrdinal;
630 using NO = Node;
631 using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
632 using crs_graph_type = CrsGraph<LO,GO,NO>;
633 using map_type = Map<LO,GO,NO>;
634 using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
635 using import_type = Import<LO,GO,NO>;
636 using export_type = Export<LO,GO,NO>;
637 using exec_space = typename crs_graph_type::execution_space;
638 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639 const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
640 constexpr bool debug = false;
641
642#ifdef HAVE_TPETRA_MMM_TIMINGS
643 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
644#endif
645
646 if (debug) {
647 std::ostringstream os;
648 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
649 << "TpetraExt::MatrixMatrix::add" << std::endl;
650 std::cerr << os.str ();
651 }
652
653 TEUCHOS_TEST_FOR_EXCEPTION
654 (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
655 prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
656 TEUCHOS_TEST_FOR_EXCEPTION
657 (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
658 prefix_mmm << "A and B must both be fill complete.");
659#ifdef HAVE_TPETRA_DEBUG
660 // The matrices don't have domain or range Maps unless they are fill complete.
661 if (A.isFillComplete () && B.isFillComplete ()) {
662 const bool domainMapsSame =
663 (! transposeA && ! transposeB &&
664 ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
665 (! transposeA && transposeB &&
666 ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
667 ( transposeA && ! transposeB &&
668 ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
669 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670 prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
671
672 const bool rangeMapsSame =
673 (! transposeA && ! transposeB &&
674 ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
675 (! transposeA && transposeB &&
676 ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
677 ( transposeA && ! transposeB &&
678 ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
679 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680 prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
681 }
682#endif // HAVE_TPETRA_DEBUG
683
684 using Teuchos::ParameterList;
685 // Form the explicit transpose of A if necessary.
686 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
687 if (transposeA) {
688 transposer_type transposer (Aprime);
689 Aprime = transposer.createTranspose ();
690 }
691
692#ifdef HAVE_TPETRA_DEBUG
693 TEUCHOS_TEST_FOR_EXCEPTION
694 (Aprime.is_null (), std::logic_error,
695 prefix_mmm << "Failed to compute Op(A). "
696 "Please report this bug to the Tpetra developers.");
697#endif // HAVE_TPETRA_DEBUG
698
699 // Form the explicit transpose of B if necessary.
700 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
701 if (transposeB) {
702 if (debug) {
703 std::ostringstream os;
704 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
705 << "Form explicit xpose of B" << std::endl;
706 std::cerr << os.str ();
707 }
708 transposer_type transposer (Bprime);
709 Bprime = transposer.createTranspose ();
710 }
711#ifdef HAVE_TPETRA_DEBUG
712 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713 prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714 TEUCHOS_TEST_FOR_EXCEPTION(
715 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716 prefix_mmm << "Aprime and Bprime must both be fill complete. "
717 "Please report this bug to the Tpetra developers.");
718#endif // HAVE_TPETRA_DEBUG
719 RCP<const map_type> CDomainMap = domainMap;
720 RCP<const map_type> CRangeMap = rangeMap;
721 if(CDomainMap.is_null())
722 {
723 CDomainMap = Bprime->getDomainMap();
724 }
725 if(CRangeMap.is_null())
726 {
727 CRangeMap = Bprime->getRangeMap();
728 }
729 assert(!(CDomainMap.is_null()));
730 assert(!(CRangeMap.is_null()));
731 typedef typename AddKern::values_array values_array;
732 typedef typename AddKern::row_ptrs_array row_ptrs_array;
733 typedef typename AddKern::col_inds_array col_inds_array;
734 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
736 values_array vals;
737 row_ptrs_array rowptrs;
738 col_inds_array colinds;
739#ifdef HAVE_TPETRA_MMM_TIMINGS
740 MM = Teuchos::null;
741 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
742#endif
743 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
744 {
745 //import Aprime into Bprime's row map so the local matrices have same # of rows
746 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
747 // cbl do not set
748 // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
749 // this import _may_ take the form of a transfer. In practice it would be unlikely,
750 // but the general case is not so forgiving.
751 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
752 }
753 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754 bool sorted = AGraphSorted && BGraphSorted;
755 RCP<const import_type> Cimport = Teuchos::null;
756 RCP<export_type> Cexport = Teuchos::null;
757 bool doFillComplete = true;
758 if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
759 {
760 doFillComplete = params->get<bool>("Call fillComplete");
761 }
762 auto Alocal = Aprime->getLocalMatrixDevice();
763 auto Blocal = Bprime->getLocalMatrixDevice();
764 LO numLocalRows = Alocal.numRows();
765 if(numLocalRows == 0)
766 {
767 //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
768 //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
769 //Handle this case now
770 //(without interfering with collective operations, since it's possible for
771 //some ranks to have 0 local rows and others not).
772 rowptrs = row_ptrs_array("C rowptrs", 0);
773 }
774 auto Acolmap = Aprime->getColMap();
775 auto Bcolmap = Bprime->getColMap();
776 if(!matchingColMaps)
777 {
778 using global_col_inds_array = typename AddKern::global_col_inds_array;
779#ifdef HAVE_TPETRA_MMM_TIMINGS
780 MM = Teuchos::null;
781 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
782#endif
783 //use kernel that converts col indices in both A and B to common domain map before adding
784 auto AlocalColmap = Acolmap->getLocalMap();
785 auto BlocalColmap = Bcolmap->getLocalMap();
786 global_col_inds_array globalColinds;
787 if (debug) {
788 std::ostringstream os;
789 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
790 << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791 std::cerr << os.str ();
792 }
793 AddKern::convertToGlobalAndAdd(
794 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795 vals, rowptrs, globalColinds);
796 if (debug) {
797 std::ostringstream os;
798 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
799 << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800 std::cerr << os.str ();
801 }
802#ifdef HAVE_TPETRA_MMM_TIMINGS
803 MM = Teuchos::null;
804 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
805#endif
806 RCP<const map_type> CcolMap;
807 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808 (CcolMap, CDomainMap, globalColinds);
809 C.replaceColMap(CcolMap);
810 col_inds_array localColinds("C colinds", globalColinds.extent(0));
811 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813 col_inds_array, global_col_inds_array,
814 typename map_type::local_map_type>
815 (localColinds, globalColinds, CcolMap->getLocalMap()));
816 KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
817 C.setAllValues(rowptrs, localColinds, vals);
818 C.fillComplete(CDomainMap, CRangeMap, params);
819 if(!doFillComplete)
820 C.resumeFill();
821 }
822 else
823 {
824 //Aprime, Bprime and C all have the same column maps
825 auto Avals = Alocal.values;
826 auto Bvals = Blocal.values;
827 auto Arowptrs = Alocal.graph.row_map;
828 auto Browptrs = Blocal.graph.row_map;
829 auto Acolinds = Alocal.graph.entries;
830 auto Bcolinds = Blocal.graph.entries;
831 if(sorted)
832 {
833 //use sorted kernel
834#ifdef HAVE_TPETRA_MMM_TIMINGS
835 MM = Teuchos::null;
836 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
837#endif
838 if (debug) {
839 std::ostringstream os;
840 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
841 << "Call AddKern::addSorted(...)" << std::endl;
842 std::cerr << os.str ();
843 }
844 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
845 }
846 else
847 {
848 //use unsorted kernel
849#ifdef HAVE_TPETRA_MMM_TIMINGS
850 MM = Teuchos::null;
851 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
852#endif
853 if (debug) {
854 std::ostringstream os;
855 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
856 << "Call AddKern::addUnsorted(...)" << std::endl;
857 std::cerr << os.str ();
858 }
859 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
860 }
861 //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
862 RCP<const map_type> Ccolmap = Bcolmap;
863 C.replaceColMap(Ccolmap);
864 C.setAllValues(rowptrs, colinds, vals);
865 if(doFillComplete)
866 {
867 #ifdef HAVE_TPETRA_MMM_TIMINGS
868 MM = Teuchos::null;
869 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
870 #endif
871 if(!CDomainMap->isSameAs(*Ccolmap))
872 {
873 if (debug) {
874 std::ostringstream os;
875 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
876 << "Create Cimport" << std::endl;
877 std::cerr << os.str ();
878 }
879 Cimport = rcp(new import_type(CDomainMap, Ccolmap));
880 }
881 if(!C.getRowMap()->isSameAs(*CRangeMap))
882 {
883 if (debug) {
884 std::ostringstream os;
885 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
886 << "Create Cexport" << std::endl;
887 std::cerr << os.str ();
888 }
889 Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
890 }
891
892 if (debug) {
893 std::ostringstream os;
894 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895 << "Call C->expertStaticFillComplete(...)" << std::endl;
896 std::cerr << os.str ();
897 }
898 C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
899 }
900 }
901}
902
903template <class Scalar,
904 class LocalOrdinal,
905 class GlobalOrdinal,
906 class Node>
907void Add(
909 bool transposeA,
910 Scalar scalarA,
912 bool transposeB,
913 Scalar scalarB,
915{
916 using Teuchos::Array;
917 using Teuchos::ArrayRCP;
918 using Teuchos::ArrayView;
919 using Teuchos::RCP;
920 using Teuchos::rcp;
921 using Teuchos::rcp_dynamic_cast;
922 using Teuchos::rcpFromRef;
923 using Teuchos::tuple;
924 using std::endl;
925 // typedef typename ArrayView<const Scalar>::size_type size_type;
926 typedef Teuchos::ScalarTraits<Scalar> STS;
928 // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
929 // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
930 // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
933
934 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
935
936 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937 prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
938
939 TEUCHOS_TEST_FOR_EXCEPTION(
940 ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
941 prefix << "Both input matrices must be fill complete before calling this function.");
942
943#ifdef HAVE_TPETRA_DEBUG
944 {
945 const bool domainMapsSame =
946 (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
947 (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
948 (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
949 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950 prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
951
952 const bool rangeMapsSame =
953 (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
954 (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
955 (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
956 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957 prefix << "The range Maps of Op(A) and Op(B) are not the same.");
958 }
959#endif // HAVE_TPETRA_DEBUG
960
961 using Teuchos::ParameterList;
962 RCP<ParameterList> transposeParams (new ParameterList);
963 transposeParams->set ("sort", false);
964
965 // Form the explicit transpose of A if necessary.
966 RCP<const crs_matrix_type> Aprime;
967 if (transposeA) {
968 transposer_type theTransposer (rcpFromRef (A));
969 Aprime = theTransposer.createTranspose (transposeParams);
970 }
971 else {
972 Aprime = rcpFromRef (A);
973 }
974
975#ifdef HAVE_TPETRA_DEBUG
976 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977 prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978#endif // HAVE_TPETRA_DEBUG
979
980 // Form the explicit transpose of B if necessary.
981 RCP<const crs_matrix_type> Bprime;
982 if (transposeB) {
983 transposer_type theTransposer (rcpFromRef (B));
984 Bprime = theTransposer.createTranspose (transposeParams);
985 }
986 else {
987 Bprime = rcpFromRef (B);
988 }
989
990#ifdef HAVE_TPETRA_DEBUG
991 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992 prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993#endif // HAVE_TPETRA_DEBUG
994
995 // Allocate or zero the entries of the result matrix.
996 if (! C.is_null ()) {
997 C->setAllToScalar (STS::zero ());
998 } else {
999 // FIXME (mfh 08 May 2013) When I first looked at this method, I
1000 // noticed that C was being given the row Map of Aprime (the
1001 // possibly transposed version of A). Is this what we want?
1002 C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1003 }
1004
1005#ifdef HAVE_TPETRA_DEBUG
1006 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007 prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009 prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011 prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1012#endif // HAVE_TPETRA_DEBUG
1013
1014 Array<RCP<const crs_matrix_type> > Mat =
1015 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1017
1018 // do a loop over each matrix to add: A reordering might be more efficient
1019 for (int k = 0; k < 2; ++k) {
1020 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1021 typename crs_matrix_type::nonconst_values_host_view_type Values;
1022
1023 // Loop over each locally owned row of the current matrix (either
1024 // Aprime or Bprime), and sum its entries into the corresponding
1025 // row of C. This works regardless of whether Aprime or Bprime
1026 // has the same row Map as C, because both sumIntoGlobalValues and
1027 // insertGlobalValues allow summing resp. inserting into nonowned
1028 // rows of C.
1029#ifdef HAVE_TPETRA_DEBUG
1030 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032#endif // HAVE_TPETRA_DEBUG
1033 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034#ifdef HAVE_TPETRA_DEBUG
1035 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037#endif // HAVE_TPETRA_DEBUG
1038
1039 const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040 for (size_t i = 0; i < localNumRows; ++i) {
1041 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043 if (numEntries > 0) {
1044 Kokkos::resize(Indices,numEntries);
1045 Kokkos::resize(Values,numEntries);
1046 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1047
1048 if (scalar[k] != STS::one ()) {
1049 for (size_t j = 0; j < numEntries; ++j) {
1050 Values[j] *= scalar[k];
1051 }
1052 }
1053
1054 if (C->isFillComplete ()) {
1055 C->sumIntoGlobalValues (globalRow, numEntries,
1056 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1057 } else {
1058 C->insertGlobalValues (globalRow, numEntries,
1059 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1060 }
1061 }
1062 }
1063 }
1064}
1065
1066} //End namespace MatrixMatrix
1067
1068namespace MMdetails{
1069
1070/*********************************************************************************************************/
1071// Prints MMM-style statistics on communication done with an Import or Export object
1072template <class TransferType>
1073void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1074 if (Transfer.is_null())
1075 return;
1076
1077 const Distributor & Distor = Transfer->getDistributor();
1078 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1079
1080 size_t rows_send = Transfer->getNumExportIDs();
1081 size_t rows_recv = Transfer->getNumRemoteIDs();
1082
1083 size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1084 size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1085 size_t num_send_neighbors = Distor.getNumSends();
1086 size_t num_recv_neighbors = Distor.getNumReceives();
1087 size_t round2_send, round2_recv;
1088 Distor.getLastDoStatistics(round2_send,round2_recv);
1089
1090 int myPID = Comm->getRank();
1091 int NumProcs = Comm->getSize();
1092
1093 // Processor by processor statistics
1094 // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1095 // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1096
1097 // Global statistics
1098 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1099 size_t gstats_min[8], gstats_max[8];
1100
1101 double lstats_avg[8], gstats_avg[8];
1102 for(int i=0; i<8; i++)
1103 lstats_avg[i] = ((double)lstats[i])/NumProcs;
1104
1105 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1106 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1107 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1108
1109 if(!myPID) {
1110 printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1111 (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1112 (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1113 printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1114 (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1115 (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1116 }
1117}
1118
1119// Kernel method for computing the local portion of C = A*B
1120template<class Scalar,
1121 class LocalOrdinal,
1122 class GlobalOrdinal,
1123 class Node>
1124void mult_AT_B_newmatrix(
1125 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1126 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1127 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1128 const std::string & label,
1129 const Teuchos::RCP<Teuchos::ParameterList>& params)
1130{
1131 using Teuchos::RCP;
1132 using Teuchos::rcp;
1133 typedef Scalar SC;
1134 typedef LocalOrdinal LO;
1135 typedef GlobalOrdinal GO;
1136 typedef Node NO;
1137 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1138 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1139
1140#ifdef HAVE_TPETRA_MMM_TIMINGS
1141 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1142 using Teuchos::TimeMonitor;
1143 RCP<TimeMonitor> MM = rcp (new TimeMonitor
1144 (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1145#endif
1146
1147 /*************************************************************/
1148 /* 1) Local Transpose of A */
1149 /*************************************************************/
1150 transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1151
1152 using Teuchos::ParameterList;
1153 RCP<ParameterList> transposeParams (new ParameterList);
1154 transposeParams->set ("sort", false);
1155 if(! params.is_null ()) {
1156 transposeParams->set ("compute global constants",
1157 params->get ("compute global constants: temporaries",
1158 false));
1159 }
1160 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1161 transposer.createTransposeLocal (transposeParams);
1162
1163 /*************************************************************/
1164 /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1165 /*************************************************************/
1166#ifdef HAVE_TPETRA_MMM_TIMINGS
1167 MM = Teuchos::null;
1168 MM = rcp (new TimeMonitor
1169 (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1170#endif
1171
1172 // Get views, asserting that no import is required to speed up computation
1173 crs_matrix_struct_type Aview;
1174 crs_matrix_struct_type Bview;
1175 RCP<const Import<LO, GO, NO> > dummyImporter;
1176
1177 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1178 RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1179 if (! params.is_null ()) {
1180 importParams1->set ("compute global constants",
1181 params->get ("compute global constants: temporaries",
1182 false));
1183 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1184 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1185 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1186 int mm_optimization_core_count =
1188 mm_optimization_core_count =
1189 slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190 int mm_optimization_core_count2 =
1191 params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1192 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1193 mm_optimization_core_count = mm_optimization_core_count2;
1194 }
1195 auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1196 sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1197 sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1198 sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1199 }
1200
1201 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1202 Aview, dummyImporter, true,
1203 label, importParams1);
1204
1205 RCP<ParameterList> importParams2 (new ParameterList);
1206 if (! params.is_null ()) {
1207 importParams2->set ("compute global constants",
1208 params->get ("compute global constants: temporaries",
1209 false));
1210 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1211 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1212 bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1213 int mm_optimization_core_count =
1215 mm_optimization_core_count =
1216 slist.get ("MM_TAFC_OptimizationCoreCount",
1217 mm_optimization_core_count);
1218 int mm_optimization_core_count2 =
1219 params->get ("MM_TAFC_OptimizationCoreCount",
1220 mm_optimization_core_count);
1221 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1222 mm_optimization_core_count = mm_optimization_core_count2;
1223 }
1224 auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1225 sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1226 sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1227 sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1228 }
1229
1230 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1231 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1232 }
1233 else {
1234 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1235 }
1236
1237#ifdef HAVE_TPETRA_MMM_TIMINGS
1238 MM = Teuchos::null;
1239 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1240#endif
1241
1242 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1243
1244 // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1245 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1246 if (needs_final_export) {
1247 Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1248 }
1249 else {
1250 Ctemp = rcp (&C, false);
1251 }
1252
1253 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1254
1255 /*************************************************************/
1256 /* 4) exportAndFillComplete matrix */
1257 /*************************************************************/
1258#ifdef HAVE_TPETRA_MMM_TIMINGS
1259 MM = Teuchos::null;
1260 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1261#endif
1262
1263 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1264
1265 if (needs_final_export) {
1266 ParameterList labelList;
1267 labelList.set("Timer Label", label);
1268 if(!params.is_null()) {
1269 ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1270 ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1271 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1272 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1273 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1274 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1275 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1276 bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1277 bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1278
1279 labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1280 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1281 labelList.set("compute global constants",params->get("compute global constants",true));
1282 labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1283 }
1284 Ctemp->exportAndFillComplete (Crcp,
1285 *Ctemp->getGraph ()->getExporter (),
1286 B.getDomainMap (),
1287 A.getDomainMap (),
1288 rcp (&labelList, false));
1289 }
1290#ifdef HAVE_TPETRA_MMM_STATISTICS
1291 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1292#endif
1293}
1294
1295/*********************************************************************************************************/
1296// Kernel method for computing the local portion of C = A*B
1297template<class Scalar,
1298 class LocalOrdinal,
1299 class GlobalOrdinal,
1300 class Node>
1301void mult_A_B(
1302 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1303 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1304 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1305 const std::string& /* label */,
1306 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1307{
1308 using Teuchos::Array;
1309 using Teuchos::ArrayRCP;
1310 using Teuchos::ArrayView;
1311 using Teuchos::OrdinalTraits;
1312 using Teuchos::null;
1313
1314 typedef Teuchos::ScalarTraits<Scalar> STS;
1315 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1316 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1317 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1318
1319 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1320 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1321
1322 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1323 ArrayView<const GlobalOrdinal> bcols_import = null;
1324 if (Bview.importColMap != null) {
1325 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1326 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1327
1328 bcols_import = Bview.importColMap->getNodeElementList();
1329 }
1330
1331 size_t C_numCols = C_lastCol - C_firstCol +
1332 OrdinalTraits<LocalOrdinal>::one();
1333 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1334 OrdinalTraits<LocalOrdinal>::one();
1335
1336 if (C_numCols_import > C_numCols)
1337 C_numCols = C_numCols_import;
1338
1339 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1340 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1341 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1342
1343 Array<Scalar> C_row_i = dwork;
1344 Array<GlobalOrdinal> C_cols = iwork;
1345 Array<size_t> c_index = iwork2;
1346 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1347 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1348
1349 size_t C_row_i_length, j, k, last_index;
1350
1351 // Run through all the hash table lookups once and for all
1352 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1353 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1354 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1355 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1356 // Maps are the same: Use local IDs as the hash
1357 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1358 Aview.colMap->getMaxLocalIndex(); i++)
1359 Acol2Brow[i]=i;
1360 }
1361 else {
1362 // Maps are not the same: Use the map's hash
1363 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1364 Aview.colMap->getMaxLocalIndex(); i++) {
1365 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1366 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1367 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1368 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1369 }
1370 }
1371
1372 // To form C = A*B we're going to execute this expression:
1373 //
1374 // C(i,j) = sum_k( A(i,k)*B(k,j) )
1375 //
1376 // Our goal, of course, is to navigate the data in A and B once, without
1377 // performing searches for column-indices, etc.
1378 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1379 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1380 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1382 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1383 ArrayView<const Scalar> Avals, Bvals, Ivals;
1384 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1385 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1386 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1387 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1388 if(!Bview.importMatrix.is_null()) {
1389 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1390 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1391 }
1392
1393 bool C_filled = C.isFillComplete();
1394
1395 for (size_t i = 0; i < C_numCols; i++)
1396 c_index[i] = OrdinalTraits<size_t>::invalid();
1397
1398 // Loop over the rows of A.
1399 size_t Arows = Aview.rowMap->getNodeNumElements();
1400 for(size_t i=0; i<Arows; ++i) {
1401
1402 // Only navigate the local portion of Aview... which is, thankfully, all of
1403 // A since this routine doesn't do transpose modes
1404 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1405
1406 // Loop across the i-th row of A and for each corresponding row in B, loop
1407 // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1408 // quantities C_row_i. In other words, as we stride across B(k,:) we're
1409 // calculating updates for row i of the result matrix C.
1410 C_row_i_length = OrdinalTraits<size_t>::zero();
1411
1412 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1413 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1414 const Scalar Aval = Avals[k];
1415 if (Aval == STS::zero())
1416 continue;
1417
1418 if (Ak == LO_INVALID)
1419 continue;
1420
1421 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1422 LocalOrdinal col = Bcolind[j];
1423 //assert(col >= 0 && col < C_numCols);
1424
1425 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1426 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1427 // This has to be a += so insertGlobalValue goes out
1428 C_row_i[C_row_i_length] = Aval*Bvals[j];
1429 C_cols[C_row_i_length] = col;
1430 c_index[col] = C_row_i_length;
1431 C_row_i_length++;
1432
1433 } else {
1434 C_row_i[c_index[col]] += Aval*Bvals[j];
1435 }
1436 }
1437 }
1438
1439 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1440 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1441 C_cols[ii] = bcols[C_cols[ii]];
1442 combined_index[ii] = C_cols[ii];
1443 combined_values[ii] = C_row_i[ii];
1444 }
1445 last_index = C_row_i_length;
1446
1447 //
1448 //Now put the C_row_i values into C.
1449 //
1450 // We might have to revamp this later.
1451 C_row_i_length = OrdinalTraits<size_t>::zero();
1452
1453 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1454 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1455 const Scalar Aval = Avals[k];
1456 if (Aval == STS::zero())
1457 continue;
1458
1459 if (Ak!=LO_INVALID) continue;
1460
1461 Ak = Acol2Irow[Acolind[k]];
1462 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1463 LocalOrdinal col = Icolind[j];
1464 //assert(col >= 0 && col < C_numCols);
1465
1466 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1467 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1468 // This has to be a += so insertGlobalValue goes out
1469 C_row_i[C_row_i_length] = Aval*Ivals[j];
1470 C_cols[C_row_i_length] = col;
1471 c_index[col] = C_row_i_length;
1472 C_row_i_length++;
1473
1474 } else {
1475 // This has to be a += so insertGlobalValue goes out
1476 C_row_i[c_index[col]] += Aval*Ivals[j];
1477 }
1478 }
1479 }
1480
1481 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1482 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1483 C_cols[ii] = bcols_import[C_cols[ii]];
1484 combined_index[last_index] = C_cols[ii];
1485 combined_values[last_index] = C_row_i[ii];
1486 last_index++;
1487 }
1488
1489 // Now put the C_row_i values into C.
1490 // We might have to revamp this later.
1491 C_filled ?
1492 C.sumIntoGlobalValues(
1493 global_row,
1494 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1495 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1496 :
1497 C.insertGlobalValues(
1498 global_row,
1499 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1500 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1501
1502 }
1503}
1504
1505/*********************************************************************************************************/
1506template<class Scalar,
1507 class LocalOrdinal,
1508 class GlobalOrdinal,
1509 class Node>
1510void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1511 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1512 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1513
1514 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1515 Mview.maxNumRowEntries = Mview.indices[0].size();
1516
1517 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1518 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1519 Mview.maxNumRowEntries = Mview.indices[i].size();
1520 }
1521}
1522
1523/*********************************************************************************************************/
1524template<class CrsMatrixType>
1525size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1526 // Follows the NZ estimate in ML's ml_matmatmult.c
1527 size_t Aest = 100, Best=100;
1528 if (A.getNodeNumEntries() >= A.getNodeNumRows())
1529 Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1530 if (B.getNodeNumEntries() >= B.getNodeNumRows())
1531 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1532
1533 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1534 nnzperrow *= nnzperrow;
1535
1536 return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1537}
1538
1539
1540/*********************************************************************************************************/
1541// Kernel method for computing the local portion of C = A*B
1542//
1543// mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1544// function, so this is probably the function we want to
1545// thread-parallelize.
1546template<class Scalar,
1547 class LocalOrdinal,
1548 class GlobalOrdinal,
1549 class Node>
1550void mult_A_B_newmatrix(
1551 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1552 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1553 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1554 const std::string& label,
1555 const Teuchos::RCP<Teuchos::ParameterList>& params)
1556{
1557 using Teuchos::Array;
1558 using Teuchos::ArrayRCP;
1559 using Teuchos::ArrayView;
1560 using Teuchos::RCP;
1561 using Teuchos::rcp;
1562
1563 // Tpetra typedefs
1564 typedef LocalOrdinal LO;
1565 typedef GlobalOrdinal GO;
1566 typedef Node NO;
1567 typedef Import<LO,GO,NO> import_type;
1568 typedef Map<LO,GO,NO> map_type;
1569
1570 // Kokkos typedefs
1571 typedef typename map_type::local_map_type local_map_type;
1573 typedef typename KCRS::StaticCrsGraphType graph_t;
1574 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1575 typedef typename NO::execution_space execution_space;
1576 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1577 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1578
1579#ifdef HAVE_TPETRA_MMM_TIMINGS
1580 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1581 using Teuchos::TimeMonitor;
1582 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1583#endif
1584 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1585
1586 // Build the final importer / column map, hash table lookups for C
1587 RCP<const import_type> Cimport;
1588 RCP<const map_type> Ccolmap;
1589 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1590 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1591 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1592 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1593 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1594 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1595 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1596 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1597
1598 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1599 // indices of B, to local column indices of C. (B and C have the
1600 // same number of columns.) The kernel uses this, instead of
1601 // copying the entire input matrix B and converting its column
1602 // indices to those of C.
1603 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1604
1605 if (Bview.importMatrix.is_null()) {
1606 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1607 Cimport = Bimport;
1608 Ccolmap = Bview.colMap;
1609 const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1610 // Bcol2Ccol is trivial
1611 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1612 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1613 KOKKOS_LAMBDA(const LO i) {
1614 Bcol2Ccol(i) = i;
1615 });
1616 }
1617 else {
1618 // mfh 27 Sep 2016: B has "remotes," so we need to build the
1619 // column Map of C, as well as C's Import object (from its domain
1620 // Map to its column Map). C's column Map is the union of the
1621 // column Maps of (the local part of) B, and the "remote" part of
1622 // B. Ditto for the Import. We have optimized this "setUnion"
1623 // operation on Import objects and Maps.
1624
1625 // Choose the right variant of setUnion
1626 if (!Bimport.is_null() && !Iimport.is_null()) {
1627 Cimport = Bimport->setUnion(*Iimport);
1628 }
1629 else if (!Bimport.is_null() && Iimport.is_null()) {
1630 Cimport = Bimport->setUnion();
1631 }
1632 else if (Bimport.is_null() && !Iimport.is_null()) {
1633 Cimport = Iimport->setUnion();
1634 }
1635 else {
1636 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1637 }
1638 Ccolmap = Cimport->getTargetMap();
1639
1640 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1641 // in general. We should get rid of it in order to reduce
1642 // communication costs of sparse matrix-matrix multiply.
1643 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1644 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1645
1646 // NOTE: This is not efficient and should be folded into setUnion
1647 //
1648 // mfh 27 Sep 2016: What the above comment means, is that the
1649 // setUnion operation on Import objects could also compute these
1650 // local index - to - local index look-up tables.
1651 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1652 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1653 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1654 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1655 });
1656 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1657 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1658 });
1659
1660 }
1661
1662 // Replace the column map
1663 //
1664 // mfh 27 Sep 2016: We do this because C was originally created
1665 // without a column Map. Now we have its column Map.
1666 C.replaceColMap(Ccolmap);
1667
1668 // mfh 27 Sep 2016: Construct tables that map from local column
1669 // indices of A, to local row indices of either B_local (the locally
1670 // owned part of B), or B_remote (the "imported" remote part of B).
1671 //
1672 // For column index Aik in row i of A, if the corresponding row of B
1673 // exists in the local part of B ("orig") (which I'll call B_local),
1674 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1675 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1676 //
1677 // For column index Aik in row i of A, if the corresponding row of B
1678 // exists in the remote part of B ("Import") (which I'll call
1679 // B_remote), then targetMapToImportRow[Aik] is the local index of
1680 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1681 // (a flag value).
1682
1683 // Run through all the hash table lookups once and for all
1684 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1685 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1686
1687 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1688 GO aidx = Acolmap_local.getGlobalElement(i);
1689 LO B_LID = Browmap_local.getLocalElement(aidx);
1690 if (B_LID != LO_INVALID) {
1691 targetMapToOrigRow(i) = B_LID;
1692 targetMapToImportRow(i) = LO_INVALID;
1693 } else {
1694 LO I_LID = Irowmap_local.getLocalElement(aidx);
1695 targetMapToOrigRow(i) = LO_INVALID;
1696 targetMapToImportRow(i) = I_LID;
1697
1698 }
1699 });
1700
1701 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1702 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1703 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1704
1705}
1706
1707
1708/*********************************************************************************************************/
1709// AB NewMatrix Kernel wrappers (Default non-threaded version)
1710template<class Scalar,
1711 class LocalOrdinal,
1712 class GlobalOrdinal,
1713 class Node,
1714 class LocalOrdinalViewType>
1715void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1716 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1717 const LocalOrdinalViewType & targetMapToOrigRow,
1718 const LocalOrdinalViewType & targetMapToImportRow,
1719 const LocalOrdinalViewType & Bcol2Ccol,
1720 const LocalOrdinalViewType & Icol2Ccol,
1721 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1722 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1723 const std::string& label,
1724 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1725#ifdef HAVE_TPETRA_MMM_TIMINGS
1726 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1727 using Teuchos::TimeMonitor;
1728 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1729#endif
1730
1731 using Teuchos::Array;
1732 using Teuchos::ArrayRCP;
1733 using Teuchos::ArrayView;
1734 using Teuchos::RCP;
1735 using Teuchos::rcp;
1736
1737 // Lots and lots of typedefs
1738 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1739 typedef typename KCRS::StaticCrsGraphType graph_t;
1740 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1741 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1742 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1743 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1744
1745 typedef Scalar SC;
1746 typedef LocalOrdinal LO;
1747 typedef GlobalOrdinal GO;
1748 typedef Node NO;
1749 typedef Map<LO,GO,NO> map_type;
1750 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1751 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1752 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1753
1754 // Sizes
1755 RCP<const map_type> Ccolmap = C.getColMap();
1756 size_t m = Aview.origMatrix->getNodeNumRows();
1757 size_t n = Ccolmap->getNodeNumElements();
1758 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1759
1760 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1761 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1762 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
1763
1764 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1765 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1766 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1767
1768 c_lno_view_t Irowptr;
1769 lno_nnz_view_t Icolind;
1770 scalar_view_t Ivals;
1771 if(!Bview.importMatrix.is_null()) {
1772 auto lclB = Bview.importMatrix->getLocalMatrixHost();
1773 Irowptr = lclB.graph.row_map;
1774 Icolind = lclB.graph.entries;
1775 Ivals = lclB.values;
1776 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1777 }
1778
1779#ifdef HAVE_TPETRA_MMM_TIMINGS
1780 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1781#endif
1782
1783 // Classic csr assembly (low memory edition)
1784 //
1785 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1786 // The method loops over rows of A, and may resize after processing
1787 // each row. Chris Siefert says that this reflects experience in
1788 // ML; for the non-threaded case, ML found it faster to spend less
1789 // effort on estimation and risk an occasional reallocation.
1790 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1791 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
1792 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
1793 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
1794
1795 // mfh 27 Sep 2016: The c_status array is an implementation detail
1796 // of the local sparse matrix-matrix multiply routine.
1797
1798 // The status array will contain the index into colind where this entry was last deposited.
1799 // c_status[i] < CSR_ip - not in the row yet
1800 // c_status[i] >= CSR_ip - this is the entry where you can find the data
1801 // We start with this filled with INVALID's indicating that there are no entries yet.
1802 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1803 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1804 std::vector<size_t> c_status(n, ST_INVALID);
1805
1806 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1807 // routine. The routine computes C := A * (B_local + B_remote).
1808 //
1809 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1810 // you whether the corresponding row of B belongs to B_local
1811 // ("orig") or B_remote ("Import").
1812
1813 // For each row of A/C
1814 size_t CSR_ip = 0, OLD_ip = 0;
1815 for (size_t i = 0; i < m; i++) {
1816 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1817 // on the calling process.
1818 Crowptr[i] = CSR_ip;
1819
1820 // mfh 27 Sep 2016: For each entry of A in the current row of A
1821 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1822 LO Aik = Acolind[k]; // local column index of current entry of A
1823 const SC Aval = Avals[k]; // value of current entry of A
1824 if (Aval == SC_ZERO)
1825 continue; // skip explicitly stored zero values in A
1826
1827 if (targetMapToOrigRow[Aik] != LO_INVALID) {
1828 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1829 // corresponding to the current entry of A is populated, then
1830 // the corresponding row of B is in B_local (i.e., it lives on
1831 // the calling process).
1832
1833 // Local matrix
1834 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
1835
1836 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1837 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1838 LO Bkj = Bcolind[j];
1839 LO Cij = Bcol2Ccol[Bkj];
1840
1841 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1842 // New entry
1843 c_status[Cij] = CSR_ip;
1844 Ccolind[CSR_ip] = Cij;
1845 Cvals[CSR_ip] = Aval*Bvals[j];
1846 CSR_ip++;
1847
1848 } else {
1849 Cvals[c_status[Cij]] += Aval*Bvals[j];
1850 }
1851 }
1852
1853 } else {
1854 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1855 // corresponding to the current entry of A NOT populated (has
1856 // a flag "invalid" value), then the corresponding row of B is
1857 // in B_local (i.e., it lives on the calling process).
1858
1859 // Remote matrix
1860 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
1861 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1862 LO Ikj = Icolind[j];
1863 LO Cij = Icol2Ccol[Ikj];
1864
1865 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1866 // New entry
1867 c_status[Cij] = CSR_ip;
1868 Ccolind[CSR_ip] = Cij;
1869 Cvals[CSR_ip] = Aval*Ivals[j];
1870 CSR_ip++;
1871 } else {
1872 Cvals[c_status[Cij]] += Aval*Ivals[j];
1873 }
1874 }
1875 }
1876 }
1877
1878 // Resize for next pass if needed
1879 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1880 CSR_alloc *= 2;
1881 Kokkos::resize(Ccolind,CSR_alloc);
1882 Kokkos::resize(Cvals,CSR_alloc);
1883 }
1884 OLD_ip = CSR_ip;
1885 }
1886
1887 Crowptr[m] = CSR_ip;
1888
1889 // Downward resize
1890 Kokkos::resize(Ccolind,CSR_ip);
1891 Kokkos::resize(Cvals,CSR_ip);
1892
1893#ifdef HAVE_TPETRA_MMM_TIMINGS
1894 {
1895 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
1896#endif
1897
1898 // Final sort & set of CRS arrays
1899 if (params.is_null() || params->get("sort entries",true))
1900 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1901 C.setAllValues(Crowptr,Ccolind, Cvals);
1902
1903
1904#ifdef HAVE_TPETRA_MMM_TIMINGS
1905 }
1906 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
1907 {
1908#endif
1909
1910 // Final FillComplete
1911 //
1912 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1913 // Import (from domain Map to column Map) construction (which costs
1914 // lots of communication) by taking the previously constructed
1915 // Import object. We should be able to do this without interfering
1916 // with the implementation of the local part of sparse matrix-matrix
1917 // multply above.
1918 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1919 labelList->set("Timer Label",label);
1920 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1921 RCP<const Export<LO,GO,NO> > dummyExport;
1922 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1923#ifdef HAVE_TPETRA_MMM_TIMINGS
1924 }
1925 MM2 = Teuchos::null;
1926#endif
1927
1928}
1929
1930
1931
1932
1933/*********************************************************************************************************/
1934// Kernel method for computing the local portion of C = A*B (reuse)
1935template<class Scalar,
1936 class LocalOrdinal,
1937 class GlobalOrdinal,
1938 class Node>
1939void mult_A_B_reuse(
1940 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1941 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1942 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1943 const std::string& label,
1944 const Teuchos::RCP<Teuchos::ParameterList>& params)
1945{
1946 using Teuchos::Array;
1947 using Teuchos::ArrayRCP;
1948 using Teuchos::ArrayView;
1949 using Teuchos::RCP;
1950 using Teuchos::rcp;
1951
1952 // Tpetra typedefs
1953 typedef LocalOrdinal LO;
1954 typedef GlobalOrdinal GO;
1955 typedef Node NO;
1956 typedef Import<LO,GO,NO> import_type;
1957 typedef Map<LO,GO,NO> map_type;
1958
1959 // Kokkos typedefs
1960 typedef typename map_type::local_map_type local_map_type;
1962 typedef typename KCRS::StaticCrsGraphType graph_t;
1963 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1964 typedef typename NO::execution_space execution_space;
1965 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1966 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1967
1968#ifdef HAVE_TPETRA_MMM_TIMINGS
1969 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1970 using Teuchos::TimeMonitor;
1971 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1972#endif
1973 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1974
1975 // Grab all the maps
1976 RCP<const import_type> Cimport = C.getGraph()->getImporter();
1977 RCP<const map_type> Ccolmap = C.getColMap();
1978 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1979 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1980 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1981 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1982 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1983 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1984
1985 // Build the final importer / column map, hash table lookups for C
1986 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1987 {
1988 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1989 // So, column map of C may be a strict subset of the column map of B
1990 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1991 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1992 });
1993
1994 if (!Bview.importMatrix.is_null()) {
1995 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1996 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1997
1998 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1999 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2000 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2001 });
2002 }
2003 }
2004
2005 // Run through all the hash table lookups once and for all
2006 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2007 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2008 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2009 GO aidx = Acolmap_local.getGlobalElement(i);
2010 LO B_LID = Browmap_local.getLocalElement(aidx);
2011 if (B_LID != LO_INVALID) {
2012 targetMapToOrigRow(i) = B_LID;
2013 targetMapToImportRow(i) = LO_INVALID;
2014 } else {
2015 LO I_LID = Irowmap_local.getLocalElement(aidx);
2016 targetMapToOrigRow(i) = LO_INVALID;
2017 targetMapToImportRow(i) = I_LID;
2018
2019 }
2020 });
2021
2022 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2023 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2024 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2025}
2026
2027/*********************************************************************************************************/
2028template<class Scalar,
2029 class LocalOrdinal,
2030 class GlobalOrdinal,
2031 class Node,
2032 class LocalOrdinalViewType>
2033void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2034 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2035 const LocalOrdinalViewType & targetMapToOrigRow,
2036 const LocalOrdinalViewType & targetMapToImportRow,
2037 const LocalOrdinalViewType & Bcol2Ccol,
2038 const LocalOrdinalViewType & Icol2Ccol,
2039 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2040 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2041 const std::string& label,
2042 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2043#ifdef HAVE_TPETRA_MMM_TIMINGS
2044 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2045 using Teuchos::TimeMonitor;
2046 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2047 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2048#else
2049 (void)label;
2050#endif
2051 using Teuchos::RCP;
2052 using Teuchos::rcp;
2053
2054
2055 // Lots and lots of typedefs
2056 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2057 typedef typename KCRS::StaticCrsGraphType graph_t;
2058 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2059 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2060 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2061
2062 typedef Scalar SC;
2063 typedef LocalOrdinal LO;
2064 typedef GlobalOrdinal GO;
2065 typedef Node NO;
2066 typedef Map<LO,GO,NO> map_type;
2067 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2068 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2069 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2070
2071 // Sizes
2072 RCP<const map_type> Ccolmap = C.getColMap();
2073 size_t m = Aview.origMatrix->getNodeNumRows();
2074 size_t n = Ccolmap->getNodeNumElements();
2075
2076 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2077 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2078 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2079 const KCRS & Cmat = C.getLocalMatrixHost();
2080
2081 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2082 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2083 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2084 scalar_view_t Cvals = Cmat.values;
2085
2086 c_lno_view_t Irowptr;
2087 lno_nnz_view_t Icolind;
2088 scalar_view_t Ivals;
2089 if(!Bview.importMatrix.is_null()) {
2090 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2091 Irowptr = lclB.graph.row_map;
2092 Icolind = lclB.graph.entries;
2093 Ivals = lclB.values;
2094 }
2095
2096#ifdef HAVE_TPETRA_MMM_TIMINGS
2097 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2098#endif
2099
2100 // Classic csr assembly (low memory edition)
2101 // mfh 27 Sep 2016: The c_status array is an implementation detail
2102 // of the local sparse matrix-matrix multiply routine.
2103
2104 // The status array will contain the index into colind where this entry was last deposited.
2105 // c_status[i] < CSR_ip - not in the row yet
2106 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2107 // We start with this filled with INVALID's indicating that there are no entries yet.
2108 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2109 std::vector<size_t> c_status(n, ST_INVALID);
2110
2111 // For each row of A/C
2112 size_t CSR_ip = 0, OLD_ip = 0;
2113 for (size_t i = 0; i < m; i++) {
2114 // First fill the c_status array w/ locations where we're allowed to
2115 // generate nonzeros for this row
2116 OLD_ip = Crowptr[i];
2117 CSR_ip = Crowptr[i+1];
2118 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2119 c_status[Ccolind[k]] = k;
2120
2121 // Reset values in the row of C
2122 Cvals[k] = SC_ZERO;
2123 }
2124
2125 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126 LO Aik = Acolind[k];
2127 const SC Aval = Avals[k];
2128 if (Aval == SC_ZERO)
2129 continue;
2130
2131 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2132 // Local matrix
2133 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2134
2135 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2136 LO Bkj = Bcolind[j];
2137 LO Cij = Bcol2Ccol[Bkj];
2138
2139 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2140 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2141 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2142
2143 Cvals[c_status[Cij]] += Aval * Bvals[j];
2144 }
2145
2146 } else {
2147 // Remote matrix
2148 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2149 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2150 LO Ikj = Icolind[j];
2151 LO Cij = Icol2Ccol[Ikj];
2152
2153 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2154 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2155 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2156
2157 Cvals[c_status[Cij]] += Aval * Ivals[j];
2158 }
2159 }
2160 }
2161 }
2162
2163#ifdef HAVE_TPETRA_MMM_TIMINGS
2164 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2165#endif
2166
2167 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2168}
2169
2170
2171/*********************************************************************************************************/
2172// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2173template<class Scalar,
2174 class LocalOrdinal,
2175 class GlobalOrdinal,
2176 class Node>
2177void jacobi_A_B_newmatrix(
2178 Scalar omega,
2179 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2180 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2181 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2182 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2183 const std::string& label,
2184 const Teuchos::RCP<Teuchos::ParameterList>& params)
2185{
2186 using Teuchos::Array;
2187 using Teuchos::ArrayRCP;
2188 using Teuchos::ArrayView;
2189 using Teuchos::RCP;
2190 using Teuchos::rcp;
2191 // typedef Scalar SC;
2192 typedef LocalOrdinal LO;
2193 typedef GlobalOrdinal GO;
2194 typedef Node NO;
2195
2196 typedef Import<LO,GO,NO> import_type;
2197 typedef Map<LO,GO,NO> map_type;
2198 typedef typename map_type::local_map_type local_map_type;
2199
2200 // All of the Kokkos typedefs
2202 typedef typename KCRS::StaticCrsGraphType graph_t;
2203 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2204 typedef typename NO::execution_space execution_space;
2205 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2206 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2207
2208
2209#ifdef HAVE_TPETRA_MMM_TIMINGS
2210 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2211 using Teuchos::TimeMonitor;
2212 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2213#endif
2214 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2215
2216 // Build the final importer / column map, hash table lookups for C
2217 RCP<const import_type> Cimport;
2218 RCP<const map_type> Ccolmap;
2219 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2220 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2221 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2222 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2223 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2224 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2225 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2226 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2227
2228 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2229 // indices of B, to local column indices of C. (B and C have the
2230 // same number of columns.) The kernel uses this, instead of
2231 // copying the entire input matrix B and converting its column
2232 // indices to those of C.
2233 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2234
2235 if (Bview.importMatrix.is_null()) {
2236 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2237 Cimport = Bimport;
2238 Ccolmap = Bview.colMap;
2239 // Bcol2Ccol is trivial
2240 // Bcol2Ccol is trivial
2241
2242 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2243 Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2244 Bcol2Ccol(i) = static_cast<LO> (i);
2245 });
2246 } else {
2247 // mfh 27 Sep 2016: B has "remotes," so we need to build the
2248 // column Map of C, as well as C's Import object (from its domain
2249 // Map to its column Map). C's column Map is the union of the
2250 // column Maps of (the local part of) B, and the "remote" part of
2251 // B. Ditto for the Import. We have optimized this "setUnion"
2252 // operation on Import objects and Maps.
2253
2254 // Choose the right variant of setUnion
2255 if (!Bimport.is_null() && !Iimport.is_null()){
2256 Cimport = Bimport->setUnion(*Iimport);
2257 Ccolmap = Cimport->getTargetMap();
2258
2259 } else if (!Bimport.is_null() && Iimport.is_null()) {
2260 Cimport = Bimport->setUnion();
2261
2262 } else if(Bimport.is_null() && !Iimport.is_null()) {
2263 Cimport = Iimport->setUnion();
2264
2265 } else
2266 throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2267
2268 Ccolmap = Cimport->getTargetMap();
2269
2270 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2271 std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2272
2273 // NOTE: This is not efficient and should be folded into setUnion
2274 //
2275 // mfh 27 Sep 2016: What the above comment means, is that the
2276 // setUnion operation on Import objects could also compute these
2277 // local index - to - local index look-up tables.
2278 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2279 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2280 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2281 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2282 });
2283 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2284 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2285 });
2286
2287 }
2288
2289 // Replace the column map
2290 //
2291 // mfh 27 Sep 2016: We do this because C was originally created
2292 // without a column Map. Now we have its column Map.
2293 C.replaceColMap(Ccolmap);
2294
2295 // mfh 27 Sep 2016: Construct tables that map from local column
2296 // indices of A, to local row indices of either B_local (the locally
2297 // owned part of B), or B_remote (the "imported" remote part of B).
2298 //
2299 // For column index Aik in row i of A, if the corresponding row of B
2300 // exists in the local part of B ("orig") (which I'll call B_local),
2301 // then targetMapToOrigRow[Aik] is the local index of that row of B.
2302 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2303 //
2304 // For column index Aik in row i of A, if the corresponding row of B
2305 // exists in the remote part of B ("Import") (which I'll call
2306 // B_remote), then targetMapToImportRow[Aik] is the local index of
2307 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2308 // (a flag value).
2309
2310 // Run through all the hash table lookups once and for all
2311 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2312 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2313 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2314 GO aidx = Acolmap_local.getGlobalElement(i);
2315 LO B_LID = Browmap_local.getLocalElement(aidx);
2316 if (B_LID != LO_INVALID) {
2317 targetMapToOrigRow(i) = B_LID;
2318 targetMapToImportRow(i) = LO_INVALID;
2319 } else {
2320 LO I_LID = Irowmap_local.getLocalElement(aidx);
2321 targetMapToOrigRow(i) = LO_INVALID;
2322 targetMapToImportRow(i) = I_LID;
2323
2324 }
2325 });
2326
2327 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2328 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2329 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2330
2331}
2332
2333
2334/*********************************************************************************************************/
2335// Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2336// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2337
2338template<class Scalar,
2339 class LocalOrdinal,
2340 class GlobalOrdinal,
2341 class Node,
2342 class LocalOrdinalViewType>
2343void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2344 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2345 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2346 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2347 const LocalOrdinalViewType & targetMapToOrigRow,
2348 const LocalOrdinalViewType & targetMapToImportRow,
2349 const LocalOrdinalViewType & Bcol2Ccol,
2350 const LocalOrdinalViewType & Icol2Ccol,
2351 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2352 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2353 const std::string& label,
2354 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2355
2356#ifdef HAVE_TPETRA_MMM_TIMINGS
2357 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2358 using Teuchos::TimeMonitor;
2359 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2360#endif
2361
2362 using Teuchos::Array;
2363 using Teuchos::ArrayRCP;
2364 using Teuchos::ArrayView;
2365 using Teuchos::RCP;
2366 using Teuchos::rcp;
2367
2368 // Lots and lots of typedefs
2369 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2370 typedef typename KCRS::StaticCrsGraphType graph_t;
2371 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2372 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2374 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2375
2376 // Jacobi-specific
2377 typedef typename scalar_view_t::memory_space scalar_memory_space;
2378
2379 typedef Scalar SC;
2380 typedef LocalOrdinal LO;
2381 typedef GlobalOrdinal GO;
2382 typedef Node NO;
2383
2384 typedef Map<LO,GO,NO> map_type;
2385 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2386 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2387
2388 // Sizes
2389 RCP<const map_type> Ccolmap = C.getColMap();
2390 size_t m = Aview.origMatrix->getNodeNumRows();
2391 size_t n = Ccolmap->getNodeNumElements();
2392 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2393
2394 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2395 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2396 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2397
2398 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2399 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2400 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2401
2402 c_lno_view_t Irowptr;
2403 lno_nnz_view_t Icolind;
2404 scalar_view_t Ivals;
2405 if(!Bview.importMatrix.is_null()) {
2406 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2407 Irowptr = lclB.graph.row_map;
2408 Icolind = lclB.graph.entries;
2409 Ivals = lclB.values;
2410 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2411 }
2412
2413 // Jacobi-specific inner stuff
2414 auto Dvals =
2415 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2416
2417 // Teuchos::ArrayView::operator[].
2418 // The status array will contain the index into colind where this entry was last deposited.
2419 // c_status[i] < CSR_ip - not in the row yet.
2420 // c_status[i] >= CSR_ip, this is the entry where you can find the data
2421 // We start with this filled with INVALID's indicating that there are no entries yet.
2422 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2423 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2424 Array<size_t> c_status(n, ST_INVALID);
2425
2426 // Classic csr assembly (low memory edition)
2427 //
2428 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2429 // The method loops over rows of A, and may resize after processing
2430 // each row. Chris Siefert says that this reflects experience in
2431 // ML; for the non-threaded case, ML found it faster to spend less
2432 // effort on estimation and risk an occasional reallocation.
2433 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2434 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2435 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2436 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2437 size_t CSR_ip = 0, OLD_ip = 0;
2438
2439 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2440
2441 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2442 // routine. The routine computes
2443 //
2444 // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2445 //
2446 // This corresponds to one sweep of (weighted) Jacobi.
2447 //
2448 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2449 // you whether the corresponding row of B belongs to B_local
2450 // ("orig") or B_remote ("Import").
2451
2452 // For each row of A/C
2453 for (size_t i = 0; i < m; i++) {
2454 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2455 // on the calling process.
2456 Crowptr[i] = CSR_ip;
2457 SC minusOmegaDval = -omega*Dvals(i,0);
2458
2459 // Entries of B
2460 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2461 Scalar Bval = Bvals[j];
2462 if (Bval == SC_ZERO)
2463 continue;
2464 LO Bij = Bcolind[j];
2465 LO Cij = Bcol2Ccol[Bij];
2466
2467 // Assume no repeated entries in B
2468 c_status[Cij] = CSR_ip;
2469 Ccolind[CSR_ip] = Cij;
2470 Cvals[CSR_ip] = Bvals[j];
2471 CSR_ip++;
2472 }
2473
2474 // Entries of -omega * Dinv * A * B
2475 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2476 LO Aik = Acolind[k];
2477 const SC Aval = Avals[k];
2478 if (Aval == SC_ZERO)
2479 continue;
2480
2481 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2482 // Local matrix
2483 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2484
2485 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2486 LO Bkj = Bcolind[j];
2487 LO Cij = Bcol2Ccol[Bkj];
2488
2489 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2490 // New entry
2491 c_status[Cij] = CSR_ip;
2492 Ccolind[CSR_ip] = Cij;
2493 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2494 CSR_ip++;
2495
2496 } else {
2497 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2498 }
2499 }
2500
2501 } else {
2502 // Remote matrix
2503 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2504 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2505 LO Ikj = Icolind[j];
2506 LO Cij = Icol2Ccol[Ikj];
2507
2508 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2509 // New entry
2510 c_status[Cij] = CSR_ip;
2511 Ccolind[CSR_ip] = Cij;
2512 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2513 CSR_ip++;
2514 } else {
2515 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2516 }
2517 }
2518 }
2519 }
2520
2521 // Resize for next pass if needed
2522 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2523 CSR_alloc *= 2;
2524 Kokkos::resize(Ccolind,CSR_alloc);
2525 Kokkos::resize(Cvals,CSR_alloc);
2526 }
2527 OLD_ip = CSR_ip;
2528 }
2529 Crowptr[m] = CSR_ip;
2530
2531 // Downward resize
2532 Kokkos::resize(Ccolind,CSR_ip);
2533 Kokkos::resize(Cvals,CSR_ip);
2534
2535 {
2536#ifdef HAVE_TPETRA_MMM_TIMINGS
2537 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2538#endif
2539
2540 // Replace the column map
2541 //
2542 // mfh 27 Sep 2016: We do this because C was originally created
2543 // without a column Map. Now we have its column Map.
2544 C.replaceColMap(Ccolmap);
2545
2546 // Final sort & set of CRS arrays
2547 //
2548 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2549 // matrix-matrix multiply routine sort the entries for us?
2550 // Final sort & set of CRS arrays
2551 if (params.is_null() || params->get("sort entries",true))
2552 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2553 C.setAllValues(Crowptr,Ccolind, Cvals);
2554 }
2555 {
2556#ifdef HAVE_TPETRA_MMM_TIMINGS
2557 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2558#endif
2559
2560 // Final FillComplete
2561 //
2562 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2563 // Import (from domain Map to column Map) construction (which costs
2564 // lots of communication) by taking the previously constructed
2565 // Import object. We should be able to do this without interfering
2566 // with the implementation of the local part of sparse matrix-matrix
2567 // multply above
2568 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2569 labelList->set("Timer Label",label);
2570 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2571 RCP<const Export<LO,GO,NO> > dummyExport;
2572 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2573
2574 }
2575}
2576
2577
2578/*********************************************************************************************************/
2579// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2580template<class Scalar,
2581 class LocalOrdinal,
2582 class GlobalOrdinal,
2583 class Node>
2584void jacobi_A_B_reuse(
2585 Scalar omega,
2586 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2587 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2588 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2589 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2590 const std::string& label,
2591 const Teuchos::RCP<Teuchos::ParameterList>& params)
2592{
2593 using Teuchos::Array;
2594 using Teuchos::ArrayRCP;
2595 using Teuchos::ArrayView;
2596 using Teuchos::RCP;
2597 using Teuchos::rcp;
2598
2599 typedef LocalOrdinal LO;
2600 typedef GlobalOrdinal GO;
2601 typedef Node NO;
2602
2603 typedef Import<LO,GO,NO> import_type;
2604 typedef Map<LO,GO,NO> map_type;
2605
2606 // Kokkos typedefs
2607 typedef typename map_type::local_map_type local_map_type;
2609 typedef typename KCRS::StaticCrsGraphType graph_t;
2610 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2611 typedef typename NO::execution_space execution_space;
2612 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2613 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2614
2615#ifdef HAVE_TPETRA_MMM_TIMINGS
2616 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2617 using Teuchos::TimeMonitor;
2618 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2619#endif
2620 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2621
2622 // Grab all the maps
2623 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2624 RCP<const map_type> Ccolmap = C.getColMap();
2625 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2626 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2627 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2628 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2629 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2630 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2631
2632 // Build the final importer / column map, hash table lookups for C
2633 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2634 {
2635 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2636 // So, column map of C may be a strict subset of the column map of B
2637 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2638 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2639 });
2640
2641 if (!Bview.importMatrix.is_null()) {
2642 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2643 std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2644
2645 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2646 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2647 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2648 });
2649 }
2650 }
2651
2652 // Run through all the hash table lookups once and for all
2653 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2654 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2655 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2656 GO aidx = Acolmap_local.getGlobalElement(i);
2657 LO B_LID = Browmap_local.getLocalElement(aidx);
2658 if (B_LID != LO_INVALID) {
2659 targetMapToOrigRow(i) = B_LID;
2660 targetMapToImportRow(i) = LO_INVALID;
2661 } else {
2662 LO I_LID = Irowmap_local.getLocalElement(aidx);
2663 targetMapToOrigRow(i) = LO_INVALID;
2664 targetMapToImportRow(i) = I_LID;
2665
2666 }
2667 });
2668
2669#ifdef HAVE_TPETRA_MMM_TIMINGS
2670 MM = Teuchos::null;
2671#endif
2672
2673 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2674 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2675 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2676}
2677
2678
2679
2680/*********************************************************************************************************/
2681template<class Scalar,
2682 class LocalOrdinal,
2683 class GlobalOrdinal,
2684 class Node,
2685 class LocalOrdinalViewType>
2686void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2687 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2688 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2689 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2690 const LocalOrdinalViewType & targetMapToOrigRow,
2691 const LocalOrdinalViewType & targetMapToImportRow,
2692 const LocalOrdinalViewType & Bcol2Ccol,
2693 const LocalOrdinalViewType & Icol2Ccol,
2694 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2695 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2696 const std::string& label,
2697 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2698#ifdef HAVE_TPETRA_MMM_TIMINGS
2699 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2700 using Teuchos::TimeMonitor;
2701 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2702 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2703#else
2704 (void)label;
2705#endif
2706 using Teuchos::RCP;
2707 using Teuchos::rcp;
2708
2709 // Lots and lots of typedefs
2710 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2711 typedef typename KCRS::StaticCrsGraphType graph_t;
2712 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2713 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2714 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2715 typedef typename scalar_view_t::memory_space scalar_memory_space;
2716
2717 typedef Scalar SC;
2718 typedef LocalOrdinal LO;
2719 typedef GlobalOrdinal GO;
2720 typedef Node NO;
2721 typedef Map<LO,GO,NO> map_type;
2722 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2723 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2724 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2725
2726 // Sizes
2727 RCP<const map_type> Ccolmap = C.getColMap();
2728 size_t m = Aview.origMatrix->getNodeNumRows();
2729 size_t n = Ccolmap->getNodeNumElements();
2730
2731 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2732 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2733 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2734 const KCRS & Cmat = C.getLocalMatrixHost();
2735
2736 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2737 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2738 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2739 scalar_view_t Cvals = Cmat.values;
2740
2741 c_lno_view_t Irowptr;
2742 lno_nnz_view_t Icolind;
2743 scalar_view_t Ivals;
2744 if(!Bview.importMatrix.is_null()) {
2745 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2746 Irowptr = lclB.graph.row_map;
2747 Icolind = lclB.graph.entries;
2748 Ivals = lclB.values;
2749 }
2750
2751 // Jacobi-specific inner stuff
2752 auto Dvals =
2753 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2754
2755#ifdef HAVE_TPETRA_MMM_TIMINGS
2756 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2757#endif
2758
2759 // The status array will contain the index into colind where this entry was last deposited.
2760 // c_status[i] < CSR_ip - not in the row yet
2761 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2762 // We start with this filled with INVALID's indicating that there are no entries yet.
2763 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2764 std::vector<size_t> c_status(n, ST_INVALID);
2765
2766 // For each row of A/C
2767 size_t CSR_ip = 0, OLD_ip = 0;
2768 for (size_t i = 0; i < m; i++) {
2769
2770 // First fill the c_status array w/ locations where we're allowed to
2771 // generate nonzeros for this row
2772 OLD_ip = Crowptr[i];
2773 CSR_ip = Crowptr[i+1];
2774 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2775 c_status[Ccolind[k]] = k;
2776
2777 // Reset values in the row of C
2778 Cvals[k] = SC_ZERO;
2779 }
2780
2781 SC minusOmegaDval = -omega*Dvals(i,0);
2782
2783 // Entries of B
2784 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2785 Scalar Bval = Bvals[j];
2786 if (Bval == SC_ZERO)
2787 continue;
2788 LO Bij = Bcolind[j];
2789 LO Cij = Bcol2Ccol[Bij];
2790
2791 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2792 std::runtime_error, "Trying to insert a new entry into a static graph");
2793
2794 Cvals[c_status[Cij]] = Bvals[j];
2795 }
2796
2797 // Entries of -omega * Dinv * A * B
2798 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2799 LO Aik = Acolind[k];
2800 const SC Aval = Avals[k];
2801 if (Aval == SC_ZERO)
2802 continue;
2803
2804 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2805 // Local matrix
2806 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2807
2808 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2809 LO Bkj = Bcolind[j];
2810 LO Cij = Bcol2Ccol[Bkj];
2811
2812 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2813 std::runtime_error, "Trying to insert a new entry into a static graph");
2814
2815 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2816 }
2817
2818 } else {
2819 // Remote matrix
2820 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2821 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2822 LO Ikj = Icolind[j];
2823 LO Cij = Icol2Ccol[Ikj];
2824
2825 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2826 std::runtime_error, "Trying to insert a new entry into a static graph");
2827
2828 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2829 }
2830 }
2831 }
2832 }
2833
2834#ifdef HAVE_TPETRA_MMM_TIMINGS
2835 MM2 = Teuchos::null;
2836 MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
2837#endif
2838
2839 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2840#ifdef HAVE_TPETRA_MMM_TIMINGS
2841 MM2 = Teuchos::null;
2842 MM = Teuchos::null;
2843#endif
2844
2845}
2846
2847
2848
2849/*********************************************************************************************************/
2850template<class Scalar,
2851 class LocalOrdinal,
2852 class GlobalOrdinal,
2853 class Node>
2854void import_and_extract_views(
2855 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2856 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2857 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2858 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2859 bool userAssertsThereAreNoRemotes,
2860 const std::string& label,
2861 const Teuchos::RCP<Teuchos::ParameterList>& params)
2862{
2863 using Teuchos::Array;
2864 using Teuchos::ArrayView;
2865 using Teuchos::RCP;
2866 using Teuchos::rcp;
2867 using Teuchos::null;
2868
2869 typedef Scalar SC;
2870 typedef LocalOrdinal LO;
2871 typedef GlobalOrdinal GO;
2872 typedef Node NO;
2873
2874 typedef Map<LO,GO,NO> map_type;
2875 typedef Import<LO,GO,NO> import_type;
2876 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2877
2878#ifdef HAVE_TPETRA_MMM_TIMINGS
2879 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2880 using Teuchos::TimeMonitor;
2881 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2882#endif
2883 // The goal of this method is to populate the 'Aview' struct with views of the
2884 // rows of A, including all rows that correspond to elements in 'targetMap'.
2885 //
2886 // If targetMap includes local elements that correspond to remotely-owned rows
2887 // of A, then those remotely-owned rows will be imported into
2888 // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2889 Aview.deleteContents();
2890
2891 Aview.origMatrix = rcp(&A, false);
2892 Aview.origRowMap = A.getRowMap();
2893 Aview.rowMap = targetMap;
2894 Aview.colMap = A.getColMap();
2895 Aview.domainMap = A.getDomainMap();
2896 Aview.importColMap = null;
2897 RCP<const map_type> rowMap = A.getRowMap();
2898 const int numProcs = rowMap->getComm()->getSize();
2899
2900 // Short circuit if the user swears there are no remotes (or if we're in serial)
2901 if (userAssertsThereAreNoRemotes || numProcs < 2)
2902 return;
2903
2904 RCP<const import_type> importer;
2905 if (params != null && params->isParameter("importer")) {
2906 importer = params->get<RCP<const import_type> >("importer");
2907
2908 } else {
2909#ifdef HAVE_TPETRA_MMM_TIMINGS
2910 MM = Teuchos::null;
2911 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2912#endif
2913
2914 // Mark each row in targetMap as local or remote, and go ahead and get a view
2915 // for the local rows
2916 RCP<const map_type> remoteRowMap;
2917 size_t numRemote = 0;
2918 int mode = 0;
2919 if (!prototypeImporter.is_null() &&
2920 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2921 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2922 // We have a valid prototype importer --- ask it for the remotes
2923#ifdef HAVE_TPETRA_MMM_TIMINGS
2924 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
2925#endif
2926 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2927 numRemote = prototypeImporter->getNumRemoteIDs();
2928
2929 Array<GO> remoteRows(numRemote);
2930 for (size_t i = 0; i < numRemote; i++)
2931 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2932
2933 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2934 rowMap->getIndexBase(), rowMap->getComm()));
2935 mode = 1;
2936
2937 } else if (prototypeImporter.is_null()) {
2938 // No prototype importer --- count the remotes the hard way
2939#ifdef HAVE_TPETRA_MMM_TIMINGS
2940 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
2941#endif
2942 ArrayView<const GO> rows = targetMap->getNodeElementList();
2943 size_t numRows = targetMap->getNodeNumElements();
2944
2945 Array<GO> remoteRows(numRows);
2946 for(size_t i = 0; i < numRows; ++i) {
2947 const LO mlid = rowMap->getLocalElement(rows[i]);
2948
2949 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2950 remoteRows[numRemote++] = rows[i];
2951 }
2952 remoteRows.resize(numRemote);
2953 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2954 rowMap->getIndexBase(), rowMap->getComm()));
2955 mode = 2;
2956
2957 } else {
2958 // PrototypeImporter is bad. But if we're in serial that's OK.
2959 mode = 3;
2960 }
2961
2962 if (numProcs < 2) {
2963 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2964 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2965 // If only one processor we don't need to import any remote rows, so return.
2966 return;
2967 }
2968
2969 //
2970 // Now we will import the needed remote rows of A, if the global maximum
2971 // value of numRemote is greater than 0.
2972 //
2973#ifdef HAVE_TPETRA_MMM_TIMINGS
2974 MM = Teuchos::null;
2975 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2976#endif
2977
2978 global_size_t globalMaxNumRemote = 0;
2979 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2980
2981 if (globalMaxNumRemote > 0) {
2982#ifdef HAVE_TPETRA_MMM_TIMINGS
2983 MM = Teuchos::null;
2984 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2985#endif
2986 // Create an importer with target-map remoteRowMap and source-map rowMap.
2987 if (mode == 1)
2988 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2989 else if (mode == 2)
2990 importer = rcp(new import_type(rowMap, remoteRowMap));
2991 else
2992 throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2993 }
2994
2995 if (params != null)
2996 params->set("importer", importer);
2997 }
2998
2999 if (importer != null) {
3000#ifdef HAVE_TPETRA_MMM_TIMINGS
3001 MM = Teuchos::null;
3002 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3003#endif
3004
3005 // Now create a new matrix into which we can import the remote rows of A that we need.
3006 Teuchos::ParameterList labelList;
3007 labelList.set("Timer Label", label);
3008 auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3009
3010 bool isMM = true;
3011 bool overrideAllreduce = false;
3012 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3013 // Minor speedup tweak - avoid computing the global constants
3014 Teuchos::ParameterList params_sublist;
3015 if(!params.is_null()) {
3016 labelList.set("compute global constants", params->get("compute global constants",false));
3017 params_sublist = params->sublist("matrixmatrix: kernel params",false);
3018 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3019 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3020 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3021 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3022 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3023 }
3024 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3025 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3026 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3027
3028 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3029 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3030
3031#if 0
3032 // Disabled code for dumping input matrices
3033 static int count=0;
3034 char str[80];
3035 sprintf(str,"import_matrix.%d.dat",count);
3037 count++;
3038#endif
3039
3040#ifdef HAVE_TPETRA_MMM_STATISTICS
3041 printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3042#endif
3043
3044
3045#ifdef HAVE_TPETRA_MMM_TIMINGS
3046 MM = Teuchos::null;
3047 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3048#endif
3049
3050 // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3051 Aview.importColMap = Aview.importMatrix->getColMap();
3052#ifdef HAVE_TPETRA_MMM_TIMINGS
3053 MM = Teuchos::null;
3054#endif
3055 }
3056}
3057
3058
3059
3060
3061
3062/*********************************************************************************************************/
3063 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3064template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3066merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3067 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3068 const LocalOrdinalViewType & Acol2Brow,
3069 const LocalOrdinalViewType & Acol2Irow,
3070 const LocalOrdinalViewType & Bcol2Ccol,
3071 const LocalOrdinalViewType & Icol2Ccol,
3072 const size_t mergedNodeNumCols) {
3073
3074 using Teuchos::RCP;
3076 typedef typename KCRS::StaticCrsGraphType graph_t;
3077 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3078 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3079 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3080 // Grab the Kokkos::SparseCrsMatrices
3081 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3082 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3083
3084 // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3085 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3086 // We do have a Bimport
3087 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3088 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3089 RCP<const KCRS> Ik_;
3090 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3091 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3092 KCRS Iks;
3093 if(Ik!=0) Iks = *Ik;
3094 size_t merge_numrows = Ak.numCols();
3095 // The last entry of this at least, need to be initialized
3096 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3097
3098 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3099
3100 // Use a Kokkos::parallel_scan to build the rowptr
3101 typedef typename Node::execution_space execution_space;
3102 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3103 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3104 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3105 if(final) Mrowptr(i) = update;
3106 // Get the row count
3107 size_t ct=0;
3108 if(Acol2Brow(i)!=LO_INVALID)
3109 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3110 else
3111 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3112 update+=ct;
3113
3114 if(final && i+1==merge_numrows)
3115 Mrowptr(i+1)=update;
3116 });
3117
3118 // Allocate nnz
3119 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3120 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3121 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3122
3123 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3124 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3125 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3126 if(Acol2Brow(i)!=LO_INVALID) {
3127 size_t row = Acol2Brow(i);
3128 size_t start = Bk.graph.row_map(row);
3129 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3131 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3132 }
3133 }
3134 else {
3135 size_t row = Acol2Irow(i);
3136 size_t start = Iks.graph.row_map(row);
3137 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3138 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3139 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3140 }
3141 }
3142 });
3143
3144 KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3145 return newmat;
3146 }
3147 else {
3148 // We don't have a Bimport (the easy case)
3149 return Bk;
3150 }
3151}//end merge_matrices
3152
3153template<typename SC, typename LO, typename GO, typename NO>
3154void AddKernels<SC, LO, GO, NO>::
3155addSorted(
3156 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3157 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3158 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3159 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3160 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3161 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3162 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3163 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3164 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3165 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3166 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3167{
3168 using Teuchos::TimeMonitor;
3169 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3170 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3171 auto nrows = Arowptrs.extent(0) - 1;
3172 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3173 typename AddKern::KKH handle;
3174 handle.create_spadd_handle(true);
3175 auto addHandle = handle.get_spadd_handle();
3176#ifdef HAVE_TPETRA_MMM_TIMINGS
3177 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3178#endif
3179 KokkosSparse::Experimental::spadd_symbolic
3180 <KKH,
3181 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3182 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3183 row_ptrs_array, col_inds_array>
3184 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3185 //KokkosKernels requires values to be zeroed
3186 Cvals = values_array("C values", addHandle->get_c_nnz());
3187 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3188#ifdef HAVE_TPETRA_MMM_TIMINGS
3189 MM = Teuchos::null;
3190 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3191#endif
3192 KokkosSparse::Experimental::spadd_numeric(&handle,
3193 Arowptrs, Acolinds, Avals, scalarA,
3194 Browptrs, Bcolinds, Bvals, scalarB,
3195 Crowptrs, Ccolinds, Cvals);
3196}
3197
3198template<typename SC, typename LO, typename GO, typename NO>
3199void AddKernels<SC, LO, GO, NO>::
3200addUnsorted(
3201 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3202 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3203 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3204 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3205 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3206 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3207 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3208 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3209 GO /* numGlobalCols */,
3210 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3211 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3212 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3213{
3214 using Teuchos::TimeMonitor;
3215 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3216 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3217 auto nrows = Arowptrs.extent(0) - 1;
3218 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3219 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3220 typename AddKern::KKH handle;
3221 handle.create_spadd_handle(false);
3222 auto addHandle = handle.get_spadd_handle();
3223#ifdef HAVE_TPETRA_MMM_TIMINGS
3224 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3225#endif
3226 KokkosSparse::Experimental::spadd_symbolic
3227 <KKH,
3228 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3229 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3230 row_ptrs_array, col_inds_array>
3231 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3232 //Cvals must be zeroed out
3233 Cvals = values_array("C values", addHandle->get_c_nnz());
3234 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3235#ifdef HAVE_TPETRA_MMM_TIMINGS
3236 MM = Teuchos::null;
3237 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3238#endif
3239 KokkosSparse::Experimental::spadd_numeric(&handle,
3240 Arowptrs, Acolinds, Avals, scalarA,
3241 Browptrs, Bcolinds, Bvals, scalarB,
3242 Crowptrs, Ccolinds, Cvals);
3243}
3244
3245template<typename GO,
3246 typename LocalIndicesType,
3247 typename GlobalIndicesType,
3248 typename ColMapType>
3249struct ConvertLocalToGlobalFunctor
3250{
3251 ConvertLocalToGlobalFunctor(
3252 const LocalIndicesType& colindsOrig_,
3253 const GlobalIndicesType& colindsConverted_,
3254 const ColMapType& colmap_) :
3255 colindsOrig (colindsOrig_),
3256 colindsConverted (colindsConverted_),
3257 colmap (colmap_)
3258 {}
3259 KOKKOS_INLINE_FUNCTION void
3260 operator() (const GO i) const
3261 {
3262 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3263 }
3264 LocalIndicesType colindsOrig;
3265 GlobalIndicesType colindsConverted;
3266 ColMapType colmap;
3267};
3268
3269template<typename SC, typename LO, typename GO, typename NO>
3270void MMdetails::AddKernels<SC, LO, GO, NO>::
3271convertToGlobalAndAdd(
3272 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3273 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3274 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3275 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3276 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3277 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3278 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3279 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3280 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3281{
3282 using Teuchos::TimeMonitor;
3283 //Need to use a different KokkosKernelsHandle type than other versions,
3284 //since the ordinals are now GO
3285 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3286 typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3287
3288 const values_array Avals = A.values;
3289 const values_array Bvals = B.values;
3290 const col_inds_array Acolinds = A.graph.entries;
3291 const col_inds_array Bcolinds = B.graph.entries;
3292 auto Arowptrs = A.graph.row_map;
3293 auto Browptrs = B.graph.row_map;
3294 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3295 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3296#ifdef HAVE_TPETRA_MMM_TIMINGS
3297 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3298#endif
3299 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3300 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3301 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3302 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3303 KKH_GO handle;
3304 handle.create_spadd_handle(false);
3305 auto addHandle = handle.get_spadd_handle();
3306#ifdef HAVE_TPETRA_MMM_TIMINGS
3307 MM = Teuchos::null;
3308 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3309#endif
3310 auto nrows = Arowptrs.extent(0) - 1;
3311 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3312 KokkosSparse::Experimental::spadd_symbolic
3313 <KKH_GO, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3314 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3315 Cvals = values_array("C values", addHandle->get_c_nnz());
3316 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3317#ifdef HAVE_TPETRA_MMM_TIMINGS
3318 MM = Teuchos::null;
3319 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3320#endif
3321 KokkosSparse::Experimental::spadd_numeric(&handle,
3322 Arowptrs, AcolindsConverted, Avals, scalarA,
3323 Browptrs, BcolindsConverted, Bvals, scalarB,
3324 Crowptrs, Ccolinds, Cvals);
3325}
3326
3327
3328} //End namepsace MMdetails
3329
3330} //End namespace Tpetra
3331
3332/*********************************************************************************************************/
3333//
3334// Explicit instantiation macro
3335//
3336// Must be expanded from within the Tpetra namespace!
3337//
3338namespace Tpetra {
3339
3340#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3341 template \
3342 void MatrixMatrix::Multiply( \
3343 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3344 bool transposeA, \
3345 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3346 bool transposeB, \
3347 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3348 bool call_FillComplete_on_result, \
3349 const std::string & label, \
3350 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3351\
3352template \
3353 void MatrixMatrix::Jacobi( \
3354 SCALAR omega, \
3355 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3356 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3357 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3358 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3359 bool call_FillComplete_on_result, \
3360 const std::string & label, \
3361 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3362\
3363 template \
3364 void MatrixMatrix::Add( \
3365 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3366 bool transposeA, \
3367 SCALAR scalarA, \
3368 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3369 bool transposeB, \
3370 SCALAR scalarB, \
3371 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3372\
3373 template \
3374 void MatrixMatrix::Add( \
3375 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3376 bool transposeA, \
3377 SCALAR scalarA, \
3378 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3379 SCALAR scalarB ); \
3380\
3381 template \
3382 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3383 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3384 (const SCALAR & alpha, \
3385 const bool transposeA, \
3386 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3387 const SCALAR & beta, \
3388 const bool transposeB, \
3389 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3390 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3391 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3392 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3393\
3394 template \
3395 void \
3396 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3397 (const SCALAR & alpha, \
3398 const bool transposeA, \
3399 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3400 const SCALAR& beta, \
3401 const bool transposeB, \
3402 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3403 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3404 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3405 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3406 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3407\
3408 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3409
3410} //End namespace Tpetra
3411
3412#endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed dense vector.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.