MueLu Version of the Day
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48
49#ifdef HAVE_MUELU_KOKKOS_REFACTOR
50#include <Kokkos_Core.hpp>
51#include <KokkosSparse_CrsMatrix.hpp>
52
53#include "Xpetra_Matrix.hpp"
54
56
57#include "MueLu_AmalgamationInfo_kokkos.hpp"
58#include "MueLu_Exceptions.hpp"
59#include "MueLu_Level.hpp"
60#include "MueLu_LWGraph_kokkos.hpp"
61#include "MueLu_MasterList.hpp"
62#include "MueLu_Monitor.hpp"
63#include "MueLu_Utilities_kokkos.hpp"
64
65namespace MueLu {
66
67
68 namespace CoalesceDrop_Kokkos_Details { // anonymous
69
70 template<class LO, class RowType>
71 class ScanFunctor {
72 public:
73 ScanFunctor(RowType rows_) : rows(rows_) { }
74
75 KOKKOS_INLINE_FUNCTION
76 void operator()(const LO i, LO& upd, const bool& final) const {
77 upd += rows(i);
78 if (final)
79 rows(i) = upd;
80 }
81
82 private:
83 RowType rows;
84 };
85
86 template<class LO, class GhostedViewType>
87 class ClassicalDropFunctor {
88 private:
89 typedef typename GhostedViewType::value_type SC;
90 typedef Kokkos::ArithTraits<SC> ATS;
91 typedef typename ATS::magnitudeType magnitudeType;
92
93 GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94 magnitudeType eps;
95
96 public:
97 ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold) :
98 diag(ghostedDiag),
99 eps(threshold)
100 { }
101
102 // Return true if we drop, false if not
103 KOKKOS_FORCEINLINE_FUNCTION
104 bool operator()(LO row, LO col, SC val) const {
105 // We avoid square root by using squared values
106 auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
107 auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
108
109 return (aij2 <= eps*eps * aiiajj);
110 }
111 };
112
113 template<class LO, class CoordsType>
114 class DistanceFunctor {
115 private:
116 typedef typename CoordsType::value_type SC;
117 typedef Kokkos::ArithTraits<SC> ATS;
118 typedef typename ATS::magnitudeType magnitudeType;
119
120 public:
121 typedef SC value_type;
122
123 public:
124 DistanceFunctor(CoordsType coords_) : coords(coords_) { }
125
126 KOKKOS_INLINE_FUNCTION
127 magnitudeType distance2(LO row, LO col) const {
128 SC d = ATS::zero(), s;
129 for (size_t j = 0; j < coords.extent(1); j++) {
130 s = coords(row,j) - coords(col,j);
131 d += s*s;
132 }
133 return ATS::magnitude(d);
134 }
135 private:
136 CoordsType coords;
137 };
138
139 template<class LO, class GhostedViewType, class DistanceFunctor>
140 class DistanceLaplacianDropFunctor {
141 private:
142 typedef typename GhostedViewType::value_type SC;
143 typedef Kokkos::ArithTraits<SC> ATS;
144 typedef typename ATS::magnitudeType magnitudeType;
145
146 public:
147 DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold) :
148 diag(ghostedLaplDiag),
149 distFunctor(distFunctor_),
150 eps(threshold)
151 { }
152
153 // Return true if we drop, false if not
154 KOKKOS_INLINE_FUNCTION
155 bool operator()(LO row, LO col, SC /* val */) const {
156 // We avoid square root by using squared values
157
158 // We ignore incoming value of val as we operate on an auxiliary
159 // distance Laplacian matrix
160 typedef typename DistanceFunctor::value_type dSC;
161 typedef Kokkos::ArithTraits<dSC> dATS;
162 auto fval = dATS::one() / distFunctor.distance2(row, col);
163
164 auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
165 auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
166
167 return (aij2 <= eps*eps * aiiajj);
168 }
169
170 private:
171 GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
172 DistanceFunctor distFunctor;
173 magnitudeType eps;
174 };
175
176 template<class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177 class ScalarFunctor {
178 private:
179 typedef typename MatrixType::StaticCrsGraphType graph_type;
180 typedef typename graph_type::row_map_type rows_type;
181 typedef typename graph_type::entries_type cols_type;
182 typedef typename MatrixType::values_type vals_type;
183 typedef Kokkos::ArithTraits<SC> ATS;
184 typedef typename ATS::magnitudeType magnitudeType;
185
186 public:
187 ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
188 typename rows_type::non_const_type rows_,
189 typename cols_type::non_const_type colsAux_,
190 typename vals_type::non_const_type valsAux_,
191 bool reuseGraph_, bool lumping_, SC /* threshold_ */) :
192 A(A_),
193 bndNodes(bndNodes_),
194 dropFunctor(dropFunctor_),
195 rows(rows_),
196 colsAux(colsAux_),
197 valsAux(valsAux_),
198 reuseGraph(reuseGraph_),
199 lumping(lumping_)
200 {
201 rowsA = A.graph.row_map;
202 zero = ATS::zero();
203 }
204
205 KOKKOS_INLINE_FUNCTION
206 void operator()(const LO row, LO& nnz) const {
207 auto rowView = A.rowConst(row);
208 auto length = rowView.length;
209 auto offset = rowsA(row);
210
211 SC diag = zero;
212 LO rownnz = 0;
213 LO diagID = -1;
214 for (decltype(length) colID = 0; colID < length; colID++) {
215 LO col = rowView.colidx(colID);
216 SC val = rowView.value (colID);
217
218 if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
219 colsAux(offset+rownnz) = col;
220
221 LO valID = (reuseGraph ? colID : rownnz);
222 valsAux(offset+valID) = val;
223 if (row == col)
224 diagID = valID;
225
226 rownnz++;
227
228 } else {
229 // Rewrite with zeros (needed for reuseGraph)
230 valsAux(offset+colID) = zero;
231 diag += val;
232 }
233 }
234 // How to assert on the device?
235 // assert(diagIndex != -1);
236 rows(row+1) = rownnz;
237 // if (lumping && diagID != -1) {
238 if (lumping) {
239 // Add diag to the diagonal
240
241 // NOTE_KOKKOS: valsAux was allocated with
242 // ViewAllocateWithoutInitializing. This is not a problem here
243 // because we explicitly set this value above.
244 valsAux(offset+diagID) += diag;
245 }
246
247 // If the only element remaining after filtering is diagonal, mark node as boundary
248 // FIXME: this should really be replaced by the following
249 // if (indices.size() == 1 && indices[0] == row)
250 // boundaryNodes[row] = true;
251 // We do not do it this way now because there is no framework for distinguishing isolated
252 // and boundary nodes in the aggregation algorithms
253 bndNodes(row) = (rownnz == 1);
254
255 nnz += rownnz;
256 }
257
258 private:
259 MatrixType A;
260 BndViewType bndNodes;
261 DropFunctorType dropFunctor;
262
263 rows_type rowsA;
264
265 typename rows_type::non_const_type rows;
266 typename cols_type::non_const_type colsAux;
267 typename vals_type::non_const_type valsAux;
268
269 bool reuseGraph;
270 bool lumping;
271 SC zero;
272 };
273
274 // collect number nonzeros of blkSize rows in nnz_(row+1)
275 template<class MatrixType, class NnzType, class blkSizeType>
276 class Stage1aVectorFunctor {
277 private:
278 typedef typename MatrixType::ordinal_type LO;
279
280 public:
281 Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_) :
282 kokkosMatrix(kokkosMatrix_),
283 nnz(nnz_),
284 blkSize(blkSize_) { }
285
286 KOKKOS_INLINE_FUNCTION
287 void operator()(const LO row, LO& totalnnz) const {
288
289 // the following code is more or less what MergeRows is doing
290 // count nonzero entries in all dof rows associated with node row
291 LO nodeRowMaxNonZeros = 0;
292 for (LO j = 0; j < blkSize; j++) {
293 auto rowView = kokkosMatrix.row(row * blkSize + j);
294 nodeRowMaxNonZeros += rowView.length;
295 }
296 nnz(row + 1) = nodeRowMaxNonZeros;
297 totalnnz += nodeRowMaxNonZeros;
298 }
299
300
301 private:
302 MatrixType kokkosMatrix; //< local matrix part
303 NnzType nnz; //< View containing number of nonzeros for current row
304 blkSizeType blkSize; //< block size (or partial block size in strided maps)
305 };
306
307
308 // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
309 // the DofIds may not be sorted.
310 template<class MatrixType, class NnzType, class blkSizeType, class ColDofType>
311 class Stage1bVectorFunctor {
312 private:
313 typedef typename MatrixType::ordinal_type LO;
314 //typedef typename MatrixType::value_type SC;
315 //typedef typename MatrixType::device_type NO;
316
317 private:
318 MatrixType kokkosMatrix; //< local matrix part
319 NnzType nnz; //< View containing dof offsets for dof columns
320 blkSizeType blkSize; //< block size (or partial block size in strided maps)
321 ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
322
323 public:
324 Stage1bVectorFunctor(MatrixType kokkosMatrix_,
325 NnzType nnz_,
326 blkSizeType blkSize_,
327 ColDofType coldofs_) :
328 kokkosMatrix(kokkosMatrix_),
329 nnz(nnz_),
330 blkSize(blkSize_),
331 coldofs(coldofs_) {
332 }
333
334 KOKKOS_INLINE_FUNCTION
335 void operator()(const LO rowNode) const {
336
337 LO pos = nnz(rowNode);
338 for (LO j = 0; j < blkSize; j++) {
339 auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
340 auto numIndices = rowView.length;
341
342 for (decltype(numIndices) k = 0; k < numIndices; k++) {
343 auto dofID = rowView.colidx(k);
344 coldofs(pos) = dofID;
345 pos ++;
346 }
347 }
348 }
349
350 };
351
352 // sort column ids
353 // translate them into (unique) node ids
354 // count the node column ids per node row
355 template<class MatrixType, class ColDofNnzType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeType>
356 class Stage1cVectorFunctor {
357 private:
358 typedef typename MatrixType::ordinal_type LO;
359
360 private:
361 ColDofNnzType coldofnnz; //< view containing start and stop indices for subviews
362 ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
363 Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
364 ColDofNnzType colnodennz; //< view containing number of column nodes for each node row
365 BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
366
367 public:
368 Stage1cVectorFunctor(ColDofNnzType coldofnnz_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, ColDofNnzType colnodennz_, BdryNodeType bdrynode_) :
369 coldofnnz(coldofnnz_),
370 coldofs(coldofs_),
371 dof2node(dof2node_),
372 colnodennz(colnodennz_),
373 bdrynode(bdrynode_) {
374 }
375
376 KOKKOS_INLINE_FUNCTION
377 void operator()(const LO rowNode, LO& nnz) const {
378 LO begin = coldofnnz(rowNode);
379 LO end = coldofnnz(rowNode+1);
380 LO n = end - begin;
381 for (LO i = 0; i < (n-1); i++) {
382 for (LO j = 0; j < (n-i-1); j++) {
383 if (coldofs(j+begin) > coldofs(j+begin+1)) {
384 LO temp = coldofs(j+begin);
385 coldofs(j+begin) = coldofs(j+begin+1);
386 coldofs(j+begin+1) = temp;
387 }
388 }
389 }
390 size_t cnt = 0;
391 LO lastNodeID = -1;
392 for (LO i = 0; i < n; i++) {
393 LO dofID = coldofs(begin + i);
394 LO nodeID = dof2node(dofID);
395 if(nodeID != lastNodeID) {
396 lastNodeID = nodeID;
397 coldofs(begin+cnt) = nodeID;
398 cnt++;
399 }
400 }
401 if(cnt == 1)
402 bdrynode(rowNode) = true;
403 else
404 bdrynode(rowNode) = false;
405 colnodennz(rowNode+1) = cnt;
406 nnz += cnt;
407 }
408
409 };
410
411 // fill column node id view
412 template<class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
413 class Stage1dVectorFunctor {
414 private:
415 typedef typename MatrixType::ordinal_type LO;
416 typedef typename MatrixType::value_type SC;
417
418 private:
419 ColDofType coldofs; //< view containing mixed node and dof indices (only input)
420 ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
421 ColNodeType colnodes; //< view containing the local node ids associated with columns
422 ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
423
424 public:
425 Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_) :
426 coldofs(coldofs_),
427 coldofnnz(coldofnnz_),
428 colnodes(colnodes_),
429 colnodennz(colnodennz_) {
430 }
431
432 KOKKOS_INLINE_FUNCTION
433 void operator()(const LO rowNode) const {
434 auto dofbegin = coldofnnz(rowNode);
435 auto nodebegin = colnodennz(rowNode);
436 auto nodeend = colnodennz(rowNode+1);
437 auto n = nodeend - nodebegin;
438
439 for (decltype(nodebegin) i = 0; i < n; i++) {
440 colnodes(nodebegin + i) = coldofs(dofbegin + i);
441 }
442 }
443 };
444
445
446 } // namespace
447
448 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
449 RCP<const ParameterList> CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
450 RCP<ParameterList> validParamList = rcp(new ParameterList());
451
452#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
453 SET_VALID_ENTRY("aggregation: drop tol");
454 SET_VALID_ENTRY("aggregation: Dirichlet threshold");
455 SET_VALID_ENTRY("aggregation: drop scheme");
456 SET_VALID_ENTRY("filtered matrix: use lumping");
457 SET_VALID_ENTRY("filtered matrix: reuse graph");
458 SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
459 {
460 typedef Teuchos::StringToIntegralParameterEntryValidator<int> validatorType;
461 validParamList->getEntry("aggregation: drop scheme").setValidator(
462 rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian"), "aggregation: drop scheme")));
463 }
464#undef SET_VALID_ENTRY
465 validParamList->set< bool > ("lightweight wrap", true, "Experimental option for lightweight graph access");
466
467 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
468 validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
469 validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory for Coordinates");
470
471 return validParamList;
472 }
473
474 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
475 void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level &currentLevel) const {
476 Input(currentLevel, "A");
477 Input(currentLevel, "UnAmalgamationInfo");
478
479 const ParameterList& pL = GetParameterList();
480 if (pL.get<bool>("lightweight wrap") == true) {
481 if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
482 Input(currentLevel, "Coordinates");
483 }
484 }
485
486 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
487 void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
488 Build(Level& currentLevel) const {
489 FactoryMonitor m(*this, "Build", currentLevel);
490
491 typedef Teuchos::ScalarTraits<SC> STS;
492 const SC zero = STS::zero();
493
494 auto A = Get< RCP<Matrix> >(currentLevel, "A");
495 LO blkSize = A->GetFixedBlockSize();
496
497 auto amalInfo = Get< RCP<AmalgamationInfo_kokkos> >(currentLevel, "UnAmalgamationInfo");
498
499 const ParameterList& pL = GetParameterList();
500
501 bool doLightWeightWrap = pL.get<bool>("lightweight wrap");
502 GetOStream(Warnings0) << "lightweight wrap is deprecated" << std::endl;
503 TEUCHOS_TEST_FOR_EXCEPTION(!doLightWeightWrap, Exceptions::RuntimeError,
504 "MueLu KokkosRefactor only supports \"lightweight wrap\"=\"true\"");
505
506 std::string algo = pL.get<std::string>("aggregation: drop scheme");
507
508 double threshold = pL.get<double>("aggregation: drop tol");
509 GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
510 << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
511
512 const typename STS::magnitudeType dirichletThreshold =
513 STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
514
515 GO numDropped = 0, numTotal = 0;
516
517 RCP<LWGraph_kokkos> graph;
518 LO dofsPerNode = -1;
519
520 typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
521 boundary_nodes_type boundaryNodes;
522
523 RCP<Matrix> filteredA;
524 if (blkSize == 1 && threshold == zero) {
525 // Scalar problem without dropping
526
527 // Detect and record rows that correspond to Dirichlet boundary conditions
528 boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
529
530 // Trivial LWGraph construction
531 graph = rcp(new LWGraph_kokkos(A->getLocalMatrixDevice().graph, A->getRowMap(), A->getColMap(), "graph of A"));
532 graph->SetBoundaryNodeMap(boundaryNodes);
533
534 numTotal = A->getNodeNumEntries();
535 dofsPerNode = 1;
536
537 filteredA = A;
538
539 } else if (blkSize == 1 && threshold != zero) {
540 // Scalar problem with dropping
541
542 typedef typename Matrix::local_matrix_type local_matrix_type;
543 typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
544 typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
545 typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
546 typedef typename local_matrix_type::values_type::non_const_type vals_type;
547
548 LO numRows = A->getNodeNumRows();
549 local_matrix_type kokkosMatrix = A->getLocalMatrixDevice();
550 auto nnzA = kokkosMatrix.nnz();
551 auto rowsA = kokkosMatrix.graph.row_map;
552
553
554 typedef Kokkos::ArithTraits<SC> ATS;
555
556 bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
557 bool lumping = pL.get<bool>("filtered matrix: use lumping");
558 if (lumping)
559 GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
560
561 // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
562 rows_type rows ("FA_rows", numRows+1);
563 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
564 vals_type valsAux;
565 if (reuseGraph) {
566 SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
567
568 // Share graph with the original matrix
569 filteredA = MatrixFactory::Build(A->getCrsGraph());
570
571 // Do a no-op fill-complete
572 RCP<ParameterList> fillCompleteParams(new ParameterList);
573 fillCompleteParams->set("No Nonlocal Changes", true);
574 filteredA->fillComplete(fillCompleteParams);
575
576 // No need to reuseFill, just modify in place
577 valsAux = filteredA->getLocalMatrixDevice().values;
578
579 } else {
580 // Need an extra array to compress
581 valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
582 }
583
584 typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
585
586 LO nnzFA = 0;
587 {
588 if (algo == "classical") {
589 // Construct overlapped matrix diagonal
590 RCP<Vector> ghostedDiag;
591 {
592 kokkosMatrix = local_matrix_type();
593 SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
594 ghostedDiag = Utilities_kokkos::GetMatrixOverlappedDiagonal(*A);
595 kokkosMatrix=A->getLocalMatrixDevice();
596 }
597
598 // Filter out entries
599 {
600 SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
601
602 auto ghostedDiagView = ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
603
604 CoalesceDrop_Kokkos_Details::ClassicalDropFunctor<LO, decltype(ghostedDiagView)> dropFunctor(ghostedDiagView, threshold);
605 CoalesceDrop_Kokkos_Details::ScalarFunctor<typename ATS::val_type, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
606 scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
607
608 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
609 scalarFunctor, nnzFA);
610 }
611
612 } else if (algo == "distance laplacian") {
613 typedef Xpetra::MultiVector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO> doubleMultiVector;
614 auto coords = Get<RCP<doubleMultiVector> >(currentLevel, "Coordinates");
615
616 auto uniqueMap = A->getRowMap();
617 auto nonUniqueMap = A->getColMap();
618
619 // Construct ghosted coordinates
620 RCP<const Import> importer;
621 {
622 SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
623 importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
624 }
625 RCP<doubleMultiVector> ghostedCoords;
626 {
627 SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
628 ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO>::Build(nonUniqueMap, coords->getNumVectors());
629 ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
630 }
631
632 auto ghostedCoordsView = ghostedCoords->getDeviceLocalView(Xpetra::Access::ReadWrite);
633 CoalesceDrop_Kokkos_Details::DistanceFunctor<LO, decltype(ghostedCoordsView)> distFunctor(ghostedCoordsView);
634
635 // Construct Laplacian diagonal
636 RCP<Vector> localLaplDiag;
637 {
638 SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
639
640 localLaplDiag = VectorFactory::Build(uniqueMap);
641
642 auto localLaplDiagView = localLaplDiag->getDeviceLocalView(Xpetra::Access::OverwriteAll);
643 auto kokkosGraph = kokkosMatrix.graph;
644
645 Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0,numRows),
646 KOKKOS_LAMBDA(const LO row) {
647 auto rowView = kokkosGraph.rowConst(row);
648 auto length = rowView.length;
649
650 SC d = ATS::zero();
651 for (decltype(length) colID = 0; colID < length; colID++) {
652 auto col = rowView(colID);
653 if (row != col)
654 d += ATS::one()/distFunctor.distance2(row, col);
655 }
656 localLaplDiagView(row,0) = d;
657 });
658 }
659
660 // Construct ghosted Laplacian diagonal
661 RCP<Vector> ghostedLaplDiag;
662 {
663 SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
664 ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
665 ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
666 }
667
668 // Filter out entries
669 {
670 SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
671
672 auto ghostedLaplDiagView = ghostedLaplDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
673
674 CoalesceDrop_Kokkos_Details::DistanceLaplacianDropFunctor<LO, decltype(ghostedLaplDiagView), decltype(distFunctor)>
675 dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
676 CoalesceDrop_Kokkos_Details::ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
677 scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
678
679 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
680 scalarFunctor, nnzFA);
681 }
682 }
683
684 }
685 numDropped = nnzA - nnzFA;
686
687 boundaryNodes = bndNodes;
688
689 {
690 SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
691
692 // parallel_scan (exclusive)
693 Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0,numRows+1),
694 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
695 update += rows(i);
696 if (final_pass)
697 rows(i) = update;
698 });
699 }
700
701 // Compress cols (and optionally vals)
702 // We use a trick here: we moved all remaining elements to the beginning
703 // of the original row in the main loop, so we don't need to check for
704 // INVALID here, and just stop when achieving the new number of elements
705 // per row.
706 cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
707 vals_type vals;
708 if (reuseGraph) {
709 GetOStream(Runtime1) << "reuse matrix graph for filtering (compress matrix columns only)" << std::endl;
710 // Only compress cols
711 SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
712
713 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
714 KOKKOS_LAMBDA(const LO i) {
715 // Is there Kokkos memcpy?
716 LO rowStart = rows(i);
717 LO rowAStart = rowsA(i);
718 size_t rownnz = rows(i+1) - rows(i);
719 for (size_t j = 0; j < rownnz; j++)
720 cols(rowStart+j) = colsAux(rowAStart+j);
721 });
722 } else {
723 // Compress cols and vals
724 GetOStream(Runtime1) << "new matrix graph for filtering (compress matrix columns and values)" << std::endl;
725 SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
726
727 vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
728
729 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
730 KOKKOS_LAMBDA(const LO i) {
731 LO rowStart = rows(i);
732 LO rowAStart = rowsA(i);
733 size_t rownnz = rows(i+1) - rows(i);
734 for (size_t j = 0; j < rownnz; j++) {
735 cols(rowStart+j) = colsAux(rowAStart+j);
736 vals(rowStart+j) = valsAux(rowAStart+j);
737 }
738 });
739 }
740
741 kokkos_graph_type kokkosGraph(cols, rows);
742
743 {
744 SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
745
746 graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
747 graph->SetBoundaryNodeMap(boundaryNodes);
748 }
749
750 numTotal = A->getNodeNumEntries();
751
752 dofsPerNode = 1;
753
754 if (!reuseGraph) {
755 SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
756
757 local_matrix_type localFA = local_matrix_type("A", numRows, A->getLocalMatrixDevice().numCols(), nnzFA, vals, rows, cols);
758 auto filteredACrs = CrsMatrixFactory::Build(localFA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap(),
759 A->getCrsGraph()->getImporter(), A->getCrsGraph()->getExporter());
760 filteredA = rcp(new CrsMatrixWrap(filteredACrs));
761 }
762
763 filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
764
765 if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
766 // Reuse max eigenvalue from A
767 // It is unclear what eigenvalue is the best for the smoothing, but we already may have
768 // the D^{-1}A estimate in A, may as well use it.
769 // NOTE: ML does that too
770 filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
771 } else {
772 filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
773 }
774
775 } else if (blkSize > 1 && threshold == zero) {
776 // Case 3: block problem without filtering
777 //
778 // FIXME_KOKKOS: this code is completely unoptimized. It really should do
779 // a very simple thing: merge rows and produce nodal graph. But the code
780 // seems very complicated. Can we do better?
781
782 TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getNodeNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getNodeNumElements() << " but should be a multiply of " << blkSize);
783
784 const RCP<const Map> rowMap = A->getRowMap();
785 const RCP<const Map> colMap = A->getColMap();
786
787 // build a node row map (uniqueMap = non-overlapping) and a node column map
788 // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
789 // stored in the AmalgamationInfo class container contain the local node id
790 // given a local dof id. The data is calculated in the AmalgamationFactory and
791 // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
792 const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
793 const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
794 Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
795 Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
796
798 rowTranslationView(rowTranslationArray.getRawPtr(),rowTranslationArray.size() );
800 colTranslationView(colTranslationArray.getRawPtr(),colTranslationArray.size() );
801
802 // get number of local nodes
803 LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getNodeNumElements());
804 typedef typename Kokkos::View<LocalOrdinal*, DeviceType> id_translation_type;
805 id_translation_type rowTranslation("dofId2nodeId",rowTranslationArray.size());
806 id_translation_type colTranslation("ov_dofId2nodeId",colTranslationArray.size());
807 Kokkos::deep_copy(rowTranslation, rowTranslationView);
808 Kokkos::deep_copy(colTranslation, colTranslationView);
809
810 // extract striding information
811 blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
812 LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
813 LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
814 if(A->IsView("stridedMaps") == true) {
815 const RCP<const Map> myMap = A->getRowMap("stridedMaps");
816 const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
817 TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
818 blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
819 blkId = strMap->getStridedBlockId();
820 if (blkId > -1)
821 blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
822 }
823 auto kokkosMatrix = A->getLocalMatrixDevice(); // access underlying kokkos data
824
825 //
826 typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
827 typedef typename kokkos_graph_type::row_map_type row_map_type;
828 //typedef typename row_map_type::HostMirror row_map_type_h;
829 typedef typename kokkos_graph_type::entries_type entries_type;
830
831 // Stage 1c: get number of dof-nonzeros per blkSize node rows
832 typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
833 LO numDofCols = 0;
834 CoalesceDrop_Kokkos_Details::Stage1aVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize)> stage1aFunctor(kokkosMatrix, dofNnz, blkPartSize);
835 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0,numNodes), stage1aFunctor, numDofCols);
836 // parallel_scan (exclusive)
837 CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(dofNnz)> scanFunctor(dofNnz);
838 Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanFunctor);
839
840 typename entries_type::non_const_type dofcols("dofcols", numDofCols/*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
841 CoalesceDrop_Kokkos_Details::Stage1bVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols)> stage1bFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols);
842 Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1b", range_type(0,numNodes), stage1bFunctor);
843
844 // we have dofcols and dofids from Stage1dVectorFunctor
845 LO numNodeCols = 0;
846 typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
847 typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
848 CoalesceDrop_Kokkos_Details::Stage1cVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(colTranslation), decltype(bndNodes)> stage1cFunctor(dofNnz, dofcols, colTranslation,rows,bndNodes);
849 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1cFunctor,numNodeCols);
850
851 // parallel_scan (exclusive)
852 CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(rows)> scanNodeFunctor(rows);
853 Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanNodeFunctor);
854
855 // create column node view
856 typename entries_type::non_const_type cols("nodecols", numNodeCols);
857
858
859 CoalesceDrop_Kokkos_Details::Stage1dVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(rows), decltype(cols)> stage1dFunctor(dofcols, dofNnz, cols, rows);
860 Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1dFunctor);
861 kokkos_graph_type kokkosGraph(cols, rows);
862
863 // create LW graph
864 graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
865
866 boundaryNodes = bndNodes;
867 graph->SetBoundaryNodeMap(boundaryNodes);
868 numTotal = A->getNodeNumEntries();
869
870 dofsPerNode = blkSize;
871
872 filteredA = A;
873
874 } else {
875 TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
876 }
877
878 if (GetVerbLevel() & Statistics1) {
879 GO numLocalBoundaryNodes = 0;
880 GO numGlobalBoundaryNodes = 0;
881
882 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
883 KOKKOS_LAMBDA(const LO i, GO& n) {
884 if (boundaryNodes(i))
885 n++;
886 }, numLocalBoundaryNodes);
887
888 auto comm = A->getRowMap()->getComm();
889 MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
890 GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
891 }
892
893 if ((GetVerbLevel() & Statistics1) && threshold != zero) {
894 auto comm = A->getRowMap()->getComm();
895
896 GO numGlobalTotal, numGlobalDropped;
897 MueLu_sumAll(comm, numTotal, numGlobalTotal);
898 MueLu_sumAll(comm, numDropped, numGlobalDropped);
899
900 if (numGlobalTotal != 0) {
901 GetOStream(Statistics1) << "Number of dropped entries: "
902 << numGlobalDropped << "/" << numGlobalTotal
903 << " (" << 100*Teuchos::as<double>(numGlobalDropped)/Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
904 }
905 }
906
907 Set(currentLevel, "DofsPerNode", dofsPerNode);
908 Set(currentLevel, "Graph", graph);
909 Set(currentLevel, "A", filteredA);
910 }
911}
912#endif // HAVE_MUELU_KOKKOS_REFACTOR
913#endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
#define SET_VALID_ENTRY(name)
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
Exception throws to report errors in the internal logical of the program.
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics1
Print more statistics.
@ Runtime0
One-liner description of what is happening.
@ Runtime1
Description of what is happening (more verbose)
Kokkos::View< bool *, typename NO::device_type > DetectDirichletRows(const Xpetra::Matrix< SC, LO, GO, NO > &A, const typename Teuchos::ScalarTraits< SC >::magnitudeType &tol, const bool count_twos_as_dirichlet)