MueLu Version of the Day
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48
49#ifdef HAVE_MUELU_KOKKOS_REFACTOR
50
52
54
55#include "MueLu_Aggregates_kokkos.hpp"
56#include "MueLu_AmalgamationFactory_kokkos.hpp"
57#include "MueLu_AmalgamationInfo_kokkos.hpp"
58#include "MueLu_CoarseMapFactory_kokkos.hpp"
59#include "MueLu_MasterList.hpp"
60#include "MueLu_NullspaceFactory_kokkos.hpp"
61#include "MueLu_PerfUtils.hpp"
62#include "MueLu_Monitor.hpp"
63#include "MueLu_Utilities_kokkos.hpp"
64
65namespace MueLu {
66
67 namespace { // anonymous
68
69 template<class LocalOrdinal, class View>
70 class ReduceMaxFunctor{
71 public:
72 ReduceMaxFunctor(View view) : view_(view) { }
73
74 KOKKOS_INLINE_FUNCTION
75 void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
76 if (vmax < view_(i))
77 vmax = view_(i);
78 }
79
80 KOKKOS_INLINE_FUNCTION
81 void join (volatile LocalOrdinal& dst, const volatile LocalOrdinal& src) const {
82 if (dst < src) {
83 dst = src;
84 }
85 }
86
87 KOKKOS_INLINE_FUNCTION
88 void init (LocalOrdinal& dst) const {
89 dst = 0;
90 }
91 private:
92 View view_;
93 };
94
95 // local QR decomposition
96 template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
97 class LocalQRDecompFunctor {
98 private:
99 typedef LOType LO;
100 typedef GOType GO;
101 typedef SCType SC;
102
103 typedef typename DeviceType::execution_space execution_space;
104 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
105 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
106 typedef typename impl_ATS::magnitudeType Magnitude;
107
110
111 private:
112
113 NspType fineNS;
114 NspType coarseNS;
115 aggRowsType aggRows;
116 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
117 agg2RowMapLOType agg2RowMapLO;
118 statusType statusAtomic;
119 rowsType rows;
120 rowsAuxType rowsAux;
121 colsAuxType colsAux;
122 valsAuxType valsAux;
123 bool doQRStep;
124 public:
125 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
126 fineNS(fineNS_),
127 coarseNS(coarseNS_),
128 aggRows(aggRows_),
129 maxAggDofSize(maxAggDofSize_),
130 agg2RowMapLO(agg2RowMapLO_),
131 statusAtomic(statusAtomic_),
132 rows(rows_),
133 rowsAux(rowsAux_),
134 colsAux(colsAux_),
135 valsAux(valsAux_),
136 doQRStep(doQRStep_)
137 { }
138
139 KOKKOS_INLINE_FUNCTION
140 void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
141 auto agg = thread.league_rank();
142
143 // size of aggregate: number of DOFs in aggregate
144 auto aggSize = aggRows(agg+1) - aggRows(agg);
145
146 const impl_SC one = impl_ATS::one();
147 const impl_SC two = one + one;
148 const impl_SC zero = impl_ATS::zero();
149 const auto zeroM = impl_ATS::magnitude(zero);
150
151 int m = aggSize;
152 int n = fineNS.extent(1);
153
154 // calculate row offset for coarse nullspace
155 Xpetra::global_size_t offset = agg * n;
156
157 if (doQRStep) {
158
159 // Extract the piece of the nullspace corresponding to the aggregate
160 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
161 for (int j = 0; j < n; j++)
162 for (int k = 0; k < m; k++)
163 r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
164#if 0
165 printf("A\n");
166 for (int i = 0; i < m; i++) {
167 for (int j = 0; j < n; j++)
168 printf(" %5.3lf ", r(i,j));
169 printf("\n");
170 }
171#endif
172
173 // Calculate QR decomposition (standard)
174 shared_matrix q(thread.team_shmem(), m, m); // Q
175 if (m >= n) {
176 bool isSingular = false;
177
178 // Initialize Q^T
179 auto qt = q;
180 for (int i = 0; i < m; i++) {
181 for (int j = 0; j < m; j++)
182 qt(i,j) = zero;
183 qt(i,i) = one;
184 }
185
186 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
187 // FIXME_KOKKOS: use team
188 Magnitude s = zeroM, norm, norm_x;
189 for (int i = k+1; i < m; i++)
190 s += pow(impl_ATS::magnitude(r(i,k)), 2);
191 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
192
193 if (norm == zero) {
194 isSingular = true;
195 break;
196 }
197
198 r(k,k) -= norm*one;
199
200 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
201 if (norm_x == zeroM) {
202 // We have a single diagonal element in the column.
203 // No reflections required. Just need to restor r(k,k).
204 r(k,k) = norm*one;
205 continue;
206 }
207
208 // FIXME_KOKKOS: use team
209 for (int i = k; i < m; i++)
210 r(i,k) /= norm_x;
211
212 // Update R(k:m,k+1:n)
213 for (int j = k+1; j < n; j++) {
214 // FIXME_KOKKOS: use team in the loops
215 impl_SC si = zero;
216 for (int i = k; i < m; i++)
217 si += r(i,k) * r(i,j);
218 for (int i = k; i < m; i++)
219 r(i,j) -= two*si * r(i,k);
220 }
221
222 // Update Q^T (k:m,k:m)
223 for (int j = k; j < m; j++) {
224 // FIXME_KOKKOS: use team in the loops
225 impl_SC si = zero;
226 for (int i = k; i < m; i++)
227 si += r(i,k) * qt(i,j);
228 for (int i = k; i < m; i++)
229 qt(i,j) -= two*si * r(i,k);
230 }
231
232 // Fix R(k:m,k)
233 r(k,k) = norm*one;
234 for (int i = k+1; i < m; i++)
235 r(i,k) = zero;
236 }
237
238#if 0
239 // Q = (Q^T)^T
240 for (int i = 0; i < m; i++)
241 for (int j = 0; j < i; j++) {
242 impl_SC tmp = qt(i,j);
243 qt(i,j) = qt(j,i);
244 qt(j,i) = tmp;
245 }
246#endif
247
248 // Build coarse nullspace using the upper triangular part of R
249 for (int j = 0; j < n; j++)
250 for (int k = 0; k <= j; k++)
251 coarseNS(offset+k,j) = r(k,j);
252
253 if (isSingular) {
254 statusAtomic(1) = true;
255 return;
256 }
257
258 } else {
259 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
260
261 // The local QR decomposition is not possible in the "overconstrained"
262 // case (i.e. number of columns in qr > number of rowsAux), which
263 // corresponds to #DOFs in Aggregate < n. For usual problems this
264 // is only possible for single node aggregates in structural mechanics.
265 // (Similar problems may arise in discontinuous Galerkin problems...)
266 // We bypass the QR decomposition and use an identity block in the
267 // tentative prolongator for the single node aggregate and transfer the
268 // corresponding fine level null space information 1-to-1 to the coarse
269 // level null space part.
270
271 // NOTE: The resulting tentative prolongation operator has
272 // (m*DofsPerNode-n) zero columns leading to a singular
273 // coarse level operator A. To deal with that one has the following
274 // options:
275 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
276 // false) to add some identity block to the diagonal of the zero rowsAux
277 // in the coarse level operator A, such that standard level smoothers
278 // can be used again.
279 // - Use special (projection-based) level smoothers, which can deal
280 // with singular matrices (very application specific)
281 // - Adapt the code below to avoid zero columns. However, we do not
282 // support a variable number of DOFs per node in MueLu/Xpetra which
283 // makes the implementation really hard.
284 //
285 // FIXME: do we need to check for singularity here somehow? Zero
286 // columns would be easy but linear dependency would require proper QR.
287
288 // R = extended (by adding identity rowsAux) qr
289 for (int j = 0; j < n; j++)
290 for (int k = 0; k < n; k++)
291 if (k < m)
292 coarseNS(offset+k,j) = r(k,j);
293 else
294 coarseNS(offset+k,j) = (k == j ? one : zero);
295
296 // Q = I (rectangular)
297 for (int i = 0; i < m; i++)
298 for (int j = 0; j < n; j++)
299 q(i,j) = (j == i ? one : zero);
300 }
301
302 // Process each row in the local Q factor and fill helper arrays to assemble P
303 for (int j = 0; j < m; j++) {
304 LO localRow = agg2RowMapLO(aggRows(agg)+j);
305 size_t rowStart = rowsAux(localRow);
306 size_t lnnz = 0;
307 for (int k = 0; k < n; k++) {
308 // skip zeros
309 if (q(j,k) != zero) {
310 colsAux(rowStart+lnnz) = offset + k;
311 valsAux(rowStart+lnnz) = q(j,k);
312 lnnz++;
313 }
314 }
315 rows(localRow+1) = lnnz;
316 nnz += lnnz;
317 }
318
319#if 0
320 printf("R\n");
321 for (int i = 0; i < m; i++) {
322 for (int j = 0; j < n; j++)
323 printf(" %5.3lf ", coarseNS(i,j));
324 printf("\n");
325 }
326
327 printf("Q\n");
328 for (int i = 0; i < aggSize; i++) {
329 for (int j = 0; j < aggSize; j++)
330 printf(" %5.3lf ", q(i,j));
331 printf("\n");
332 }
333#endif
334 } else {
336 // "no-QR" option //
338 // Local Q factor is just the fine nullspace support over the current aggregate.
339 // Local R factor is the identity.
340 // TODO I have not implemented any special handling for aggregates that are too
341 // TODO small to locally support the nullspace, as is done in the standard QR
342 // TODO case above.
343
344 for (int j = 0; j < m; j++) {
345 LO localRow = agg2RowMapLO(aggRows(agg)+j);
346 size_t rowStart = rowsAux(localRow);
347 size_t lnnz = 0;
348 for (int k = 0; k < n; k++) {
349 const impl_SC qr_jk = fineNS(localRow,k);
350 // skip zeros
351 if (qr_jk != zero) {
352 colsAux(rowStart+lnnz) = offset + k;
353 valsAux(rowStart+lnnz) = qr_jk;
354 lnnz++;
355 }
356 }
357 rows(localRow+1) = lnnz;
358 nnz += lnnz;
359 }
360
361 for (int j = 0; j < n; j++)
362 coarseNS(offset+j,j) = one;
363
364 }
365
366 }
367
368 // amount of shared memory
369 size_t team_shmem_size( int /* team_size */ ) const {
370 if (doQRStep) {
371 int m = maxAggDofSize;
372 int n = fineNS.extent(1);
373 return shared_matrix::shmem_size(m, n) + // r
374 shared_matrix::shmem_size(m, m); // q
375 } else
376 return 0;
377 }
378 };
379
380 } // namespace anonymous
381
382 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
383 RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
384 RCP<ParameterList> validParamList = rcp(new ParameterList());
385
386#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
387 SET_VALID_ENTRY("tentative: calculate qr");
388 SET_VALID_ENTRY("tentative: build coarse coordinates");
389#undef SET_VALID_ENTRY
390
391 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
392 validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
393 validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
394 validParamList->set< RCP<const FactoryBase> >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
395 validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
396 validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
397 validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
398
399 // Make sure we don't recursively validate options for the matrixmatrix kernels
400 ParameterList norecurse;
401 norecurse.disableRecursiveValidation();
402 validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
403
404 return validParamList;
405 }
406
407 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
408 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& /* coarseLevel */) const {
409
410 const ParameterList& pL = GetParameterList();
411 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
412 std::string nspName = "Nullspace";
413 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
414
415 Input(fineLevel, "A");
416 Input(fineLevel, "Aggregates");
417 Input(fineLevel, nspName);
418 Input(fineLevel, "UnAmalgamationInfo");
419 Input(fineLevel, "CoarseMap");
420 if( fineLevel.GetLevelID() == 0 &&
421 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
422 pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
423 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
424 Input(fineLevel, "Coordinates");
425 } else if (bTransferCoordinates_) {
426 Input(fineLevel, "Coordinates");
427 }
428 }
429
430 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
431 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel) const {
432 return BuildP(fineLevel, coarseLevel);
433 }
434
435 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
436 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
437 FactoryMonitor m(*this, "Build", coarseLevel);
438
439 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
440 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
441 const ParameterList& pL = GetParameterList();
442 std::string nspName = "Nullspace";
443 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
444
445 auto A = Get< RCP<Matrix> > (fineLevel, "A");
446 auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel, "Aggregates");
447 auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel, "UnAmalgamationInfo");
448 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
449 auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
450 RCP<RealValuedMultiVector> fineCoords;
451 if(bTransferCoordinates_) {
452 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
453 }
454
455 RCP<Matrix> Ptentative;
456 RCP<MultiVector> coarseNullspace;
457 RCP<RealValuedMultiVector> coarseCoords;
458
459 if(bTransferCoordinates_) {
460 ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
461 GO indexBase = coarseMap->getIndexBase();
462
463 LO blkSize = 1;
464 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
465 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
466
467 Array<GO> elementList;
468 ArrayView<const GO> elementListView;
469 if (blkSize == 1) {
470 // Scalar system
471 // No amalgamation required
472 elementListView = elementAList;
473
474 } else {
475 auto numElements = elementAList.size() / blkSize;
476
477 elementList.resize(numElements);
478
479 // Amalgamate the map
480 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
481 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
482
483 elementListView = elementList;
484 }
485
486 auto uniqueMap = fineCoords->getMap();
487 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
488 elementListView, indexBase, coarseMap->getComm());
489 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
490
491 // Create overlapped fine coordinates to reduce global communication
492 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
493 if (aggregates->AggregatesCrossProcessors()) {
494 auto nonUniqueMap = aggregates->GetMap();
495 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
496
497 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
498 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
499 }
500
501 // The good new is that his graph has already been constructed for the
502 // TentativePFactory and was cached in Aggregates. So this is a no-op.
503 auto aggGraph = aggregates->GetGraph();
504 auto numAggs = aggGraph.numRows();
505
506 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
507 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
508
509 // Fill in coarse coordinates
510 {
511 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
512
513 const auto dim = fineCoords->getNumVectors();
514
515 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
516 for (size_t j = 0; j < dim; j++) {
517 Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
518 KOKKOS_LAMBDA(const LO i) {
519 // A row in this graph represents all node ids in the aggregate
520 // Therefore, averaging is very easy
521
522 auto aggregate = aggGraph.rowConst(i);
523
524 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
525 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
526 sum += fineCoordsRandomView(aggregate(colID),j);
527
528 coarseCoordsView(i,j) = sum / aggregate.length;
529 });
530 }
531 }
532 }
533
534 if (!aggregates->AggregatesCrossProcessors())
535 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
536 else
537 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
538
539 // If available, use striding information of fine level matrix A for range
540 // map and coarseMap as domain map; otherwise use plain range map of
541 // Ptent = plain range map of A for range map and coarseMap as domain map.
542 // NOTE:
543 // The latter is not really safe, since there is no striding information
544 // for the range map. This is not really a problem, since striding
545 // information is always available on the intermedium levels and the
546 // coarsest levels.
547 if (A->IsView("stridedMaps") == true)
548 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
549 else
550 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
551
552 if(bTransferCoordinates_) {
553 Set(coarseLevel, "Coordinates", coarseCoords);
554 }
555 Set(coarseLevel, "Nullspace", coarseNullspace);
556 Set(coarseLevel, "P", Ptentative);
557
558 if (IsPrint(Statistics2)) {
559 RCP<ParameterList> params = rcp(new ParameterList());
560 params->set("printLoadBalancingInfo", true);
561 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
562 }
563 }
564
565 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
566 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
567 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
568 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
569 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
570 RCP<MultiVector>& coarseNullspace, const int levelID) const {
571 auto rowMap = A->getRowMap();
572 auto colMap = A->getColMap();
573
574 const size_t numRows = rowMap->getNodeNumElements();
575 const size_t NSDim = fineNullspace->getNumVectors();
576
577 typedef Kokkos::ArithTraits<SC> ATS;
578 using impl_SC = typename ATS::val_type;
579 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
580 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
581
582 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
583
584 typename Aggregates_kokkos::local_graph_type aggGraph;
585 {
586 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
587 aggGraph = aggregates->GetGraph();
588 }
589 auto aggRows = aggGraph.row_map;
590 auto aggCols = aggGraph.entries;
591
592 // Aggregates map is based on the amalgamated column map
593 // We can skip global-to-local conversion if LIDs in row map are
594 // same as LIDs in column map
595 bool goodMap;
596 {
597 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
598 goodMap = isGoodMap(*rowMap, *colMap);
599 }
600 // FIXME_KOKKOS: need to proofread later code for bad maps
601 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
602 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
603 "(i.e. \"matching\" row and column maps)");
604
605 // STEP 1: do unamalgamation
606 // The non-kokkos version uses member functions from the AmalgamationInfo
607 // container class to unamalgamate the data. In contrast, the kokkos
608 // version of TentativePFactory does the unamalgamation here and only uses
609 // the data of the AmalgamationInfo container class
610
611 // Extract information for unamalgamation
612 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
613 GO indexBase;
614 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
615 GO globalOffset = amalgInfo->GlobalOffset();
616
617 // Extract aggregation info (already in Kokkos host views)
618 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
619 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
620 const size_t numAggregates = aggregates->GetNumAggregates();
621
622 int myPID = aggregates->GetMap()->getComm()->getRank();
623
624 // Create Kokkos::View (on the device) to store the aggreate dof sizes
625 // Later used to get aggregate dof offsets
626 // NOTE: This zeros itself on construction
627 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
628 AggSizeType aggDofSizes;
629
630 if (stridedBlockSize == 1) {
631 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
632
633 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
634 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
635
636 auto sizesConst = aggregates->ComputeAggregateSizes();
637 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
638
639 } else {
640 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
641
642 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
643 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
644
645 auto nodeMap = aggregates->GetMap()->getLocalMap();
646 auto dofMap = colMap->getLocalMap();
647
648 Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
649 KOKKOS_LAMBDA(const LO agg) {
650 auto aggRowView = aggGraph.rowConst(agg);
651
652 size_t size = 0;
653 for (LO colID = 0; colID < aggRowView.length; colID++) {
654 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
655
656 for (LO k = 0; k < stridedBlockSize; k++) {
657 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
658
659 if (dofMap.getLocalElement(dofGID) != INVALID)
660 size++;
661 }
662 }
663 aggDofSizes(agg+1) = size;
664 });
665 }
666
667 // Find maximum dof size for aggregates
668 // Later used to reserve enough scratch space for local QR decompositions
669 LO maxAggSize = 0;
670 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
671 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
672
673 // parallel_scan (exclusive)
674 // The aggDofSizes View then contains the aggregate dof offsets
675 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
676 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
677 update += aggDofSizes(i);
678 if (final_pass)
679 aggDofSizes(i) = update;
680 });
681
682 // Create Kokkos::View on the device to store mapping
683 // between (local) aggregate id and row map ids (LIDs)
684 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
685 {
686 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
687
688 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
689 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
690
691 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
692 KOKKOS_LAMBDA(const LO lnode) {
693 if (procWinner(lnode, 0) == myPID) {
694 // No need for atomics, it's one-to-one
695 auto aggID = vertex2AggId(lnode,0);
696
697 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
698 // FIXME: I think this may be wrong
699 // We unconditionally add the whole block here. When we calculated
700 // aggDofSizes, we did the isLocalElement check. Something's fishy.
701 for (LO k = 0; k < stridedBlockSize; k++)
702 agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
703 }
704 });
705 }
706
707 // STEP 2: prepare local QR decomposition
708 // Reserve memory for tentative prolongation operator
709 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
710
711 // Pull out the nullspace vectors so that we can have random access (on the device)
712 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
713 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
714
715 size_t nnz = 0; // actual number of nnz
716
717 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
718 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
719 typedef typename local_matrix_type::index_type::non_const_type cols_type;
720 typedef typename local_matrix_type::values_type::non_const_type vals_type;
721
722
723 // Device View for status (error messages...)
724 typedef Kokkos::View<int[10], DeviceType> status_type;
725 status_type status("status");
726
727 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
728 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
729
730 const ParameterList& pL = GetParameterList();
731 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
732 if (!doQRStep) {
733 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
734 if (NSDim>1)
735 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
736 }
737
738 size_t nnzEstimate = numRows * NSDim;
739 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows+1);
740 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
741 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
742 rows_type rows("Ptent_rows", numRows+1);
743 {
744 // Stage 0: fill in views.
745 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
746
747 // The main thing to notice is initialization of vals with INVALID. These
748 // values will later be used to compress the arrays
749 Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
750 KOKKOS_LAMBDA(const LO row) {
751 rowsAux(row) = row*NSDim;
752 });
753 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
754 KOKKOS_LAMBDA(const LO j) {
755 colsAux(j) = INVALID;
756 });
757 }
758
759 if (NSDim == 1) {
760 // 1D is special, as it is the easiest. We don't even need to the QR,
761 // just normalize an array. Plus, no worries abot small aggregates. In
762 // addition, we do not worry about compression. It is unlikely that
763 // nullspace will have zeros. If it does, a prolongator row would be
764 // zero and we'll get singularity anyway.
765 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
766
767 // Set up team policy with numAggregates teams and one thread per team.
768 // Each team handles a slice of the data associated with one aggregate
769 // and performs a local QR decomposition (in this case real QR is
770 // unnecessary).
771 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
772
773 if (doQRStep) {
774 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
775 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
776 auto agg = thread.league_rank();
777
778 // size of the aggregate (number of DOFs in aggregate)
779 LO aggSize = aggRows(agg+1) - aggRows(agg);
780
781 // Extract the piece of the nullspace corresponding to the aggregate, and
782 // put it in the flat array, "localQR" (in column major format) for the
783 // QR routine. Trivial in 1D.
784 auto norm = impl_ATS::magnitude(zero);
785
786 // Calculate QR by hand
787 // FIXME: shouldn't there be stridedblock here?
788 // FIXME_KOKKOS: shouldn't there be stridedblock here?
789 for (decltype(aggSize) k = 0; k < aggSize; k++) {
790 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
791 norm += dnorm*dnorm;
792 }
793 norm = sqrt(norm);
794
795 if (norm == zero) {
796 // zero column; terminate the execution
797 statusAtomic(1) = true;
798 return;
799 }
800
801 // R = norm
802 coarseNS(agg, 0) = norm;
803
804 // Q = localQR(:,0)/norm
805 for (decltype(aggSize) k = 0; k < aggSize; k++) {
806 LO localRow = agg2RowMapLO(aggRows(agg)+k);
807 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
808
809 rows(localRow+1) = 1;
810 colsAux(localRow) = agg;
811 valsAux(localRow) = localVal;
812
813 }
814 });
815
816 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
817 Kokkos::deep_copy(statusHost, status);
818 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
819 if (statusHost(i)) {
820 std::ostringstream oss;
821 oss << "MueLu::TentativePFactory::MakeTentative: ";
822 switch (i) {
823 case 0: oss << "!goodMap is not implemented"; break;
824 case 1: oss << "fine level NS part has a zero column"; break;
825 }
826 throw Exceptions::RuntimeError(oss.str());
827 }
828
829 } else {
830 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
831 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
832 auto agg = thread.league_rank();
833
834 // size of the aggregate (number of DOFs in aggregate)
835 LO aggSize = aggRows(agg+1) - aggRows(agg);
836
837 // R = norm
838 coarseNS(agg, 0) = one;
839
840 // Q = localQR(:,0)/norm
841 for (decltype(aggSize) k = 0; k < aggSize; k++) {
842 LO localRow = agg2RowMapLO(aggRows(agg)+k);
843 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
844
845 rows(localRow+1) = 1;
846 colsAux(localRow) = agg;
847 valsAux(localRow) = localVal;
848
849 }
850 });
851 }
852
853 Kokkos::parallel_reduce("MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
854 KOKKOS_LAMBDA(const LO i, size_t &nnz_count) {
855 nnz_count += rows(i);
856 }, nnz);
857
858 } else { // NSdim > 1
859 // FIXME_KOKKOS: This code branch is completely unoptimized.
860 // Work to do:
861 // - Optimize QR decomposition
862 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
863 // packing new values in the beginning of each row
864 // We do use auxilary view in this case, so keep a second rows view for
865 // counting nonzeros in rows
866
867 {
868 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
869 // Set up team policy with numAggregates teams and one thread per team.
870 // Each team handles a slice of the data associated with one aggregate
871 // and performs a local QR decomposition
872 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
873 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
874 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
875 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
876 decltype(valsAux)>
877 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
878 rows, rowsAux, colsAux, valsAux, doQRStep);
879 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
880 }
881
882 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
883 Kokkos::deep_copy(statusHost, status);
884 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
885 if (statusHost(i)) {
886 std::ostringstream oss;
887 oss << "MueLu::TentativePFactory::MakeTentative: ";
888 switch(i) {
889 case 0: oss << "!goodMap is not implemented"; break;
890 case 1: oss << "fine level NS part has a zero column"; break;
891 }
892 throw Exceptions::RuntimeError(oss.str());
893 }
894 }
895
896 // Compress the cols and vals by ignoring INVALID column entries that correspond
897 // to 0 in QR.
898
899 // The real cols and vals are constructed using calculated (not estimated) nnz
900 cols_type cols;
901 vals_type vals;
902
903 if (nnz != nnzEstimate) {
904 {
905 // Stage 2: compress the arrays
906 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
907
908 Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
909 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
910 upd += rows(i);
911 if (final)
912 rows(i) = upd;
913 });
914 }
915
916 {
917 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
918
919 cols = cols_type("Ptent_cols", nnz);
920 vals = vals_type("Ptent_vals", nnz);
921
922 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
923 // to the beginning of rows. See CoalesceDropFactory_kokkos for
924 // example.
925 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
926 KOKKOS_LAMBDA(const LO i) {
927 LO rowStart = rows(i);
928
929 size_t lnnz = 0;
930 for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
931 if (colsAux(j) != INVALID) {
932 cols(rowStart+lnnz) = colsAux(j);
933 vals(rowStart+lnnz) = valsAux(j);
934 lnnz++;
935 }
936 });
937 }
938
939 } else {
940 rows = rowsAux;
941 cols = colsAux;
942 vals = valsAux;
943 }
944
945 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
946
947 {
948 // Stage 3: construct Xpetra::Matrix
949 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
950
951 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
952
953 // Managing labels & constants for ESFC
954 RCP<ParameterList> FCparams;
955 if (pL.isSublist("matrixmatrix: kernel params"))
956 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
957 else
958 FCparams = rcp(new ParameterList);
959
960 // By default, we don't need global constants for TentativeP
961 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
962 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
963
964 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
965 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
966 }
967 }
968
969 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
970 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
971 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */,
972 RCP<AmalgamationInfo_kokkos> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
973 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
974 RCP<MultiVector>& /* coarseNullspace */) const {
975 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
976 }
977
978 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
979 bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
980 isGoodMap(const Map& rowMap, const Map& colMap) const {
981 auto rowLocalMap = rowMap.getLocalMap();
982 auto colLocalMap = colMap.getLocalMap();
983
984 const size_t numRows = rowLocalMap.getNodeNumElements();
985 const size_t numCols = colLocalMap.getNodeNumElements();
986
987 if (numCols < numRows)
988 return false;
989
990 size_t numDiff = 0;
991 Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
992 KOKKOS_LAMBDA(const LO i, size_t &diff) {
993 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
994 }, numDiff);
995
996 return (numDiff == 0);
997 }
998
999} //namespace MueLu
1000
1001#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1002#endif // HAVE_MUELU_KOKKOS_REFACTOR
1003#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.