Tpetra parallel linear algebra Version of the Day
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
42
45
49#include "Tpetra_BlockMultiVector.hpp"
50#include "Tpetra_BlockView.hpp"
51
52#include "Teuchos_TimeMonitor.hpp"
53#ifdef HAVE_TPETRA_DEBUG
54# include <set>
55#endif // HAVE_TPETRA_DEBUG
56
57//
58// mfh 25 May 2016: Temporary fix for #393.
59//
60// Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
61// 4.7.2 compiler bug ("internal compiler error") when compiling them.
62// Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
63// use them in that case, either.
64//
65// mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
66// compiler error") too. Ditto for GCC 5.1. I'll just disable the
67// thing for any GCC version.
68//
69#if defined(__CUDACC__)
70 // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
71# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
73# endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74
75#elif defined(__GNUC__)
76
77# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
79# endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80
81#else // some other compiler
82
83 // Optimistically assume that other compilers aren't broken.
84# if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85# define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
86# endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87#endif // defined(__CUDACC__), defined(__GNUC__)
88
89
90namespace Tpetra {
91
92namespace Impl {
93
94 template<typename T>
95 struct BlockCrsRowStruct {
96 T totalNumEntries, totalNumBytes, maxRowLength;
97 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
98 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
99 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
100 : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
101 };
102
103 template<typename T>
104 static
105 KOKKOS_INLINE_FUNCTION
106 void operator+=(volatile BlockCrsRowStruct<T> &a,
107 volatile const BlockCrsRowStruct<T> &b) {
108 a.totalNumEntries += b.totalNumEntries;
109 a.totalNumBytes += b.totalNumBytes;
110 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
111 }
112
113 template<typename T>
114 static
115 KOKKOS_INLINE_FUNCTION
116 void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
117 a.totalNumEntries += b.totalNumEntries;
118 a.totalNumBytes += b.totalNumBytes;
119 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
120 }
121
122 template<typename T, typename ExecSpace>
123 struct BlockCrsReducer {
124 typedef BlockCrsReducer reducer;
125 typedef T value_type;
126 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
127 value_type *value;
128
129 KOKKOS_INLINE_FUNCTION
130 BlockCrsReducer(value_type &val) : value(&val) {}
131
132 KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
133 KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const { dst += src; }
134 KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
135 KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
136 KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
137 };
138
139 template<class AlphaCoeffType,
140 class GraphType,
141 class MatrixValuesType,
142 class InVecType,
143 class BetaCoeffType,
144 class OutVecType,
145 bool IsBuiltInType>
146 class BcrsApplyNoTransFunctor {
147 private:
148 static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
149 "MatrixValuesType must be a Kokkos::View.");
150 static_assert (Kokkos::Impl::is_view<OutVecType>::value,
151 "OutVecType must be a Kokkos::View.");
152 static_assert (Kokkos::Impl::is_view<InVecType>::value,
153 "InVecType must be a Kokkos::View.");
154 static_assert (std::is_same<MatrixValuesType,
155 typename MatrixValuesType::const_type>::value,
156 "MatrixValuesType must be a const Kokkos::View.");
157 static_assert (std::is_same<OutVecType,
158 typename OutVecType::non_const_type>::value,
159 "OutVecType must be a nonconst Kokkos::View.");
160 static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
161 "InVecType must be a const Kokkos::View.");
162 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
163 "MatrixValuesType must be a rank-1 Kokkos::View.");
164 static_assert (static_cast<int> (InVecType::rank) == 1,
165 "InVecType must be a rank-1 Kokkos::View.");
166 static_assert (static_cast<int> (OutVecType::rank) == 1,
167 "OutVecType must be a rank-1 Kokkos::View.");
168 typedef typename MatrixValuesType::non_const_value_type scalar_type;
169
170 public:
171 typedef typename GraphType::device_type device_type;
172
174 typedef typename std::remove_const<typename GraphType::data_type>::type
176
183 void setX (const InVecType& X) { X_ = X; }
184
191 void setY (const OutVecType& Y) { Y_ = Y; }
192
193 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
194 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
195
197 BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
198 const GraphType& graph,
199 const MatrixValuesType& val,
200 const local_ordinal_type blockSize,
201 const InVecType& X,
202 const beta_coeff_type& beta,
203 const OutVecType& Y) :
204 alpha_ (alpha),
205 ptr_ (graph.row_map),
206 ind_ (graph.entries),
207 val_ (val),
208 blockSize_ (blockSize),
209 X_ (X),
210 beta_ (beta),
211 Y_ (Y)
212 {}
213
214 // Dummy team version
215 KOKKOS_INLINE_FUNCTION void
216 operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
217 {
218 Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
219 }
220
221 // Range Policy for non built-in types
222 KOKKOS_INLINE_FUNCTION void
223 operator () (const local_ordinal_type& lclRow) const
224 {
229 using Kokkos::Details::ArithTraits;
230 // I'm not writing 'using Kokkos::make_pair;' here, because that
231 // may break builds for users who make the mistake of putting
232 // 'using namespace std;' in the global namespace. Please don't
233 // ever do that! But just in case you do, I'll take this
234 // precaution.
235 using Kokkos::parallel_for;
236 using Kokkos::subview;
237 typedef typename decltype (ptr_)::non_const_value_type offset_type;
238 typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
239 Kokkos::LayoutRight,
240 device_type,
241 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
242 little_block_type;
243
244 const offset_type Y_ptBeg = lclRow * blockSize_;
245 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
246 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
247
248 // This version of the code does not use temporary storage.
249 // Each thread writes to its own block of the target vector.
250 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
251 FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
252 }
253 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
254 SCAL (beta_, Y_cur);
255 }
256
257 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
258 const offset_type blkBeg = ptr_[lclRow];
259 const offset_type blkEnd = ptr_[lclRow+1];
260 // Precompute to save integer math in the inner loop.
261 const offset_type bs2 = blockSize_ * blockSize_;
262 for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
263 ++absBlkOff) {
264 little_block_type A_cur (val_.data () + absBlkOff * bs2,
265 blockSize_, blockSize_);
266 const offset_type X_blkCol = ind_[absBlkOff];
267 const offset_type X_ptBeg = X_blkCol * blockSize_;
268 const offset_type X_ptEnd = X_ptBeg + blockSize_;
269 auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
270
271 GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
272 } // for each entry in current local block row of matrix
273 }
274 }
275
276 private:
277 alpha_coeff_type alpha_;
278 typename GraphType::row_map_type::const_type ptr_;
279 typename GraphType::entries_type::const_type ind_;
280 MatrixValuesType val_;
281 local_ordinal_type blockSize_;
282 InVecType X_;
283 beta_coeff_type beta_;
284 OutVecType Y_;
285 };
286
287 template<class AlphaCoeffType,
288 class GraphType,
289 class MatrixValuesType,
290 class InVecType,
291 class BetaCoeffType,
292 class OutVecType>
293 class BcrsApplyNoTransFunctor<AlphaCoeffType,
294 GraphType,
295 MatrixValuesType,
296 InVecType,
297 BetaCoeffType,
298 OutVecType,
299 true> {
300 private:
301 static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
302 "MatrixValuesType must be a Kokkos::View.");
303 static_assert (Kokkos::Impl::is_view<OutVecType>::value,
304 "OutVecType must be a Kokkos::View.");
305 static_assert (Kokkos::Impl::is_view<InVecType>::value,
306 "InVecType must be a Kokkos::View.");
307 static_assert (std::is_same<MatrixValuesType,
308 typename MatrixValuesType::const_type>::value,
309 "MatrixValuesType must be a const Kokkos::View.");
310 static_assert (std::is_same<OutVecType,
311 typename OutVecType::non_const_type>::value,
312 "OutVecType must be a nonconst Kokkos::View.");
313 static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
314 "InVecType must be a const Kokkos::View.");
315 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
316 "MatrixValuesType must be a rank-1 Kokkos::View.");
317 static_assert (static_cast<int> (InVecType::rank) == 1,
318 "InVecType must be a rank-1 Kokkos::View.");
319 static_assert (static_cast<int> (OutVecType::rank) == 1,
320 "OutVecType must be a rank-1 Kokkos::View.");
321 typedef typename MatrixValuesType::non_const_value_type scalar_type;
322
323 public:
324 typedef typename GraphType::device_type device_type;
325
327 typedef typename std::remove_const<typename GraphType::data_type>::type
329
336 void setX (const InVecType& X) { X_ = X; }
337
344 void setY (const OutVecType& Y) { Y_ = Y; }
345
346 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
347 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
348
350 BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
351 const GraphType& graph,
352 const MatrixValuesType& val,
353 const local_ordinal_type blockSize,
354 const InVecType& X,
355 const beta_coeff_type& beta,
356 const OutVecType& Y) :
357 alpha_ (alpha),
358 ptr_ (graph.row_map),
359 ind_ (graph.entries),
360 val_ (val),
361 blockSize_ (blockSize),
362 X_ (X),
363 beta_ (beta),
364 Y_ (Y)
365 {}
366
367 // dummy Range version
368 KOKKOS_INLINE_FUNCTION void
369 operator () (const local_ordinal_type& lclRow) const
370 {
371 Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
372 }
373
374 // Team policy for built-in types
375 KOKKOS_INLINE_FUNCTION void
376 operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
377 {
378 const local_ordinal_type lclRow = member.league_rank();
379
380 using Kokkos::Details::ArithTraits;
381 // I'm not writing 'using Kokkos::make_pair;' here, because that
382 // may break builds for users who make the mistake of putting
383 // 'using namespace std;' in the global namespace. Please don't
384 // ever do that! But just in case you do, I'll take this
385 // precaution.
386 using Kokkos::parallel_for;
387 using Kokkos::subview;
388 typedef typename decltype (ptr_)::non_const_value_type offset_type;
389 typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
390 Kokkos::LayoutRight,
391 device_type,
392 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
393 little_block_type;
394
395 const offset_type Y_ptBeg = lclRow * blockSize_;
396 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
397 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
398
399 // This version of the code does not use temporary storage.
400 // Each thread writes to its own block of the target vector.
401 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
402 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
403 [&](const local_ordinal_type &i) {
404 Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
405 });
406 }
407 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
408 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
409 [&](const local_ordinal_type &i) {
410 Y_cur(i) *= beta_;
411 });
412 }
413 member.team_barrier();
414
415 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
416 const offset_type blkBeg = ptr_[lclRow];
417 const offset_type blkEnd = ptr_[lclRow+1];
418 // Precompute to save integer math in the inner loop.
419 const offset_type bs2 = blockSize_ * blockSize_;
420 little_block_type A_cur (val_.data (), blockSize_, blockSize_);
421 auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
422 Kokkos::parallel_for
423 (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
424 [&](const local_ordinal_type &absBlkOff) {
425 A_cur.assign_data(val_.data () + absBlkOff * bs2);
426 const offset_type X_blkCol = ind_[absBlkOff];
427 const offset_type X_ptBeg = X_blkCol * blockSize_;
428 X_cur.assign_data(&X_(X_ptBeg));
429
430 Kokkos::parallel_for
431 (Kokkos::ThreadVectorRange(member, blockSize_),
432 [&](const local_ordinal_type &k0) {
433 scalar_type val(0);
434 for (local_ordinal_type k1=0;k1<blockSize_;++k1)
435 val += A_cur(k0,k1)*X_cur(k1);
436#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
437 // host space team size is always 1
438 Y_cur(k0) += alpha_*val;
439#else
440 // cuda space team size can be larger than 1
441 // atomic is not allowed for sacado type;
442 // thus this needs to be specialized or
443 // sacado atomic should be supported.
444 Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
445#endif
446 });
447 }); // for each entry in current local block row of matrix
448 }
449 }
450
451 private:
452 alpha_coeff_type alpha_;
453 typename GraphType::row_map_type::const_type ptr_;
454 typename GraphType::entries_type::const_type ind_;
455 MatrixValuesType val_;
456 local_ordinal_type blockSize_;
457 InVecType X_;
458 beta_coeff_type beta_;
459 OutVecType Y_;
460 };
461
462
463 template<class AlphaCoeffType,
464 class GraphType,
465 class MatrixValuesType,
466 class InMultiVecType,
467 class BetaCoeffType,
468 class OutMultiVecType>
469 void
470 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
471 const GraphType& graph,
472 const MatrixValuesType& val,
473 const typename std::remove_const<typename GraphType::data_type>::type blockSize,
474 const InMultiVecType& X,
475 const BetaCoeffType& beta,
476 const OutMultiVecType& Y
477 )
478 {
479 static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
480 "MatrixValuesType must be a Kokkos::View.");
481 static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
482 "OutMultiVecType must be a Kokkos::View.");
483 static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
484 "InMultiVecType must be a Kokkos::View.");
485 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
486 "MatrixValuesType must be a rank-1 Kokkos::View.");
487 static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
488 "OutMultiVecType must be a rank-2 Kokkos::View.");
489 static_assert (static_cast<int> (InMultiVecType::rank) == 2,
490 "InMultiVecType must be a rank-2 Kokkos::View.");
491
492 typedef typename GraphType::device_type::execution_space execution_space;
493 typedef typename MatrixValuesType::const_type matrix_values_type;
494 typedef typename OutMultiVecType::non_const_type out_multivec_type;
495 typedef typename InMultiVecType::const_type in_multivec_type;
496 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
497 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
498 typedef typename std::remove_const<typename GraphType::data_type>::type LO;
499
500 constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
501
502 const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
503 static_cast<LO> (0) :
504 static_cast<LO> (graph.row_map.extent (0) - 1);
505 const LO numVecs = Y.extent (1);
506 if (numLocalMeshRows == 0 || numVecs == 0) {
507 return; // code below doesn't handle numVecs==0 correctly
508 }
509
510 // These assignments avoid instantiating the functor extra times
511 // unnecessarily, e.g., for X const vs. nonconst. We only need the
512 // X const case, so only instantiate for that case.
513 in_multivec_type X_in = X;
514 out_multivec_type Y_out = Y;
515
516 // The functor only knows how to handle one vector at a time, and it
517 // expects 1-D Views. Thus, we need to know the type of each column
518 // of X and Y.
519 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
520 typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
521 typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
522 matrix_values_type, in_vec_type, beta_type, out_vec_type,
523 is_builtin_type_enabled> functor_type;
524
525 auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
526 auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
527
528 // Compute the first column of Y.
529 if (is_builtin_type_enabled) {
530 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
531 // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
532 typedef Kokkos::TeamPolicy<execution_space> policy_type;
533 policy_type policy(1,1);
534#if defined(KOKKOS_ENABLE_CUDA)
535 constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
536#else
537 constexpr bool is_cuda = false;
538#endif // defined(KOKKOS_ENABLE_CUDA)
539 if (is_cuda) {
540 LO team_size = 8, vector_size = 1;
541 if (blockSize <= 5) vector_size = 4;
542 else if (blockSize <= 9) vector_size = 8;
543 else if (blockSize <= 12) vector_size = 12;
544 else if (blockSize <= 20) vector_size = 20;
545 else vector_size = 20;
546 policy = policy_type(numLocalMeshRows, team_size, vector_size);
547 } else {
548 policy = policy_type(numLocalMeshRows, 1, 1);
549 }
550 Kokkos::parallel_for (policy, functor);
551
552 // Compute the remaining columns of Y.
553 for (LO j = 1; j < numVecs; ++j) {
554 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
555 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
556 functor.setX (X_j);
557 functor.setY (Y_j);
558 Kokkos::parallel_for (policy, functor);
559 }
560 } else {
561 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
562 Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
563 Kokkos::parallel_for (policy, functor);
564 for (LO j = 1; j < numVecs; ++j) {
565 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
566 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
567 functor.setX (X_j);
568 functor.setY (Y_j);
569 Kokkos::parallel_for (policy, functor);
570 }
571 }
572 }
573} // namespace Impl
574
575namespace { // (anonymous)
576
577// Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
578// version that takes two Kokkos::View arguments).
579template<class Scalar, class LO, class GO, class Node>
580class GetLocalDiagCopy {
581public:
582 typedef typename Node::device_type device_type;
583 typedef size_t diag_offset_type;
584 typedef Kokkos::View<const size_t*, device_type,
585 Kokkos::MemoryUnmanaged> diag_offsets_type;
586 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
587 typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
588 typedef typename local_graph_device_type::row_map_type row_offsets_type;
589 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
590 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
591 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
592
593 // Constructor
594 GetLocalDiagCopy (const diag_type& diag,
595 const values_type& val,
596 const diag_offsets_type& diagOffsets,
597 const row_offsets_type& ptr,
598 const LO blockSize) :
599 diag_ (diag),
600 diagOffsets_ (diagOffsets),
601 ptr_ (ptr),
602 blockSize_ (blockSize),
603 offsetPerBlock_ (blockSize_*blockSize_),
604 val_(val)
605 {}
606
607 KOKKOS_INLINE_FUNCTION void
608 operator() (const LO& lclRowInd) const
609 {
610 using Kokkos::ALL;
611
612 // Get row offset
613 const size_t absOffset = ptr_[lclRowInd];
614
615 // Get offset relative to start of row
616 const size_t relOffset = diagOffsets_[lclRowInd];
617
618 // Get the total offset
619 const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
620
621 // Get a view of the block. BCRS currently uses LayoutRight
622 // regardless of the device.
623 typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
624 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
625 const_little_block_type;
626 const_little_block_type D_in (val_.data () + pointOffset,
627 blockSize_, blockSize_);
628 auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
629 COPY (D_in, D_out);
630 }
631
632 private:
633 diag_type diag_;
634 diag_offsets_type diagOffsets_;
635 row_offsets_type ptr_;
636 LO blockSize_;
637 LO offsetPerBlock_;
638 values_type val_;
639 };
640} // namespace (anonymous)
641
642 template<class Scalar, class LO, class GO, class Node>
643 std::ostream&
644 BlockCrsMatrix<Scalar, LO, GO, Node>::
645 markLocalErrorAndGetStream ()
646 {
647 * (this->localError_) = true;
648 if ((*errs_).is_null ()) {
649 *errs_ = Teuchos::rcp (new std::ostringstream ());
650 }
651 return **errs_;
652 }
653
654 template<class Scalar, class LO, class GO, class Node>
657 dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
658 graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
659 blockSize_ (static_cast<LO> (0)),
660 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
661 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
662 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
663 offsetPerBlock_ (0),
664 localError_ (new bool (false)),
665 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
666 {
667 }
668
669 template<class Scalar, class LO, class GO, class Node>
671 BlockCrsMatrix (const crs_graph_type& graph,
672 const LO blockSize) :
673 dist_object_type (graph.getMap ()),
674 graph_ (graph),
675 rowMeshMap_ (* (graph.getRowMap ())),
676 blockSize_ (blockSize),
677 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
678 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
679 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
680 offsetPerBlock_ (blockSize * blockSize),
681 localError_ (new bool (false)),
682 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
683 {
685 TEUCHOS_TEST_FOR_EXCEPTION(
686 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
687 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
688 "rows (isSorted() is false). This class assumes sorted rows.");
689
690 graphRCP_ = Teuchos::rcpFromRef(graph_);
691
692 // Trick to test whether LO is nonpositive, without a compiler
693 // warning in case LO is unsigned (which is generally a bad idea
694 // anyway). I don't promise that the trick works, but it
695 // generally does with gcc at least, in my experience.
696 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
697 TEUCHOS_TEST_FOR_EXCEPTION(
698 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
699 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
700 " <= 0. The block size must be positive.");
701
702 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
703 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
704
705 {
706 // These are rcp
707 const auto domainPointMap = getDomainMap();
708 const auto colPointMap = Teuchos::rcp
709 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
710 *pointImporter_ = Teuchos::rcp
711 (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
712 }
713 {
714 auto local_graph_h = graph.getLocalGraphHost ();
715 auto ptr_h = local_graph_h.row_map;
716 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
717 Kokkos::deep_copy(ptrHost_, ptr_h);
718
719 auto ind_h = local_graph_h.entries;
720 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
721 Kokkos::deep_copy (indHost_, ind_h);
722
723 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
724 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
725 }
726 }
727
728 template<class Scalar, class LO, class GO, class Node>
730 BlockCrsMatrix (const crs_graph_type& graph,
731 const map_type& domainPointMap,
732 const map_type& rangePointMap,
733 const LO blockSize) :
734 dist_object_type (graph.getMap ()),
735 graph_ (graph),
736 rowMeshMap_ (* (graph.getRowMap ())),
737 domainPointMap_ (domainPointMap),
738 rangePointMap_ (rangePointMap),
739 blockSize_ (blockSize),
740 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
741 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
742 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
743 offsetPerBlock_ (blockSize * blockSize),
744 localError_ (new bool (false)),
745 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
746 {
747 TEUCHOS_TEST_FOR_EXCEPTION(
748 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
749 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
750 "rows (isSorted() is false). This class assumes sorted rows.");
751
752 graphRCP_ = Teuchos::rcpFromRef(graph_);
753
754 // Trick to test whether LO is nonpositive, without a compiler
755 // warning in case LO is unsigned (which is generally a bad idea
756 // anyway). I don't promise that the trick works, but it
757 // generally does with gcc at least, in my experience.
758 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
759 TEUCHOS_TEST_FOR_EXCEPTION(
760 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
761 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
762 " <= 0. The block size must be positive.");
763 {
764 // These are rcp
765 const auto rcpDomainPointMap = getDomainMap();
766 const auto colPointMap = Teuchos::rcp
767 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
768 *pointImporter_ = Teuchos::rcp
769 (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
770 }
771 {
772 auto local_graph_h = graph.getLocalGraphHost ();
773 auto ptr_h = local_graph_h.row_map;
774 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
775 Kokkos::deep_copy(ptrHost_, ptr_h);
776
777 auto ind_h = local_graph_h.entries;
778 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
779 Kokkos::deep_copy (indHost_, ind_h);
780
781 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
782 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
783 }
784 }
785
786 template<class Scalar, class LO, class GO, class Node>
787 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
789 getDomainMap () const
790 { // Copy constructor of map_type does a shallow copy.
791 // We're only returning by RCP for backwards compatibility.
792 return Teuchos::rcp (new map_type (domainPointMap_));
793 }
794
795 template<class Scalar, class LO, class GO, class Node>
796 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
798 getRangeMap () const
799 { // Copy constructor of map_type does a shallow copy.
800 // We're only returning by RCP for backwards compatibility.
801 return Teuchos::rcp (new map_type (rangePointMap_));
802 }
803
804 template<class Scalar, class LO, class GO, class Node>
805 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
807 getRowMap () const
808 {
809 return graph_.getRowMap();
810 }
811
812 template<class Scalar, class LO, class GO, class Node>
813 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
815 getColMap () const
816 {
817 return graph_.getColMap();
818 }
819
820 template<class Scalar, class LO, class GO, class Node>
823 getGlobalNumRows() const
824 {
825 return graph_.getGlobalNumRows();
826 }
827
828 template<class Scalar, class LO, class GO, class Node>
829 size_t
831 getNodeNumRows() const
832 {
833 return graph_.getNodeNumRows();
834 }
835
836 template<class Scalar, class LO, class GO, class Node>
837 size_t
840 {
841 return graph_.getNodeMaxNumRowEntries();
842 }
843
844 template<class Scalar, class LO, class GO, class Node>
845 void
847 apply (const mv_type& X,
848 mv_type& Y,
849 Teuchos::ETransp mode,
850 Scalar alpha,
851 Scalar beta) const
852 {
854 TEUCHOS_TEST_FOR_EXCEPTION(
855 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
856 std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
857 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
858 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
859
860 BMV X_view;
861 BMV Y_view;
862 const LO blockSize = getBlockSize ();
863 try {
864 X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
865 Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
866 }
867 catch (std::invalid_argument& e) {
868 TEUCHOS_TEST_FOR_EXCEPTION(
869 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
870 "apply: Either the input MultiVector X or the output MultiVector Y "
871 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
872 "graph. BlockMultiVector's constructor threw the following exception: "
873 << e.what ());
874 }
875
876 try {
877 // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
878 // either that or mark fields of this class as 'mutable'. The
879 // problem is that applyBlock wants to do lazy initialization of
880 // temporary block multivectors.
881 const_cast<this_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
882 } catch (std::invalid_argument& e) {
883 TEUCHOS_TEST_FOR_EXCEPTION(
884 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
885 "apply: The implementation method applyBlock complained about having "
886 "an invalid argument. It reported the following: " << e.what ());
887 } catch (std::logic_error& e) {
888 TEUCHOS_TEST_FOR_EXCEPTION(
889 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
890 "apply: The implementation method applyBlock complained about a "
891 "possible bug in its implementation. It reported the following: "
892 << e.what ());
893 } catch (std::exception& e) {
894 TEUCHOS_TEST_FOR_EXCEPTION(
895 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
896 "apply: The implementation method applyBlock threw an exception which "
897 "is neither std::invalid_argument nor std::logic_error, but is a "
898 "subclass of std::exception. It reported the following: "
899 << e.what ());
900 } catch (...) {
901 TEUCHOS_TEST_FOR_EXCEPTION(
902 true, std::logic_error, "Tpetra::BlockCrsMatrix::"
903 "apply: The implementation method applyBlock threw an exception which "
904 "is not an instance of a subclass of std::exception. This probably "
905 "indicates a bug. Please report this to the Tpetra developers.");
906 }
907 }
908
909 template<class Scalar, class LO, class GO, class Node>
910 void
914 Teuchos::ETransp mode,
915 const Scalar alpha,
916 const Scalar beta)
917 {
918 TEUCHOS_TEST_FOR_EXCEPTION(
919 X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
920 "Tpetra::BlockCrsMatrix::applyBlock: "
921 "X and Y have different block sizes. X.getBlockSize() = "
922 << X.getBlockSize () << " != Y.getBlockSize() = "
923 << Y.getBlockSize () << ".");
924
925 if (mode == Teuchos::NO_TRANS) {
926 applyBlockNoTrans (X, Y, alpha, beta);
927 } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
928 applyBlockTrans (X, Y, mode, alpha, beta);
929 } else {
930 TEUCHOS_TEST_FOR_EXCEPTION(
931 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
932 "applyBlock: Invalid 'mode' argument. Valid values are "
933 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
934 }
935 }
936
937 template<class Scalar, class LO, class GO, class Node>
938 void
940 setAllToScalar (const Scalar& alpha)
941 {
942 auto val_d = val_.getDeviceView(Access::OverwriteAll);
943 Kokkos::deep_copy(val_d, alpha);
944 }
945
946 template<class Scalar, class LO, class GO, class Node>
947 LO
949 replaceLocalValues (const LO localRowInd,
950 const LO colInds[],
951 const Scalar vals[],
952 const LO numColInds) const
953 {
954 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
955 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
956 ptrdiff_t * offsets = offsets_host_view.data();
957 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
958 const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
959 return validCount;
960 }
961
962 template <class Scalar, class LO, class GO, class Node>
963 void
965 getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
966 Kokkos::MemoryUnmanaged>& offsets) const
967 {
968 graph_.getLocalDiagOffsets (offsets);
969 }
970
971 template <class Scalar, class LO, class GO, class Node>
972 void
974 getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
975 Kokkos::MemoryUnmanaged>& diag,
976 const Kokkos::View<const size_t*, device_type,
977 Kokkos::MemoryUnmanaged>& offsets) const
978 {
979 using Kokkos::parallel_for;
980 const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
981
982 const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
983 const LO blockSize = this->getBlockSize ();
984 TEUCHOS_TEST_FOR_EXCEPTION
985 (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
986 static_cast<LO> (diag.extent (1)) < blockSize ||
987 static_cast<LO> (diag.extent (2)) < blockSize,
988 std::invalid_argument, prefix <<
989 "The input Kokkos::View is not big enough to hold all the data.");
990 TEUCHOS_TEST_FOR_EXCEPTION
991 (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
992 prefix << "offsets.size() = " << offsets.size () << " < local number of "
993 "diagonal blocks " << lclNumMeshRows << ".");
994
995 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
996 typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
997
998 // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
999 // we reserve the right to do lazy allocation of device data. (We
1000 // don't plan to do lazy allocation for host data; the host
1001 // version of the data always exists.)
1002 auto val_d = val_.getDeviceView(Access::ReadOnly);
1003 parallel_for (policy_type (0, lclNumMeshRows),
1004 functor_type (diag, val_d, offsets,
1005 graph_.getLocalGraphDevice ().row_map, blockSize_));
1006 }
1007
1008#ifdef TPETRA_ENABLE_DEPRECATED_CODE
1009 template <class Scalar, class LO, class GO, class Node>
1010 void
1012 getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1013 Kokkos::MemoryUnmanaged>& diag,
1014 const Teuchos::ArrayView<const size_t>& offsets) const
1015 {
1016 auto offsets_view_host = Kokkos::View<size_t*,Kokkos::HostSpace>(const_cast<size_t*>(offsets.getRawPtr()), offsets.size());
1017 auto offsets_view_device = Kokkos::create_mirror_view_and_copy(typename device_type::memory_space(), offsets_view_host);
1018 getLocalDiagCopy(diag, offsets_view_device);
1019 Kokkos::deep_copy(offsets_view_host, offsets_view_device);
1020 }
1021#endif
1022
1023 template<class Scalar, class LO, class GO, class Node>
1024 LO
1026 absMaxLocalValues (const LO localRowInd,
1027 const LO colInds[],
1028 const Scalar vals[],
1029 const LO numColInds) const
1030 {
1031 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1032 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1033 ptrdiff_t * offsets = offsets_host_view.data();
1034 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1035 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1036 return validCount;
1037 }
1038
1039
1040 template<class Scalar, class LO, class GO, class Node>
1041 LO
1043 sumIntoLocalValues (const LO localRowInd,
1044 const LO colInds[],
1045 const Scalar vals[],
1046 const LO numColInds) const
1047 {
1048 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1049 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1050 ptrdiff_t * offsets = offsets_host_view.data();
1051 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1052 const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1053 return validCount;
1054 }
1055#ifdef TPETRA_ENABLE_DEPRECATED_CODE
1056 template<class Scalar, class LO, class GO, class Node>
1057 LO
1059 getLocalRowView (const LO localRowInd,
1060 const LO*& colInds,
1061 Scalar*& vals,
1062 LO& numInds) const
1063 {
1064
1065 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1066 colInds = NULL;
1067 vals = NULL;
1068 numInds = 0;
1069 return Teuchos::OrdinalTraits<LO>::invalid ();
1070 }
1071 else {
1072 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1073 colInds = indHost_.data () + absBlockOffsetStart;
1074
1075 auto vals_host_out = getValuesHost (localRowInd);
1076 impl_scalar_type* vals_host_out_raw = const_cast<impl_scalar_type*>(vals_host_out.data ());
1077 vals = reinterpret_cast<Scalar*> (vals_host_out_raw);
1078 numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1079 return 0; // indicates no error
1080 }
1081 }
1082#endif
1083
1084 template<class Scalar, class LO, class GO, class Node>
1085 void
1087 getLocalRowCopy (LO LocalRow,
1088 nonconst_local_inds_host_view_type &Indices,
1089 nonconst_values_host_view_type &Values,
1090 size_t& NumEntries) const
1091 {
1092 auto vals = getValuesHost(LocalRow);
1093 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1094 if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
1095 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1096 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1097 << numInds << " row entries");
1098 }
1099 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1100 for (LO i=0; i<numInds; ++i) {
1101 Indices[i] = colInds[i];
1102 }
1103 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1104 Values[i] = vals[i];
1105 }
1106 NumEntries = numInds;
1107 }
1108
1109#ifdef TPETRA_ENABLE_DEPRECATED_CODE
1110 template<class Scalar, class LO, class GO, class Node>
1111 void
1113 getLocalRowCopy (LO LocalRow,
1114 const Teuchos::ArrayView<LO>& Indices,
1115 const Teuchos::ArrayView<Scalar>& Values,
1116 size_t &NumEntries) const
1117 {
1118 auto vals = getValuesHost(LocalRow);
1119 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1120 if (numInds > (LO)Indices.size() || numInds*blockSize_*blockSize_ > (LO)Values.size()) {
1121 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1122 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1123 << numInds << " row entries");
1124 }
1125 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1126 for (LO i=0; i<numInds; ++i) {
1127 Indices[i] = colInds[i];
1128 }
1129 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1130 Values[i] = vals[i];
1131 }
1132 NumEntries = numInds;
1133 }
1134#endif
1135
1136 template<class Scalar, class LO, class GO, class Node>
1137 LO
1139 getLocalRowOffsets (const LO localRowInd,
1140 ptrdiff_t offsets[],
1141 const LO colInds[],
1142 const LO numColInds) const
1143 {
1144 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1145 // We got no offsets, because the input local row index is
1146 // invalid on the calling process. That may not be an error, if
1147 // numColInds is zero anyway; it doesn't matter. This is the
1148 // advantage of returning the number of valid indices.
1149 return static_cast<LO> (0);
1150 }
1151
1152 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1153 LO hint = 0; // Guess for the relative offset into the current row
1154 LO validCount = 0; // number of valid column indices in colInds
1155
1156 for (LO k = 0; k < numColInds; ++k) {
1157 const LO relBlockOffset =
1158 this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1159 if (relBlockOffset != LINV) {
1160 offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1161 hint = relBlockOffset + 1;
1162 ++validCount;
1163 }
1164 }
1165 return validCount;
1166 }
1167
1168
1169 template<class Scalar, class LO, class GO, class Node>
1170 LO
1172 replaceLocalValuesByOffsets (const LO localRowInd,
1173 const ptrdiff_t offsets[],
1174 const Scalar vals[],
1175 const LO numOffsets) const
1176 {
1177 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1178 // We modified no values, because the input local row index is
1179 // invalid on the calling process. That may not be an error, if
1180 // numColInds is zero anyway; it doesn't matter. This is the
1181 // advantage of returning the number of valid indices.
1182 return static_cast<LO> (0);
1183 }
1184 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1186 auto val_out = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
1187 impl_scalar_type* vOut = val_out.data();
1188
1189 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1190 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
1191 size_t pointOffset = 0; // Current offset into input values
1192 LO validCount = 0; // number of valid offsets
1193
1194 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1195 const size_t blockOffset = offsets[k]*perBlockSize;
1196 if (offsets[k] != STINV) {
1197 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1198 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1199 COPY (A_new, A_old);
1200 ++validCount;
1201 }
1202 }
1203 return validCount;
1204 }
1205
1206
1207 template<class Scalar, class LO, class GO, class Node>
1208 LO
1210 absMaxLocalValuesByOffsets (const LO localRowInd,
1211 const ptrdiff_t offsets[],
1212 const Scalar vals[],
1213 const LO numOffsets) const
1214 {
1215 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1216 // We modified no values, because the input local row index is
1217 // invalid on the calling process. That may not be an error, if
1218 // numColInds is zero anyway; it doesn't matter. This is the
1219 // advantage of returning the number of valid indices.
1220 return static_cast<LO> (0);
1221 }
1222 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1223 auto val_out = getValuesHost(localRowInd);
1224 impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
1225
1226 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1227 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1228 size_t pointOffset = 0; // Current offset into input values
1229 LO validCount = 0; // number of valid offsets
1230
1231 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1232 const size_t blockOffset = offsets[k]*perBlockSize;
1233 if (blockOffset != STINV) {
1234 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1235 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1236 ::Tpetra::Impl::absMax (A_old, A_new);
1237 ++validCount;
1238 }
1239 }
1240 return validCount;
1241 }
1242
1243
1244 template<class Scalar, class LO, class GO, class Node>
1245 LO
1247 sumIntoLocalValuesByOffsets (const LO localRowInd,
1248 const ptrdiff_t offsets[],
1249 const Scalar vals[],
1250 const LO numOffsets) const
1251 {
1252 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1253 // We modified no values, because the input local row index is
1254 // invalid on the calling process. That may not be an error, if
1255 // numColInds is zero anyway; it doesn't matter. This is the
1256 // advantage of returning the number of valid indices.
1257 return static_cast<LO> (0);
1258 }
1259 const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1260 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1262 auto val_out = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
1263 impl_scalar_type* vOut = val_out.data();
1264
1265 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1266 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1267 size_t pointOffset = 0; // Current offset into input values
1268 LO validCount = 0; // number of valid offsets
1269
1270 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1271 const size_t blockOffset = offsets[k]*perBlockSize;
1272 if (blockOffset != STINV) {
1273 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1274 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1275 AXPY (ONE, A_new, A_old);
1276 ++validCount;
1277 }
1278 }
1279 return validCount;
1280 }
1281
1282 template<class Scalar, class LO, class GO, class Node>
1284 impl_scalar_type_dualview::t_host::const_type
1286 getValuesHost () const
1287 {
1288 return val_.getHostView(Access::ReadOnly);
1289 }
1290
1291 template<class Scalar, class LO, class GO, class Node>
1292 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1293 impl_scalar_type_dualview::t_dev::const_type
1294 BlockCrsMatrix<Scalar, LO, GO, Node>::
1295 getValuesDevice () const
1296 {
1297 return val_.getDeviceView(Access::ReadOnly);
1298 }
1299
1300 template<class Scalar, class LO, class GO, class Node>
1301 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1302 impl_scalar_type_dualview::t_host
1304 getValuesHostNonConst () const
1305 {
1306 return val_.getHostView(Access::ReadWrite);
1307 }
1308
1309 template<class Scalar, class LO, class GO, class Node>
1311 impl_scalar_type_dualview::t_dev
1314 {
1315 return val_.getDeviceView(Access::ReadWrite);
1316 }
1317
1318 template<class Scalar, class LO, class GO, class Node>
1319 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1320 impl_scalar_type_dualview::t_host::const_type
1322 getValuesHost (const LO& lclRow) const
1323 {
1324 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1325 auto val = val_.getHostView(Access::ReadOnly);
1326 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1327 return r_val;
1328 }
1329
1330 template<class Scalar, class LO, class GO, class Node>
1332 impl_scalar_type_dualview::t_dev::const_type
1334 getValuesDevice (const LO& lclRow) const
1335 {
1336 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1337 auto val = val_.getDeviceView(Access::ReadOnly);
1338 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1339 return r_val;
1340 }
1341
1342 template<class Scalar, class LO, class GO, class Node>
1345 getValuesHostNonConst (const LO& lclRow)
1346 {
1347 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1348 auto val = val_.getHostView(Access::ReadWrite);
1349 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1350 return r_val;
1351 }
1352
1353 template<class Scalar, class LO, class GO, class Node>
1356 getValuesDeviceNonConst (const LO& lclRow)
1357 {
1358 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1359 auto val = val_.getDeviceView(Access::ReadWrite);
1360 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1361 return r_val;
1362 }
1363
1364 template<class Scalar, class LO, class GO, class Node>
1365 size_t
1367 getNumEntriesInLocalRow (const LO localRowInd) const
1368 {
1369 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1370 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1371 return static_cast<LO> (0); // the calling process doesn't have that row
1372 } else {
1373 return static_cast<LO> (numEntInGraph);
1374 }
1375 }
1376
1377 template<class Scalar, class LO, class GO, class Node>
1378 void
1382 const Teuchos::ETransp mode,
1383 const Scalar alpha,
1384 const Scalar beta)
1385 {
1386 (void) X;
1387 (void) Y;
1388 (void) mode;
1389 (void) alpha;
1390 (void) beta;
1391
1392 TEUCHOS_TEST_FOR_EXCEPTION(
1393 true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1394 "transpose and conjugate transpose modes are not yet implemented.");
1395 }
1396
1397 template<class Scalar, class LO, class GO, class Node>
1398 void
1399 BlockCrsMatrix<Scalar, LO, GO, Node>::
1400 applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1401 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1402 const Scalar alpha,
1403 const Scalar beta)
1404 {
1405 using Teuchos::RCP;
1406 using Teuchos::rcp;
1407 typedef ::Tpetra::Import<LO, GO, Node> import_type;
1408 typedef ::Tpetra::Export<LO, GO, Node> export_type;
1409 const Scalar zero = STS::zero ();
1410 const Scalar one = STS::one ();
1411 RCP<const import_type> import = graph_.getImporter ();
1412 // "export" is a reserved C++ keyword, so we can't use it.
1413 RCP<const export_type> theExport = graph_.getExporter ();
1414 const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1415
1416 if (alpha == zero) {
1417 if (beta == zero) {
1418 Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1419 }
1420 else if (beta != one) {
1421 Y.scale (beta);
1422 }
1423 }
1424 else { // alpha != 0
1425 const BMV* X_colMap = NULL;
1426 if (import.is_null ()) {
1427 try {
1428 X_colMap = &X;
1429 }
1430 catch (std::exception& e) {
1431 TEUCHOS_TEST_FOR_EXCEPTION
1432 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1433 "operator= threw an exception: " << std::endl << e.what ());
1434 }
1435 }
1436 else {
1437 // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1438 // Y_rowMap_ below. This lets us do lazy initialization
1439 // correctly with view semantics of BlockCrsMatrix. All views
1440 // of this BlockCrsMatrix have the same outer pointer. That
1441 // way, we can set the inner pointer in one view, and all
1442 // other views will see it.
1443 if ((*X_colMap_).is_null () ||
1444 (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1445 (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1446 *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1447 static_cast<LO> (X.getNumVectors ())));
1448 }
1449 (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1450 **pointImporter_,
1452 try {
1453 X_colMap = &(**X_colMap_);
1454 }
1455 catch (std::exception& e) {
1456 TEUCHOS_TEST_FOR_EXCEPTION
1457 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1458 "operator= threw an exception: " << std::endl << e.what ());
1459 }
1460 }
1461
1462 BMV* Y_rowMap = NULL;
1463 if (theExport.is_null ()) {
1464 Y_rowMap = &Y;
1465 }
1466 else if ((*Y_rowMap_).is_null () ||
1467 (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1468 (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1469 *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1470 static_cast<LO> (X.getNumVectors ())));
1471 try {
1472 Y_rowMap = &(**Y_rowMap_);
1473 }
1474 catch (std::exception& e) {
1475 TEUCHOS_TEST_FOR_EXCEPTION(
1476 true, std::logic_error, prefix << "Tpetra::MultiVector::"
1477 "operator= threw an exception: " << std::endl << e.what ());
1478 }
1479 }
1480
1481 try {
1482 localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1483 }
1484 catch (std::exception& e) {
1485 TEUCHOS_TEST_FOR_EXCEPTION
1486 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1487 "an exception: " << e.what ());
1488 }
1489 catch (...) {
1490 TEUCHOS_TEST_FOR_EXCEPTION
1491 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1492 "an exception not a subclass of std::exception.");
1493 }
1494
1495 if (! theExport.is_null ()) {
1496 Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1497 }
1498 }
1499 }
1500
1501 template<class Scalar, class LO, class GO, class Node>
1502 void
1503 BlockCrsMatrix<Scalar, LO, GO, Node>::
1504 localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1505 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1506 const Scalar alpha,
1507 const Scalar beta)
1508 {
1509 using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1510
1511 const impl_scalar_type alpha_impl = alpha;
1512 const auto graph = this->graph_.getLocalGraphDevice ();
1513 const impl_scalar_type beta_impl = beta;
1514 const LO blockSize = this->getBlockSize ();
1515
1516 auto X_mv = X.getMultiVectorView ();
1517 auto Y_mv = Y.getMultiVectorView ();
1518
1519 //auto X_lcl = X_mv.template getLocalView<device_type> (Access::ReadOnly);
1520 //auto Y_lcl = Y_mv.template getLocalView<device_type> (Access::ReadWrite);
1521 auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
1522 auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
1523 auto val = val_.getDeviceView(Access::ReadWrite);
1524
1525 bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1526 beta_impl, Y_lcl);
1527 }
1528
1529 template<class Scalar, class LO, class GO, class Node>
1530 LO
1531 BlockCrsMatrix<Scalar, LO, GO, Node>::
1532 findRelOffsetOfColumnIndex (const LO localRowIndex,
1533 const LO colIndexToFind,
1534 const LO hint) const
1535 {
1536 const size_t absStartOffset = ptrHost_[localRowIndex];
1537 const size_t absEndOffset = ptrHost_[localRowIndex+1];
1538 const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1539 // Amortize pointer arithmetic over the search loop.
1540 const LO* const curInd = indHost_.data () + absStartOffset;
1541
1542 // If the hint was correct, then the hint is the offset to return.
1543 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1544 // Always return the offset relative to the current row.
1545 return hint;
1546 }
1547
1548 // The hint was wrong, so we must search for the given column
1549 // index in the column indices for the given row.
1550 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1551
1552 // We require that the graph have sorted rows. However, binary
1553 // search only pays if the current row is longer than a certain
1554 // amount. We set this to 32, but you might want to tune this.
1555 const LO maxNumEntriesForLinearSearch = 32;
1556 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1557 // Use binary search. It would probably be better for us to
1558 // roll this loop by hand. If we wrote it right, a smart
1559 // compiler could perhaps use conditional loads and avoid
1560 // branches (according to Jed Brown on May 2014).
1561 const LO* beg = curInd;
1562 const LO* end = curInd + numEntriesInRow;
1563 std::pair<const LO*, const LO*> p =
1564 std::equal_range (beg, end, colIndexToFind);
1565 if (p.first != p.second) {
1566 // offset is relative to the current row
1567 relOffset = static_cast<LO> (p.first - beg);
1568 }
1569 }
1570 else { // use linear search
1571 for (LO k = 0; k < numEntriesInRow; ++k) {
1572 if (colIndexToFind == curInd[k]) {
1573 relOffset = k; // offset is relative to the current row
1574 break;
1575 }
1576 }
1577 }
1578
1579 return relOffset;
1580 }
1581
1582 template<class Scalar, class LO, class GO, class Node>
1583 LO
1584 BlockCrsMatrix<Scalar, LO, GO, Node>::
1585 offsetPerBlock () const
1586 {
1587 return offsetPerBlock_;
1588 }
1589
1590 template<class Scalar, class LO, class GO, class Node>
1592 BlockCrsMatrix<Scalar, LO, GO, Node>::
1593 getConstLocalBlockFromInput (const impl_scalar_type* val,
1594 const size_t pointOffset) const
1595 {
1596 // Row major blocks
1597 const LO rowStride = blockSize_;
1598 return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1599 }
1600
1601 template<class Scalar, class LO, class GO, class Node>
1603 BlockCrsMatrix<Scalar, LO, GO, Node>::
1604 getNonConstLocalBlockFromInput (impl_scalar_type* val,
1605 const size_t pointOffset) const
1606 {
1607 // Row major blocks
1608 const LO rowStride = blockSize_;
1609 return little_block_type (val + pointOffset, blockSize_, rowStride);
1610 }
1611
1612 template<class Scalar, class LO, class GO, class Node>
1613 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1614 BlockCrsMatrix<Scalar, LO, GO, Node>::
1615 getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1616 const size_t pointOffset) const
1617 {
1618 // Row major blocks
1619 const LO rowStride = blockSize_;
1620 const size_t bs2 = blockSize_ * blockSize_;
1621 return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1622 }
1623
1624#ifdef TPETRA_ENABLE_DEPRECATED_CODE
1625 template<class Scalar, class LO, class GO, class Node>
1627 BlockCrsMatrix<Scalar, LO, GO, Node>::
1628 getLocalBlock (const LO localRowInd, const LO localColInd) const
1629 {
1630 using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1631
1632 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1633 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1634
1635 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1636 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1637 auto vals = const_cast<this_type&>(*this).getValuesDeviceNonConst();
1638 return getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1639 }
1640 else {
1641 return little_block_type ();
1642 }
1643 }
1644#endif
1645
1646 template<class Scalar, class LO, class GO, class Node>
1648 BlockCrsMatrix<Scalar, LO, GO, Node>::
1649 getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1650 {
1651 using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1652
1653 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1654 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1655 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1656 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1657 auto vals = const_cast<this_type&>(*this).getValuesDeviceNonConst();
1658 auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1659 return r_val;
1660 }
1661 else {
1662 return little_block_type ();
1663 }
1664 }
1665
1666 template<class Scalar, class LO, class GO, class Node>
1667 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1668 BlockCrsMatrix<Scalar, LO, GO, Node>::
1669 getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1670 {
1671 using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1672
1673 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1674 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1675 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1676 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1677 auto vals = const_cast<this_type&>(*this).getValuesHostNonConst();
1678 auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1679 return r_val;
1680 }
1681 else {
1682 return little_block_host_type ();
1683 }
1684 }
1685
1686
1687 template<class Scalar, class LO, class GO, class Node>
1688 bool
1689 BlockCrsMatrix<Scalar, LO, GO, Node>::
1690 checkSizes (const ::Tpetra::SrcDistObject& source)
1691 {
1692 using std::endl;
1693 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
1694 const this_type* src = dynamic_cast<const this_type* > (&source);
1695
1696 if (src == NULL) {
1697 std::ostream& err = markLocalErrorAndGetStream ();
1698 err << "checkSizes: The source object of the Import or Export "
1699 "must be a BlockCrsMatrix with the same template parameters as the "
1700 "target object." << endl;
1701 }
1702 else {
1703 // Use a string of ifs, not if-elseifs, because we want to know
1704 // all the errors.
1705 if (src->getBlockSize () != this->getBlockSize ()) {
1706 std::ostream& err = markLocalErrorAndGetStream ();
1707 err << "checkSizes: The source and target objects of the Import or "
1708 << "Export must have the same block sizes. The source's block "
1709 << "size = " << src->getBlockSize () << " != the target's block "
1710 << "size = " << this->getBlockSize () << "." << endl;
1711 }
1712 if (! src->graph_.isFillComplete ()) {
1713 std::ostream& err = markLocalErrorAndGetStream ();
1714 err << "checkSizes: The source object of the Import or Export is "
1715 "not fill complete. Both source and target objects must be fill "
1716 "complete." << endl;
1717 }
1718 if (! this->graph_.isFillComplete ()) {
1719 std::ostream& err = markLocalErrorAndGetStream ();
1720 err << "checkSizes: The target object of the Import or Export is "
1721 "not fill complete. Both source and target objects must be fill "
1722 "complete." << endl;
1723 }
1724 if (src->graph_.getColMap ().is_null ()) {
1725 std::ostream& err = markLocalErrorAndGetStream ();
1726 err << "checkSizes: The source object of the Import or Export does "
1727 "not have a column Map. Both source and target objects must have "
1728 "column Maps." << endl;
1729 }
1730 if (this->graph_.getColMap ().is_null ()) {
1731 std::ostream& err = markLocalErrorAndGetStream ();
1732 err << "checkSizes: The target object of the Import or Export does "
1733 "not have a column Map. Both source and target objects must have "
1734 "column Maps." << endl;
1735 }
1736 }
1737
1738 return ! (* (this->localError_));
1739 }
1740
1741 template<class Scalar, class LO, class GO, class Node>
1742 void
1745 (const ::Tpetra::SrcDistObject& source,
1746 const size_t numSameIDs,
1747 const Kokkos::DualView<const local_ordinal_type*,
1748 buffer_device_type>& permuteToLIDs,
1749 const Kokkos::DualView<const local_ordinal_type*,
1750 buffer_device_type>& permuteFromLIDs,
1751 const CombineMode /*CM*/)
1752 {
1753 using ::Tpetra::Details::Behavior;
1755 using ::Tpetra::Details::ProfilingRegion;
1756 using std::endl;
1758
1759 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1760 const bool debug = Behavior::debug();
1761 const bool verbose = Behavior::verbose();
1762
1763 // Define this function prefix
1764 std::string prefix;
1765 {
1766 std::ostringstream os;
1767 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1768 os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1769 prefix = os.str();
1770 }
1771
1772 // check if this already includes a local error
1773 if (* (this->localError_)) {
1774 std::ostream& err = this->markLocalErrorAndGetStream ();
1775 err << prefix
1776 << "The target object of the Import or Export is already in an error state."
1777 << endl;
1778 return;
1779 }
1780
1781 //
1782 // Verbose input dual view status
1783 //
1784 if (verbose) {
1785 std::ostringstream os;
1786 os << prefix << endl
1787 << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1788 << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1789 std::cerr << os.str ();
1790 }
1791
1795 if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1796 std::ostream& err = this->markLocalErrorAndGetStream ();
1797 err << prefix
1798 << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1799 << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1800 << "." << endl;
1801 return;
1802 }
1803 if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1804 std::ostream& err = this->markLocalErrorAndGetStream ();
1805 err << prefix
1806 << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1807 << endl;
1808 return;
1809 }
1810
1811 const this_type* src = dynamic_cast<const this_type* > (&source);
1812 if (src == NULL) {
1813 std::ostream& err = this->markLocalErrorAndGetStream ();
1814 err << prefix
1815 << "The source (input) object of the Import or "
1816 "Export is either not a BlockCrsMatrix, or does not have the right "
1817 "template parameters. checkSizes() should have caught this. "
1818 "Please report this bug to the Tpetra developers." << endl;
1819 return;
1820 }
1821
1822 bool lclErr = false;
1823#ifdef HAVE_TPETRA_DEBUG
1824 std::set<LO> invalidSrcCopyRows;
1825 std::set<LO> invalidDstCopyRows;
1826 std::set<LO> invalidDstCopyCols;
1827 std::set<LO> invalidDstPermuteCols;
1828 std::set<LO> invalidPermuteFromRows;
1829#endif // HAVE_TPETRA_DEBUG
1830
1831 // Copy the initial sequence of rows that are the same.
1832 //
1833 // The two graphs might have different column Maps, so we need to
1834 // do this using global column indices. This is purely local, so
1835 // we only need to check for local sameness of the two column
1836 // Maps.
1837
1838#ifdef HAVE_TPETRA_DEBUG
1839 const map_type& srcRowMap = * (src->graph_.getRowMap ());
1840#endif // HAVE_TPETRA_DEBUG
1841 const map_type& dstRowMap = * (this->graph_.getRowMap ());
1842 const map_type& srcColMap = * (src->graph_.getColMap ());
1843 const map_type& dstColMap = * (this->graph_.getColMap ());
1844 const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1845
1846 const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1847 if (verbose) {
1848 std::ostringstream os;
1849 os << prefix
1850 << "canUseLocalColumnIndices: "
1851 << (canUseLocalColumnIndices ? "true" : "false")
1852 << ", numPermute: " << numPermute
1853 << endl;
1854 std::cerr << os.str ();
1855 }
1856
1857 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1858 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1859
1860 if (canUseLocalColumnIndices) {
1861 // Copy local rows that are the "same" in both source and target.
1862 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1863#ifdef HAVE_TPETRA_DEBUG
1864 if (! srcRowMap.isNodeLocalElement (localRow)) {
1865 lclErr = true;
1866 invalidSrcCopyRows.insert (localRow);
1867 continue; // skip invalid rows
1868 }
1869#endif // HAVE_TPETRA_DEBUG
1870
1871 local_inds_host_view_type lclSrcCols;
1872 values_host_view_type vals;
1873 LO numEntries;
1874 // If this call fails, that means the mesh row local index is
1875 // invalid. That means the Import or Export is invalid somehow.
1876 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1877 if (numEntries > 0) {
1878 LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1879 if (err != numEntries) {
1880 lclErr = true;
1881 if (! dstRowMap.isNodeLocalElement (localRow)) {
1882#ifdef HAVE_TPETRA_DEBUG
1883 invalidDstCopyRows.insert (localRow);
1884#endif // HAVE_TPETRA_DEBUG
1885 }
1886 else {
1887 // Once there's an error, there's no sense in saving
1888 // time, so we check whether the column indices were
1889 // invalid. However, only remember which ones were
1890 // invalid in a debug build, because that might take a
1891 // lot of space.
1892 for (LO k = 0; k < numEntries; ++k) {
1893 if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1894 lclErr = true;
1895#ifdef HAVE_TPETRA_DEBUG
1896 (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1897#endif // HAVE_TPETRA_DEBUG
1898 }
1899 }
1900 }
1901 }
1902 }
1903 } // for each "same" local row
1904
1905 // Copy the "permute" local rows.
1906 for (size_t k = 0; k < numPermute; ++k) {
1907 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1908 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1909
1910 local_inds_host_view_type lclSrcCols;
1911 values_host_view_type vals;
1912 LO numEntries;
1913 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1914 if (numEntries > 0) {
1915 LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1916 if (err != numEntries) {
1917 lclErr = true;
1918#ifdef HAVE_TPETRA_DEBUG
1919 for (LO c = 0; c < numEntries; ++c) {
1920 if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1921 invalidDstPermuteCols.insert (lclSrcCols[c]);
1922 }
1923 }
1924#endif // HAVE_TPETRA_DEBUG
1925 }
1926 }
1927 }
1928 }
1929 else { // must convert column indices to global
1930 // Reserve space to store the destination matrix's local column indices.
1931 const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
1932 Teuchos::Array<LO> lclDstCols (maxNumEnt);
1933
1934 // Copy local rows that are the "same" in both source and target.
1935 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1936 local_inds_host_view_type lclSrcCols;
1937 values_host_view_type vals;
1938 LO numEntries;
1939
1940 // If this call fails, that means the mesh row local index is
1941 // invalid. That means the Import or Export is invalid somehow.
1942 try {
1943 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1944 } catch (std::exception& e) {
1945 if (debug) {
1946 std::ostringstream os;
1947 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1948 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1949 << localRow << ", src->getLocalRowView() threw an exception: "
1950 << e.what ();
1951 std::cerr << os.str ();
1952 }
1953 throw e;
1954 }
1955
1956 if (numEntries > 0) {
1957 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1958 lclErr = true;
1959 if (debug) {
1960 std::ostringstream os;
1961 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1962 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1963 << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1964 << maxNumEnt << endl;
1965 std::cerr << os.str ();
1966 }
1967 }
1968 else {
1969 // Convert the source matrix's local column indices to the
1970 // destination matrix's local column indices.
1971 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1972 for (LO j = 0; j < numEntries; ++j) {
1973 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1974 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1975 lclErr = true;
1976#ifdef HAVE_TPETRA_DEBUG
1977 invalidDstCopyCols.insert (lclDstColsView[j]);
1978#endif // HAVE_TPETRA_DEBUG
1979 }
1980 }
1981 LO err(0);
1982 try {
1983 err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1984 } catch (std::exception& e) {
1985 if (debug) {
1986 std::ostringstream os;
1987 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1988 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1989 << localRow << ", this->replaceLocalValues() threw an exception: "
1990 << e.what ();
1991 std::cerr << os.str ();
1992 }
1993 throw e;
1994 }
1995 if (err != numEntries) {
1996 lclErr = true;
1997 if (debug) {
1998 std::ostringstream os;
1999 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2000 os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2001 "localRow " << localRow << ", this->replaceLocalValues "
2002 "returned " << err << " instead of numEntries = "
2003 << numEntries << endl;
2004 std::cerr << os.str ();
2005 }
2006 }
2007 }
2008 }
2009 }
2010
2011 // Copy the "permute" local rows.
2012 for (size_t k = 0; k < numPermute; ++k) {
2013 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2014 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2015
2016 local_inds_host_view_type lclSrcCols;
2017 values_host_view_type vals;
2018 LO numEntries;
2019
2020 try {
2021 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
2022 } catch (std::exception& e) {
2023 if (debug) {
2024 std::ostringstream os;
2025 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2026 os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2027 "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2028 << ", src->getLocalRowView() threw an exception: " << e.what ();
2029 std::cerr << os.str ();
2030 }
2031 throw e;
2032 }
2033
2034 if (numEntries > 0) {
2035 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2036 lclErr = true;
2037 }
2038 else {
2039 // Convert the source matrix's local column indices to the
2040 // destination matrix's local column indices.
2041 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2042 for (LO j = 0; j < numEntries; ++j) {
2043 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2044 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2045 lclErr = true;
2046#ifdef HAVE_TPETRA_DEBUG
2047 invalidDstPermuteCols.insert (lclDstColsView[j]);
2048#endif // HAVE_TPETRA_DEBUG
2049 }
2050 }
2051 LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
2052 if (err != numEntries) {
2053 lclErr = true;
2054 }
2055 }
2056 }
2057 }
2058 }
2059
2060 if (lclErr) {
2061 std::ostream& err = this->markLocalErrorAndGetStream ();
2062#ifdef HAVE_TPETRA_DEBUG
2063 err << "copyAndPermute: The graph structure of the source of the "
2064 "Import or Export must be a subset of the graph structure of the "
2065 "target. ";
2066 err << "invalidSrcCopyRows = [";
2067 for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2068 it != invalidSrcCopyRows.end (); ++it) {
2069 err << *it;
2070 typename std::set<LO>::const_iterator itp1 = it;
2071 itp1++;
2072 if (itp1 != invalidSrcCopyRows.end ()) {
2073 err << ",";
2074 }
2075 }
2076 err << "], invalidDstCopyRows = [";
2077 for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2078 it != invalidDstCopyRows.end (); ++it) {
2079 err << *it;
2080 typename std::set<LO>::const_iterator itp1 = it;
2081 itp1++;
2082 if (itp1 != invalidDstCopyRows.end ()) {
2083 err << ",";
2084 }
2085 }
2086 err << "], invalidDstCopyCols = [";
2087 for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2088 it != invalidDstCopyCols.end (); ++it) {
2089 err << *it;
2090 typename std::set<LO>::const_iterator itp1 = it;
2091 itp1++;
2092 if (itp1 != invalidDstCopyCols.end ()) {
2093 err << ",";
2094 }
2095 }
2096 err << "], invalidDstPermuteCols = [";
2097 for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2098 it != invalidDstPermuteCols.end (); ++it) {
2099 err << *it;
2100 typename std::set<LO>::const_iterator itp1 = it;
2101 itp1++;
2102 if (itp1 != invalidDstPermuteCols.end ()) {
2103 err << ",";
2104 }
2105 }
2106 err << "], invalidPermuteFromRows = [";
2107 for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2108 it != invalidPermuteFromRows.end (); ++it) {
2109 err << *it;
2110 typename std::set<LO>::const_iterator itp1 = it;
2111 itp1++;
2112 if (itp1 != invalidPermuteFromRows.end ()) {
2113 err << ",";
2114 }
2115 }
2116 err << "]" << endl;
2117#else
2118 err << "copyAndPermute: The graph structure of the source of the "
2119 "Import or Export must be a subset of the graph structure of the "
2120 "target." << endl;
2121#endif // HAVE_TPETRA_DEBUG
2122 }
2123
2124 if (debug) {
2125 std::ostringstream os;
2126 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2127 const bool lclSuccess = ! (* (this->localError_));
2128 os << "*** Proc " << myRank << ": copyAndPermute "
2129 << (lclSuccess ? "succeeded" : "FAILED");
2130 if (lclSuccess) {
2131 os << endl;
2132 } else {
2133 os << ": error messages: " << this->errorMessages (); // comes w/ endl
2134 }
2135 std::cerr << os.str ();
2136 }
2137 }
2138
2139 namespace { // (anonymous)
2140
2159 template<class LO, class GO>
2160 size_t
2161 packRowCount (const size_t numEnt,
2162 const size_t numBytesPerValue,
2163 const size_t blkSize)
2164 {
2165 using ::Tpetra::Details::PackTraits;
2166
2167 if (numEnt == 0) {
2168 // Empty rows always take zero bytes, to ensure sparsity.
2169 return 0;
2170 }
2171 else {
2172 // We store the number of entries as a local index (LO).
2173 LO numEntLO = 0; // packValueCount wants this.
2174 GO gid {};
2175 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2176 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2177 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2178 return numEntLen + gidsLen + valsLen;
2179 }
2180 }
2181
2192 template<class ST, class LO, class GO>
2193 size_t
2194 unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2195 const size_t offset,
2196 const size_t numBytes,
2197 const size_t /* numBytesPerValue */)
2198 {
2199 using Kokkos::subview;
2200 using ::Tpetra::Details::PackTraits;
2201
2202 if (numBytes == 0) {
2203 // Empty rows always take zero bytes, to ensure sparsity.
2204 return static_cast<size_t> (0);
2205 }
2206 else {
2207 LO numEntLO = 0;
2208 const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2209 TEUCHOS_TEST_FOR_EXCEPTION
2210 (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2211 "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2212 << ".");
2213 const char* const inBuf = imports.data () + offset;
2214 const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2215 TEUCHOS_TEST_FOR_EXCEPTION
2216 (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2217 "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2218 << ".");
2219 return static_cast<size_t> (numEntLO);
2220 }
2221 }
2222
2226 template<class ST, class LO, class GO>
2227 size_t
2228 packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2229 const size_t offset,
2230 const size_t numEnt,
2231 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2232 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2233 const size_t numBytesPerValue,
2234 const size_t blockSize)
2235 {
2236 using Kokkos::subview;
2237 using ::Tpetra::Details::PackTraits;
2238
2239 if (numEnt == 0) {
2240 // Empty rows always take zero bytes, to ensure sparsity.
2241 return 0;
2242 }
2243 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2244
2245 const GO gid = 0; // packValueCount wants this
2246 const LO numEntLO = static_cast<size_t> (numEnt);
2247
2248 const size_t numEntBeg = offset;
2249 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2250 const size_t gidsBeg = numEntBeg + numEntLen;
2251 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2252 const size_t valsBeg = gidsBeg + gidsLen;
2253 const size_t valsLen = numScalarEnt * numBytesPerValue;
2254
2255 char* const numEntOut = exports.data () + numEntBeg;
2256 char* const gidsOut = exports.data () + gidsBeg;
2257 char* const valsOut = exports.data () + valsBeg;
2258
2259 size_t numBytesOut = 0;
2260 int errorCode = 0;
2261 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2262
2263 {
2264 Kokkos::pair<int, size_t> p;
2265 p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2266 errorCode += p.first;
2267 numBytesOut += p.second;
2268
2269 p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2270 errorCode += p.first;
2271 numBytesOut += p.second;
2272 }
2273
2274 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2275 TEUCHOS_TEST_FOR_EXCEPTION
2276 (numBytesOut != expectedNumBytes, std::logic_error,
2277 "packRowForBlockCrs: numBytesOut = " << numBytesOut
2278 << " != expectedNumBytes = " << expectedNumBytes << ".");
2279
2280 TEUCHOS_TEST_FOR_EXCEPTION
2281 (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2282 "PackTraits::packArray returned a nonzero error code");
2283
2284 return numBytesOut;
2285 }
2286
2287 // Return the number of bytes actually read / used.
2288 template<class ST, class LO, class GO>
2289 size_t
2290 unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2291 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2292 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2293 const size_t offset,
2294 const size_t numBytes,
2295 const size_t numEnt,
2296 const size_t numBytesPerValue,
2297 const size_t blockSize)
2298 {
2299 using ::Tpetra::Details::PackTraits;
2300
2301 if (numBytes == 0) {
2302 // Rows with zero bytes always have zero entries.
2303 return 0;
2304 }
2305 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2306 TEUCHOS_TEST_FOR_EXCEPTION
2307 (static_cast<size_t> (imports.extent (0)) <= offset,
2308 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2309 << imports.extent (0) << " <= offset = " << offset << ".");
2310 TEUCHOS_TEST_FOR_EXCEPTION
2311 (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2312 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2313 << imports.extent (0) << " < offset + numBytes = "
2314 << (offset + numBytes) << ".");
2315
2316 const GO gid = 0; // packValueCount wants this
2317 const LO lid = 0; // packValueCount wants this
2318
2319 const size_t numEntBeg = offset;
2320 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2321 const size_t gidsBeg = numEntBeg + numEntLen;
2322 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2323 const size_t valsBeg = gidsBeg + gidsLen;
2324 const size_t valsLen = numScalarEnt * numBytesPerValue;
2325
2326 const char* const numEntIn = imports.data () + numEntBeg;
2327 const char* const gidsIn = imports.data () + gidsBeg;
2328 const char* const valsIn = imports.data () + valsBeg;
2329
2330 size_t numBytesOut = 0;
2331 int errorCode = 0;
2332 LO numEntOut;
2333 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2334 TEUCHOS_TEST_FOR_EXCEPTION
2335 (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2336 "unpackRowForBlockCrs: Expected number of entries " << numEnt
2337 << " != actual number of entries " << numEntOut << ".");
2338
2339 {
2340 Kokkos::pair<int, size_t> p;
2341 p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2342 errorCode += p.first;
2343 numBytesOut += p.second;
2344
2345 p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2346 errorCode += p.first;
2347 numBytesOut += p.second;
2348 }
2349
2350 TEUCHOS_TEST_FOR_EXCEPTION
2351 (numBytesOut != numBytes, std::logic_error,
2352 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2353 << " != numBytes = " << numBytes << ".");
2354
2355 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2356 TEUCHOS_TEST_FOR_EXCEPTION
2357 (numBytesOut != expectedNumBytes, std::logic_error,
2358 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2359 << " != expectedNumBytes = " << expectedNumBytes << ".");
2360
2361 TEUCHOS_TEST_FOR_EXCEPTION
2362 (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2363 "PackTraits::unpackArray returned a nonzero error code");
2364
2365 return numBytesOut;
2366 }
2367 } // namespace (anonymous)
2368
2369 template<class Scalar, class LO, class GO, class Node>
2370 void
2373 (const ::Tpetra::SrcDistObject& source,
2374 const Kokkos::DualView<const local_ordinal_type*,
2375 buffer_device_type>& exportLIDs,
2376 Kokkos::DualView<packet_type*,
2377 buffer_device_type>& exports, // output
2378 Kokkos::DualView<size_t*,
2379 buffer_device_type> numPacketsPerLID, // output
2380 size_t& constantNumPackets)
2381 {
2382 using ::Tpetra::Details::Behavior;
2384 using ::Tpetra::Details::ProfilingRegion;
2385 using ::Tpetra::Details::PackTraits;
2386
2387 typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2388
2390
2391 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2392
2393 const bool debug = Behavior::debug();
2394 const bool verbose = Behavior::verbose();
2395
2396 // Define this function prefix
2397 std::string prefix;
2398 {
2399 std::ostringstream os;
2400 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2401 os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2402 prefix = os.str();
2403 }
2404
2405 // check if this already includes a local error
2406 if (* (this->localError_)) {
2407 std::ostream& err = this->markLocalErrorAndGetStream ();
2408 err << prefix
2409 << "The target object of the Import or Export is already in an error state."
2410 << std::endl;
2411 return;
2412 }
2413
2414 //
2415 // Verbose input dual view status
2416 //
2417 if (verbose) {
2418 std::ostringstream os;
2419 os << prefix << std::endl
2420 << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2421 << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2422 << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2423 std::cerr << os.str ();
2424 }
2425
2429 if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2430 std::ostream& err = this->markLocalErrorAndGetStream ();
2431 err << prefix
2432 << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2433 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2434 << "." << std::endl;
2435 return;
2436 }
2437 if (exportLIDs.need_sync_host ()) {
2438 std::ostream& err = this->markLocalErrorAndGetStream ();
2439 err << prefix << "exportLIDs be sync'd to host." << std::endl;
2440 return;
2441 }
2442
2443 const this_type* src = dynamic_cast<const this_type* > (&source);
2444 if (src == NULL) {
2445 std::ostream& err = this->markLocalErrorAndGetStream ();
2446 err << prefix
2447 << "The source (input) object of the Import or "
2448 "Export is either not a BlockCrsMatrix, or does not have the right "
2449 "template parameters. checkSizes() should have caught this. "
2450 "Please report this bug to the Tpetra developers." << std::endl;
2451 return;
2452 }
2453
2454 // Graphs and matrices are allowed to have a variable number of
2455 // entries per row. We could test whether all rows have the same
2456 // number of entries, but DistObject can only use this
2457 // optimization if all rows on _all_ processes have the same
2458 // number of entries. Rather than do the all-reduce necessary to
2459 // test for this unlikely case, we tell DistObject (by setting
2460 // constantNumPackets to zero) to assume that different rows may
2461 // have different numbers of entries.
2462 constantNumPackets = 0;
2463
2464 // const values
2465 const crs_graph_type& srcGraph = src->graph_;
2466 const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2467 const size_t numExportLIDs = exportLIDs.extent (0);
2468 size_t numBytesPerValue(0);
2469 {
2470 auto val_host = val_.getHostView(Access::ReadOnly);
2471 numBytesPerValue =
2472 PackTraits<impl_scalar_type>
2473 ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2474 }
2475
2476 // Compute the number of bytes ("packets") per row to pack. While
2477 // we're at it, compute the total # of block entries to send, and
2478 // the max # of block entries in any of the rows we're sending.
2479
2480 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2481
2482 // Graph information is on host; let's do this on host parallel reduce
2483 auto exportLIDsHost = exportLIDs.view_host();
2484 auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2485 numPacketsPerLID.modify_host ();
2486 {
2487 using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2488 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2489 Kokkos::parallel_reduce
2490 (policy,
2491 [=](const size_t &i, typename reducer_type::value_type &update) {
2492 const LO lclRow = exportLIDsHost(i);
2493 size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2494 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2495
2496 const size_t numBytes =
2497 packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2498 numPacketsPerLIDHost(i) = numBytes;
2499 update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2500 }, rowReducerStruct);
2501 }
2502
2503 // Compute the number of bytes ("packets") per row to pack. While
2504 // we're at it, compute the total # of block entries to send, and
2505 // the max # of block entries in any of the rows we're sending.
2506 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2507 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2508 const size_t maxRowLength = rowReducerStruct.maxRowLength;
2509
2510 if (verbose) {
2511 std::ostringstream os;
2512 os << prefix
2513 << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2514 << std::endl;
2515 std::cerr << os.str ();
2516 }
2517
2518 // We use a "struct of arrays" approach to packing each row's
2519 // entries. All the column indices (as global indices) go first,
2520 // then all their owning process ranks, and then the values.
2521 if(exports.extent(0) != totalNumBytes)
2522 {
2523 const std::string oldLabel = exports.d_view.label ();
2524 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2525 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2526 }
2527 if (totalNumEntries > 0) {
2528 // Current position (in bytes) in the 'exports' output array.
2529 Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2530 {
2531 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2532 Kokkos::parallel_scan
2533 (policy,
2534 [=](const size_t &i, size_t &update, const bool &final) {
2535 if (final) offset(i) = update;
2536 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2537 });
2538 }
2539 if (offset(numExportLIDs) != totalNumBytes) {
2540 std::ostream& err = this->markLocalErrorAndGetStream ();
2541 err << prefix
2542 << "At end of method, the final offset (in bytes) "
2543 << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2544 << totalNumBytes << ". "
2545 << "Please report this bug to the Tpetra developers." << std::endl;
2546 return;
2547 }
2548
2549 // For each block row of the matrix owned by the calling
2550 // process, pack that block row's column indices and values into
2551 // the exports array.
2552
2553 // Source matrix's column Map. We verified in checkSizes() that
2554 // the column Map exists (is not null).
2555 const map_type& srcColMap = * (srcGraph.getColMap ());
2556
2557 // Pack the data for each row to send, into the 'exports' buffer.
2558 // exports will be modified on host.
2559 exports.modify_host();
2560 {
2561 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2562 const auto policy =
2563 policy_type(numExportLIDs, 1, 1)
2564 .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2565 Kokkos::parallel_for
2566 (policy,
2567 [=](const typename policy_type::member_type &member) {
2568 const size_t i = member.league_rank();
2569 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2570 gblColInds(member.team_scratch(0), maxRowLength);
2571
2572 const LO lclRowInd = exportLIDsHost(i);
2573 local_inds_host_view_type lclColInds;
2574 values_host_view_type vals;
2575
2576 // It's OK to ignore the return value, since if the calling
2577 // process doesn't own that local row, then the number of
2578 // entries in that row on the calling process is zero.
2579 src->getLocalRowView (lclRowInd, lclColInds, vals);
2580 const size_t numEnt = lclColInds.extent(0);
2581
2582 // Convert column indices from local to global.
2583 for (size_t j = 0; j < numEnt; ++j)
2584 gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2585
2586 // Kyungjoo: additional wrapping scratch view is necessary
2587 // the following function interface need the same execution space
2588 // host scratch space somehow is not considered same as the host_exec
2589 // Copy the row's data into the current spot in the exports array.
2590 const size_t numBytes =
2591 packRowForBlockCrs<impl_scalar_type, LO, GO>
2592 (exports.view_host(),
2593 offset(i),
2594 numEnt,
2595 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2596 vals,
2597 numBytesPerValue,
2598 blockSize);
2599
2600 // numBytes should be same as the difference between offsets
2601 if (debug) {
2602 const size_t offsetDiff = offset(i+1) - offset(i);
2603 if (numBytes != offsetDiff) {
2604 std::ostringstream os;
2605 os << prefix
2606 << "numBytes computed from packRowForBlockCrs is different from "
2607 << "precomputed offset values, LID = " << i << std::endl;
2608 std::cerr << os.str ();
2609 }
2610 }
2611 }); // for each LID (of a row) to send
2612 }
2613 } // if totalNumEntries > 0
2614
2615 if (debug) {
2616 std::ostringstream os;
2617 const bool lclSuccess = ! (* (this->localError_));
2618 os << prefix
2619 << (lclSuccess ? "succeeded" : "FAILED")
2620 << std::endl;
2621 std::cerr << os.str ();
2622 }
2623 }
2624
2625 template<class Scalar, class LO, class GO, class Node>
2626 void
2629 (const Kokkos::DualView<const local_ordinal_type*,
2630 buffer_device_type>& importLIDs,
2631 Kokkos::DualView<packet_type*,
2632 buffer_device_type> imports,
2633 Kokkos::DualView<size_t*,
2634 buffer_device_type> numPacketsPerLID,
2635 const size_t /* constantNumPackets */,
2636 const CombineMode combineMode)
2637 {
2638 using ::Tpetra::Details::Behavior;
2640 using ::Tpetra::Details::ProfilingRegion;
2641 using ::Tpetra::Details::PackTraits;
2642 using std::endl;
2643 using host_exec =
2644 typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2645
2646 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2647 const bool verbose = Behavior::verbose ();
2648
2649 // Define this function prefix
2650 std::string prefix;
2651 {
2652 std::ostringstream os;
2653 auto map = this->graph_.getRowMap ();
2654 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2655 const int myRank = comm.is_null () ? -1 : comm->getRank ();
2656 os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2657 prefix = os.str ();
2658 if (verbose) {
2659 os << "Start" << endl;
2660 std::cerr << os.str ();
2661 }
2662 }
2663
2664 // check if this already includes a local error
2665 if (* (this->localError_)) {
2666 std::ostream& err = this->markLocalErrorAndGetStream ();
2667 std::ostringstream os;
2668 os << prefix << "{Im/Ex}port target is already in error." << endl;
2669 if (verbose) {
2670 std::cerr << os.str ();
2671 }
2672 err << os.str ();
2673 return;
2674 }
2675
2679 if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2680 std::ostream& err = this->markLocalErrorAndGetStream ();
2681 std::ostringstream os;
2682 os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2683 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2684 << "." << endl;
2685 if (verbose) {
2686 std::cerr << os.str ();
2687 }
2688 err << os.str ();
2689 return;
2690 }
2691
2692 if (combineMode != ADD && combineMode != INSERT &&
2693 combineMode != REPLACE && combineMode != ABSMAX &&
2694 combineMode != ZERO) {
2695 std::ostream& err = this->markLocalErrorAndGetStream ();
2696 std::ostringstream os;
2697 os << prefix
2698 << "Invalid CombineMode value " << combineMode << ". Valid "
2699 << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2700 << std::endl;
2701 if (verbose) {
2702 std::cerr << os.str ();
2703 }
2704 err << os.str ();
2705 return;
2706 }
2707
2708 if (this->graph_.getColMap ().is_null ()) {
2709 std::ostream& err = this->markLocalErrorAndGetStream ();
2710 std::ostringstream os;
2711 os << prefix << "Target matrix's column Map is null." << endl;
2712 if (verbose) {
2713 std::cerr << os.str ();
2714 }
2715 err << os.str ();
2716 return;
2717 }
2718
2719 // Target matrix's column Map. Use to convert the global column
2720 // indices in the receive buffer to local indices. We verified in
2721 // checkSizes() that the column Map exists (is not null).
2722 const map_type& tgtColMap = * (this->graph_.getColMap ());
2723
2724 // Const values
2725 const size_t blockSize = this->getBlockSize ();
2726 const size_t numImportLIDs = importLIDs.extent(0);
2727 // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2728 // default-constructed instance could have a different size than
2729 // other instances. (We assume all nominally constructed
2730 // instances have the same size; that's not the issue here.) This
2731 // could be bad if the calling process has no entries, but other
2732 // processes have entries that they want to send to us.
2733 size_t numBytesPerValue(0);
2734 {
2735 auto val_host = val_.getHostView(Access::ReadOnly);
2736 numBytesPerValue =
2737 PackTraits<impl_scalar_type>::packValueCount
2738 (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2739 }
2740 const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
2741 const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2742
2743 if (verbose) {
2744 std::ostringstream os;
2745 os << prefix << "combineMode: "
2746 << ::Tpetra::combineModeToString (combineMode)
2747 << ", blockSize: " << blockSize
2748 << ", numImportLIDs: " << numImportLIDs
2749 << ", numBytesPerValue: " << numBytesPerValue
2750 << ", maxRowNumEnt: " << maxRowNumEnt
2751 << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2752 std::cerr << os.str ();
2753 }
2754
2755 if (combineMode == ZERO || numImportLIDs == 0) {
2756 if (verbose) {
2757 std::ostringstream os;
2758 os << prefix << "Nothing to unpack. Done!" << std::endl;
2759 std::cerr << os.str ();
2760 }
2761 return;
2762 }
2763
2764 // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2765 // we can remove this sync.
2766 if (imports.need_sync_host ()) {
2767 imports.sync_host ();
2768 }
2769
2770 // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2771 // sync'd numPacketsPerLID to host, since it needs to do that in
2772 // order to post MPI_Irecv messages with the correct lengths. A
2773 // hypothetical device-specific boundary exchange implementation
2774 // could possibly receive data without sync'ing lengths to host,
2775 // but we don't need to design for that nonexistent thing yet.
2776
2777 if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2778 importLIDs.need_sync_host ()) {
2779 std::ostream& err = this->markLocalErrorAndGetStream ();
2780 std::ostringstream os;
2781 os << prefix << "All input DualViews must be sync'd to host by now. "
2782 << "imports_nc.need_sync_host()="
2783 << (imports.need_sync_host () ? "true" : "false")
2784 << ", numPacketsPerLID.need_sync_host()="
2785 << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2786 << ", importLIDs.need_sync_host()="
2787 << (importLIDs.need_sync_host () ? "true" : "false")
2788 << "." << endl;
2789 if (verbose) {
2790 std::cerr << os.str ();
2791 }
2792 err << os.str ();
2793 return;
2794 }
2795
2796 const auto importLIDsHost = importLIDs.view_host ();
2797 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2798
2799 // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2800 // loop, by only unpacking on final==true (when we know the
2801 // current offset's value).
2802
2803 Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2804 {
2805 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2806 Kokkos::parallel_scan
2807 ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2808 [=] (const size_t &i, size_t &update, const bool &final) {
2809 if (final) offset(i) = update;
2810 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2811 });
2812 }
2813
2814 // this variable does not matter with a race condition (just error flag)
2815 //
2816 // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2817 // or atomics on bool, so we use int instead. (I know we're not
2818 // launching a CUDA loop here, but we could in the future, and I'd
2819 // like to avoid potential trouble.)
2820 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2821 errorDuringUnpack ("errorDuringUnpack");
2822 errorDuringUnpack () = 0;
2823 {
2824 using policy_type = Kokkos::TeamPolicy<host_exec>;
2825 const auto policy = policy_type (numImportLIDs, 1, 1)
2826 .set_scratch_size (0, Kokkos::PerTeam (sizeof (GO) * maxRowNumEnt +
2827 sizeof (LO) * maxRowNumEnt +
2828 numBytesPerValue * maxRowNumScalarEnt));
2829 using host_scratch_space = typename host_exec::scratch_memory_space;
2830 using pair_type = Kokkos::pair<size_t, size_t>;
2831 Kokkos::parallel_for
2832 ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2833 [=] (const typename policy_type::member_type& member) {
2834 const size_t i = member.league_rank();
2835
2836 Kokkos::View<GO*, host_scratch_space> gblColInds
2837 (member.team_scratch (0), maxRowNumEnt);
2838 Kokkos::View<LO*, host_scratch_space> lclColInds
2839 (member.team_scratch (0), maxRowNumEnt);
2840 Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2841 (member.team_scratch (0), maxRowNumScalarEnt);
2842
2843 const size_t offval = offset(i);
2844 const LO lclRow = importLIDsHost(i);
2845 const size_t numBytes = numPacketsPerLIDHost(i);
2846 const size_t numEnt =
2847 unpackRowCount<impl_scalar_type, LO, GO>
2848 (imports.view_host (), offval, numBytes, numBytesPerValue);
2849
2850 if (numBytes > 0) {
2851 if (numEnt > maxRowNumEnt) {
2852 errorDuringUnpack() = 1;
2853 if (verbose) {
2854 std::ostringstream os;
2855 os << prefix
2856 << "At i = " << i << ", numEnt = " << numEnt
2857 << " > maxRowNumEnt = " << maxRowNumEnt
2858 << std::endl;
2859 std::cerr << os.str ();
2860 }
2861 return;
2862 }
2863 }
2864 const size_t numScalarEnt = numEnt*blockSize*blockSize;
2865 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2866 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2867 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2868
2869 // Kyungjoo: additional wrapping scratch view is necessary
2870 // the following function interface need the same execution space
2871 // host scratch space somehow is not considered same as the host_exec
2872 size_t numBytesOut = 0;
2873 try {
2874 numBytesOut =
2875 unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2876 (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2877 Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2878 imports.view_host(),
2879 offval, numBytes, numEnt,
2880 numBytesPerValue, blockSize);
2881 }
2882 catch (std::exception& e) {
2883 errorDuringUnpack() = 1;
2884 if (verbose) {
2885 std::ostringstream os;
2886 os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2887 << e.what () << endl;
2888 std::cerr << os.str ();
2889 }
2890 return;
2891 }
2892
2893 if (numBytes != numBytesOut) {
2894 errorDuringUnpack() = 1;
2895 if (verbose) {
2896 std::ostringstream os;
2897 os << prefix
2898 << "At i = " << i << ", numBytes = " << numBytes
2899 << " != numBytesOut = " << numBytesOut << "."
2900 << std::endl;
2901 std::cerr << os.str();
2902 }
2903 return;
2904 }
2905
2906 // Convert incoming global indices to local indices.
2907 for (size_t k = 0; k < numEnt; ++k) {
2908 lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2909 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2910 if (verbose) {
2911 std::ostringstream os;
2912 os << prefix
2913 << "At i = " << i << ", GID " << gidsOut(k)
2914 << " is not owned by the calling process."
2915 << std::endl;
2916 std::cerr << os.str();
2917 }
2918 return;
2919 }
2920 }
2921
2922 // Combine the incoming data with the matrix's current data.
2923 LO numCombd = 0;
2924 const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2925 const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2926 (const_cast<const impl_scalar_type*> (valsOut.data ()));
2927 if (combineMode == ADD) {
2928 numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2929 } else if (combineMode == INSERT || combineMode == REPLACE) {
2930 numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2931 } else if (combineMode == ABSMAX) {
2932 numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2933 }
2934
2935 if (static_cast<LO> (numEnt) != numCombd) {
2936 errorDuringUnpack() = 1;
2937 if (verbose) {
2938 std::ostringstream os;
2939 os << prefix << "At i = " << i << ", numEnt = " << numEnt
2940 << " != numCombd = " << numCombd << "."
2941 << endl;
2942 std::cerr << os.str ();
2943 }
2944 return;
2945 }
2946 }); // for each import LID i
2947 }
2948
2949 if (errorDuringUnpack () != 0) {
2950 std::ostream& err = this->markLocalErrorAndGetStream ();
2951 err << prefix << "Unpacking failed.";
2952 if (! verbose) {
2953 err << " Please run again with the environment variable setting "
2954 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2955 }
2956 err << endl;
2957 }
2958
2959 if (verbose) {
2960 std::ostringstream os;
2961 const bool lclSuccess = ! (* (this->localError_));
2962 os << prefix
2963 << (lclSuccess ? "succeeded" : "FAILED")
2964 << std::endl;
2965 std::cerr << os.str ();
2966 }
2967 }
2968
2969 template<class Scalar, class LO, class GO, class Node>
2970 std::string
2972 {
2973 using Teuchos::TypeNameTraits;
2974 std::ostringstream os;
2975 os << "\"Tpetra::BlockCrsMatrix\": { "
2976 << "Template parameters: { "
2977 << "Scalar: " << TypeNameTraits<Scalar>::name ()
2978 << "LO: " << TypeNameTraits<LO>::name ()
2979 << "GO: " << TypeNameTraits<GO>::name ()
2980 << "Node: " << TypeNameTraits<Node>::name ()
2981 << " }"
2982 << ", Label: \"" << this->getObjectLabel () << "\""
2983 << ", Global dimensions: ["
2984 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2985 << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2986 << ", Block size: " << getBlockSize ()
2987 << " }";
2988 return os.str ();
2989 }
2990
2991
2992 template<class Scalar, class LO, class GO, class Node>
2993 void
2995 describe (Teuchos::FancyOStream& out,
2996 const Teuchos::EVerbosityLevel verbLevel) const
2997 {
2998 using Teuchos::ArrayRCP;
2999 using Teuchos::CommRequest;
3000 using Teuchos::FancyOStream;
3001 using Teuchos::getFancyOStream;
3002 using Teuchos::ireceive;
3003 using Teuchos::isend;
3004 using Teuchos::outArg;
3005 using Teuchos::TypeNameTraits;
3006 using Teuchos::VERB_DEFAULT;
3007 using Teuchos::VERB_NONE;
3008 using Teuchos::VERB_LOW;
3009 using Teuchos::VERB_MEDIUM;
3010 // using Teuchos::VERB_HIGH;
3011 using Teuchos::VERB_EXTREME;
3012 using Teuchos::RCP;
3013 using Teuchos::wait;
3014 using std::endl;
3015
3016 // Set default verbosity if applicable.
3017 const Teuchos::EVerbosityLevel vl =
3018 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3019
3020 if (vl == VERB_NONE) {
3021 return; // print nothing
3022 }
3023
3024 // describe() always starts with a tab before it prints anything.
3025 Teuchos::OSTab tab0 (out);
3026
3027 out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3028 Teuchos::OSTab tab1 (out);
3029
3030 out << "Template parameters:" << endl;
3031 {
3032 Teuchos::OSTab tab2 (out);
3033 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3034 << "LO: " << TypeNameTraits<LO>::name () << endl
3035 << "GO: " << TypeNameTraits<GO>::name () << endl
3036 << "Node: " << TypeNameTraits<Node>::name () << endl;
3037 }
3038 out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3039 << "Global dimensions: ["
3040 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3041 << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3042
3043 const LO blockSize = getBlockSize ();
3044 out << "Block size: " << blockSize << endl;
3045
3046 // constituent objects
3047 if (vl >= VERB_MEDIUM) {
3048 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3049 const int myRank = comm.getRank ();
3050 if (myRank == 0) {
3051 out << "Row Map:" << endl;
3052 }
3053 getRowMap()->describe (out, vl);
3054
3055 if (! getColMap ().is_null ()) {
3056 if (getColMap() == getRowMap()) {
3057 if (myRank == 0) {
3058 out << "Column Map: same as row Map" << endl;
3059 }
3060 }
3061 else {
3062 if (myRank == 0) {
3063 out << "Column Map:" << endl;
3064 }
3065 getColMap ()->describe (out, vl);
3066 }
3067 }
3068 if (! getDomainMap ().is_null ()) {
3069 if (getDomainMap () == getRowMap ()) {
3070 if (myRank == 0) {
3071 out << "Domain Map: same as row Map" << endl;
3072 }
3073 }
3074 else if (getDomainMap () == getColMap ()) {
3075 if (myRank == 0) {
3076 out << "Domain Map: same as column Map" << endl;
3077 }
3078 }
3079 else {
3080 if (myRank == 0) {
3081 out << "Domain Map:" << endl;
3082 }
3083 getDomainMap ()->describe (out, vl);
3084 }
3085 }
3086 if (! getRangeMap ().is_null ()) {
3087 if (getRangeMap () == getDomainMap ()) {
3088 if (myRank == 0) {
3089 out << "Range Map: same as domain Map" << endl;
3090 }
3091 }
3092 else if (getRangeMap () == getRowMap ()) {
3093 if (myRank == 0) {
3094 out << "Range Map: same as row Map" << endl;
3095 }
3096 }
3097 else {
3098 if (myRank == 0) {
3099 out << "Range Map: " << endl;
3100 }
3101 getRangeMap ()->describe (out, vl);
3102 }
3103 }
3104 }
3105
3106 if (vl >= VERB_EXTREME) {
3107 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3108 const int myRank = comm.getRank ();
3109 const int numProcs = comm.getSize ();
3110
3111 // Print the calling process' data to the given output stream.
3112 RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3113 RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3114 FancyOStream& os = *osPtr;
3115 os << "Process " << myRank << ":" << endl;
3116 Teuchos::OSTab tab2 (os);
3117
3118 const map_type& meshRowMap = * (graph_.getRowMap ());
3119 const map_type& meshColMap = * (graph_.getColMap ());
3120 for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3121 meshLclRow <= meshRowMap.getMaxLocalIndex ();
3122 ++meshLclRow) {
3123 const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3124 os << "Row " << meshGblRow << ": {";
3125
3126 local_inds_host_view_type lclColInds;
3127 values_host_view_type vals;
3128 LO numInds = 0;
3129 this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
3130
3131 for (LO k = 0; k < numInds; ++k) {
3132 const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3133
3134 os << "Col " << gblCol << ": [";
3135 for (LO i = 0; i < blockSize; ++i) {
3136 for (LO j = 0; j < blockSize; ++j) {
3137 os << vals[blockSize*blockSize*k + i*blockSize + j];
3138 if (j + 1 < blockSize) {
3139 os << ", ";
3140 }
3141 }
3142 if (i + 1 < blockSize) {
3143 os << "; ";
3144 }
3145 }
3146 os << "]";
3147 if (k + 1 < numInds) {
3148 os << ", ";
3149 }
3150 }
3151 os << "}" << endl;
3152 }
3153
3154 // Print data on Process 0. This will automatically respect the
3155 // current indentation level.
3156 if (myRank == 0) {
3157 out << lclOutStrPtr->str ();
3158 lclOutStrPtr = Teuchos::null; // clear it to save space
3159 }
3160
3161 const int sizeTag = 1337;
3162 const int dataTag = 1338;
3163
3164 ArrayRCP<char> recvDataBuf; // only used on Process 0
3165
3166 // Send string sizes and data from each process in turn to
3167 // Process 0, and print on that process.
3168 for (int p = 1; p < numProcs; ++p) {
3169 if (myRank == 0) {
3170 // Receive the incoming string's length.
3171 ArrayRCP<size_t> recvSize (1);
3172 recvSize[0] = 0;
3173 RCP<CommRequest<int> > recvSizeReq =
3174 ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3175 wait<int> (comm, outArg (recvSizeReq));
3176 const size_t numCharsToRecv = recvSize[0];
3177
3178 // Allocate space for the string to receive. Reuse receive
3179 // buffer space if possible. We can do this because in the
3180 // current implementation, we only have one receive in
3181 // flight at a time. Leave space for the '\0' at the end,
3182 // in case the sender doesn't send it.
3183 if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3184 recvDataBuf.resize (numCharsToRecv + 1);
3185 }
3186 ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3187 // Post the receive of the actual string data.
3188 RCP<CommRequest<int> > recvDataReq =
3189 ireceive<int, char> (recvData, p, dataTag, comm);
3190 wait<int> (comm, outArg (recvDataReq));
3191
3192 // Print the received data. This will respect the current
3193 // indentation level. Make sure that the string is
3194 // null-terminated.
3195 recvDataBuf[numCharsToRecv] = '\0';
3196 out << recvDataBuf.getRawPtr ();
3197 }
3198 else if (myRank == p) { // if I am not Process 0, and my rank is p
3199 // This deep-copies the string at most twice, depending on
3200 // whether std::string reference counts internally (it
3201 // generally does, so this won't deep-copy at all).
3202 const std::string stringToSend = lclOutStrPtr->str ();
3203 lclOutStrPtr = Teuchos::null; // clear original to save space
3204
3205 // Send the string's length to Process 0.
3206 const size_t numCharsToSend = stringToSend.size ();
3207 ArrayRCP<size_t> sendSize (1);
3208 sendSize[0] = numCharsToSend;
3209 RCP<CommRequest<int> > sendSizeReq =
3210 isend<int, size_t> (sendSize, 0, sizeTag, comm);
3211 wait<int> (comm, outArg (sendSizeReq));
3212
3213 // Send the actual string to Process 0. We know that the
3214 // string has length > 0, so it's save to take the address
3215 // of the first entry. Make a nonowning ArrayRCP to hold
3216 // the string. Process 0 will add a null termination
3217 // character at the end of the string, after it receives the
3218 // message.
3219 ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3220 RCP<CommRequest<int> > sendDataReq =
3221 isend<int, char> (sendData, 0, dataTag, comm);
3222 wait<int> (comm, outArg (sendDataReq));
3223 }
3224 } // for each process rank p other than 0
3225 } // extreme verbosity level (print the whole matrix)
3226 }
3227
3228 template<class Scalar, class LO, class GO, class Node>
3229 Teuchos::RCP<const Teuchos::Comm<int> >
3231 getComm() const
3232 {
3233 return graph_.getComm();
3234 }
3235
3236
3237 template<class Scalar, class LO, class GO, class Node>
3240 getGlobalNumCols() const
3241 {
3242 return graph_.getGlobalNumCols();
3243 }
3244
3245 template<class Scalar, class LO, class GO, class Node>
3246 size_t
3248 getNodeNumCols() const
3249 {
3250 return graph_.getNodeNumCols();
3251 }
3252
3253 template<class Scalar, class LO, class GO, class Node>
3254 GO
3256 getIndexBase() const
3257 {
3258 return graph_.getIndexBase();
3259 }
3260
3261 template<class Scalar, class LO, class GO, class Node>
3265 {
3266 return graph_.getGlobalNumEntries();
3267 }
3268
3269 template<class Scalar, class LO, class GO, class Node>
3270 size_t
3272 getNodeNumEntries() const
3273 {
3274 return graph_.getNodeNumEntries();
3275 }
3276
3277 template<class Scalar, class LO, class GO, class Node>
3278 size_t
3280 getNumEntriesInGlobalRow (GO globalRow) const
3281 {
3282 return graph_.getNumEntriesInGlobalRow(globalRow);
3283 }
3284
3285
3286 template<class Scalar, class LO, class GO, class Node>
3287 size_t
3290 {
3291 return graph_.getGlobalMaxNumRowEntries();
3292 }
3293
3294 template<class Scalar, class LO, class GO, class Node>
3295 bool
3297 hasColMap() const
3298 {
3299 return graph_.hasColMap();
3300 }
3301
3302
3303 template<class Scalar, class LO, class GO, class Node>
3304 bool
3306 isLocallyIndexed() const
3307 {
3308 return graph_.isLocallyIndexed();
3309 }
3310
3311 template<class Scalar, class LO, class GO, class Node>
3312 bool
3314 isGloballyIndexed() const
3315 {
3316 return graph_.isGloballyIndexed();
3317 }
3318
3319 template<class Scalar, class LO, class GO, class Node>
3320 bool
3322 isFillComplete() const
3323 {
3324 return graph_.isFillComplete ();
3325 }
3326
3327 template<class Scalar, class LO, class GO, class Node>
3328 bool
3330 supportsRowViews() const
3331 {
3332 return false;
3333 }
3334
3335 template<class Scalar, class LO, class GO, class Node>
3336 void
3338 getGlobalRowCopy (GO /*GlobalRow*/,
3339 nonconst_global_inds_host_view_type &/*Indices*/,
3340 nonconst_values_host_view_type &/*Values*/,
3341 size_t& /*NumEntries*/) const
3342 {
3343 TEUCHOS_TEST_FOR_EXCEPTION(
3344 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3345 "This class doesn't support global matrix indexing.");
3346
3347 }
3348
3349#ifdef TPETRA_ENABLE_DEPRECATED_CODE
3350 template<class Scalar, class LO, class GO, class Node>
3351 void
3353 getGlobalRowCopy (GO /* GlobalRow */,
3354 const Teuchos::ArrayView<GO> &/* Indices */,
3355 const Teuchos::ArrayView<Scalar> &/* Values */,
3356 size_t &/* NumEntries */) const
3357 {
3358 TEUCHOS_TEST_FOR_EXCEPTION(
3359 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3360 "This class doesn't support global matrix indexing.");
3361
3362 }
3363#endif
3364
3365#ifdef TPETRA_ENABLE_DEPRECATED_CODE
3366 template<class Scalar, class LO, class GO, class Node>
3367 void
3369 getGlobalRowView (GO /* GlobalRow */,
3370 Teuchos::ArrayView<const GO> &/* indices */,
3371 Teuchos::ArrayView<const Scalar> &/* values */) const
3372 {
3373 TEUCHOS_TEST_FOR_EXCEPTION(
3374 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3375 "This class doesn't support global matrix indexing.");
3376
3377 }
3378
3379 template<class Scalar, class LO, class GO, class Node>
3380 void
3382 getLocalRowView (LO /* LocalRow */,
3383 Teuchos::ArrayView<const LO>& /* indices */,
3384 Teuchos::ArrayView<const Scalar>& /* values */) const
3385 {
3386 TEUCHOS_TEST_FOR_EXCEPTION(
3387 true, std::logic_error, "Tpetra::BlockCrsMatrix::getLocalRowView: "
3388 "This class doesn't support local matrix indexing.");
3389 }
3390#endif
3391
3392 template<class Scalar, class LO, class GO, class Node>
3393 void
3395 getGlobalRowView (GO /* GlobalRow */,
3396 global_inds_host_view_type &/* indices */,
3397 values_host_view_type &/* values */) const
3398 {
3399 TEUCHOS_TEST_FOR_EXCEPTION(
3400 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3401 "This class doesn't support global matrix indexing.");
3402
3403 }
3404
3405 template<class Scalar, class LO, class GO, class Node>
3406 void
3408 getLocalRowView (LO localRowInd,
3409 local_inds_host_view_type &colInds,
3410 values_host_view_type &vals) const
3411 {
3412 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3413 colInds = local_inds_host_view_type();
3414 vals = values_host_view_type();
3415 }
3416 else {
3417 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3418 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3419 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3420
3421 vals = getValuesHost (localRowInd);
3422 }
3423 }
3424
3425 template<class Scalar, class LO, class GO, class Node>
3426 void
3428 getLocalRowViewNonConst (LO localRowInd,
3429 local_inds_host_view_type &colInds,
3430 nonconst_values_host_view_type &vals) const
3431 {
3432 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3433 colInds = nonconst_local_inds_host_view_type();
3434 vals = nonconst_values_host_view_type();
3435 }
3436 else {
3437 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3438 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3439 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3440
3442 vals = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
3443 }
3444 }
3445
3446 template<class Scalar, class LO, class GO, class Node>
3447 void
3450 {
3451 const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3452
3453 Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3454 graph_.getLocalDiagOffsets (diagOffsets);
3455
3456 // The code below works on host, so use a host View.
3457 auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3458 Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3459
3460 auto vals_host_out = val_.getHostView(Access::ReadOnly);
3461 const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
3462
3463 // TODO amk: This is a temporary measure to make the code run with Ifpack2
3464 size_t rowOffset = 0;
3465 size_t offset = 0;
3466 LO bs = getBlockSize();
3467 for(size_t r=0; r<getNodeNumRows(); r++)
3468 {
3469 // move pointer to start of diagonal block
3470 offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3471 for(int b=0; b<bs; b++)
3472 {
3473 diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3474 }
3475 // move pointer to start of next block row
3476 rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3477 }
3478
3479 //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3480 }
3481
3482 template<class Scalar, class LO, class GO, class Node>
3483 void
3485 leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3486 {
3487 TEUCHOS_TEST_FOR_EXCEPTION(
3488 true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3489 "not implemented.");
3490
3491 }
3492
3493 template<class Scalar, class LO, class GO, class Node>
3494 void
3496 rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3497 {
3498 TEUCHOS_TEST_FOR_EXCEPTION(
3499 true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3500 "not implemented.");
3501
3502 }
3503
3504 template<class Scalar, class LO, class GO, class Node>
3505 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3507 getGraph() const
3508 {
3509 return graphRCP_;
3510 }
3511
3512 template<class Scalar, class LO, class GO, class Node>
3513 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3515 getFrobeniusNorm () const
3516 {
3517 TEUCHOS_TEST_FOR_EXCEPTION(
3518 true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3519 "not implemented.");
3520 }
3521
3522} // namespace Tpetra
3523
3524//
3525// Explicit instantiation macro
3526//
3527// Must be expanded from within the Tpetra namespace!
3528//
3529#define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3530 template class BlockCrsMatrix< S, LO, GO, NODE >;
3531
3532#endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual size_t getNodeNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
size_t getNodeNumRows() const override
get the local number of block rows
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
virtual size_t getNodeNumCols() const override
The number of columns needed to apply the forward operator on this node.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
One or more distributed dense vectors.
A distributed dense vector.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
int local_ordinal_type
Default value of Scalar template parameter.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
size_t global_size_t
Global size_t object.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.