Ifpack2 Templated Preconditioning Package Version 1.0
Ifpack2_BlockTriDiContainer_impl.hpp
1/*@HEADER
2// ***********************************************************************
3//
4// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ***********************************************************************
40//@HEADER
41*/
42
43#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45
46#include <Teuchos_Details_MpiTypeTraits.hpp>
47
48#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49#include <Tpetra_Distributor.hpp>
50#include <Tpetra_BlockMultiVector.hpp>
51
52#include <Kokkos_ArithTraits.hpp>
53#include <KokkosBatched_Util.hpp>
54#include <KokkosBatched_Vector.hpp>
55#include <KokkosBatched_Copy_Decl.hpp>
56#include <KokkosBatched_Copy_Impl.hpp>
57#include <KokkosBatched_AddRadial_Decl.hpp>
58#include <KokkosBatched_AddRadial_Impl.hpp>
59#include <KokkosBatched_SetIdentity_Decl.hpp>
60#include <KokkosBatched_SetIdentity_Impl.hpp>
61#include <KokkosBatched_Gemm_Decl.hpp>
62#include <KokkosBatched_Gemm_Serial_Impl.hpp>
63#include <KokkosBatched_Gemm_Team_Impl.hpp>
64#include <KokkosBatched_Gemv_Decl.hpp>
65#include <KokkosBatched_Gemv_Serial_Impl.hpp>
66#include <KokkosBatched_Gemv_Team_Impl.hpp>
67#include <KokkosBatched_Trsm_Decl.hpp>
68#include <KokkosBatched_Trsm_Serial_Impl.hpp>
69#include <KokkosBatched_Trsm_Team_Impl.hpp>
70#include <KokkosBatched_Trsv_Decl.hpp>
71#include <KokkosBatched_Trsv_Serial_Impl.hpp>
72#include <KokkosBatched_Trsv_Team_Impl.hpp>
73#include <KokkosBatched_LU_Decl.hpp>
74#include <KokkosBatched_LU_Serial_Impl.hpp>
75#include <KokkosBatched_LU_Team_Impl.hpp>
76
77#include <KokkosBlas1_nrm1.hpp>
78#include <KokkosBlas1_nrm2.hpp>
79
80#include <memory>
81
82// need to interface this into cmake variable (or only use this flag when it is necessary)
83//#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84//#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
85#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
86#include "cuda_profiler_api.h"
87#endif
88
89// I am not 100% sure about the mpi 3 on cuda
90#if MPI_VERSION >= 3
91#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
92#endif
93
94// ::: Experiments :::
95// define either pinned memory or cudamemory for mpi
96// if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
97// if defined, this use pinned memory instead of device pointer
98// by default, we enable pinned memory
99#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
100//#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
101
102// if defined, all views are allocated on cuda space intead of cuda uvm space
103#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
104
105// if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
106#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
107#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
108#endif
109
110// if defined, it uses multiple execution spaces
111#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
112
113namespace Ifpack2 {
114
115 namespace BlockTriDiContainerDetails {
116
117 namespace KB = KokkosBatched;
118
122 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
123
124 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
125 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
126 MemoryTraitsType::is_random_access |
127 flag>;
128
129 template <typename ViewType>
130 using Unmanaged = Kokkos::View<typename ViewType::data_type,
131 typename ViewType::array_layout,
132 typename ViewType::device_type,
133 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
134 template <typename ViewType>
135 using Atomic = Kokkos::View<typename ViewType::data_type,
136 typename ViewType::array_layout,
137 typename ViewType::device_type,
138 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
139 template <typename ViewType>
140 using Const = Kokkos::View<typename ViewType::const_data_type,
141 typename ViewType::array_layout,
142 typename ViewType::device_type,
143 typename ViewType::memory_traits>;
144 template <typename ViewType>
145 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
146
147 template <typename ViewType>
148 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
149
150 template <typename ViewType>
151 using Unmanaged = Kokkos::View<typename ViewType::data_type,
152 typename ViewType::array_layout,
153 typename ViewType::device_type,
154 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
155
156
157 template <typename ViewType>
158 using Scratch = Kokkos::View<typename ViewType::data_type,
159 typename ViewType::array_layout,
160 typename ViewType::execution_space::scratch_memory_space,
161 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
162
166 template<typename T> struct BlockTridiagScalarType { typedef T type; };
167#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
168 template<> struct BlockTridiagScalarType<double> { typedef float type; };
169 //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
170#endif
171
175 template<typename T> struct is_cuda { enum : bool { value = false }; };
176#if defined(KOKKOS_ENABLE_CUDA)
177 template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
178#endif
179
183 template<typename T> struct is_hip { enum : bool { value = false }; };
184#if defined(KOKKOS_ENABLE_HIP)
185 template<> struct is_hip<Kokkos::Experimental::HIP> { enum : bool { value = true }; };
186#endif
187
191 template<typename T>
193 static void createInstance(T &exec_instance) {
194 exec_instance = T();
195 }
196#if defined(KOKKOS_ENABLE_CUDA)
197 static void createInstance(const cudaStream_t &s, T &exec_instance) {
198 exec_instance = T();
199 }
200#endif
201 };
202
203#if defined(KOKKOS_ENABLE_CUDA)
204 template<>
205 struct ExecutionSpaceFactory<Kokkos::Cuda> {
206 static void createInstance(Kokkos::Cuda &exec_instance) {
207 exec_instance = Kokkos::Cuda();
208 }
209 static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
210 exec_instance = Kokkos::Cuda(s);
211 }
212 };
213#endif
214
215#if defined(KOKKOS_ENABLE_HIP)
216 template<>
217 struct ExecutionSpaceFactory<Kokkos::Experimental::HIP> {
218 static void createInstance(Kokkos::Experimental::HIP &exec_instance) {
219 exec_instance = Kokkos::Experimental::HIP();
220 }
221 };
222#endif
223
227 template<typename CommPtrType>
228 std::string get_msg_prefix (const CommPtrType &comm) {
229 const auto rank = comm->getRank();
230 const auto nranks = comm->getSize();
231 std::stringstream ss;
232 ss << "Rank " << rank << " of " << nranks << ": ";
233 return ss.str();
234 }
235
239 template<typename T, int N>
241 T v[N];
242 KOKKOS_INLINE_FUNCTION
244 for (int i=0;i<N;++i)
245 this->v[i] = 0;
246 }
247 KOKKOS_INLINE_FUNCTION
249 for (int i=0;i<N;++i)
250 this->v[i] = b.v[i];
251 }
252 };
253 template<typename T, int N>
254 static
255 KOKKOS_INLINE_FUNCTION
256 void
257 operator+=(volatile ArrayValueType<T,N> &a,
258 volatile const ArrayValueType<T,N> &b) {
259 for (int i=0;i<N;++i)
260 a.v[i] += b.v[i];
261 }
262 template<typename T, int N>
263 static
264 KOKKOS_INLINE_FUNCTION
265 void
266 operator+=(ArrayValueType<T,N> &a,
267 const ArrayValueType<T,N> &b) {
268 for (int i=0;i<N;++i)
269 a.v[i] += b.v[i];
270 }
271
275 template<typename T, int N, typename ExecSpace>
276 struct SumReducer {
277 typedef SumReducer reducer;
279 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
280 value_type *value;
281
282 KOKKOS_INLINE_FUNCTION
283 SumReducer(value_type &val) : value(&val) {}
284
285 KOKKOS_INLINE_FUNCTION
286 void join(value_type &dst, value_type &src) const {
287 for (int i=0;i<N;++i)
288 dst.v[i] += src.v[i];
289 }
290 KOKKOS_INLINE_FUNCTION
291 void join(volatile value_type &dst, const volatile value_type &src) const {
292 for (int i=0;i<N;++i)
293 dst.v[i] += src.v[i];
294 }
295 KOKKOS_INLINE_FUNCTION
296 void init(value_type &val) const {
297 for (int i=0;i<N;++i)
298 val.v[i] = Kokkos::reduction_identity<T>::sum();
299 }
300 KOKKOS_INLINE_FUNCTION
301 value_type& reference() {
302 return *value;
303 }
304 KOKKOS_INLINE_FUNCTION
305 result_view_type view() const {
306 return result_view_type(value);
307 }
308 };
309
310#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
311#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
312#else
313#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
314#endif
315
316#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
317#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
318 CUDA_SAFE_CALL(cudaProfilerStart());
319
320#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
321 { CUDA_SAFE_CALL( cudaProfilerStop() ); }
322#else
324#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
325#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
326#endif
327
331 template <typename MatrixType>
332 struct ImplType {
336 typedef size_t size_type;
337 typedef typename MatrixType::scalar_type scalar_type;
338 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
339 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
340 typedef typename MatrixType::node_type node_type;
341
345 typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
346 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
347
348 typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
349 typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
350
354 typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
355
359 typedef typename node_type::device_type node_device_type;
360 typedef typename node_device_type::execution_space node_execution_space;
361 typedef typename node_device_type::memory_space node_memory_space;
362
363#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
365 typedef node_execution_space execution_space;
366 typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
367 Kokkos::CudaSpace,
368 node_memory_space>::type memory_space;
369 typedef Kokkos::Device<execution_space,memory_space> device_type;
370#else
371 typedef node_device_type device_type;
372 typedef node_execution_space execution_space;
373 typedef node_memory_space memory_space;
374#endif
375
376 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
377 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
378 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
379 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
380 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
381 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
382 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
383 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
384
388 template<typename T, int l> using Vector = KB::Vector<T,l>;
389 template<typename T> using SIMD = KB::SIMD<T>;
390 template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
391 template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
392
393 static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
394 static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
395 typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
396 typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
397
401 typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
402 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
403 // tpetra block crs values
404 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
405 typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
406
407 // tpetra multivector values (layout left): may need to change the typename more explicitly
408 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
409 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
410
411 // packed data always use layout right
412 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
413 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
414 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
415 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
416 typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
417 typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
418 };
419
423 template<typename MatrixType>
424 typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
425 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
426 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
427 using impl_type = ImplType<MatrixType>;
428 using tpetra_map_type = typename impl_type::tpetra_map_type;
429 using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
430 using tpetra_import_type = typename impl_type::tpetra_import_type;
431
432 const auto g = A->getCrsGraph(); // tpetra crs graph object
433 const auto blocksize = A->getBlockSize();
434 const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
435 const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
436
437 return Teuchos::rcp(new tpetra_import_type(src, tgt));
438 }
439
440 // Partial replacement for forward-mode MultiVector::doImport.
441 // Permits overlapped communication and computation, but also supports sync'ed.
442 // I'm finding that overlapped comm/comp can give quite poor performance on some
443 // platforms, so we can't just use it straightforwardly always.
444
445 template<typename MatrixType>
446 struct AsyncableImport {
447 public:
448 using impl_type = ImplType<MatrixType>;
449
450 private:
454#if !defined(HAVE_IFPACK2_MPI)
455 typedef int MPI_Request;
456 typedef int MPI_Comm;
457#endif
460 using scalar_type = typename impl_type::scalar_type;
461
462 static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
463#ifdef HAVE_IFPACK2_MPI
464 MPI_Request ureq;
465 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
466 if (ireq == NULL) MPI_Request_free(&ureq);
467 return ret;
468#else
469 return 0;
470#endif
471 }
472
473 static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
474#ifdef HAVE_IFPACK2_MPI
475 MPI_Request ureq;
476 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
477 if (ireq == NULL) MPI_Request_free(&ureq);
478 return ret;
479#else
480 return 0;
481#endif
482 }
483
484 static int waitany(int count, MPI_Request* reqs, int* index) {
485#ifdef HAVE_IFPACK2_MPI
486 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
487#else
488 return 0;
489#endif
490 }
491
492 static int waitall(int count, MPI_Request* reqs) {
493#ifdef HAVE_IFPACK2_MPI
494 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
495#else
496 return 0;
497#endif
498 }
499
500 public:
501 using tpetra_map_type = typename impl_type::tpetra_map_type;
502 using tpetra_import_type = typename impl_type::tpetra_import_type;
503
504 using local_ordinal_type = typename impl_type::local_ordinal_type;
505 using global_ordinal_type = typename impl_type::global_ordinal_type;
506 using size_type = typename impl_type::size_type;
507 using impl_scalar_type = typename impl_type::impl_scalar_type;
508
509 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
510 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
511
512 using execution_space = typename impl_type::execution_space;
513 using memory_space = typename impl_type::memory_space;
514 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
515 using size_type_1d_view = typename impl_type::size_type_1d_view;
516 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
517
518#if defined(KOKKOS_ENABLE_CUDA)
519 using impl_scalar_type_1d_view =
520 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
521# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
522 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
523# elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
524 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
525# else // no experimental macros are defined
526 typename impl_type::impl_scalar_type_1d_view,
527# endif
528 typename impl_type::impl_scalar_type_1d_view>::type;
529#else
530 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
531#endif
532 using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
533 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
534
535#ifdef HAVE_IFPACK2_MPI
536 MPI_Comm comm;
537#endif
538
539 impl_scalar_type_2d_view_tpetra remote_multivector;
540 local_ordinal_type blocksize;
541
542 template<typename T>
543 struct SendRecvPair {
544 T send, recv;
545 };
546
547 // (s)end and (r)eceive data:
548 SendRecvPair<int_1d_view_host> pids; // mpi ranks
549 SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
550 SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
551 SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
552 SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
553 SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
554
555 local_ordinal_type_1d_view dm2cm; // permutation
556
557#if defined(KOKKOS_ENABLE_CUDA)
558 using cuda_stream_1d_std_vector = std::vector<cudaStream_t>;
559 cuda_stream_1d_std_vector stream;
560
561 using exec_instance_1d_std_vector = std::vector<execution_space>;
562 exec_instance_1d_std_vector exec_instances;
563#endif
564
565 // for cuda
566 public:
567 void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
568 const size_type_1d_view &offs) {
569 // wrap lens to kokkos view and deep copy to device
570 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
571 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
572
573 // exclusive scan
574 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
575 const local_ordinal_type lens_size = lens_device.extent(0);
576 Kokkos::parallel_scan
577 ("AsyncableImport::RangePolicy::setOffsetValues",
578 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
579 if (final)
580 offs(i) = update;
581 update += (i < lens_size ? lens_device[i] : 0);
582 });
583 }
584
585 void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
586 const size_type_1d_view_host &offs) {
587 // wrap lens to kokkos view and deep copy to device
588 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
589 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
590
591 // exclusive scan
592 offs(0) = 0;
593 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
594 offs(i) = offs(i-1) + lens[i-1];
595 }
596 }
597
598 private:
599 void createMpiRequests(const tpetra_import_type &import) {
600 Tpetra::Distributor &distributor = import.getDistributor();
601
602 // copy pids from distributor
603 const auto pids_from = distributor.getProcsFrom();
604 pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
605 memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
606
607 const auto pids_to = distributor.getProcsTo();
608 pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
609 memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
610
611 // mpi requests
612 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
613 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
614
615 // construct offsets
616#if 0
617 const auto lengths_to = distributor.getLengthsTo();
618 offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
619
620 const auto lengths_from = distributor.getLengthsFrom();
621 offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
622
623 setOffsetValues(lengths_to, offset.send);
624 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
625
626 setOffsetValues(lengths_from, offset.recv);
627 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
628#else
629 const auto lengths_to = distributor.getLengthsTo();
630 offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
631
632 const auto lengths_from = distributor.getLengthsFrom();
633 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
634
635 setOffsetValuesHost(lengths_to, offset_host.send);
636 //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
637
638 setOffsetValuesHost(lengths_from, offset_host.recv);
639 //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
640#endif
641 }
642
643 void createSendRecvIDs(const tpetra_import_type &import) {
644 // For each remote PID, the list of LIDs to receive.
645 const auto remote_lids = import.getRemoteLIDs();
646 const local_ordinal_type_1d_view_host
647 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
648 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
649 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
650
651 // For each export PID, the list of LIDs to send.
652 auto epids = import.getExportPIDs();
653 auto elids = import.getExportLIDs();
654 TEUCHOS_ASSERT(epids.size() == elids.size());
655 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
656 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
657
658 // naive search (not sure if pids or epids are sorted)
659 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
660 const auto pid_send_value = pids.send[i];
661 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
662 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
663#if !defined(__CUDA_ARCH__)
664 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
665#endif
666 }
667 Kokkos::deep_copy(lids.send, lids_send_host);
668 }
669
670 void createExecutionSpaceInstances() {
671#if defined(KOKKOS_ENABLE_CUDA)
672 const local_ordinal_type num_streams = 8;
673 {
674 stream.clear();
675 stream.resize(num_streams);
676 exec_instances.clear();
677 exec_instances.resize(num_streams);
678 for (local_ordinal_type i=0;i<num_streams;++i) {
679 CUDA_SAFE_CALL(cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking));
680 ExecutionSpaceFactory<execution_space>::createInstance(stream[i], exec_instances[i]);
681 }
682 }
683#endif
684 }
685
686 void destroyExecutionSpaceInstances() {
687#if defined(KOKKOS_ENABLE_CUDA)
688 {
689 const local_ordinal_type num_streams = stream.size();
690 for (local_ordinal_type i=0;i<num_streams;++i)
691 CUDA_SAFE_CALL(cudaStreamDestroy(stream[i]));
692 }
693 stream.clear();
694 exec_instances.clear();
695#endif
696 }
697
698 public:
699 // for cuda, all tag types are public
700 struct ToBuffer {};
701 struct ToMultiVector {};
702
703 AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
704 const Teuchos::RCP<const tpetra_map_type>& tgt_map,
705 const local_ordinal_type blocksize_,
706 const local_ordinal_type_1d_view dm2cm_) {
707 blocksize = blocksize_;
708 dm2cm = dm2cm_;
709
710#ifdef HAVE_IFPACK2_MPI
711 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
712#endif
713 const tpetra_import_type import(src_map, tgt_map);
714
715 createMpiRequests(import);
716 createSendRecvIDs(import);
717 createExecutionSpaceInstances();
718 }
719
720 ~AsyncableImport() {
721 destroyExecutionSpaceInstances();
722 }
723
724 void createDataBuffer(const local_ordinal_type &num_vectors) {
725 const size_type extent_0 = lids.recv.extent(0)*blocksize;
726 const size_type extent_1 = num_vectors;
727 if (remote_multivector.extent(0) == extent_0 &&
728 remote_multivector.extent(1) == extent_1) {
729 // skip
730 } else {
731 remote_multivector =
732 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
733
734 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
735 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
736
737 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
738 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
739 }
740 }
741
742 void cancel () {
743#ifdef HAVE_IFPACK2_MPI
744 waitall(reqs.recv.size(), reqs.recv.data());
745 waitall(reqs.send.size(), reqs.send.data());
746#endif
747 }
748
749 // ======================================================================
750 // Async version using cuda stream
751 // - cuda only with kokkos develop branch
752 // ======================================================================
753
754#if defined(KOKKOS_ENABLE_CUDA)
755 template<typename PackTag>
756 static
757 void copy(const local_ordinal_type_1d_view &lids_,
758 const impl_scalar_type_1d_view &buffer_,
759 const local_ordinal_type ibeg_,
760 const local_ordinal_type iend_,
761 const impl_scalar_type_2d_view_tpetra &multivector_,
762 const local_ordinal_type blocksize_,
763 const execution_space &exec_instance_) {
764 const local_ordinal_type num_vectors = multivector_.extent(1);
765 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
766 const local_ordinal_type idiff = iend_ - ibeg_;
767 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
768
769 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
770 local_ordinal_type vector_size(0);
771 if (blocksize_ <= 4) vector_size = 4;
772 else if (blocksize_ <= 8) vector_size = 8;
773 else if (blocksize_ <= 16) vector_size = 16;
774 else vector_size = 32;
775
776 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
777 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
778 Kokkos::parallel_for
779 (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
780 Kokkos::Experimental::require(policy, work_item_property),
781 KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
782 const local_ordinal_type i = member.league_rank();
783 Kokkos::parallel_for
784 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
785 auto aptr = abase + blocksize_*(i + idiff*j);
786 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
787 if (std::is_same<PackTag,ToBuffer>::value)
788 Kokkos::parallel_for
789 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
790 aptr[k] = bptr[k];
791 });
792 else
793 Kokkos::parallel_for
794 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
795 bptr[k] = aptr[k];
796 });
797 });
798 });
799 }
800
801 void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
802 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
803
804#ifdef HAVE_IFPACK2_MPI
805 // constants and reallocate data buffers if necessary
806 const local_ordinal_type num_vectors = mv.extent(1);
807 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
808
809 // 0. post receive async
810 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
811 irecv(comm,
812 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
813 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
814 pids.recv[i],
815 42,
816 &reqs.recv[i]);
817 }
818
820 execution_space().fence();
821
822 // 1. async memcpy
823 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
824 // 1.0. enqueue pack buffer
825 if (i<8) exec_instances[i%8].fence();
826 copy<ToBuffer>(lids.send, buffer.send,
827 offset_host.send(i), offset_host.send(i+1),
828 mv, blocksize,
829 //execution_space());
830 exec_instances[i%8]);
831
832 }
834 //execution_space().fence();
835 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
836 // 1.1. sync the stream and isend
837 if (i<8) exec_instances[i%8].fence();
838 isend(comm,
839 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
840 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
841 pids.send[i],
842 42,
843 &reqs.send[i]);
844 }
845
846 // 2. poke communication
847 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
848 int flag;
849 MPI_Status stat;
850 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
851 }
852#endif // HAVE_IFPACK2_MPI
853 }
854
855 void syncRecvVar1() {
856 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
857#ifdef HAVE_IFPACK2_MPI
858 // 0. wait for receive async.
859 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
860 local_ordinal_type idx = i;
861
862 // 0.0. wait any
863 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
864
865 // 0.1. unpack data after data is moved into a device
866 copy<ToMultiVector>(lids.recv, buffer.recv,
867 offset_host.recv(idx), offset_host.recv(idx+1),
868 remote_multivector, blocksize,
869 exec_instances[idx%8]);
870 }
871
872 // 1. fire up all cuda events
873 Kokkos::fence();
874
875 // 2. cleanup all open comm
876 waitall(reqs.send.size(), reqs.send.data());
877#endif // HAVE_IFPACK2_MPI
878 }
879#endif //defined(KOKKOS_ENABLE_CUDA)
880
881 // ======================================================================
882 // Generic version without using cuda stream
883 // - only difference between cuda and host architecture is on using team
884 // or range policies.
885 // ======================================================================
886 template<typename PackTag>
887 static
888 void copy(const local_ordinal_type_1d_view &lids_,
889 const impl_scalar_type_1d_view &buffer_,
890 const local_ordinal_type &ibeg_,
891 const local_ordinal_type &iend_,
892 const impl_scalar_type_2d_view_tpetra &multivector_,
893 const local_ordinal_type blocksize_) {
894 const local_ordinal_type num_vectors = multivector_.extent(1);
895 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
896 const local_ordinal_type idiff = iend_ - ibeg_;
897 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
898 if (is_cuda<execution_space>::value || is_hip<execution_space>::value) {
899#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
900 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
901 local_ordinal_type vector_size(0);
902 if (blocksize_ <= 4) vector_size = 4;
903 else if (blocksize_ <= 8) vector_size = 8;
904 else if (blocksize_ <= 16) vector_size = 16;
905 else vector_size = 32;
906 const team_policy_type policy(idiff, 1, vector_size);
907 Kokkos::parallel_for
908 ("AsyncableImport::TeamPolicy::copy",
909 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
910 const local_ordinal_type i = member.league_rank();
911 Kokkos::parallel_for
912 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
913 auto aptr = abase + blocksize_*(i + idiff*j);
914 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
915 if (std::is_same<PackTag,ToBuffer>::value)
916 Kokkos::parallel_for
917 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
918 aptr[k] = bptr[k];
919 });
920 else
921 Kokkos::parallel_for
922 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
923 bptr[k] = aptr[k];
924 });
925 });
926 });
927#endif
928 } else {
929#if defined(__CUDA_ARCH__)
930 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
931#else
932 {
933 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
934 Kokkos::parallel_for
935 ("AsyncableImport::RangePolicy::copy",
936 policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
937 const local_ordinal_type i = ij%idiff;
938 const local_ordinal_type j = ij/idiff;
939 auto aptr = abase + blocksize_*(i + idiff*j);
940 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
941 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
942 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
943 memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
944 });
945 }
946#endif
947 }
948 }
949
950
954 void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
955 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
956
957#ifdef HAVE_IFPACK2_MPI
958 // constants and reallocate data buffers if necessary
959 const local_ordinal_type num_vectors = mv.extent(1);
960 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
961
962 // receive async
963 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
964 irecv(comm,
965 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
966 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
967 pids.recv[i],
968 42,
969 &reqs.recv[i]);
970 }
971
972 // send async
973 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
974 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
975 mv, blocksize);
976 Kokkos::fence();
977 isend(comm,
978 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
979 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
980 pids.send[i],
981 42,
982 &reqs.send[i]);
983 }
984
985 // I find that issuing an Iprobe seems to nudge some MPIs into action,
986 // which helps with overlapped comm/comp performance.
987 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
988 int flag;
989 MPI_Status stat;
990 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
991 }
992#endif
993 }
994
995 void syncRecvVar0() {
996 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
997#ifdef HAVE_IFPACK2_MPI
998 // receive async.
999 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
1000 local_ordinal_type idx = i;
1001 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
1002 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
1003 remote_multivector, blocksize);
1004 }
1005 // wait on the sends to match all Isends with a cleanup operation.
1006 waitall(reqs.send.size(), reqs.send.data());
1007#endif
1008 }
1009
1013 void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
1014#if defined(KOKKOS_ENABLE_CUDA)
1015#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
1016 asyncSendRecvVar1(mv);
1017#else
1018 asyncSendRecvVar0(mv);
1019#endif
1020#else
1021 asyncSendRecvVar0(mv);
1022#endif
1023 }
1024 void syncRecv() {
1025#if defined(KOKKOS_ENABLE_CUDA)
1026#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
1027 syncRecvVar1();
1028#else
1029 syncRecvVar0();
1030#endif
1031#else
1032 syncRecvVar0();
1033#endif
1034 }
1035
1036 void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
1037 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
1038 asyncSendRecv(mv);
1039 syncRecv();
1040 }
1041
1042 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
1043 };
1044
1048 template<typename MatrixType>
1049 Teuchos::RCP<AsyncableImport<MatrixType> >
1050 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
1051 using impl_type = ImplType<MatrixType>;
1052 using tpetra_map_type = typename impl_type::tpetra_map_type;
1053 using local_ordinal_type = typename impl_type::local_ordinal_type;
1054 using global_ordinal_type = typename impl_type::global_ordinal_type;
1055 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1056
1057 const auto g = A->getCrsGraph(); // tpetra crs graph object
1058 const auto blocksize = A->getBlockSize();
1059 const auto domain_map = g.getDomainMap();
1060 const auto column_map = g.getColMap();
1061
1062 std::vector<global_ordinal_type> gids;
1063 bool separate_remotes = true, found_first = false, need_owned_permutation = false;
1064 for (size_t i=0;i<column_map->getNodeNumElements();++i) {
1065 const global_ordinal_type gid = column_map->getGlobalElement(i);
1066 if (!domain_map->isNodeGlobalElement(gid)) {
1067 found_first = true;
1068 gids.push_back(gid);
1069 } else if (found_first) {
1070 separate_remotes = false;
1071 break;
1072 }
1073 if (!need_owned_permutation &&
1074 domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
1075 // The owned part of the domain and column maps are different
1076 // orderings. We *could* do a super efficient impl of this case in the
1077 // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
1078 // really, if a caller cares about speed, they wouldn't make different
1079 // local permutations like this. So we punt on the best impl and go for
1080 // a pretty good one: the permutation is done in place in
1081 // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
1082 // is the presumably worse memory access pattern of the input vector.
1083 need_owned_permutation = true;
1084 }
1085 }
1086
1087 if (separate_remotes) {
1088 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
1089 const auto parsimonious_col_map
1090 = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1091 if (parsimonious_col_map->getGlobalNumElements() > 0) {
1092 // make the importer only if needed.
1093 local_ordinal_type_1d_view dm2cm;
1094 if (need_owned_permutation) {
1095 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getNodeNumElements());
1096 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1097 for (size_t i=0;i<domain_map->getNodeNumElements();++i)
1098 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1099 Kokkos::deep_copy(dm2cm, dm2cm_host);
1100 }
1101 return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1102 }
1103 }
1104 return Teuchos::null;
1105 }
1106
1107 template<typename MatrixType>
1108 struct PartInterface {
1109 using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
1110 using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1111
1112 PartInterface() = default;
1113 PartInterface(const PartInterface &b) = default;
1114
1115 // Some terms:
1116 // The matrix A is split as A = D + R, where D is the matrix of tridiag
1117 // blocks and R is the remainder.
1118 // A part is roughly a synonym for a tridiag. The distinction is that a part
1119 // is the set of rows belonging to one tridiag and, equivalently, the off-diag
1120 // rows in R associated with that tridiag. In contrast, the term tridiag is
1121 // used to refer specifically to tridiag data, such as the pointer into the
1122 // tridiag data array.
1123 // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
1124 // tridiag, and partptr points to the beginning of each tridiag. This is the
1125 // LID space.
1126 // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
1127 // by this ordinal. This is the 'index' space.
1128 // A flat index is the mathematical index into an array. A pack index
1129 // accounts for SIMD packing.
1130
1131 // Local row LIDs. Permutation from caller's index space to tridiag index
1132 // space.
1133 local_ordinal_type_1d_view lclrow;
1134 // partptr_ is the pointer array into lclrow_.
1135 local_ordinal_type_1d_view partptr; // np+1
1136 // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
1137 // is the start of the i'th pack.
1138 local_ordinal_type_1d_view packptr; // npack+1
1139 // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
1140 // an alias of partptr_ in the case of no overlap.
1141 local_ordinal_type_1d_view part2rowidx0; // np+1
1142 // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
1143 // it's the same as part2rowidx0_; if it's > 1, then the value is combined
1144 // with i % vector_length to get the location in the packed data.
1145 local_ordinal_type_1d_view part2packrowidx0; // np+1
1146 local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
1147 // rowidx2part_ maps the row index to the part index.
1148 local_ordinal_type_1d_view rowidx2part; // nr
1149 // True if lcl{row|col} is at most a constant away from row{idx|col}. In
1150 // practice, this knowledge is not particularly useful, as packing for batched
1151 // processing is done at the same time as the permutation from LID to index
1152 // space. But it's easy to detect, so it's recorded in case an optimization
1153 // can be made based on it.
1154 bool row_contiguous;
1155
1156 local_ordinal_type max_partsz;
1157 };
1158
1162 template<typename MatrixType>
1163 PartInterface<MatrixType>
1164 createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1165 const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
1166 using impl_type = ImplType<MatrixType>;
1167 using local_ordinal_type = typename impl_type::local_ordinal_type;
1168 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1169
1170 constexpr int vector_length = impl_type::vector_length;
1171
1172 const auto comm = A->getRowMap()->getComm();
1173
1174 PartInterface<MatrixType> interf;
1175
1176 const bool jacobi = partitions.size() == 0;
1177 const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
1178 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1179
1180#if defined(BLOCKTRIDICONTAINER_DEBUG)
1181 local_ordinal_type nrows = 0;
1182 if (jacobi)
1183 nrows = nparts;
1184 else
1185 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1186
1187 TEUCHOS_TEST_FOR_EXCEPT_MSG
1188 (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1189 << "the same as getNodeNumRows: " << nrows << " vs " << A_n_lclrows);
1190#endif
1191
1192 // permutation vector
1193 std::vector<local_ordinal_type> p;
1194 if (jacobi) {
1195 interf.max_partsz = 1;
1196 } else {
1197 // reorder parts to maximize simd packing efficiency
1198 p.resize(nparts);
1199
1200 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1201 std::vector<size_idx_pair_type> partsz(nparts);
1202 for (local_ordinal_type i=0;i<nparts;++i)
1203 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1204 std::sort(partsz.begin(), partsz.end(),
1205 [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1206 return x.first > y.first;
1207 });
1208 for (local_ordinal_type i=0;i<nparts;++i)
1209 p[i] = partsz[i].second;
1210
1211 interf.max_partsz = partsz[0].first;
1212 }
1213
1214 // allocate parts
1215 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1216 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1217 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1218 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1219 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1220
1221 // mirror to host and compute on host execution space
1222 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1223 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1224 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1225 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1226 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1227
1228 // Determine parts.
1229 interf.row_contiguous = true;
1230 partptr(0) = 0;
1231 part2rowidx0(0) = 0;
1232 part2packrowidx0(0) = 0;
1233 local_ordinal_type pack_nrows = 0;
1234 if (jacobi) {
1235 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1236 const local_ordinal_type ipnrows = 1;
1237 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1238 get_msg_prefix(comm)
1239 << "partition " << p[ip]
1240 << " is empty, which is not allowed.");
1241 //assume No overlap.
1242 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1243 // Since parts are ordered in nonincreasing size, the size of the first
1244 // part in a pack is the size for all parts in the pack.
1245 if (ip % vector_length == 0) pack_nrows = ipnrows;
1246 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1247 const local_ordinal_type os = partptr(ip);
1248 for (local_ordinal_type i=0;i<ipnrows;++i) {
1249 const auto lcl_row = ip;
1250 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1251 get_msg_prefix(comm)
1252 << "partitions[" << p[ip] << "]["
1253 << i << "] = " << lcl_row
1254 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1255 << "].");
1256 lclrow(os+i) = lcl_row;
1257 rowidx2part(os+i) = ip;
1258 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1259 interf.row_contiguous = false;
1260 }
1261 partptr(ip+1) = os + ipnrows;
1262 }
1263 } else {
1264 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1265 const auto* part = &partitions[p[ip]];
1266 const local_ordinal_type ipnrows = part->size();
1267 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1268 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1269 get_msg_prefix(comm)
1270 << "partition " << p[ip]
1271 << " is empty, which is not allowed.");
1272 //assume No overlap.
1273 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1274 // Since parts are ordered in nonincreasing size, the size of the first
1275 // part in a pack is the size for all parts in the pack.
1276 if (ip % vector_length == 0) pack_nrows = ipnrows;
1277 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1278 const local_ordinal_type os = partptr(ip);
1279 for (local_ordinal_type i=0;i<ipnrows;++i) {
1280 const auto lcl_row = (*part)[i];
1281 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1282 get_msg_prefix(comm)
1283 << "partitions[" << p[ip] << "]["
1284 << i << "] = " << lcl_row
1285 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1286 << "].");
1287 lclrow(os+i) = lcl_row;
1288 rowidx2part(os+i) = ip;
1289 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1290 interf.row_contiguous = false;
1291 }
1292 partptr(ip+1) = os + ipnrows;
1293 }
1294 }
1295#if defined(BLOCKTRIDICONTAINER_DEBUG)
1296 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1297#endif
1298 if (lclrow(0) != 0) interf.row_contiguous = false;
1299
1300 Kokkos::deep_copy(interf.partptr, partptr);
1301 Kokkos::deep_copy(interf.lclrow, lclrow);
1302
1303 //assume No overlap. Thus:
1304 interf.part2rowidx0 = interf.partptr;
1305 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1306
1307 interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1308 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1309
1310 { // Fill packptr.
1311 local_ordinal_type npacks = 0;
1312 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1313 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1314 ++npacks;
1315 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1316 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1317 packptr(0) = 0;
1318 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1319 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1320 packptr(k++) = ip;
1321 Kokkos::deep_copy(interf.packptr, packptr);
1322 }
1323
1324 return interf;
1325 }
1326
1330 template <typename MatrixType>
1333 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1334 using size_type_1d_view = typename impl_type::size_type_1d_view;
1335 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1336
1337 // flat_td_ptr(i) is the index into flat-array values of the start of the
1338 // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1339 // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1340 // vector_length is the position in the pack.
1341 size_type_1d_view flat_td_ptr, pack_td_ptr;
1342 // List of local column indices into A from which to grab
1343 // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1344 local_ordinal_type_1d_view A_colindsub;
1345 // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1346 // tridiag's pack, and i % vector_length gives the position in the pack.
1347 vector_type_3d_view values;
1348
1349 bool is_diagonal_only;
1350
1351 BlockTridiags() = default;
1352 BlockTridiags(const BlockTridiags &b) = default;
1353
1354 // Index into row-major block of a tridiag.
1355 template <typename idx_type>
1356 static KOKKOS_FORCEINLINE_FUNCTION
1357 idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1358 // Given a row of a row-major tridiag, return the index of the first block
1359 // in that row.
1360 template <typename idx_type>
1361 static KOKKOS_FORCEINLINE_FUNCTION
1362 idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1363 // Number of blocks in a tridiag having a given number of rows.
1364 template <typename idx_type>
1365 static KOKKOS_FORCEINLINE_FUNCTION
1366 idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1367 };
1368
1369
1373 template<typename MatrixType>
1375 createBlockTridiags(const PartInterface<MatrixType> &interf) {
1376 using impl_type = ImplType<MatrixType>;
1377 using execution_space = typename impl_type::execution_space;
1378 using local_ordinal_type = typename impl_type::local_ordinal_type;
1379 using size_type = typename impl_type::size_type;
1380 using size_type_1d_view = typename impl_type::size_type_1d_view;
1381
1382 constexpr int vector_length = impl_type::vector_length;
1383
1385
1386 const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1387
1388 { // construct the flat index pointers into the tridiag values array.
1389 btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1390 const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1391 Kokkos::parallel_scan
1392 ("createBlockTridiags::RangePolicy::flat_td_ptr",
1393 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1394 if (final)
1395 btdm.flat_td_ptr(i) = update;
1396 if (i < ntridiags) {
1397 const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1398 update += btdm.NumBlocks(nrows);
1399 }
1400 });
1401
1402 const auto nblocks = Kokkos::create_mirror_view_and_copy
1403 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1404 btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1405 }
1406
1407 // And the packed index pointers.
1408 if (vector_length == 1) {
1409 btdm.pack_td_ptr = btdm.flat_td_ptr;
1410 } else {
1411 const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1412 btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1413 const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1414 Kokkos::parallel_scan
1415 ("createBlockTridiags::RangePolicy::pack_td_ptr",
1416 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1417 const local_ordinal_type parti = interf.packptr(i);
1418 const local_ordinal_type parti_next = interf.packptr(i+1);
1419 if (final) {
1420 const size_type nblks = update;
1421 for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1422 btdm.pack_td_ptr(pti) = nblks;
1423 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1424 // last one
1425 if (i == npacks-1)
1426 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1427 }
1428 {
1429 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1430 update += btdm.NumBlocks(nrows);
1431 }
1432 });
1433 }
1434
1435 // values and A_colindsub are created in the symbolic phase
1436
1437 return btdm;
1438 }
1439
1440 // Set the tridiags to be I to the full pack block size. That way, if a
1441 // tridiag within a pack is shorter than the longest one, the extra blocks are
1442 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1443 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1444 // blocks, then the solve proceeds as though the extra blocks aren't
1445 // present. Since this extra work is part of the SIMD calls, it's not actually
1446 // extra work. Instead, it means we don't have to put checks or masks in, or
1447 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1448 // since the numeric phase fills in only the used entries, leaving these I
1449 // blocks intact.
1450 template<typename MatrixType>
1451 void
1452 setTridiagsToIdentity
1453 (const BlockTridiags<MatrixType>& btdm,
1454 const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1455 {
1456 using impl_type = ImplType<MatrixType>;
1457 using execution_space = typename impl_type::execution_space;
1458 using local_ordinal_type = typename impl_type::local_ordinal_type;
1459 using size_type_1d_view = typename impl_type::size_type_1d_view;
1460
1461 const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1462 const local_ordinal_type blocksize = btdm.values.extent(1);
1463
1464 {
1465 const int vector_length = impl_type::vector_length;
1466 const int internal_vector_length = impl_type::internal_vector_length;
1467
1468 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1469 using internal_vector_type = typename impl_type::internal_vector_type;
1470 using internal_vector_type_4d_view =
1471 typename impl_type::internal_vector_type_4d_view;
1472
1473 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1474 const internal_vector_type_4d_view values
1475 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1476 btdm.values.extent(0),
1477 btdm.values.extent(1),
1478 btdm.values.extent(2),
1479 vector_length/internal_vector_length);
1480 const local_ordinal_type vector_loop_size = values.extent(3);
1481#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1482 local_ordinal_type total_team_size(0);
1483 if (blocksize <= 5) total_team_size = 32;
1484 else if (blocksize <= 9) total_team_size = 64;
1485 else if (blocksize <= 12) total_team_size = 96;
1486 else if (blocksize <= 16) total_team_size = 128;
1487 else if (blocksize <= 20) total_team_size = 160;
1488 else total_team_size = 160;
1489 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1490 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1491#elif defined(KOKKOS_ENABLE_HIP)
1492 // FIXME: HIP
1493 // These settings might be completely wrong
1494 // will have to do some experiments to decide
1495 // what makes sense on AMD GPUs
1496 local_ordinal_type total_team_size(0);
1497 if (blocksize <= 5) total_team_size = 32;
1498 else if (blocksize <= 9) total_team_size = 64;
1499 else if (blocksize <= 12) total_team_size = 96;
1500 else if (blocksize <= 16) total_team_size = 128;
1501 else if (blocksize <= 20) total_team_size = 160;
1502 else total_team_size = 160;
1503 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1504 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1505#else // Host architecture: team size is always one
1506 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1507#endif
1508 Kokkos::parallel_for
1509 ("setTridiagsToIdentity::TeamPolicy",
1510 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1511 const local_ordinal_type k = member.league_rank();
1512 const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1513 const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1514 const local_ordinal_type diff = iend - ibeg;
1515 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1516 const btdm_scalar_type one(1);
1517 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1518 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1519 const local_ordinal_type i = ibeg + ii*3;
1520 for (local_ordinal_type j=0;j<blocksize;++j)
1521 values(i,j,j,v) = one;
1522 });
1523 });
1524 });
1525 }
1526 }
1527
1531 template <typename MatrixType>
1532 struct AmD {
1534 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1535 using size_type_1d_view = typename impl_type::size_type_1d_view;
1536 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
1537 // rowptr points to the start of each row of A_colindsub.
1538 size_type_1d_view rowptr, rowptr_remote;
1539 // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1540 // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1541 // where g is A's graph, are the columns AmD uses. If seq_method_, then
1542 // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1543 // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1544 // contains the remote ones.
1545 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1546
1547 // Currently always true.
1548 bool is_tpetra_block_crs;
1549
1550 // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1551 impl_scalar_type_1d_view_tpetra tpetra_values;
1552
1553 AmD() = default;
1554 AmD(const AmD &b) = default;
1555 };
1556
1560 template<typename MatrixType>
1561 void
1562 performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1563 const PartInterface<MatrixType> &interf,
1565 AmD<MatrixType> &amd,
1566 const bool overlap_communication_and_computation) {
1567 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1568
1569 using impl_type = ImplType<MatrixType>;
1570 using node_memory_space = typename impl_type::node_memory_space;
1571 using host_execution_space = typename impl_type::host_execution_space;
1572
1573 using local_ordinal_type = typename impl_type::local_ordinal_type;
1574 using global_ordinal_type = typename impl_type::global_ordinal_type;
1575 using size_type = typename impl_type::size_type;
1576 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1577 using size_type_1d_view = typename impl_type::size_type_1d_view;
1578 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1579 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1580
1581 constexpr int vector_length = impl_type::vector_length;
1582
1583 const auto comm = A->getRowMap()->getComm();
1584 const auto& g = A->getCrsGraph();
1585 const auto blocksize = A->getBlockSize();
1586
1587 // mirroring to host
1588 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1589 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1590 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1591 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1592 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1593
1594 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1595
1596 // find column to row map on host
1597 Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getNodeNumCols());
1598 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1599 {
1600 const auto rowmap = g.getRowMap();
1601 const auto colmap = g.getColMap();
1602 const auto dommap = g.getDomainMap();
1603 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1604
1605#if !defined(__CUDA_ARCH__)
1606 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1607 Kokkos::parallel_for
1608 ("performSymbolicPhase::RangePolicy::col2row",
1609 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1610 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1611 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1612 if (dommap->isNodeGlobalElement(gid)) {
1613 const local_ordinal_type lc = colmap->getLocalElement(gid);
1614# if defined(BLOCKTRIDICONTAINER_DEBUG)
1615 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1616 get_msg_prefix(comm) << "GID " << gid
1617 << " gives an invalid local column.");
1618# endif
1619 col2row(lc) = lr;
1620 }
1621 });
1622#endif
1623 }
1624
1625 // construct the D and R graphs in A = D + R.
1626 {
1627 const auto local_graph = g.getLocalGraphHost();
1628 const auto local_graph_rowptr = local_graph.row_map;
1629 TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1630 const auto local_graph_colidx = local_graph.entries;
1631
1632 //assume no overlap.
1633
1634 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1635 {
1636 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1637 Kokkos::parallel_for
1638 ("performSymbolicPhase::RangePolicy::lclrow2idx",
1639 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1640 lclrow2idx[lclrow(i)] = i;
1641 });
1642 }
1643
1644 // count (block) nnzs in D and R.
1645 typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1646 typename sum_reducer_type::value_type sum_reducer_value;
1647 {
1648 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1649 Kokkos::parallel_reduce
1650 // profiling interface does not work
1651 (//"performSymbolicPhase::RangePolicy::count_nnz",
1652 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1653 // LID -> index.
1654 const local_ordinal_type ri0 = lclrow2idx[lr];
1655 const local_ordinal_type pi0 = rowidx2part(ri0);
1656 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1657 const local_ordinal_type lc = local_graph_colidx(j);
1658 const local_ordinal_type lc2r = col2row[lc];
1659 bool incr_R = false;
1660 do { // breakable
1661 if (lc2r == Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1662 incr_R = true;
1663 break;
1664 }
1665 const local_ordinal_type ri = lclrow2idx[lc2r];
1666 const local_ordinal_type pi = rowidx2part(ri);
1667 if (pi != pi0) {
1668 incr_R = true;
1669 break;
1670 }
1671 // Test for being in the tridiag. This is done in index space. In
1672 // LID space, tridiag LIDs in a row are not necessarily related by
1673 // {-1, 0, 1}.
1674 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1675 ++update.v[0]; // D_nnz
1676 else
1677 incr_R = true;
1678 } while (0);
1679 if (incr_R) {
1680 if (lc < nrows) ++update.v[1]; // R_nnz_owned
1681 else ++update.v[2]; // R_nnz_remote
1682 }
1683 }
1684 }, sum_reducer_type(sum_reducer_value));
1685 }
1686 size_type D_nnz = sum_reducer_value.v[0];
1687 size_type R_nnz_owned = sum_reducer_value.v[1];
1688 size_type R_nnz_remote = sum_reducer_value.v[2];
1689
1690 if (!overlap_communication_and_computation) {
1691 R_nnz_owned += R_nnz_remote;
1692 R_nnz_remote = 0;
1693 }
1694
1695 // construct the D graph.
1696 {
1697 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1698
1699 btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1700 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1701
1702#if defined(BLOCKTRIDICONTAINER_DEBUG)
1703 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1704#endif
1705
1706 const local_ordinal_type nparts = partptr.extent(0) - 1;
1707 {
1708 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1709 Kokkos::parallel_for
1710 ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1711 policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1712 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1713 local_ordinal_type offset = 0;
1714 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1715 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1716 offset = 1;
1717 const local_ordinal_type lr0 = lclrow(ri0);
1718 const size_type j0 = local_graph_rowptr(lr0);
1719 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1720 const local_ordinal_type lc = local_graph_colidx(j);
1721 const local_ordinal_type lc2r = col2row[lc];
1722 if (lc2r == Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) continue;
1723 const local_ordinal_type ri = lclrow2idx[lc2r];
1724 const local_ordinal_type pi = rowidx2part(ri);
1725 if (pi != pi0) continue;
1726 if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1727 const local_ordinal_type row_entry = j - j0;
1728 D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1729 }
1730 }
1731 });
1732 }
1733#if defined(BLOCKTRIDICONTAINER_DEBUG)
1734 for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1735 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1736#endif
1737 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1738
1739 // Allocate values.
1740 {
1741 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1742 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1743 btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1744 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1745 }
1746 }
1747
1748 // Construct the R graph.
1749 {
1750 amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1751 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
1752
1753 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1754 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1755
1756 amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1757 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
1758
1759 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1760 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1761
1762 {
1763 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1764 Kokkos::parallel_for
1765 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1766 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1767 const local_ordinal_type ri0 = lclrow2idx[lr];
1768 const local_ordinal_type pi0 = rowidx2part(ri0);
1769 const size_type j0 = local_graph_rowptr(lr);
1770 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1771 const local_ordinal_type lc = local_graph_colidx(j);
1772 const local_ordinal_type lc2r = col2row[lc];
1773 if (lc2r != Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1774 const local_ordinal_type ri = lclrow2idx[lc2r];
1775 const local_ordinal_type pi = rowidx2part(ri);
1776 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1777 continue;
1778 }
1779 }
1780 // exclusive scan will be performed later
1781 if (!overlap_communication_and_computation || lc < nrows) {
1782 ++R_rowptr(lr);
1783 } else {
1784 ++R_rowptr_remote(lr);
1785 }
1786 }
1787 });
1788 }
1789
1790 // exclusive scan
1791 typedef ArrayValueType<size_type,2> update_type;
1792 {
1793 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1794 Kokkos::parallel_scan
1795 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1796 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1797 update_type &update,
1798 const bool &final) {
1799 update_type val;
1800 val.v[0] = R_rowptr(lr);
1801 if (overlap_communication_and_computation)
1802 val.v[1] = R_rowptr_remote(lr);
1803
1804 if (final) {
1805 R_rowptr(lr) = update.v[0];
1806 if (overlap_communication_and_computation)
1807 R_rowptr_remote(lr) = update.v[1];
1808
1809 if (lr < nrows) {
1810 const local_ordinal_type ri0 = lclrow2idx[lr];
1811 const local_ordinal_type pi0 = rowidx2part(ri0);
1812
1813 size_type cnt_rowptr = R_rowptr(lr);
1814 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1815
1816 const size_type j0 = local_graph_rowptr(lr);
1817 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1818 const local_ordinal_type lc = local_graph_colidx(j);
1819 const local_ordinal_type lc2r = col2row[lc];
1820 if (lc2r != Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1821 const local_ordinal_type ri = lclrow2idx[lc2r];
1822 const local_ordinal_type pi = rowidx2part(ri);
1823 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1824 continue;
1825 }
1826 const local_ordinal_type row_entry = j - j0;
1827 if (!overlap_communication_and_computation || lc < nrows)
1828 R_A_colindsub(cnt_rowptr++) = row_entry;
1829 else
1830 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1831 }
1832 }
1833 }
1834 update += val;
1835 });
1836 }
1837 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1838 Kokkos::deep_copy(amd.rowptr, R_rowptr);
1839 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1840 if (overlap_communication_and_computation) {
1841 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1842 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1843 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1844 }
1845
1846 // Allocate or view values.
1847 amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
1848
1849 }
1850 }
1851 }
1852
1853
1857 template<typename ArgActiveExecutionMemorySpace>
1859
1860 template<>
1861 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1862 typedef KB::Mode::Serial mode_type;
1863#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1864 typedef KB::Algo::Level3::CompactMKL algo_type;
1865#else
1866 typedef KB::Algo::Level3::Blocked algo_type;
1867#endif
1868 static int recommended_team_size(const int /* blksize */,
1869 const int /* vector_length */,
1870 const int /* internal_vector_length */) {
1871 return 1;
1872 }
1873
1874 };
1875
1876#if defined(KOKKOS_ENABLE_CUDA)
1877 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1878 const int vector_length,
1879 const int internal_vector_length) {
1880 const int vector_size = vector_length/internal_vector_length;
1881 int total_team_size(0);
1882 if (blksize <= 5) total_team_size = 32;
1883 else if (blksize <= 9) total_team_size = 32; // 64
1884 else if (blksize <= 12) total_team_size = 96;
1885 else if (blksize <= 16) total_team_size = 128;
1886 else if (blksize <= 20) total_team_size = 160;
1887 else total_team_size = 160;
1888 return 2*total_team_size/vector_size;
1889 }
1890 template<>
1891 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1892 typedef KB::Mode::Team mode_type;
1893 typedef KB::Algo::Level3::Unblocked algo_type;
1894 static int recommended_team_size(const int blksize,
1895 const int vector_length,
1896 const int internal_vector_length) {
1897 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1898 }
1899 };
1900 template<>
1901 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1902 typedef KB::Mode::Team mode_type;
1903 typedef KB::Algo::Level3::Unblocked algo_type;
1904 static int recommended_team_size(const int blksize,
1905 const int vector_length,
1906 const int internal_vector_length) {
1907 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1908 }
1909 };
1910#endif
1911
1912#if defined(KOKKOS_ENABLE_HIP)
1913 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
1914 const int vector_length,
1915 const int internal_vector_length) {
1916 const int vector_size = vector_length/internal_vector_length;
1917 int total_team_size(0);
1918 if (blksize <= 5) total_team_size = 32;
1919 else if (blksize <= 9) total_team_size = 32; // 64
1920 else if (blksize <= 12) total_team_size = 96;
1921 else if (blksize <= 16) total_team_size = 128;
1922 else if (blksize <= 20) total_team_size = 160;
1923 else total_team_size = 160;
1924 return 2*total_team_size/vector_size;
1925 }
1926 template<>
1927 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
1928 typedef KB::Mode::Team mode_type;
1929 typedef KB::Algo::Level3::Unblocked algo_type;
1930 static int recommended_team_size(const int blksize,
1931 const int vector_length,
1932 const int internal_vector_length) {
1933 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1934 }
1935 };
1936 template<>
1937 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
1938 typedef KB::Mode::Team mode_type;
1939 typedef KB::Algo::Level3::Unblocked algo_type;
1940 static int recommended_team_size(const int blksize,
1941 const int vector_length,
1942 const int internal_vector_length) {
1943 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1944 }
1945 };
1946#endif
1947
1948 template<typename MatrixType>
1949 struct ExtractAndFactorizeTridiags {
1950 public:
1951 using impl_type = ImplType<MatrixType>;
1952 // a functor cannot have both device_type and execution_space; specialization error in kokkos
1953 using execution_space = typename impl_type::execution_space;
1954 using memory_space = typename impl_type::memory_space;
1956 using local_ordinal_type = typename impl_type::local_ordinal_type;
1957 using size_type = typename impl_type::size_type;
1958 using impl_scalar_type = typename impl_type::impl_scalar_type;
1959 using magnitude_type = typename impl_type::magnitude_type;
1961 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1963 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1964 using size_type_1d_view = typename impl_type::size_type_1d_view;
1965 using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
1967 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1968 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
1969 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1970 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
1971 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
1972 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1973 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
1974
1975 using internal_vector_type = typename impl_type::internal_vector_type;
1976 static constexpr int vector_length = impl_type::vector_length;
1977 static constexpr int internal_vector_length = impl_type::internal_vector_length;
1978
1980 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1981 using member_type = typename team_policy_type::member_type;
1982
1983 private:
1984 // part interface
1985 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1986 const local_ordinal_type max_partsz;
1987 // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1988 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
1989 const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
1990 const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
1991 // block tridiags
1992 const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1993 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1994 const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1995 const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
1996 // shared information
1997 const local_ordinal_type blocksize, blocksize_square;
1998 // diagonal safety
1999 const magnitude_type tiny;
2000 const local_ordinal_type vector_loop_size;
2001 const local_ordinal_type vector_length_value;
2002
2003 public:
2004 ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2005 const PartInterface<MatrixType> &interf_,
2006 const Teuchos::RCP<const block_crs_matrix_type> &A_,
2007 const magnitude_type& tiny_) :
2008 // interface
2009 partptr(interf_.partptr),
2010 lclrow(interf_.lclrow),
2011 packptr(interf_.packptr),
2012 max_partsz(interf_.max_partsz),
2013 // block crs matrix
2014 A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
2015 A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
2016 // block tridiags
2017 pack_td_ptr(btdm_.pack_td_ptr),
2018 flat_td_ptr(btdm_.flat_td_ptr),
2019 A_colindsub(btdm_.A_colindsub),
2020 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2021 btdm_.values.extent(0),
2022 btdm_.values.extent(1),
2023 btdm_.values.extent(2),
2024 vector_length/internal_vector_length),
2025 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2026 btdm_.values.extent(0),
2027 btdm_.values.extent(1),
2028 btdm_.values.extent(2),
2029 vector_length),
2030 blocksize(btdm_.values.extent(1)),
2031 blocksize_square(blocksize*blocksize),
2032 // diagonal weight to avoid zero pivots
2033 tiny(tiny_),
2034 vector_loop_size(vector_length/internal_vector_length),
2035 vector_length_value(vector_length) {}
2036
2037 private:
2038
2039 KOKKOS_INLINE_FUNCTION
2040 void
2041 extract(local_ordinal_type partidx,
2042 local_ordinal_type npacks) const {
2043 const size_type kps = pack_td_ptr(partidx);
2044 local_ordinal_type kfs[vector_length] = {};
2045 local_ordinal_type ri0[vector_length] = {};
2046 local_ordinal_type nrows[vector_length] = {};
2047
2048 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2049 kfs[vi] = flat_td_ptr(partidx);
2050 ri0[vi] = partptr(partidx);
2051 nrows[vi] = partptr(partidx+1) - ri0[vi];
2052 }
2053 for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
2054 for (local_ordinal_type e=0;e<3;++e) {
2055 const impl_scalar_type* block[vector_length] = {};
2056 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2057 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2058 block[vi] = &A_values(Aj*blocksize_square);
2059 }
2060 const size_type pi = kps + j;
2061 ++j;
2062 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2063 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2064 const auto idx = ii*blocksize + jj;
2065 auto& v = internal_vector_values(pi, ii, jj, 0);
2066 for (local_ordinal_type vi=0;vi<npacks;++vi)
2067 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2068 }
2069 }
2070
2071 if (nrows[0] == 1) break;
2072 if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2073 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2074 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2075 npacks = vi;
2076 break;
2077 }
2078 }
2079 }
2080 }
2081 }
2082
2083 KOKKOS_INLINE_FUNCTION
2084 void
2085 extract(const member_type &member,
2086 const local_ordinal_type &partidxbeg,
2087 const local_ordinal_type &npacks,
2088 const local_ordinal_type &vbeg) const {
2089 local_ordinal_type kfs_vals[internal_vector_length] = {};
2090 local_ordinal_type ri0_vals[internal_vector_length] = {};
2091 local_ordinal_type nrows_vals[internal_vector_length] = {};
2092
2093 const size_type kps = pack_td_ptr(partidxbeg);
2094 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2095 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
2096 ri0_vals[vi] = partptr(partidxbeg+vi);
2097 nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
2098 }
2099
2100 local_ordinal_type j_vals[internal_vector_length] = {};
2101 for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
2102 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2103 const local_ordinal_type nrows = nrows_vals[vi];
2104 if (tr < nrows) {
2105 auto &j = j_vals[vi];
2106 const local_ordinal_type kfs = kfs_vals[vi];
2107 const local_ordinal_type ri0 = ri0_vals[vi];
2108 const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
2109 const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
2110 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2111 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2112 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2113 const size_type pi = kps + j;
2114 Kokkos::parallel_for
2115 (Kokkos::TeamThreadRange(member,blocksize),
2116 [&](const local_ordinal_type &ii) {
2117 for (local_ordinal_type jj=0;jj<blocksize;++jj)
2118 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[ii*blocksize + jj]);
2119 });
2120 }
2121 }
2122 }
2123 }
2124 }
2125
2126 template<typename AAViewType,
2127 typename WWViewType>
2128 KOKKOS_INLINE_FUNCTION
2129 void
2130 factorize(const member_type &member,
2131 const local_ordinal_type &i0,
2132 const local_ordinal_type &nrows,
2133 const local_ordinal_type &v,
2134 const AAViewType &AA,
2135 const WWViewType &WW) const {
2136 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2137 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2138 typedef default_mode_and_algo_type::mode_type default_mode_type;
2139 typedef default_mode_and_algo_type::algo_type default_algo_type;
2140
2141 // constant
2142 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2143
2144 // subview pattern
2145 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2146 KB::LU<member_type,
2147 default_mode_type,KB::Algo::LU::Unblocked>
2148 ::invoke(member, A , tiny);
2149
2150 if (nrows > 1) {
2151 auto B = A;
2152 auto C = A;
2153 local_ordinal_type i = i0;
2154 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2155 B.assign_data( &AA(i+1,0,0,v) );
2156 KB::Trsm<member_type,
2157 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2158 default_mode_type,default_algo_type>
2159 ::invoke(member, one, A, B);
2160 C.assign_data( &AA(i+2,0,0,v) );
2161 KB::Trsm<member_type,
2162 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2163 default_mode_type,default_algo_type>
2164 ::invoke(member, one, A, C);
2165 A.assign_data( &AA(i+3,0,0,v) );
2166
2167 member.team_barrier();
2168 KB::Gemm<member_type,
2169 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2170 default_mode_type,default_algo_type>
2171 ::invoke(member, -one, C, B, one, A);
2172 KB::LU<member_type,
2173 default_mode_type,KB::Algo::LU::Unblocked>
2174 ::invoke(member, A, tiny);
2175 }
2176 } else {
2177 // for block jacobi invert a matrix here
2178 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2179 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2180 ::invoke(member, A, W);
2181 KB::SetIdentity<member_type,default_mode_type>
2182 ::invoke(member, A);
2183 member.team_barrier();
2184 KB::Trsm<member_type,
2185 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2186 default_mode_type,default_algo_type>
2187 ::invoke(member, one, W, A);
2188 KB::Trsm<member_type,
2189 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2190 default_mode_type,default_algo_type>
2191 ::invoke(member, one, W, A);
2192 }
2193 }
2194
2195 public:
2196
2197 struct ExtractAndFactorizeTag {};
2198
2199 KOKKOS_INLINE_FUNCTION
2200 void
2201 operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
2202 // btdm is packed and sorted from largest one
2203 const local_ordinal_type packidx = member.league_rank();
2204
2205 const local_ordinal_type partidx = packptr(packidx);
2206 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2207 const local_ordinal_type i0 = pack_td_ptr(partidx);
2208 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2209
2210 internal_vector_scratch_type_3d_view
2211 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2212 if (vector_loop_size == 1) {
2213 extract(partidx, npacks);
2214 factorize(member, i0, nrows, 0, internal_vector_values, WW);
2215 } else {
2216 Kokkos::parallel_for
2217 (Kokkos::ThreadVectorRange(member, vector_loop_size),
2218 [&](const local_ordinal_type &v) {
2219 const local_ordinal_type vbeg = v*internal_vector_length;
2220 if (vbeg < npacks)
2221 extract(member, partidx+vbeg, npacks, vbeg);
2222 // this is not safe if vector loop size is different from vector size of
2223 // the team policy. we always make sure this when constructing the team policy
2224 member.team_barrier();
2225 factorize(member, i0, nrows, v, internal_vector_values, WW);
2226 });
2227 }
2228 }
2229
2230 void run() {
2231 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2232 const local_ordinal_type team_size =
2233 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2234 recommended_team_size(blocksize, vector_length, internal_vector_length);
2235 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2236 shmem_size(blocksize, blocksize, vector_loop_size);
2237
2238 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2239 policy(packptr.extent(0)-1, team_size, vector_loop_size);
2240#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2241 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2242 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
2243#else
2244 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2245 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2246 policy, *this);
2247#endif
2248 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2249 }
2250
2251 };
2252
2256 template<typename MatrixType>
2257 void
2258 performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
2259 const PartInterface<MatrixType> &interf,
2261 const typename ImplType<MatrixType>::magnitude_type tiny) {
2262 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
2263 ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
2264 function.run();
2265 }
2266
2270 template<typename MatrixType>
2272 public:
2274 using execution_space = typename impl_type::execution_space;
2275 using memory_space = typename impl_type::memory_space;
2276
2277 using local_ordinal_type = typename impl_type::local_ordinal_type;
2278 using impl_scalar_type = typename impl_type::impl_scalar_type;
2279 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2280 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
2281 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2282 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2283 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2284 using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
2285 static constexpr int vector_length = impl_type::vector_length;
2286
2287 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2288
2289 private:
2290 // part interface
2291 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2292 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2293 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2294 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2295 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2296 const local_ordinal_type blocksize;
2297 const local_ordinal_type num_vectors;
2298
2299 // packed multivector output (or input)
2300 vector_type_3d_view packed_multivector;
2301 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
2302
2303 template<typename TagType>
2304 KOKKOS_INLINE_FUNCTION
2305 void copy_multivectors(const local_ordinal_type &j,
2306 const local_ordinal_type &vi,
2307 const local_ordinal_type &pri,
2308 const local_ordinal_type &ri0) const {
2309 for (local_ordinal_type col=0;col<num_vectors;++col)
2310 for (local_ordinal_type i=0;i<blocksize;++i)
2311 packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2312 }
2313
2314 public:
2315
2316 MultiVectorConverter(const PartInterface<MatrixType> &interf,
2317 const vector_type_3d_view &pmv)
2318 : partptr(interf.partptr),
2319 packptr(interf.packptr),
2320 part2packrowidx0(interf.part2packrowidx0),
2321 part2rowidx0(interf.part2rowidx0),
2322 lclrow(interf.lclrow),
2323 blocksize(pmv.extent(1)),
2324 num_vectors(pmv.extent(2)),
2325 packed_multivector(pmv) {}
2326
2327 // TODO:: modify this routine similar to the team level functions
2328 // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
2329 KOKKOS_INLINE_FUNCTION
2330 void
2331 operator() (const local_ordinal_type &packidx) const {
2332 local_ordinal_type partidx = packptr(packidx);
2333 local_ordinal_type npacks = packptr(packidx+1) - partidx;
2334 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2335
2336 local_ordinal_type ri0[vector_length] = {};
2337 local_ordinal_type nrows[vector_length] = {};
2338 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2339 ri0[v] = part2rowidx0(partidx);
2340 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2341 }
2342 for (local_ordinal_type j=0;j<nrows[0];++j) {
2343 local_ordinal_type cnt = 1;
2344 for (;cnt<npacks && j!= nrows[cnt];++cnt);
2345 npacks = cnt;
2346 const local_ordinal_type pri = pri0 + j;
2347 for (local_ordinal_type col=0;col<num_vectors;++col)
2348 for (local_ordinal_type i=0;i<blocksize;++i)
2349 for (local_ordinal_type v=0;v<npacks;++v)
2350 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2351 }
2352 }
2353
2354 KOKKOS_INLINE_FUNCTION
2355 void
2356 operator() (const member_type &member) const {
2357 const local_ordinal_type packidx = member.league_rank();
2358 const local_ordinal_type partidx_begin = packptr(packidx);
2359 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2360 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2361 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
2362 const local_ordinal_type partidx = partidx_begin + v;
2363 const local_ordinal_type ri0 = part2rowidx0(partidx);
2364 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2365
2366 if (nrows == 1) {
2367 const local_ordinal_type pri = pri0;
2368 for (local_ordinal_type col=0;col<num_vectors;++col) {
2369 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
2370 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2371 });
2372 }
2373 } else {
2374 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
2375 const local_ordinal_type pri = pri0 + j;
2376 for (local_ordinal_type col=0;col<num_vectors;++col)
2377 for (local_ordinal_type i=0;i<blocksize;++i)
2378 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2379 });
2380 }
2381 });
2382 }
2383
2384 void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2385 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2386 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
2387
2388 scalar_multivector = scalar_multivector_;
2390#if defined(KOKKOS_ENABLE_CUDA)
2391 const local_ordinal_type vl = vector_length;
2392 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2393 Kokkos::parallel_for
2394 ("MultiVectorConverter::TeamPolicy", policy, *this);
2395#endif
2397#if defined(KOKKOS_ENABLE_HIP)
2398 const local_ordinal_type vl = vector_length;
2399 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2400 Kokkos::parallel_for
2401 ("MultiVectorConverter::TeamPolicy", policy, *this);
2402#endif
2403 } else {
2404#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
2405 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
2406#else
2407 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2408 Kokkos::parallel_for
2409 ("MultiVectorConverter::RangePolicy", policy, *this);
2410#endif
2411 }
2412 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2413 }
2414 };
2415
2419 template<typename ArgActiveExecutionMemorySpace>
2421
2422 template<>
2423 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2424 typedef KB::Mode::Serial mode_type;
2425 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2426#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2427 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2428#else
2429 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2430#endif
2431 static int recommended_team_size(const int /* blksize */,
2432 const int /* vector_length */,
2433 const int /* internal_vector_length */) {
2434 return 1;
2435 }
2436 };
2437
2438#if defined(KOKKOS_ENABLE_CUDA)
2439 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2440 const int vector_length,
2441 const int internal_vector_length) {
2442 const int vector_size = vector_length/internal_vector_length;
2443 int total_team_size(0);
2444 if (blksize <= 5) total_team_size = 32;
2445 else if (blksize <= 9) total_team_size = 32; // 64
2446 else if (blksize <= 12) total_team_size = 96;
2447 else if (blksize <= 16) total_team_size = 128;
2448 else if (blksize <= 20) total_team_size = 160;
2449 else total_team_size = 160;
2450 return total_team_size/vector_size;
2451 }
2452
2453 template<>
2454 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2455 typedef KB::Mode::Team mode_type;
2456 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2457 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2458 static int recommended_team_size(const int blksize,
2459 const int vector_length,
2460 const int internal_vector_length) {
2461 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2462 }
2463 };
2464 template<>
2465 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2466 typedef KB::Mode::Team mode_type;
2467 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2468 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2469 static int recommended_team_size(const int blksize,
2470 const int vector_length,
2471 const int internal_vector_length) {
2472 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2473 }
2474 };
2475#endif
2476
2477#if defined(KOKKOS_ENABLE_HIP)
2478 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
2479 const int vector_length,
2480 const int internal_vector_length) {
2481 const int vector_size = vector_length/internal_vector_length;
2482 int total_team_size(0);
2483 if (blksize <= 5) total_team_size = 32;
2484 else if (blksize <= 9) total_team_size = 32; // 64
2485 else if (blksize <= 12) total_team_size = 96;
2486 else if (blksize <= 16) total_team_size = 128;
2487 else if (blksize <= 20) total_team_size = 160;
2488 else total_team_size = 160;
2489 return total_team_size/vector_size;
2490 }
2491
2492 template<>
2493 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
2494 typedef KB::Mode::Team mode_type;
2495 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2496 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2497 static int recommended_team_size(const int blksize,
2498 const int vector_length,
2499 const int internal_vector_length) {
2500 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2501 }
2502 };
2503 template<>
2504 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
2505 typedef KB::Mode::Team mode_type;
2506 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2507 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2508 static int recommended_team_size(const int blksize,
2509 const int vector_length,
2510 const int internal_vector_length) {
2511 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2512 }
2513 };
2514#endif
2515
2516 template<typename MatrixType>
2517 struct SolveTridiags {
2518 public:
2519 using impl_type = ImplType<MatrixType>;
2520 using execution_space = typename impl_type::execution_space;
2521
2522 using local_ordinal_type = typename impl_type::local_ordinal_type;
2523 using size_type = typename impl_type::size_type;
2524 using impl_scalar_type = typename impl_type::impl_scalar_type;
2525 using magnitude_type = typename impl_type::magnitude_type;
2526 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2527 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2529 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2530 using size_type_1d_view = typename impl_type::size_type_1d_view;
2532 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2533 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2534 //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2535
2536 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2537
2538 using internal_vector_type =typename impl_type::internal_vector_type;
2539 static constexpr int vector_length = impl_type::vector_length;
2540 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2541
2543 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2544 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2545
2547 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2548 using member_type = typename team_policy_type::member_type;
2549
2550 private:
2551 // part interface
2552 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2553 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2554 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2555 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2556
2557 // block tridiags
2558 const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2559
2560 // block tridiags values
2561 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2562 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2563
2564 const local_ordinal_type vector_loop_size;
2565
2566 // copy to multivectors : damping factor and Y_scalar_multivector
2567 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2568#if defined(__CUDA_ARCH__)
2569 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2570#else
2571 /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2572#endif
2573 const impl_scalar_type df;
2574 const bool compute_diff;
2575
2576 public:
2577 SolveTridiags(const PartInterface<MatrixType> &interf,
2578 const BlockTridiags<MatrixType> &btdm,
2579 const vector_type_3d_view &pmv,
2580 const impl_scalar_type damping_factor,
2581 const bool is_norm_manager_active)
2582 :
2583 // interface
2584 partptr(interf.partptr),
2585 packptr(interf.packptr),
2586 part2packrowidx0(interf.part2packrowidx0),
2587 lclrow(interf.lclrow),
2588 // block tridiags and multivector
2589 pack_td_ptr(btdm.pack_td_ptr),
2590 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2591 btdm.values.extent(0),
2592 btdm.values.extent(1),
2593 btdm.values.extent(2),
2594 vector_length/internal_vector_length),
2595 X_internal_vector_values((internal_vector_type*)pmv.data(),
2596 pmv.extent(0),
2597 pmv.extent(1),
2598 pmv.extent(2),
2599 vector_length/internal_vector_length),
2600 vector_loop_size(vector_length/internal_vector_length),
2601 Y_scalar_multivector(),
2602 Z_scalar_vector(),
2603 df(damping_factor),
2604 compute_diff(is_norm_manager_active)
2605 {}
2606
2607 public:
2608
2610 KOKKOS_INLINE_FUNCTION
2611 void
2612 copyToFlatMultiVector(const member_type &member,
2613 const local_ordinal_type partidxbeg, // partidx for v = 0
2614 const local_ordinal_type npacks,
2615 const local_ordinal_type pri0,
2616 const local_ordinal_type v, // index with a loop of vector_loop_size
2617 const local_ordinal_type blocksize,
2618 const local_ordinal_type num_vectors) const {
2619 const local_ordinal_type vbeg = v*internal_vector_length;
2620 if (vbeg < npacks) {
2621 local_ordinal_type ri0_vals[internal_vector_length] = {};
2622 local_ordinal_type nrows_vals[internal_vector_length] = {};
2623 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2624 const local_ordinal_type partidx = partidxbeg+vv;
2625 ri0_vals[vi] = partptr(partidx);
2626 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2627 }
2628
2629 impl_scalar_type z_partial_sum(0);
2630 if (nrows_vals[0] == 1) {
2631 const local_ordinal_type j=0, pri=pri0;
2632 {
2633 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2634 const local_ordinal_type ri0 = ri0_vals[vi];
2635 const local_ordinal_type nrows = nrows_vals[vi];
2636 if (j < nrows) {
2637 Kokkos::parallel_for
2638 (Kokkos::TeamThreadRange(member, blocksize),
2639 [&](const local_ordinal_type &i) {
2640 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2641 for (local_ordinal_type col=0;col<num_vectors;++col) {
2642 impl_scalar_type &y = Y_scalar_multivector(row,col);
2643 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2644 y += df*yd;
2645
2646 {//if (compute_diff) {
2647 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2648 z_partial_sum += yd_abs*yd_abs;
2649 }
2650 }
2651 });
2652 }
2653 }
2654 }
2655 } else {
2656 Kokkos::parallel_for
2657 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2658 [&](const local_ordinal_type &j) {
2659 const local_ordinal_type pri = pri0 + j;
2660 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2661 const local_ordinal_type ri0 = ri0_vals[vi];
2662 const local_ordinal_type nrows = nrows_vals[vi];
2663 if (j < nrows) {
2664 for (local_ordinal_type col=0;col<num_vectors;++col) {
2665 for (local_ordinal_type i=0;i<blocksize;++i) {
2666 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2667 impl_scalar_type &y = Y_scalar_multivector(row,col);
2668 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2669 y += df*yd;
2670
2671 {//if (compute_diff) {
2672 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2673 z_partial_sum += yd_abs*yd_abs;
2674 }
2675 }
2676 }
2677 }
2678 }
2679 });
2680 }
2681 //if (compute_diff)
2682 Z_scalar_vector(member.league_rank()) += z_partial_sum;
2683 }
2684 }
2685
2689 template<typename WWViewType>
2690 KOKKOS_INLINE_FUNCTION
2691 void
2692 solveSingleVector(const member_type &member,
2693 const local_ordinal_type &blocksize,
2694 const local_ordinal_type &i0,
2695 const local_ordinal_type &r0,
2696 const local_ordinal_type &nrows,
2697 const local_ordinal_type &v,
2698 const WWViewType &WW) const {
2699 typedef SolveTridiagsDefaultModeAndAlgo
2700 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2701 typedef default_mode_and_algo_type::mode_type default_mode_type;
2702 typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2703
2704 // base pointers
2705 auto A = D_internal_vector_values.data();
2706 auto X = X_internal_vector_values.data();
2707
2708 // constant
2709 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2710 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2711 //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2712
2713 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2714 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2715 const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2716 const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2717 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2718 const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2719
2720 // for multiple rhs
2721 //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2722 //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2723
2724 // move to starting point
2725 A += i0*astep + v;
2726 X += r0*xstep + v;
2727
2728 //for (local_ordinal_type col=0;col<num_vectors;++col)
2729 if (nrows > 1) {
2730 // solve Lx = x
2731 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2732 (default_mode_type,default_algo_type,
2733 member,
2734 KB::Diag::Unit,
2735 blocksize,blocksize,
2736 one,
2737 A, as0, as1,
2738 X, xs0);
2739
2740 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2741 member.team_barrier();
2742 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2743 (default_mode_type,default_algo_type,
2744 member,
2745 blocksize, blocksize,
2746 -one,
2747 A+2*astep, as0, as1,
2748 X, xs0,
2749 one,
2750 X+1*xstep, xs0);
2751 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2752 (default_mode_type,default_algo_type,
2753 member,
2754 KB::Diag::Unit,
2755 blocksize,blocksize,
2756 one,
2757 A+3*astep, as0, as1,
2758 X+1*xstep, xs0);
2759
2760 A += 3*astep;
2761 X += 1*xstep;
2762 }
2763
2764 // solve Ux = x
2765 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2766 (default_mode_type,default_algo_type,
2767 member,
2768 KB::Diag::NonUnit,
2769 blocksize, blocksize,
2770 one,
2771 A, as0, as1,
2772 X, xs0);
2773
2774 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2775 A -= 3*astep;
2776 member.team_barrier();
2777 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2778 (default_mode_type,default_algo_type,
2779 member,
2780 blocksize, blocksize,
2781 -one,
2782 A+1*astep, as0, as1,
2783 X, xs0,
2784 one,
2785 X-1*xstep, xs0);
2786 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2787 (default_mode_type,default_algo_type,
2788 member,
2789 KB::Diag::NonUnit,
2790 blocksize, blocksize,
2791 one,
2792 A, as0, as1,
2793 X-1*xstep,xs0);
2794 X -= 1*xstep;
2795 }
2796 // for multiple rhs
2797 //X += xs1;
2798 } else {
2799 const local_ordinal_type ws0 = WW.stride_0();
2800 auto W = WW.data() + v;
2801 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2802 (default_mode_type,
2803 member, blocksize, X, xs0, W, ws0);
2804 member.team_barrier();
2805 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2806 (default_mode_type,default_algo_type,
2807 member,
2808 blocksize, blocksize,
2809 one,
2810 A, as0, as1,
2811 W, xs0,
2812 zero,
2813 X, xs0);
2814 }
2815 }
2816
2817 template<typename WWViewType>
2818 KOKKOS_INLINE_FUNCTION
2819 void
2820 solveMultiVector(const member_type &member,
2821 const local_ordinal_type &/* blocksize */,
2822 const local_ordinal_type &i0,
2823 const local_ordinal_type &r0,
2824 const local_ordinal_type &nrows,
2825 const local_ordinal_type &v,
2826 const WWViewType &WW) const {
2827 typedef SolveTridiagsDefaultModeAndAlgo
2828 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2829 typedef default_mode_and_algo_type::mode_type default_mode_type;
2830 typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2831
2832 // constant
2833 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2834 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2835
2836 // subview pattern
2837 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2838 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2839 auto X2 = X1;
2840
2841 local_ordinal_type i = i0, r = r0;
2842
2843
2844 if (nrows > 1) {
2845 // solve Lx = x
2846 KB::Trsm<member_type,
2847 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2848 default_mode_type,default_algo_type>
2849 ::invoke(member, one, A, X1);
2850 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2851 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2852 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2853 member.team_barrier();
2854 KB::Gemm<member_type,
2855 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2856 default_mode_type,default_algo_type>
2857 ::invoke(member, -one, A, X1, one, X2);
2858 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2859 KB::Trsm<member_type,
2860 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2861 default_mode_type,default_algo_type>
2862 ::invoke(member, one, A, X2);
2863 X1.assign_data( X2.data() );
2864 }
2865
2866 // solve Ux = x
2867 KB::Trsm<member_type,
2868 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2869 default_mode_type,default_algo_type>
2870 ::invoke(member, one, A, X1);
2871 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2872 i -= 3;
2873 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2874 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2875 member.team_barrier();
2876 KB::Gemm<member_type,
2877 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2878 default_mode_type,default_algo_type>
2879 ::invoke(member, -one, A, X1, one, X2);
2880
2881 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2882 KB::Trsm<member_type,
2883 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2884 default_mode_type,default_algo_type>
2885 ::invoke(member, one, A, X2);
2886 X1.assign_data( X2.data() );
2887 }
2888 } else {
2889 // matrix is already inverted
2890 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2891 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2892 ::invoke(member, X1, W);
2893 member.team_barrier();
2894 KB::Gemm<member_type,
2895 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2896 default_mode_type,default_algo_type>
2897 ::invoke(member, one, A, W, zero, X1);
2898 }
2899 }
2900
2901 template<int B> struct SingleVectorTag {};
2902 template<int B> struct MultiVectorTag {};
2903
2904 template<int B>
2905 KOKKOS_INLINE_FUNCTION
2906 void
2907 operator() (const SingleVectorTag<B> &, const member_type &member) const {
2908 const local_ordinal_type packidx = member.league_rank();
2909 const local_ordinal_type partidx = packptr(packidx);
2910 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2911 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2912 const local_ordinal_type i0 = pack_td_ptr(partidx);
2913 const local_ordinal_type r0 = part2packrowidx0(partidx);
2914 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2915 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2916 const local_ordinal_type num_vectors = 1;
2917 internal_vector_scratch_type_3d_view
2918 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2919 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2920 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2921 });
2922 Kokkos::parallel_for
2923 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2924 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2925 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2926 });
2927 }
2928
2929 template<int B>
2930 KOKKOS_INLINE_FUNCTION
2931 void
2932 operator() (const MultiVectorTag<B> &, const member_type &member) const {
2933 const local_ordinal_type packidx = member.league_rank();
2934 const local_ordinal_type partidx = packptr(packidx);
2935 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2936 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2937 const local_ordinal_type i0 = pack_td_ptr(partidx);
2938 const local_ordinal_type r0 = part2packrowidx0(partidx);
2939 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2940 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2941 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2942
2943 internal_vector_scratch_type_3d_view
2944 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2945 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2946 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2947 });
2948 Kokkos::parallel_for
2949 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2950 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2951 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2952 });
2953 }
2954
2955 void run(const impl_scalar_type_2d_view_tpetra &Y,
2956 const impl_scalar_type_1d_view &Z) {
2957 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2958 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
2959
2961 this->Y_scalar_multivector = Y;
2962 this->Z_scalar_vector = Z;
2963
2964 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2965 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2966
2967 const local_ordinal_type team_size =
2968 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2969 recommended_team_size(blocksize, vector_length, internal_vector_length);
2970 const int per_team_scratch = internal_vector_scratch_type_3d_view
2971 ::shmem_size(blocksize, num_vectors, vector_loop_size);
2972
2973#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2974#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2975 if (num_vectors == 1) { \
2976 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2977 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2978 Kokkos::parallel_for \
2979 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2980 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2981 } else { \
2982 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2983 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2984 Kokkos::parallel_for \
2985 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2986 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2987 } break
2988#else
2989#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2990 if (num_vectors == 1) { \
2991 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2992 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2993 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2994 Kokkos::parallel_for \
2995 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2996 policy, *this); \
2997 } else { \
2998 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2999 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
3000 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
3001 Kokkos::parallel_for \
3002 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
3003 policy, *this); \
3004 } break
3005#endif
3006 switch (blocksize) {
3007 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
3008 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
3009 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
3010 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
3011 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
3012 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
3013 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
3014 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
3015 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
3016 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
3017 }
3018#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
3019
3020 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3021 }
3022 };
3023
3027 static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
3028 const int team_size) {
3029 int total_team_size(0);
3030 if (blksize <= 5) total_team_size = 32;
3031 else if (blksize <= 9) total_team_size = 32; // 64
3032 else if (blksize <= 12) total_team_size = 96;
3033 else if (blksize <= 16) total_team_size = 128;
3034 else if (blksize <= 20) total_team_size = 160;
3035 else total_team_size = 160;
3036 return total_team_size/team_size;
3037 }
3038
3039 static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
3040 const int team_size) {
3041 int total_team_size(0);
3042 if (blksize <= 5) total_team_size = 32;
3043 else if (blksize <= 9) total_team_size = 32; // 64
3044 else if (blksize <= 12) total_team_size = 96;
3045 else if (blksize <= 16) total_team_size = 128;
3046 else if (blksize <= 20) total_team_size = 160;
3047 else total_team_size = 160;
3048 return total_team_size/team_size;
3049 }
3050
3051 template<typename MatrixType>
3052 struct ComputeResidualVector {
3053 public:
3054 using impl_type = ImplType<MatrixType>;
3055 using node_device_type = typename impl_type::node_device_type;
3056 using execution_space = typename impl_type::execution_space;
3057 using memory_space = typename impl_type::memory_space;
3058
3059 using local_ordinal_type = typename impl_type::local_ordinal_type;
3060 using size_type = typename impl_type::size_type;
3061 using impl_scalar_type = typename impl_type::impl_scalar_type;
3062 using magnitude_type = typename impl_type::magnitude_type;
3063 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3064 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3066 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3067 using size_type_1d_view = typename impl_type::size_type_1d_view;
3068 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
3069 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3070 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
3071 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3072 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3073 static constexpr int vector_length = impl_type::vector_length;
3074
3076 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3077
3078 // enum for max blocksize and vector length
3079 enum : int { max_blocksize = 32 };
3080
3081 private:
3082 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
3083 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
3084 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
3085 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
3086 Unmanaged<vector_type_3d_view> y_packed;
3087 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
3088
3089 // AmD information
3090 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
3091 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
3092 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
3093
3094 // block crs graph information
3095 // for cuda (kokkos crs graph uses a different size_type from size_t)
3096 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
3097 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
3098
3099 // blocksize
3100 const local_ordinal_type blocksize_requested;
3101
3102 // part interface
3103 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3104 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3105 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
3106 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3107 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3108 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
3109 const bool is_dm2cm_active;
3110
3111 public:
3112 template<typename LocalCrsGraphType>
3113 ComputeResidualVector(const AmD<MatrixType> &amd,
3114 const LocalCrsGraphType &graph,
3115 const local_ordinal_type &blocksize_requested_,
3116 const PartInterface<MatrixType> &interf,
3117 const local_ordinal_type_1d_view &dm2cm_)
3118 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
3119 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
3120 tpetra_values(amd.tpetra_values),
3121 A_rowptr(graph.row_map),
3122 A_colind(graph.entries),
3123 blocksize_requested(blocksize_requested_),
3124 part2packrowidx0(interf.part2packrowidx0),
3125 part2rowidx0(interf.part2rowidx0),
3126 rowidx2part(interf.rowidx2part),
3127 partptr(interf.partptr),
3128 lclrow(interf.lclrow),
3129 dm2cm(dm2cm_),
3130 is_dm2cm_active(dm2cm_.span() > 0)
3131 {}
3132
3133 inline
3134 void
3135 SerialGemv(const local_ordinal_type &blocksize,
3136 const impl_scalar_type * const KOKKOS_RESTRICT AA,
3137 const impl_scalar_type * const KOKKOS_RESTRICT xx,
3138 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
3139 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3140 impl_scalar_type val = 0;
3141 const local_ordinal_type offset = k0*blocksize;
3142#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
3143# pragma ivdep
3144#endif
3145#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
3146# pragma unroll
3147#endif
3148 for (local_ordinal_type k1=0;k1<blocksize;++k1)
3149 val += AA[offset+k1]*xx[k1];
3150 yy[k0] -= val;
3151 }
3152 }
3153
3154 template<typename bbViewType, typename yyViewType>
3155 KOKKOS_INLINE_FUNCTION
3156 void
3157 VectorCopy(const member_type &member,
3158 const local_ordinal_type &blocksize,
3159 const bbViewType &bb,
3160 const yyViewType &yy) const {
3161 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
3162 yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
3163 });
3164 }
3165
3166 template<typename AAViewType, typename xxViewType, typename yyViewType>
3167 KOKKOS_INLINE_FUNCTION
3168 void
3169 TeamVectorGemv(const member_type &member,
3170 const local_ordinal_type &blocksize,
3171 const AAViewType &AA,
3172 const xxViewType &xx,
3173 const yyViewType &yy) const {
3174 Kokkos::parallel_for
3175 (Kokkos::TeamThreadRange(member, blocksize),
3176 [&](const local_ordinal_type &k0) {
3177 impl_scalar_type val = 0;
3178 Kokkos::parallel_for
3179 (Kokkos::ThreadVectorRange(member, blocksize),
3180 [&](const local_ordinal_type &k1) {
3181 val += AA(k0,k1)*xx(k1);
3182 });
3183 Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3184 });
3185 }
3186
3187 template<typename AAViewType, typename xxViewType, typename yyViewType>
3188 KOKKOS_INLINE_FUNCTION
3189 void
3190 VectorGemv(const member_type &member,
3191 const local_ordinal_type &blocksize,
3192 const AAViewType &AA,
3193 const xxViewType &xx,
3194 const yyViewType &yy) const {
3195 Kokkos::parallel_for
3196 (Kokkos::ThreadVectorRange(member, blocksize),
3197 [&](const local_ordinal_type &k0) {
3198 impl_scalar_type val(0);
3199 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3200 val += AA(k0,k1)*xx(k1);
3201 }
3202 Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3203 });
3204 }
3205
3206 // template<typename AAViewType, typename xxViewType, typename yyViewType>
3207 // KOKKOS_INLINE_FUNCTION
3208 // void
3209 // VectorGemv(const member_type &member,
3210 // const local_ordinal_type &blocksize,
3211 // const AAViewType &AA,
3212 // const xxViewType &xx,
3213 // const yyViewType &yy) const {
3214 // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3215 // impl_scalar_type val = 0;
3216 // Kokkos::parallel_for
3217 // (Kokkos::ThreadVectorRange(member, blocksize),
3218 // [&](const local_ordinal_type &k1) {
3219 // val += AA(k0,k1)*xx(k1);
3220 // });
3221 // Kokkos::atomic_fetch_add(&yy(k0), -val);
3222 // }
3223 // }
3224
3225 struct SeqTag {};
3226
3227 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3228 KOKKOS_INLINE_FUNCTION
3229 void
3230 operator() (const SeqTag &, const local_ordinal_type& i) const {
3231 const local_ordinal_type blocksize = blocksize_requested;
3232 const local_ordinal_type blocksize_square = blocksize*blocksize;
3233
3234 // constants
3235 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3236 const local_ordinal_type num_vectors = y.extent(1);
3237 const local_ordinal_type row = i*blocksize;
3238 for (local_ordinal_type col=0;col<num_vectors;++col) {
3239 // y := b
3240 impl_scalar_type *yy = &y(row, col);
3241 const impl_scalar_type * const bb = &b(row, col);
3242 memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
3243
3244 // y -= Rx
3245 const size_type A_k0 = A_rowptr[i];
3246 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3247 const size_type j = A_k0 + colindsub[k];
3248 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3249 const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
3250 SerialGemv(blocksize,AA,xx,yy);
3251 }
3252 }
3253 }
3254
3255 KOKKOS_INLINE_FUNCTION
3256 void
3257 operator() (const SeqTag &, const member_type &member) const {
3258
3259 // constants
3260 const local_ordinal_type blocksize = blocksize_requested;
3261 const local_ordinal_type blocksize_square = blocksize*blocksize;
3262
3263 const local_ordinal_type lr = member.league_rank();
3264 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3265 const local_ordinal_type num_vectors = y.extent(1);
3266
3267 // subview pattern
3268 auto bb = Kokkos::subview(b, block_range, 0);
3269 auto xx = bb;
3270 auto yy = Kokkos::subview(y, block_range, 0);
3271 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3272
3273 const local_ordinal_type row = lr*blocksize;
3274 for (local_ordinal_type col=0;col<num_vectors;++col) {
3275 // y := b
3276 yy.assign_data(&y(row, col));
3277 bb.assign_data(&b(row, col));
3278 if (member.team_rank() == 0)
3279 VectorCopy(member, blocksize, bb, yy);
3280 member.team_barrier();
3281
3282 // y -= Rx
3283 const size_type A_k0 = A_rowptr[lr];
3284 Kokkos::parallel_for
3285 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3286 [&](const local_ordinal_type &k) {
3287 const size_type j = A_k0 + colindsub[k];
3288 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3289 xx.assign_data( &x(A_colind[j]*blocksize, col) );
3290 VectorGemv(member, blocksize, A_block, xx, yy);
3291 });
3292 }
3293 }
3294
3295 template<int B>
3296 struct AsyncTag {};
3297
3298 template<int B>
3299 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3300 KOKKOS_INLINE_FUNCTION
3301 void
3302 operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
3303 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3304 const local_ordinal_type blocksize_square = blocksize*blocksize;
3305
3306 // constants
3307 const local_ordinal_type partidx = rowidx2part(rowidx);
3308 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3309 const local_ordinal_type v = partidx % vector_length;
3310
3311 const local_ordinal_type num_vectors = y_packed.extent(2);
3312 const local_ordinal_type num_local_rows = lclrow.extent(0);
3313
3314 // temporary buffer for y flat
3315 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3316
3317 const local_ordinal_type lr = lclrow(rowidx);
3318 const local_ordinal_type row = lr*blocksize;
3319 for (local_ordinal_type col=0;col<num_vectors;++col) {
3320 // y := b
3321 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3322
3323 // y -= Rx
3324 const size_type A_k0 = A_rowptr[lr];
3325 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3326 const size_type j = A_k0 + colindsub[k];
3327 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3328 const local_ordinal_type A_colind_at_j = A_colind[j];
3329 if (A_colind_at_j < num_local_rows) {
3330 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3331 const impl_scalar_type * const xx = &x(loc*blocksize, col);
3332 SerialGemv(blocksize, AA,xx,yy);
3333 } else {
3334 const auto loc = A_colind_at_j - num_local_rows;
3335 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3336 SerialGemv(blocksize, AA,xx_remote,yy);
3337 }
3338 }
3339 // move yy to y_packed
3340 for (local_ordinal_type k=0;k<blocksize;++k)
3341 y_packed(pri, k, col)[v] = yy[k];
3342 }
3343 }
3344
3345 template<int B>
3346 KOKKOS_INLINE_FUNCTION
3347 void
3348 operator() (const AsyncTag<B> &, const member_type &member) const {
3349 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3350 const local_ordinal_type blocksize_square = blocksize*blocksize;
3351
3352 // constants
3353 const local_ordinal_type rowidx = member.league_rank();
3354 const local_ordinal_type partidx = rowidx2part(rowidx);
3355 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3356 const local_ordinal_type v = partidx % vector_length;
3357
3358 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3359 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3360 const local_ordinal_type num_local_rows = lclrow.extent(0);
3361
3362 // subview pattern
3363 auto bb = Kokkos::subview(b, block_range, 0);
3364 auto xx = Kokkos::subview(x, block_range, 0);
3365 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3366 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3367 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3368
3369 const local_ordinal_type lr = lclrow(rowidx);
3370 const local_ordinal_type row = lr*blocksize;
3371 for (local_ordinal_type col=0;col<num_vectors;++col) {
3372 // y := b
3373 bb.assign_data(&b(row, col));
3374 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3375 if (member.team_rank() == 0)
3376 VectorCopy(member, blocksize, bb, yy);
3377 member.team_barrier();
3378
3379 // y -= Rx
3380 const size_type A_k0 = A_rowptr[lr];
3381 Kokkos::parallel_for
3382 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3383 [&](const local_ordinal_type &k) {
3384 const size_type j = A_k0 + colindsub[k];
3385 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3386
3387 const local_ordinal_type A_colind_at_j = A_colind[j];
3388 if (A_colind_at_j < num_local_rows) {
3389 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3390 xx.assign_data( &x(loc*blocksize, col) );
3391 VectorGemv(member, blocksize, A_block, xx, yy);
3392 } else {
3393 const auto loc = A_colind_at_j - num_local_rows;
3394 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3395 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3396 }
3397 });
3398 }
3399 }
3400
3401 template <int P, int B> struct OverlapTag {};
3402
3403 template<int P, int B>
3404 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3405 KOKKOS_INLINE_FUNCTION
3406 void
3407 operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
3408 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3409 const local_ordinal_type blocksize_square = blocksize*blocksize;
3410
3411 // constants
3412 const local_ordinal_type partidx = rowidx2part(rowidx);
3413 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3414 const local_ordinal_type v = partidx % vector_length;
3415
3416 const local_ordinal_type num_vectors = y_packed.extent(2);
3417 const local_ordinal_type num_local_rows = lclrow.extent(0);
3418
3419 // temporary buffer for y flat
3420 impl_scalar_type yy[max_blocksize] = {};
3421
3422 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3423 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3424
3425 const local_ordinal_type lr = lclrow(rowidx);
3426 const local_ordinal_type row = lr*blocksize;
3427 for (local_ordinal_type col=0;col<num_vectors;++col) {
3428 if (P == 0) {
3429 // y := b
3430 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3431 } else {
3432 // y (temporary) := 0
3433 memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
3434 }
3435
3436 // y -= Rx
3437 const size_type A_k0 = A_rowptr[lr];
3438 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3439 const size_type j = A_k0 + colindsub_used[k];
3440 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3441 const local_ordinal_type A_colind_at_j = A_colind[j];
3442 if (P == 0) {
3443 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3444 const impl_scalar_type * const xx = &x(loc*blocksize, col);
3445 SerialGemv(blocksize,AA,xx,yy);
3446 } else {
3447 const auto loc = A_colind_at_j - num_local_rows;
3448 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3449 SerialGemv(blocksize,AA,xx_remote,yy);
3450 }
3451 }
3452 // move yy to y_packed
3453 if (P == 0) {
3454 for (local_ordinal_type k=0;k<blocksize;++k)
3455 y_packed(pri, k, col)[v] = yy[k];
3456 } else {
3457 for (local_ordinal_type k=0;k<blocksize;++k)
3458 y_packed(pri, k, col)[v] += yy[k];
3459 }
3460 }
3461 }
3462
3463 template<int P, int B>
3464 KOKKOS_INLINE_FUNCTION
3465 void
3466 operator() (const OverlapTag<P,B> &, const member_type &member) const {
3467 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3468 const local_ordinal_type blocksize_square = blocksize*blocksize;
3469
3470 // constants
3471 const local_ordinal_type rowidx = member.league_rank();
3472 const local_ordinal_type partidx = rowidx2part(rowidx);
3473 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3474 const local_ordinal_type v = partidx % vector_length;
3475
3476 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3477 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3478 const local_ordinal_type num_local_rows = lclrow.extent(0);
3479
3480 // subview pattern
3481 auto bb = Kokkos::subview(b, block_range, 0);
3482 auto xx = bb; //Kokkos::subview(x, block_range, 0);
3483 auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3484 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3485 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3486 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3487 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3488
3489 const local_ordinal_type lr = lclrow(rowidx);
3490 const local_ordinal_type row = lr*blocksize;
3491 for (local_ordinal_type col=0;col<num_vectors;++col) {
3492 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3493 if (P == 0) {
3494 // y := b
3495 bb.assign_data(&b(row, col));
3496 if (member.team_rank() == 0)
3497 VectorCopy(member, blocksize, bb, yy);
3498 member.team_barrier();
3499 }
3500
3501 // y -= Rx
3502 const size_type A_k0 = A_rowptr[lr];
3503 Kokkos::parallel_for
3504 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3505 [&](const local_ordinal_type &k) {
3506 const size_type j = A_k0 + colindsub_used[k];
3507 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3508
3509 const local_ordinal_type A_colind_at_j = A_colind[j];
3510 if (P == 0) {
3511 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3512 xx.assign_data( &x(loc*blocksize, col) );
3513 VectorGemv(member, blocksize, A_block, xx, yy);
3514 } else {
3515 const auto loc = A_colind_at_j - num_local_rows;
3516 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3517 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3518 }
3519 });
3520 }
3521 }
3522
3523 // y = b - Rx; seq method
3524 template<typename MultiVectorLocalViewTypeY,
3525 typename MultiVectorLocalViewTypeB,
3526 typename MultiVectorLocalViewTypeX>
3527 void run(const MultiVectorLocalViewTypeY &y_,
3528 const MultiVectorLocalViewTypeB &b_,
3529 const MultiVectorLocalViewTypeX &x_) {
3530 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3531 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
3532
3533 y = y_; b = b_; x = x_;
3534 if (is_cuda<execution_space>::value) {
3535#if defined(KOKKOS_ENABLE_CUDA)
3536 const local_ordinal_type blocksize = blocksize_requested;
3537 const local_ordinal_type team_size = 8;
3538 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3539 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3540 Kokkos::parallel_for
3541 ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3542#endif
3543 } else if(is_hip<execution_space>::value) {
3544#if defined(KOKKOS_ENABLE_HIP)
3545 const local_ordinal_type blocksize = blocksize_requested;
3546 const local_ordinal_type team_size = 8;
3547 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3548 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3549 Kokkos::parallel_for
3550 ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3551#endif
3552 } else {
3553#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3554 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3555#else
3556 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3557 Kokkos::parallel_for
3558 ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3559#endif
3560 }
3561 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3562 }
3563
3564 // y = b - R (x , x_remote)
3565 template<typename MultiVectorLocalViewTypeB,
3566 typename MultiVectorLocalViewTypeX,
3567 typename MultiVectorLocalViewTypeX_Remote>
3568 void run(const vector_type_3d_view &y_packed_,
3569 const MultiVectorLocalViewTypeB &b_,
3570 const MultiVectorLocalViewTypeX &x_,
3571 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3572 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3573 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
3574
3575 b = b_; x = x_; x_remote = x_remote_;
3576 if (is_cuda<execution_space>::value) {
3577#if defined(KOKKOS_ENABLE_CUDA)
3578 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3579 y_packed_.extent(0),
3580 y_packed_.extent(1),
3581 y_packed_.extent(2),
3582 vector_length);
3583#endif
3584 } else if (is_hip<execution_space>::value) {
3585#if defined(KOKKOS_ENABLE_HIP)
3586 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3587 y_packed_.extent(0),
3588 y_packed_.extent(1),
3589 y_packed_.extent(2),
3590 vector_length);
3591#endif
3592 } else {
3593 y_packed = y_packed_;
3594 }
3595
3596 if (is_cuda<execution_space>::value) {
3597#if defined(KOKKOS_ENABLE_CUDA)
3598 const local_ordinal_type blocksize = blocksize_requested;
3599 const local_ordinal_type team_size = 8;
3600 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3601 // local_ordinal_type vl_power_of_two = 1;
3602 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3603 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3604 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3605#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3606 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3607 policy(rowidx2part.extent(0), team_size, vector_size); \
3608 Kokkos::parallel_for \
3609 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3610 policy, *this); } break
3611 switch (blocksize_requested) {
3612 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3613 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3614 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3615 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3616 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3617 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3618 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3619 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3620 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3621 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3622 }
3623#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3624#endif
3625 } else if (is_hip<execution_space>::value) {
3626#if defined(KOKKOS_ENABLE_HIP)
3627 const local_ordinal_type blocksize = blocksize_requested;
3628 const local_ordinal_type team_size = 8;
3629 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3630 // local_ordinal_type vl_power_of_two = 1;
3631 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3632 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3633 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3634#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3635 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3636 policy(rowidx2part.extent(0), team_size, vector_size); \
3637 Kokkos::parallel_for \
3638 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3639 policy, *this); } break
3640 switch (blocksize_requested) {
3641 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3642 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3643 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3644 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3645 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3646 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3647 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3648 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3649 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3650 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3651 }
3652#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3653#endif
3654 } else {
3655#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3656 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3657#else
3658#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3659 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3660 Kokkos::parallel_for \
3661 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3662 policy, *this); } break
3663 switch (blocksize_requested) {
3664 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3665 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3666 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3667 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3668 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3669 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3670 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3671 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3672 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3673 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3674 }
3675#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3676#endif
3677 }
3678 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3679 }
3680
3681 // y = b - R (y , y_remote)
3682 template<typename MultiVectorLocalViewTypeB,
3683 typename MultiVectorLocalViewTypeX,
3684 typename MultiVectorLocalViewTypeX_Remote>
3685 void run(const vector_type_3d_view &y_packed_,
3686 const MultiVectorLocalViewTypeB &b_,
3687 const MultiVectorLocalViewTypeX &x_,
3688 const MultiVectorLocalViewTypeX_Remote &x_remote_,
3689 const bool compute_owned) {
3690 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3691 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
3692
3693 b = b_; x = x_; x_remote = x_remote_;
3694 if (is_cuda<execution_space>::value) {
3695#if defined(KOKKOS_ENABLE_CUDA)
3696 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3697 y_packed_.extent(0),
3698 y_packed_.extent(1),
3699 y_packed_.extent(2),
3700 vector_length);
3701#endif
3702 } else if (is_hip<execution_space>::value) {
3703#if defined(KOKKOS_ENABLE_HIP)
3704 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3705 y_packed_.extent(0),
3706 y_packed_.extent(1),
3707 y_packed_.extent(2),
3708 vector_length);
3709#endif
3710 } else {
3711 y_packed = y_packed_;
3712 }
3713
3714 if (is_cuda<execution_space>::value) {
3715#if defined(KOKKOS_ENABLE_CUDA)
3716 const local_ordinal_type blocksize = blocksize_requested;
3717 const local_ordinal_type team_size = 8;
3718 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3719 // local_ordinal_type vl_power_of_two = 1;
3720 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3721 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3722 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3723#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3724 if (compute_owned) { \
3725 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3726 policy(rowidx2part.extent(0), team_size, vector_size); \
3727 Kokkos::parallel_for \
3728 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3729 } else { \
3730 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3731 policy(rowidx2part.extent(0), team_size, vector_size); \
3732 Kokkos::parallel_for \
3733 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3734 } break
3735 switch (blocksize_requested) {
3736 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3737 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3738 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3739 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3740 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3741 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3742 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3743 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3744 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3745 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3746 }
3747#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3748#endif
3749 } else if (is_hip<execution_space>::value) {
3750#if defined(KOKKOS_ENABLE_HIP)
3751 const local_ordinal_type blocksize = blocksize_requested;
3752 const local_ordinal_type team_size = 8;
3753 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3754 // local_ordinal_type vl_power_of_two = 1;
3755 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3756 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3757 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3758#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3759 if (compute_owned) { \
3760 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3761 policy(rowidx2part.extent(0), team_size, vector_size); \
3762 Kokkos::parallel_for \
3763 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3764 } else { \
3765 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3766 policy(rowidx2part.extent(0), team_size, vector_size); \
3767 Kokkos::parallel_for \
3768 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3769 } break
3770 switch (blocksize_requested) {
3771 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3772 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3773 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3774 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3775 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3776 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3777 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3778 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3779 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3780 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3781 }
3782#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3783#endif
3784 } else {
3785#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3786 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3787#else
3788#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3789 if (compute_owned) { \
3790 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3791 policy(0, rowidx2part.extent(0)); \
3792 Kokkos::parallel_for \
3793 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3794 } else { \
3795 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3796 policy(0, rowidx2part.extent(0)); \
3797 Kokkos::parallel_for \
3798 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3799 } break
3800
3801 switch (blocksize_requested) {
3802 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3803 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3804 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3805 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3806 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3807 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3808 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3809 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3810 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3811 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3812 }
3813#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3814#endif
3815 }
3816 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3817 }
3818 };
3819
3820 template<typename MatrixType>
3821 void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3822 /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3823 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3824 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
3825
3826 using impl_type = ImplType<MatrixType>;
3827 using local_ordinal_type = typename impl_type::local_ordinal_type;
3828 using impl_scalar_type = typename impl_type::impl_scalar_type;
3829#if 0
3830 const auto norm2 = KokkosBlas::nrm1(zz);
3831#else
3832 impl_scalar_type norm2(0);
3833 Kokkos::parallel_reduce
3834 ("ReduceMultiVector::Device",
3835 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3836 KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3837 update += zz(i);
3838 }, norm2);
3839#endif
3840 vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3841
3842 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3843 }
3844
3848 template<typename MatrixType>
3850 public:
3852 using host_execution_space = typename impl_type::host_execution_space;
3853 using magnitude_type = typename impl_type::magnitude_type;
3854
3855 private:
3856 bool collective_;
3857 int sweep_step_, sweep_step_upper_bound_;
3858#ifdef HAVE_IFPACK2_MPI
3859 MPI_Request mpi_request_;
3860 MPI_Comm comm_;
3861#endif
3862 magnitude_type work_[3];
3863
3864 public:
3865 NormManager() = default;
3866 NormManager(const NormManager &b) = default;
3867 NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3868 sweep_step_ = 1;
3869 sweep_step_upper_bound_ = 1;
3870 collective_ = comm->getSize() > 1;
3871 if (collective_) {
3872#ifdef HAVE_IFPACK2_MPI
3873 const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3874 TEUCHOS_ASSERT( ! mpi_comm.is_null());
3875 comm_ = *mpi_comm->getRawMpiComm();
3876#endif
3877 }
3878 const magnitude_type zero(0), minus_one(-1);
3879 work_[0] = zero;
3880 work_[1] = zero;
3881 work_[2] = minus_one;
3882 }
3883
3884 // Check the norm every sweep_step sweeps.
3885 void setCheckFrequency(const int sweep_step) {
3886 TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3887 sweep_step_upper_bound_ = sweep_step;
3888 sweep_step_ = 1;
3889 }
3890
3891 // Get the buffer into which to store rank-local squared norms.
3892 magnitude_type* getBuffer() { return &work_[0]; }
3893
3894 // Call MPI_Iallreduce to find the global squared norms.
3895 void ireduce(const int sweep, const bool force = false) {
3896 if ( ! force && sweep % sweep_step_) return;
3897
3898 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3899
3900 work_[1] = work_[0];
3901#ifdef HAVE_IFPACK2_MPI
3902 auto send_data = &work_[1];
3903 auto recv_data = &work_[0];
3904 if (collective_) {
3905# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3906 MPI_Iallreduce(send_data, recv_data, 1,
3907 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3908 MPI_SUM, comm_, &mpi_request_);
3909# else
3910 MPI_Allreduce (send_data, recv_data, 1,
3911 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3912 MPI_SUM, comm_);
3913# endif
3914 }
3915#endif
3916 }
3917
3918 // Check if the norm-based termination criterion is met. tol2 is the
3919 // tolerance squared. Sweep is the sweep index. If not every iteration is
3920 // being checked, this function immediately returns false. If a check must
3921 // be done at this iteration, it waits for the reduction triggered by
3922 // ireduce to complete, then checks the global norm against the tolerance.
3923 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3924 // early return
3925 if (sweep <= 0) return false;
3926
3927 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3928
3929 TEUCHOS_ASSERT(sweep >= 1);
3930 if ( ! force && (sweep - 1) % sweep_step_) return false;
3931 if (collective_) {
3932#ifdef HAVE_IFPACK2_MPI
3933# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3934 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3935# else
3936 // Do nothing.
3937# endif
3938#endif
3939 }
3940 bool r_val = false;
3941 if (sweep == 1) {
3942 work_[2] = work_[0];
3943 } else {
3944 r_val = (work_[0] < tol2*work_[2]);
3945 }
3946
3947 // adjust sweep step
3948 const auto adjusted_sweep_step = 2*sweep_step_;
3949 if (adjusted_sweep_step < sweep_step_upper_bound_) {
3950 sweep_step_ = adjusted_sweep_step;
3951 } else {
3952 sweep_step_ = sweep_step_upper_bound_;
3953 }
3954 return r_val;
3955 }
3956
3957 // After termination has occurred, finalize the norms for use in
3958 // get_norms{0,final}.
3959 void finalize () {
3960 work_[0] = std::sqrt(work_[0]); // after converged
3961 if (work_[2] >= 0)
3962 work_[2] = std::sqrt(work_[2]); // first norm
3963 // if work_[2] is minus one, then norm is not requested.
3964 }
3965
3966 // Report norms to the caller.
3967 const magnitude_type getNorms0 () const { return work_[2]; }
3968 const magnitude_type getNormsFinal () const { return work_[0]; }
3969 };
3970
3974 template<typename MatrixType>
3975 int
3976 applyInverseJacobi(// importer
3977 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3978 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3979 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3980 const bool overlap_communication_and_computation,
3981 // tpetra interface
3982 const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3983 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3984 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3985 /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3986 // local object interface
3987 const PartInterface<MatrixType> &interf, // mesh interface
3988 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3989 const AmD<MatrixType> &amd, // R = A - D
3990 /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3991 /* */ NormManager<MatrixType> &norm_manager,
3992 // preconditioner parameters
3993 const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3994 /* */ bool is_y_zero,
3995 const int max_num_sweeps,
3996 const typename ImplType<MatrixType>::magnitude_type tol,
3997 const int check_tol_every) {
3998 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3999
4000 using impl_type = ImplType<MatrixType>;
4001 using node_memory_space = typename impl_type::node_memory_space;
4002 using local_ordinal_type = typename impl_type::local_ordinal_type;
4003 using size_type = typename impl_type::size_type;
4004 using impl_scalar_type = typename impl_type::impl_scalar_type;
4005 using magnitude_type = typename impl_type::magnitude_type;
4006 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4007 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4008 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4009 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4010
4011 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4012
4013 // either tpetra importer or async importer must be active
4014 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4015 "Neither Tpetra importer nor Async importer is null.");
4016 // max number of sweeps should be positive number
4017 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4018 "Maximum number of sweeps must be >= 1.");
4019
4020 // const parameters
4021 const bool is_seq_method_requested = !tpetra_importer.is_null();
4022 const bool is_async_importer_active = !async_importer.is_null();
4023 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4024 const magnitude_type tolerance = tol*tol;
4025 const local_ordinal_type blocksize = btdm.values.extent(1);
4026 const local_ordinal_type num_vectors = Y.getNumVectors();
4027 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4028
4029 const impl_scalar_type zero(0.0);
4030
4031 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4032 "The seq method for applyInverseJacobi, " <<
4033 "which in any case is for developer use only, " <<
4034 "does not support norm-based termination.");
4035 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4036 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4037 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4038 std::invalid_argument,
4039 "The seq method for applyInverseJacobi, " <<
4040 "which in any case is for developer use only, " <<
4041 "only supports memory spaces accessible from host.");
4042
4043 // if workspace is needed more, resize it
4044 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4045 if (work.span() < work_span_required)
4046 work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4047
4048 // construct W
4049 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4050 if (local_ordinal_type(W.extent(0)) < W_size)
4051 W = impl_scalar_type_1d_view("W", W_size);
4052
4053 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4054 {
4055 if (is_seq_method_requested) {
4056 if (Z.getNumVectors() != Y.getNumVectors())
4057 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4058 } else {
4059 if (is_async_importer_active) {
4060 // create comm data buffer and keep it here
4061 async_importer->createDataBuffer(num_vectors);
4062 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4063 }
4064 }
4065 }
4066
4067 // wrap the workspace with 3d view
4068 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4069 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4070 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4071 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4072 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4073
4074 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4075 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4076 damping_factor, is_norm_manager_active);
4077
4078 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4079 ComputeResidualVector<MatrixType>
4080 compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4081 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4082
4083 // norm manager workspace resize
4084 if (is_norm_manager_active)
4085 norm_manager.setCheckFrequency(check_tol_every);
4086
4087 // iterate
4088 int sweep = 0;
4089 for (;sweep<max_num_sweeps;++sweep) {
4090 {
4091 if (is_y_zero) {
4092 // pmv := x(lclrow)
4093 multivector_converter.run(XX);
4094 } else {
4095 if (is_seq_method_requested) {
4096 // SEQ METHOD IS TESTING ONLY
4097
4098 // y := x - R y
4099 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4100 compute_residual_vector.run(YY, XX, ZZ);
4101
4102 // pmv := y(lclrow).
4103 multivector_converter.run(YY);
4104 } else {
4105 // fused y := x - R y and pmv := y(lclrow);
4106 // real use case does not use overlap comp and comm
4107 if (overlap_communication_and_computation || !is_async_importer_active) {
4108 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4109 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
4110 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4111 if (is_async_importer_active) async_importer->cancel();
4112 break;
4113 }
4114 if (is_async_importer_active) {
4115 async_importer->syncRecv();
4116 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
4117 }
4118 } else {
4119 if (is_async_importer_active)
4120 async_importer->syncExchange(YY);
4121 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
4122 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4123 }
4124 }
4125 }
4126 }
4127
4128 // pmv := inv(D) pmv.
4129 {
4130 solve_tridiags.run(YY, W);
4131 }
4132 {
4133 if (is_norm_manager_active) {
4134 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
4135 reduceVector<MatrixType>(W, norm_manager.getBuffer());
4136 if (sweep + 1 == max_num_sweeps) {
4137 norm_manager.ireduce(sweep, true);
4138 norm_manager.checkDone(sweep + 1, tolerance, true);
4139 } else {
4140 norm_manager.ireduce(sweep);
4141 }
4142 }
4143 }
4144 is_y_zero = false;
4145 }
4146
4147 //sqrt the norms for the caller's use.
4148 if (is_norm_manager_active) norm_manager.finalize();
4149
4150 return sweep;
4151 }
4152
4153
4154 template<typename MatrixType>
4155 struct ImplObject {
4156 using impl_type = ImplType<MatrixType>;
4157 using part_interface_type = PartInterface<MatrixType>;
4158 using block_tridiags_type = BlockTridiags<MatrixType>;
4159 using amd_type = AmD<MatrixType>;
4160 using norm_manager_type = NormManager<MatrixType>;
4161 using async_import_type = AsyncableImport<MatrixType>;
4162
4163 // distructed objects
4164 Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
4165 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
4166 Teuchos::RCP<async_import_type> async_importer;
4167 bool overlap_communication_and_computation;
4168
4169 // copy of Y (mutable to penentrate const)
4170 mutable typename impl_type::tpetra_multivector_type Z;
4171 mutable typename impl_type::impl_scalar_type_1d_view W;
4172
4173 // local objects
4174 part_interface_type part_interface;
4175 block_tridiags_type block_tridiags; // D
4176 amd_type a_minus_d; // R = A - D
4177 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
4178 mutable norm_manager_type norm_manager;
4179 };
4180
4181 } // namespace BlockTriDiContainerDetails
4182
4183} // namespace Ifpack2
4184
4185#endif
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:425
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1375
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1050
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2258
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1164
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3027
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3976
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:122
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1562
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:228
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:73
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1532
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:240
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:166
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1331
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:192
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:332
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:354
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:345
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:401
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:388
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:336
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:359
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2271
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3849
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2420
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:276
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:175
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:183