Tpetra parallel linear algebra Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_MULTIVECTOR_DEF_HPP
41#define TPETRA_MULTIVECTOR_DEF_HPP
42
50
51#include "Tpetra_Core.hpp"
52#include "Tpetra_Util.hpp"
53#include "Tpetra_Vector.hpp"
65#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
66# include "Teuchos_SerialDenseMatrix.hpp"
67#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
68#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
69#include "KokkosCompat_View.hpp"
70#include "KokkosBlas.hpp"
71#include "KokkosKernels_Utils.hpp"
72#include "Kokkos_Random.hpp"
73#include "Kokkos_ArithTraits.hpp"
74#include <memory>
75#include <sstream>
76
77#ifdef HAVE_TPETRA_INST_FLOAT128
78namespace Kokkos {
79 // FIXME (mfh 04 Sep 2015) Just a stub for now!
80 template<class Generator>
81 struct rand<Generator, __float128> {
82 static KOKKOS_INLINE_FUNCTION __float128 max ()
83 {
84 return static_cast<__float128> (1.0);
85 }
86 static KOKKOS_INLINE_FUNCTION __float128
87 draw (Generator& gen)
88 {
89 // Half the smallest normalized double, is the scaling factor of
90 // the lower-order term in the double-double representation.
91 const __float128 scalingFactor =
92 static_cast<__float128> (std::numeric_limits<double>::min ()) /
93 static_cast<__float128> (2.0);
94 const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
95 const __float128 lowerOrderTerm =
96 static_cast<__float128> (gen.drand ()) * scalingFactor;
97 return higherOrderTerm + lowerOrderTerm;
98 }
99 static KOKKOS_INLINE_FUNCTION __float128
100 draw (Generator& gen, const __float128& range)
101 {
102 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
103 const __float128 scalingFactor =
104 static_cast<__float128> (std::numeric_limits<double>::min ()) /
105 static_cast<__float128> (2.0);
106 const __float128 higherOrderTerm =
107 static_cast<__float128> (gen.drand (range));
108 const __float128 lowerOrderTerm =
109 static_cast<__float128> (gen.drand (range)) * scalingFactor;
110 return higherOrderTerm + lowerOrderTerm;
111 }
112 static KOKKOS_INLINE_FUNCTION __float128
113 draw (Generator& gen, const __float128& start, const __float128& end)
114 {
115 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
116 const __float128 scalingFactor =
117 static_cast<__float128> (std::numeric_limits<double>::min ()) /
118 static_cast<__float128> (2.0);
119 const __float128 higherOrderTerm =
120 static_cast<__float128> (gen.drand (start, end));
121 const __float128 lowerOrderTerm =
122 static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
123 return higherOrderTerm + lowerOrderTerm;
124 }
125 };
126} // namespace Kokkos
127#endif // HAVE_TPETRA_INST_FLOAT128
128
129namespace { // (anonymous)
130
145 template<class ST, class LO, class GO, class NT>
147 allocDualView (const size_t lclNumRows,
148 const size_t numCols,
149 const bool zeroOut = true,
150 const bool allowPadding = false)
151 {
152 using ::Tpetra::Details::Behavior;
153 using Kokkos::AllowPadding;
154 using Kokkos::view_alloc;
155 using Kokkos::WithoutInitializing;
156 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
157 typedef typename dual_view_type::t_dev dev_view_type;
158 // This needs to be a string and not a char*, if given as an
159 // argument to Kokkos::view_alloc. This is because view_alloc
160 // also allows a raw pointer as its first argument. See
161 // https://github.com/kokkos/kokkos/issues/434.
162 const std::string label ("MV::DualView");
163 const bool debug = Behavior::debug ();
164
165 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
166 // creation of the DualView's Views works around
167 // Kokkos::DualView's current inability to accept an
168 // AllocationProperties initial argument (as Kokkos::View does).
169 // However, the work-around is harmless, since it does what the
170 // (currently nonexistent) equivalent DualView constructor would
171 // have done anyway.
172
173 dev_view_type d_view;
174 if (zeroOut) {
175 if (allowPadding) {
176 d_view = dev_view_type (view_alloc (label, AllowPadding),
177 lclNumRows, numCols);
178 }
179 else {
180 d_view = dev_view_type (view_alloc (label),
181 lclNumRows, numCols);
182 }
183 }
184 else {
185 if (allowPadding) {
186 d_view = dev_view_type (view_alloc (label,
187 WithoutInitializing,
188 AllowPadding),
189 lclNumRows, numCols);
190 }
191 else {
192 d_view = dev_view_type (view_alloc (label, WithoutInitializing),
193 lclNumRows, numCols);
194 }
195 if (debug) {
196 // Filling with NaN is a cheap and effective way to tell if
197 // downstream code is trying to use a MultiVector's data
198 // without them having been initialized. ArithTraits lets us
199 // call nan() even if the scalar type doesn't define it; it
200 // just returns some undefined value in the latter case. This
201 // won't hurt anything because by setting zeroOut=false, users
202 // already agreed that they don't care about the contents of
203 // the MultiVector.
204 const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
205 KokkosBlas::fill (d_view, nan);
206 }
207 }
208 if (debug) {
209 TEUCHOS_TEST_FOR_EXCEPTION
210 (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
211 static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
212 "allocDualView: d_view's dimensions actual dimensions do not match "
213 "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
214 d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
215 << ". Please report this bug to the Tpetra developers.");
216 }
217
218 dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
219 // Whether or not the user cares about the initial contents of the
220 // MultiVector, the device and host views are out of sync. We
221 // prefer to work in device memory. The way to ensure this
222 // happens is to mark the device view as modified.
223 dv.modify_device ();
224
225 return dv;
226 }
227
228 // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
229 //
230 // T: The type of the entries of the View.
231 // ExecSpace: The Kokkos execution space.
232 template<class T, class ExecSpace>
233 struct MakeUnmanagedView {
234 // The 'false' part of the branch carefully ensures that this
235 // won't attempt to use a host execution space that hasn't been
236 // initialized. For example, if Kokkos::OpenMP is disabled and
237 // Kokkos::Threads is enabled, the latter is always the default
238 // execution space of Kokkos::HostSpace, even when ExecSpace is
239 // Kokkos::Serial. That's why we go through the trouble of asking
240 // Kokkos::DualView what _its_ space is. That seems to work
241 // around this default execution space issue.
242 //
243 typedef typename Kokkos::Impl::if_c<
244 Kokkos::Impl::SpaceAccessibility<
245 typename ExecSpace::memory_space,
246 Kokkos::HostSpace>::accessible,
247 typename ExecSpace::device_type,
248 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
249 typedef Kokkos::LayoutLeft array_layout;
250 typedef Kokkos::View<T*, array_layout, host_exec_space,
251 Kokkos::MemoryUnmanaged> view_type;
252
253 static view_type getView (const Teuchos::ArrayView<T>& x_in)
254 {
255 const size_t numEnt = static_cast<size_t> (x_in.size ());
256 if (numEnt == 0) {
257 return view_type ();
258 } else {
259 return view_type (x_in.getRawPtr (), numEnt);
260 }
261 }
262 };
263
264 // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
265 // taking a subview of a 0 x N DualView incorrectly always results
266 // in a 0 x 0 DualView.
267 template<class DualViewType>
268 DualViewType
269 takeSubview (const DualViewType& X,
270 const Kokkos::Impl::ALL_t&,
271 const std::pair<size_t, size_t>& colRng)
272 {
273 if (X.extent (0) == 0 && X.extent (1) != 0) {
274 return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
275 }
276 else {
277 return subview (X, Kokkos::ALL (), colRng);
278 }
279 }
280
281 // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
282 // taking a subview of a 0 x N DualView incorrectly always results
283 // in a 0 x 0 DualView.
284 template<class DualViewType>
285 DualViewType
286 takeSubview (const DualViewType& X,
287 const std::pair<size_t, size_t>& rowRng,
288 const std::pair<size_t, size_t>& colRng)
289 {
290 if (X.extent (0) == 0 && X.extent (1) != 0) {
291 return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
292 }
293 else {
294 return subview (X, rowRng, colRng);
295 }
296 }
297
298 template<class DualViewType>
299 size_t
300 getDualViewStride (const DualViewType& dv)
301 {
302 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
303 // method yet, but its Views do.
304 const size_t LDA = dv.d_view.stride (1);
305 const size_t numRows = dv.extent (0);
306
307 if (LDA == 0) {
308 return (numRows == 0) ? size_t (1) : numRows;
309 }
310 else {
311 return LDA;
312 }
313 }
314
315 template<class ViewType>
316 size_t
317 getViewStride (const ViewType& view)
318 {
319 const size_t LDA = view.stride (1);
320 const size_t numRows = view.extent (0);
321
322 if (LDA == 0) {
323 return (numRows == 0) ? size_t (1) : numRows;
324 }
325 else {
326 return LDA;
327 }
328 }
329
330 template <class impl_scalar_type, class buffer_device_type>
331 bool
332 runKernelOnHost (
333 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
334 )
335 {
336 if (! imports.need_sync_device ()) {
337 return false; // most up-to-date on device
338 }
339 else { // most up-to-date on host,
340 // but if large enough, worth running on device anyway
341 size_t localLengthThreshold =
343 return imports.extent(0) <= localLengthThreshold;
344 }
345 }
346
347
348 template <class SC, class LO, class GO, class NT>
349 bool
350 runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
351 {
352 if (! X.need_sync_device ()) {
353 return false; // most up-to-date on device
354 }
355 else { // most up-to-date on host
356 // but if large enough, worth running on device anyway
357 size_t localLengthThreshold =
359 return X.getLocalLength () <= localLengthThreshold;
360 }
361 }
362
363 template <class SC, class LO, class GO, class NT>
364 void
365 multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
368 {
369 using namespace Tpetra;
372 using val_type = typename MV::impl_scalar_type;
373 using mag_type = typename MV::mag_type;
374 using dual_view_type = typename MV::dual_view_type;
375
376 auto map = X.getMap ();
377 auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
378 auto whichVecs = getMultiVectorWhichVectors (X);
379 const bool isConstantStride = X.isConstantStride ();
380 const bool isDistributed = X.isDistributed ();
381
382 const bool runOnHost = runKernelOnHost (X);
383 if (runOnHost) {
384 using view_type = typename dual_view_type::t_host;
385 using array_layout = typename view_type::array_layout;
386 using device_type = typename view_type::device_type;
387
388 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
389 normImpl<val_type, array_layout, device_type,
390 mag_type> (norms, X_lcl, whichNorm, whichVecs,
391 isConstantStride, isDistributed, comm);
392 }
393 else {
394 using view_type = typename dual_view_type::t_dev;
395 using array_layout = typename view_type::array_layout;
396 using device_type = typename view_type::device_type;
397
398 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
399 normImpl<val_type, array_layout, device_type,
400 mag_type> (norms, X_lcl, whichNorm, whichVecs,
401 isConstantStride, isDistributed, comm);
402 }
403 }
404} // namespace (anonymous)
405
406
407namespace Tpetra {
408
409 namespace Details {
410 template <typename DstView, typename SrcView>
411 struct AddAssignFunctor {
412 // This functor would be better as a lambda, but CUDA cannot compile
413 // lambdas in protected functions. It compiles fine with the functor.
414 AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
415
416 KOKKOS_INLINE_FUNCTION void
417 operator () (const size_t k) const { tgt(k) += src(k); }
418
419 DstView tgt;
420 SrcView src;
421 };
422 }
423
424 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
425 bool
427 vectorIndexOutOfRange (const size_t VectorIndex) const {
428 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
429 }
430
431 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
433 MultiVector () :
434 base_type (Teuchos::rcp (new map_type ()))
435 {}
436
437 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
439 MultiVector (const Teuchos::RCP<const map_type>& map,
440 const size_t numVecs,
441 const bool zeroOut) : /* default is true */
442 base_type (map)
443 {
444 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
445
446 const size_t lclNumRows = this->getLocalLength ();
447 view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
450 }
451
452 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
455 const Teuchos::DataAccess copyOrView) :
456 base_type (source),
457 view_ (source.view_),
458 origView_ (source.origView_),
459 owningView_ (source.owningView_),
460 whichVectors_ (source.whichVectors_)
461 {
463 const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
464 "const Teuchos::DataAccess): ";
465
466 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
467
468 if (copyOrView == Teuchos::Copy) {
469 // Reuse the conveniently already existing function that creates
470 // a deep copy.
471 MV cpy = createCopy (source);
472 this->view_ = cpy.view_;
473 this->origView_ = cpy.origView_;
474 this->owningView_ = cpy.owningView_;
475 this->whichVectors_ = cpy.whichVectors_;
476 }
477 else if (copyOrView == Teuchos::View) {
478 }
479 else {
480 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
481 true, std::invalid_argument, "Second argument 'copyOrView' has an "
482 "invalid value " << copyOrView << ". Valid values include "
483 "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
484 << Teuchos::View << ".");
485 }
486 }
487
488 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
490 MultiVector (const Teuchos::RCP<const map_type>& map,
491 const dual_view_type& view) :
492 base_type (map),
493 view_ (view),
494 origView_ (view),
495 owningView_ (view)
496 {
497 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
498 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
499 map->getNodeNumElements ();
500 const size_t lclNumRows_view = view.extent (0);
501 const size_t LDA = getDualViewStride (view);
502
503 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
504 (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
505 std::invalid_argument, "Kokkos::DualView does not match Map. "
506 "map->getNodeNumElements() = " << lclNumRows_map
507 << ", view.extent(0) = " << lclNumRows_view
508 << ", and getStride() = " << LDA << ".");
509
510 using ::Tpetra::Details::Behavior;
511 const bool debug = Behavior::debug ();
512 if (debug) {
513 using ::Tpetra::Details::checkGlobalDualViewValidity;
514 std::ostringstream gblErrStrm;
515 const bool verbose = Behavior::verbose ();
516 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
517 const bool gblValid =
518 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
519 comm.getRawPtr ());
520 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
521 (! gblValid, std::runtime_error, gblErrStrm.str ());
522 }
523 }
524
525
526 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
528 MultiVector (const Teuchos::RCP<const map_type>& map,
529 const typename dual_view_type::t_dev& d_view) :
530 base_type (map)
532 using Teuchos::ArrayRCP;
533 using Teuchos::RCP;
534 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
535
536 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
537
538 const size_t LDA = getViewStride (d_view);
539 const size_t lclNumRows = map->getNodeNumElements ();
540 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
541 (LDA < lclNumRows, std::invalid_argument, "Map does not match "
542 "Kokkos::View. map->getNodeNumElements() = " << lclNumRows
543 << ", View's column stride = " << LDA
544 << ", and View's extent(0) = " << d_view.extent (0) << ".");
545
546 auto h_view = Kokkos::create_mirror_view (d_view);
547 view_ = dual_view_type (d_view, h_view);
548
549 using ::Tpetra::Details::Behavior;
550 const bool debug = Behavior::debug ();
551 if (debug) {
552 using ::Tpetra::Details::checkGlobalDualViewValidity;
553 std::ostringstream gblErrStrm;
554 const bool verbose = Behavior::verbose ();
555 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
556 const bool gblValid =
557 checkGlobalDualViewValidity (&gblErrStrm, view_, verbose,
558 comm.getRawPtr ());
559 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
560 (! gblValid, std::runtime_error, gblErrStrm.str ());
561 }
562 // The user gave us a device view. In order to respect its
563 // initial contents, we mark the DualView as "modified on device."
564 // That way, the next sync will synchronize it with the host view.
565 view_.modify_device ();
566 origView_ = view_;
567 owningView_ = view_;
568 }
569
570 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
572 MultiVector (const Teuchos::RCP<const map_type>& map,
573 const dual_view_type& view,
574 const dual_view_type& origView) :
575 base_type (map),
576 view_ (view),
577 origView_ (origView),
578 owningView_ (origView)
579 {
580 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
581
582 const size_t LDA = getDualViewStride (origView);
583 const size_t lclNumRows = this->getLocalLength (); // comes from the Map
584 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
585 LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
586 "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
587 << ". This may also mean that the input origView's first dimension (number "
588 "of rows = " << origView.extent (0) << ") does not not match the number "
589 "of entries in the Map on the calling process.");
590
591 using ::Tpetra::Details::Behavior;
592 const bool debug = Behavior::debug ();
593 if (debug) {
594 using ::Tpetra::Details::checkGlobalDualViewValidity;
595 std::ostringstream gblErrStrm;
596 const bool verbose = Behavior::verbose ();
597 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
598 const bool gblValid_0 =
599 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
600 comm.getRawPtr ());
601 const bool gblValid_1 =
602 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
603 comm.getRawPtr ());
604 const bool gblValid = gblValid_0 && gblValid_1;
605 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
606 (! gblValid, std::runtime_error, gblErrStrm.str ());
607 }
608 }
609
610
611 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
613 MultiVector (const Teuchos::RCP<const map_type>& map,
614 const dual_view_type& view,
615 const Teuchos::ArrayView<const size_t>& whichVectors) :
616 base_type (map),
617 view_ (view),
618 origView_ (view),
619 owningView_ (view),
620 whichVectors_ (whichVectors.begin (), whichVectors.end ())
621 {
622 using Kokkos::ALL;
623 using Kokkos::subview;
624 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
625
626 using ::Tpetra::Details::Behavior;
627 const bool debug = Behavior::debug ();
628 if (debug) {
629 using ::Tpetra::Details::checkGlobalDualViewValidity;
630 std::ostringstream gblErrStrm;
631 const bool verbose = Behavior::verbose ();
632 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
633 const bool gblValid =
634 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
635 comm.getRawPtr ());
636 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
637 (! gblValid, std::runtime_error, gblErrStrm.str ());
638 }
639
640 const size_t lclNumRows = map.is_null () ? size_t (0) :
641 map->getNodeNumElements ();
642 // Check dimensions of the input DualView. We accept that Kokkos
643 // might not allow construction of a 0 x m (Dual)View with m > 0,
644 // so we only require the number of rows to match if the
645 // (Dual)View has more than zero columns. Likewise, we only
646 // require the number of columns to match if the (Dual)View has
647 // more than zero rows.
648 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
649 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
650 std::invalid_argument, "view.extent(0) = " << view.extent (0)
651 << " < map->getNodeNumElements() = " << lclNumRows << ".");
652 if (whichVectors.size () != 0) {
653 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
654 view.extent (1) != 0 && view.extent (1) == 0,
655 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
656 " = " << whichVectors.size () << " > 0.");
657 size_t maxColInd = 0;
658 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
659 for (size_type k = 0; k < whichVectors.size (); ++k) {
660 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
661 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
662 std::invalid_argument, "whichVectors[" << k << "] = "
663 "Teuchos::OrdinalTraits<size_t>::invalid().");
664 maxColInd = std::max (maxColInd, whichVectors[k]);
665 }
666 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
667 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
668 std::invalid_argument, "view.extent(1) = " << view.extent (1)
669 << " <= max(whichVectors) = " << maxColInd << ".");
670 }
671
672 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
673 // zero strides, so modify in that case.
674 const size_t LDA = getDualViewStride (view);
675 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
676 (LDA < lclNumRows, std::invalid_argument,
677 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
678
679 if (whichVectors.size () == 1) {
680 // If whichVectors has only one entry, we don't need to bother
681 // with nonconstant stride. Just shift the view over so it
682 // points to the desired column.
683 //
684 // NOTE (mfh 10 May 2014) This is a special case where we set
685 // origView_ just to view that one column, not all of the
686 // original columns. This ensures that the use of origView_ in
687 // offsetView works correctly.
688 //
689 // However, owningView_ is not a subview.
690 const std::pair<size_t, size_t> colRng (whichVectors[0],
691 whichVectors[0] + 1);
692 origView_ = view_;
693 view_ = takeSubview (view_, ALL (), colRng);
694 // whichVectors_.size() == 0 means "constant stride."
695 whichVectors_.clear ();
696 }
697 }
698
699 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
701 MultiVector (const Teuchos::RCP<const map_type>& map,
702 const dual_view_type& view,
703 const dual_view_type& origView,
704 const Teuchos::ArrayView<const size_t>& whichVectors) :
705 base_type (map),
706 view_ (view),
707 origView_ (origView),
708 owningView_ (origView),
709 whichVectors_ (whichVectors.begin (), whichVectors.end ())
710 {
711 using Kokkos::ALL;
712 using Kokkos::subview;
713 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
714
715 using ::Tpetra::Details::Behavior;
716 const bool debug = Behavior::debug ();
717 if (debug) {
718 using ::Tpetra::Details::checkGlobalDualViewValidity;
719 std::ostringstream gblErrStrm;
720 const bool verbose = Behavior::verbose ();
721 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
722 const bool gblValid_0 =
723 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
724 comm.getRawPtr ());
725 const bool gblValid_1 =
726 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
727 comm.getRawPtr ());
728 const bool gblValid = gblValid_0 && gblValid_1;
729 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
730 (! gblValid, std::runtime_error, gblErrStrm.str ());
731 }
732
733 const size_t lclNumRows = this->getLocalLength ();
734 // Check dimensions of the input DualView. We accept that Kokkos
735 // might not allow construction of a 0 x m (Dual)View with m > 0,
736 // so we only require the number of rows to match if the
737 // (Dual)View has more than zero columns. Likewise, we only
738 // require the number of columns to match if the (Dual)View has
739 // more than zero rows.
740 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
741 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
742 std::invalid_argument, "view.extent(0) = " << view.extent (0)
743 << " < map->getNodeNumElements() = " << lclNumRows << ".");
744 if (whichVectors.size () != 0) {
745 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
746 view.extent (1) != 0 && view.extent (1) == 0,
747 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
748 " = " << whichVectors.size () << " > 0.");
749 size_t maxColInd = 0;
750 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
751 for (size_type k = 0; k < whichVectors.size (); ++k) {
752 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
753 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
754 std::invalid_argument, "whichVectors[" << k << "] = "
755 "Teuchos::OrdinalTraits<size_t>::invalid().");
756 maxColInd = std::max (maxColInd, whichVectors[k]);
757 }
758 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
759 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
760 std::invalid_argument, "view.extent(1) = " << view.extent (1)
761 << " <= max(whichVectors) = " << maxColInd << ".");
762 }
763
764 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
765 // zero strides, so modify in that case.
766 const size_t LDA = getDualViewStride (origView);
767 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
768 (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
769 "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
770 lclNumRows << ". origView.extent(0) = " << origView.extent(0)
771 << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
772
773 if (whichVectors.size () == 1) {
774 // If whichVectors has only one entry, we don't need to bother
775 // with nonconstant stride. Just shift the view over so it
776 // points to the desired column.
777 //
778 // NOTE (mfh 10 May 2014) This is a special case where we set
779 // origView_ just to view that one column, not all of the
780 // original columns. This ensures that the use of origView_ in
781 // offsetView works correctly.
782 const std::pair<size_t, size_t> colRng (whichVectors[0],
783 whichVectors[0] + 1);
784 view_ = takeSubview (view_, ALL (), colRng);
785 origView_ = takeSubview (origView_, ALL (), colRng);
786 // whichVectors_.size() == 0 means "constant stride."
787 whichVectors_.clear ();
788 }
789 }
790
791 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
793 MultiVector (const Teuchos::RCP<const map_type>& map,
794 const Teuchos::ArrayView<const Scalar>& data,
795 const size_t LDA,
796 const size_t numVecs) :
797 base_type (map)
798 {
799 typedef LocalOrdinal LO;
800 typedef GlobalOrdinal GO;
801 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
802 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
803
804 // Deep copy constructor, constant stride (NO whichVectors_).
805 // There is no need for a deep copy constructor with nonconstant stride.
806
807 const size_t lclNumRows =
808 map.is_null () ? size_t (0) : map->getNodeNumElements ();
809 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
810 (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
811 "map->getNodeNumElements() = " << lclNumRows << ".");
812 if (numVecs != 0) {
813 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
814 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
815 (static_cast<size_t> (data.size ()) < minNumEntries,
816 std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
817 "entries, given the input Map and number of vectors in the MultiVector."
818 " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
819 "map->getNodeNumElements () = " << minNumEntries << ".");
820 }
821
822 this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
823 //Note: X_out will be completely overwritten
824 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
827
828 // Make an unmanaged host Kokkos::View of the input data. First
829 // create a View (X_in_orig) with the original stride. Then,
830 // create a subview (X_in) with the right number of columns.
831 const impl_scalar_type* const X_in_raw =
832 reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
833 Kokkos::View<const impl_scalar_type**,
834 Kokkos::LayoutLeft,
835 Kokkos::HostSpace,
836 Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
837 const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
838 auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
839
840 // If LDA != X_out's column stride, then we need to copy one
841 // column at a time; Kokkos::deep_copy refuses to work in that
842 // case.
843 const size_t outStride =
844 X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
845 if (LDA == outStride) { // strides are the same; deep_copy once
846 // This only works because MultiVector uses LayoutLeft.
847 // We would need a custom copy functor otherwise.
848 Kokkos::deep_copy (X_out, X_in);
849 }
850 else { // strides differ; copy one column at a time
851 typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
852 out_col_view_type;
853 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
854 in_col_view_type;
855 for (size_t j = 0; j < numVecs; ++j) {
856 out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
857 in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
858 Kokkos::deep_copy (X_out_j, X_in_j);
859 }
860 }
861 }
862
863 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
865 MultiVector (const Teuchos::RCP<const map_type>& map,
866 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
867 const size_t numVecs) :
868 base_type (map)
869 {
870 typedef impl_scalar_type IST;
871 typedef LocalOrdinal LO;
872 typedef GlobalOrdinal GO;
873 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
874 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
875
876 const size_t lclNumRows =
877 map.is_null () ? size_t (0) : map->getNodeNumElements ();
878 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
879 (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
880 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
881 "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
882 for (size_t j = 0; j < numVecs; ++j) {
883 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
884 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
885 static_cast<size_t> (X_j_av.size ()) < lclNumRows,
886 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
887 << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
888 << ".");
889 }
890
891 view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
892 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
893
894 // Make sure that the type of a single input column has the same
895 // array layout as each output column does, so we can deep_copy.
896 using array_layout = typename decltype (X_out)::array_layout;
897 using input_col_view_type = typename Kokkos::View<const IST*,
898 array_layout,
899 Kokkos::HostSpace,
900 Kokkos::MemoryUnmanaged>;
901
902 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
903 for (size_t j = 0; j < numVecs; ++j) {
904 Teuchos::ArrayView<const IST> X_j_av =
905 Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
906 input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
907 auto X_j_out = Kokkos::subview (X_out, rowRng, j);
908 Kokkos::deep_copy (X_j_out, X_j_in);
909 }
912 }
913
914 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916 isConstantStride () const {
917 return whichVectors_.empty ();
918 }
919
920 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921 size_t
923 getLocalLength () const
924 {
925 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
926 return static_cast<size_t> (0);
927 } else {
928 return this->getMap ()->getNodeNumElements ();
929 }
930 }
931
932 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
935 getGlobalLength () const
936 {
937 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
938 return static_cast<size_t> (0);
939 } else {
940 return this->getMap ()->getGlobalNumElements ();
941 }
942 }
943
944 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
945 size_t
947 getStride () const
948 {
949 return isConstantStride () ? getDualViewStride (origView_) : size_t (0);
950 }
951
952 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
953 bool
956 {
957 //Don't actually get a view, just get pointers.
958 auto thisData = view_.h_view.data();
959 auto otherData = other.view_.h_view.data();
960 size_t thisSpan = view_.h_view.span();
961 size_t otherSpan = other.view_.h_view.span();
962 return (otherData <= thisData && thisData < otherData + otherSpan)
963 || (thisData <= otherData && otherData < thisData + thisSpan);
964 }
965
966 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
967 bool
969 checkSizes (const SrcDistObject& sourceObj)
970 {
971 // Check whether the source object is a MultiVector. If not, then
972 // we can't even compare sizes, so it's definitely not OK to
973 // Import or Export from it.
975 const MV* src = dynamic_cast<const MV*> (&sourceObj);
976 if (src == nullptr) {
977 return false;
978 }
979 else {
980 // The target of the Import or Export calls checkSizes() in
981 // DistObject::doTransfer(). By that point, we've already
982 // constructed an Import or Export object using the two
983 // multivectors' Maps, which means that (hopefully) we've
984 // already checked other attributes of the multivectors. Thus,
985 // all we need to do here is check the number of columns.
986 return src->getNumVectors () == this->getNumVectors ();
987 }
988 }
989
990 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
991 size_t
994 return this->getNumVectors ();
995 }
996
997 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
998 void
1001 (const SrcDistObject& sourceObj,
1002 const size_t numSameIDs,
1003 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1004 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1005 const CombineMode CM)
1006 {
1007 using ::Tpetra::Details::Behavior;
1009 using ::Tpetra::Details::ProfilingRegion;
1010 using std::endl;
1011 using KokkosRefactor::Details::permute_array_multi_column;
1012 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1013 using Kokkos::Compat::create_const_view;
1015 const char tfecfFuncName[] = "copyAndPermute: ";
1016 ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
1017
1018 const bool verbose = Behavior::verbose ();
1019 std::unique_ptr<std::string> prefix;
1020 if (verbose) {
1021 auto map = this->getMap ();
1022 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1023 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1024 std::ostringstream os;
1025 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1026 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1027 os << "Start" << endl;
1028 std::cerr << os.str ();
1029 }
1030
1031 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1032 (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1033 std::logic_error, "permuteToLIDs.extent(0) = "
1034 << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1035 << permuteFromLIDs.extent (0) << ".");
1036
1037 // We've already called checkSizes(), so this cast must succeed.
1038 MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1039 const size_t numCols = this->getNumVectors ();
1040
1041 // sourceMV doesn't belong to us, so we can't sync it. Do the
1042 // copying where it's currently sync'd.
1043 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1044 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1045 std::logic_error, "Input MultiVector needs sync to both host "
1046 "and device.");
1047 const bool copyOnHost = runKernelOnHost(sourceMV);
1048 if (verbose) {
1049 std::ostringstream os;
1050 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1051 std::cerr << os.str ();
1052 }
1053
1054 if (verbose) {
1055 std::ostringstream os;
1056 os << *prefix << "Copy" << endl;
1057 std::cerr << os.str ();
1058 }
1059
1060 // TODO (mfh 15 Sep 2013) When we replace
1061 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1062 // ways to copy the data:
1063 //
1064 // 1. Get a (sub)view of each column and call deep_copy on that.
1065 // 2. Write a custom kernel to copy the data.
1066 //
1067 // The first is easier, but the second might be more performant in
1068 // case we decide to use layouts other than LayoutLeft. It might
1069 // even make sense to hide whichVectors_ in an entirely new layout
1070 // for Kokkos Views.
1071
1072 // Copy rows [0, numSameIDs-1] of the local multivectors.
1073 //
1074 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1075 // time, even when both multivectors are constant-stride, since
1076 // deep_copy between strided subviews with more than one column
1077 // doesn't currently work.
1078
1079 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1080 // both source and target are constant stride and have multiple
1081 // columns.
1082 if (numSameIDs > 0) {
1083 const std::pair<size_t, size_t> rows (0, numSameIDs);
1084 if (copyOnHost) {
1085 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1086 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1087
1088 for (size_t j = 0; j < numCols; ++j) {
1089 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1090 const size_t srcCol =
1091 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1092
1093 auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1094 auto src_j = Kokkos::subview (src_h, rows, srcCol);
1095 if (CM == ADD_ASSIGN) {
1096 // Sum src_j into tgt_j
1097 using range_t =
1098 Kokkos::RangePolicy<typename Node::execution_space, size_t>;
1099 range_t rp(0,numSameIDs);
1100 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1101 aaf(tgt_j, src_j);
1102 Kokkos::parallel_for(rp, aaf);
1103 }
1104 else {
1105 // Copy src_j into tgt_j
1106 Kokkos::deep_copy (tgt_j, src_j);
1107 }
1108 }
1109 }
1110 else { // copy on device
1111 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1112 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1113
1114 for (size_t j = 0; j < numCols; ++j) {
1115 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1116 const size_t srcCol =
1117 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1118
1119 auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1120 auto src_j = Kokkos::subview (src_d, rows, srcCol);
1121 if (CM == ADD_ASSIGN) {
1122 // Sum src_j into tgt_j
1123 using range_t =
1124 Kokkos::RangePolicy<typename Node::execution_space, size_t>;
1125 range_t rp(0,numSameIDs);
1126 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1127 aaf(tgt_j, src_j);
1128 Kokkos::parallel_for(rp, aaf);
1129 }
1130 else {
1131 // Copy src_j into tgt_j
1132 Kokkos::deep_copy (tgt_j, src_j);
1133 }
1134 }
1135 }
1136 }
1137
1138
1139 // For the remaining GIDs, execute the permutations. This may
1140 // involve noncontiguous access of both source and destination
1141 // vectors, depending on the LID lists.
1142 //
1143 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1144 // the same process, this merges their values by replacement of
1145 // the last encountered GID, not by the specified merge rule
1146 // (such as ADD).
1147
1148 // If there are no permutations, we are done
1149 if (permuteFromLIDs.extent (0) == 0 ||
1150 permuteToLIDs.extent (0) == 0) {
1151 if (verbose) {
1152 std::ostringstream os;
1153 os << *prefix << "No permutations. Done!" << endl;
1154 std::cerr << os.str ();
1155 }
1156 return;
1157 }
1158
1159 if (verbose) {
1160 std::ostringstream os;
1161 os << *prefix << "Permute" << endl;
1162 std::cerr << os.str ();
1163 }
1164
1165 // We could in theory optimize for the case where exactly one of
1166 // them is constant stride, but we don't currently do that.
1167 const bool nonConstStride =
1168 ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1169
1170 if (verbose) {
1171 std::ostringstream os;
1172 os << *prefix << "nonConstStride="
1173 << (nonConstStride ? "true" : "false") << endl;
1174 std::cerr << os.str ();
1175 }
1176
1177 // We only need the "which vectors" arrays if either the source or
1178 // target MV is not constant stride. Since we only have one
1179 // kernel that must do double-duty, we have to create a "which
1180 // vectors" array for the MV that _is_ constant stride.
1181 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1182 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1183 if (nonConstStride) {
1184 if (this->whichVectors_.size () == 0) {
1185 Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1186 tmpTgt.modify_host ();
1187 for (size_t j = 0; j < numCols; ++j) {
1188 tmpTgt.h_view(j) = j;
1189 }
1190 if (! copyOnHost) {
1191 tmpTgt.sync_device ();
1192 }
1193 tgtWhichVecs = tmpTgt;
1194 }
1195 else {
1196 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1197 tgtWhichVecs =
1198 getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1199 "tgtWhichVecs",
1200 copyOnHost);
1201 }
1202 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1203 (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1204 this->getNumVectors (),
1205 std::logic_error, "tgtWhichVecs.extent(0) = " <<
1206 tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1207 this->getNumVectors () << ".");
1209 if (sourceMV.whichVectors_.size () == 0) {
1210 Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1211 tmpSrc.modify_host ();
1212 for (size_t j = 0; j < numCols; ++j) {
1213 tmpSrc.h_view(j) = j;
1214 }
1215 if (! copyOnHost) {
1216 tmpSrc.sync_device ();
1217 }
1218 srcWhichVecs = tmpSrc;
1219 }
1220 else {
1221 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1222 sourceMV.whichVectors_ ();
1223 srcWhichVecs =
1224 getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1225 "srcWhichVecs",
1226 copyOnHost);
1227 }
1228 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1229 (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1230 sourceMV.getNumVectors (), std::logic_error,
1231 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1232 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1233 << ".");
1234 }
1235
1236 if (copyOnHost) { // permute on host too
1237 if (verbose) {
1238 std::ostringstream os;
1239 os << *prefix << "Get permute LIDs on host" << std::endl;
1240 std::cerr << os.str ();
1241 }
1242 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1243 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1244
1245 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1246 auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1247 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1248 auto permuteFromLIDs_h =
1249 create_const_view (permuteFromLIDs.view_host ());
1250
1251 if (verbose) {
1252 std::ostringstream os;
1253 os << *prefix << "Permute on host" << endl;
1254 std::cerr << os.str ();
1255 }
1256 if (nonConstStride) {
1257 // No need to sync first, because copyOnHost argument to
1258 // getDualViewCopyFromArrayView puts them in the right place.
1259 auto tgtWhichVecs_h =
1260 create_const_view (tgtWhichVecs.view_host ());
1261 auto srcWhichVecs_h =
1262 create_const_view (srcWhichVecs.view_host ());
1263 if (CM == ADD_ASSIGN) {
1264 using op_type = KokkosRefactor::Details::AddOp;
1265 permute_array_multi_column_variable_stride (tgt_h, src_h,
1266 permuteToLIDs_h,
1267 permuteFromLIDs_h,
1268 tgtWhichVecs_h,
1269 srcWhichVecs_h, numCols,
1270 op_type());
1271 }
1272 else {
1273 using op_type = KokkosRefactor::Details::InsertOp;
1274 permute_array_multi_column_variable_stride (tgt_h, src_h,
1275 permuteToLIDs_h,
1276 permuteFromLIDs_h,
1277 tgtWhichVecs_h,
1278 srcWhichVecs_h, numCols,
1279 op_type());
1280 }
1281 }
1282 else {
1283 if (CM == ADD_ASSIGN) {
1284 using op_type = KokkosRefactor::Details::AddOp;
1285 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1286 permuteFromLIDs_h, numCols, op_type());
1287 }
1288 else {
1289 using op_type = KokkosRefactor::Details::InsertOp;
1290 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1291 permuteFromLIDs_h, numCols, op_type());
1292 }
1293 }
1294 }
1295 else { // permute on device
1296 if (verbose) {
1297 std::ostringstream os;
1298 os << *prefix << "Get permute LIDs on device" << endl;
1299 std::cerr << os.str ();
1300 }
1301 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1302 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1303
1304 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1305 auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1306 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1307 auto permuteFromLIDs_d =
1308 create_const_view (permuteFromLIDs.view_device ());
1309
1310 if (verbose) {
1311 std::ostringstream os;
1312 os << *prefix << "Permute on device" << endl;
1313 std::cerr << os.str ();
1314 }
1315 if (nonConstStride) {
1316 // No need to sync first, because copyOnHost argument to
1317 // getDualViewCopyFromArrayView puts them in the right place.
1318 auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1319 auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1320 if (CM == ADD_ASSIGN) {
1321 using op_type = KokkosRefactor::Details::AddOp;
1322 permute_array_multi_column_variable_stride (tgt_d, src_d,
1323 permuteToLIDs_d,
1324 permuteFromLIDs_d,
1325 tgtWhichVecs_d,
1326 srcWhichVecs_d, numCols,
1327 op_type());
1328 }
1329 else {
1330 using op_type = KokkosRefactor::Details::InsertOp;
1331 permute_array_multi_column_variable_stride (tgt_d, src_d,
1332 permuteToLIDs_d,
1333 permuteFromLIDs_d,
1334 tgtWhichVecs_d,
1335 srcWhichVecs_d, numCols,
1336 op_type());
1337 }
1338 }
1339 else {
1340 if (CM == ADD_ASSIGN) {
1341 using op_type = KokkosRefactor::Details::AddOp;
1342 permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1343 permuteFromLIDs_d, numCols, op_type());
1344 }
1345 else {
1346 using op_type = KokkosRefactor::Details::InsertOp;
1347 permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1348 permuteFromLIDs_d, numCols, op_type());
1349 }
1350 }
1351 }
1352
1353 if (verbose) {
1354 std::ostringstream os;
1355 os << *prefix << "Done!" << endl;
1356 std::cerr << os.str ();
1357 }
1358 }
1359
1360 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1361 void
1364 (const SrcDistObject& sourceObj,
1365 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1366 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1367 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1368 size_t& constantNumPackets)
1369 {
1370 using ::Tpetra::Details::Behavior;
1371 using ::Tpetra::Details::ProfilingRegion;
1373 using Kokkos::Compat::create_const_view;
1374 using Kokkos::Compat::getKokkosViewDeepCopy;
1375 using std::endl;
1377 const char tfecfFuncName[] = "packAndPrepare: ";
1378 ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1379
1380 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1381 // have the option to check indices. We do so when Tpetra is in
1382 // debug mode. It is in debug mode by default in a debug build,
1383 // but you may control this at run time, before launching the
1384 // executable, by setting the TPETRA_DEBUG environment variable to
1385 // "1" (or "TRUE").
1386 const bool debugCheckIndices = Behavior::debug ();
1387 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1388 // environment variable to "1" (or "TRUE") for copious debug
1389 // output to std::cerr on every MPI process. This is unwise for
1390 // runs with large numbers of MPI processes.
1391 const bool printDebugOutput = Behavior::verbose ();
1392
1393 std::unique_ptr<std::string> prefix;
1394 if (printDebugOutput) {
1395 auto map = this->getMap ();
1396 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1397 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1398 std::ostringstream os;
1399 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1400 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1401 os << "Start" << endl;
1402 std::cerr << os.str ();
1403 }
1404
1405 // We've already called checkSizes(), so this cast must succeed.
1406 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1408 const size_t numCols = sourceMV.getNumVectors ();
1409
1410 // This spares us from needing to fill numExportPacketsPerLID.
1411 // Setting constantNumPackets to a nonzero value signals that
1412 // all packets have the same number of entries.
1413 constantNumPackets = numCols;
1415 // If we have no exports, there is nothing to do. Make sure this
1416 // goes _after_ setting constantNumPackets correctly.
1417 if (exportLIDs.extent (0) == 0) {
1418 if (printDebugOutput) {
1419 std::ostringstream os;
1420 os << *prefix << "No exports on this proc, DONE" << std::endl;
1421 std::cerr << os.str ();
1423 return;
1424 }
1425
1426 /* The layout in the export for MultiVectors is as follows:
1427 exports = { all of the data from row exportLIDs.front() ;
1428 ....
1429 all of the data from row exportLIDs.back() }
1430 This doesn't have the best locality, but is necessary because
1431 the data for a Packet (all data associated with an LID) is
1432 required to be contiguous. */
1433
1434 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1435 // packing scheme in the above comment? The data going to a
1436 // particular process must be contiguous, of course, but those
1437 // data could include entries from multiple LIDs. DistObject just
1438 // needs to know how to index into that data. Kokkos is good at
1439 // decoupling storage intent from data layout choice.
1440
1441 const size_t numExportLIDs = exportLIDs.extent (0);
1442 const size_t newExportsSize = numCols * numExportLIDs;
1443 if (printDebugOutput) {
1444 std::ostringstream os;
1445 os << *prefix << "realloc: "
1446 << "numExportLIDs: " << numExportLIDs
1447 << ", exports.extent(0): " << exports.extent (0)
1448 << ", newExportsSize: " << newExportsSize << std::endl;
1449 std::cerr << os.str ();
1450 }
1451 reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1452
1453 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1454 // sync it. Pack it where it's currently sync'd.
1455 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1456 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1457 std::logic_error, "Input MultiVector needs sync to both host "
1458 "and device.");
1459 const bool packOnHost = runKernelOnHost(sourceMV);
1460 if (printDebugOutput) {
1461 std::ostringstream os;
1462 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1463 std::cerr << os.str ();
1464 }
1465
1466 // Mark 'exports' here, since we might have resized it above.
1467 // Resizing currently requires calling the constructor, which
1468 // clears out the 'modified' flags.
1469 if (packOnHost) {
1470 // nde 06 Feb 2020: If 'exports' does not require resize
1471 // when reallocDualViewIfNeeded is called, the modified flags
1472 // are not cleared out. This can result in host and device views
1473 // being out-of-sync, resuling in an error in exports.modify_* calls.
1474 // Clearing the sync flags prevents this possible case.
1475 exports.clear_sync_state ();
1476 exports.modify_host ();
1477 }
1478 else {
1479 // nde 06 Feb 2020: If 'exports' does not require resize
1480 // when reallocDualViewIfNeeded is called, the modified flags
1481 // are not cleared out. This can result in host and device views
1482 // being out-of-sync, resuling in an error in exports.modify_* calls.
1483 // Clearing the sync flags prevents this possible case.
1484 exports.clear_sync_state ();
1485 exports.modify_device ();
1486 }
1487
1488 if (numCols == 1) { // special case for one column only
1489 // MultiVector always represents a single column with constant
1490 // stride, but it doesn't hurt to implement both cases anyway.
1491 //
1492 // ETP: I'm not sure I agree with the above statement. Can't a single-
1493 // column multivector be a subview of another multi-vector, in which case
1494 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1495 // separately.
1496 //
1497 // mfh 18 Jan 2016: In answer to ETP's comment above:
1498 // MultiVector treats single-column MultiVectors created using a
1499 // "nonconstant stride constructor" as a special case, and makes
1500 // them constant stride (by making whichVectors_ have length 0).
1501 if (sourceMV.isConstantStride ()) {
1502 using KokkosRefactor::Details::pack_array_single_column;
1503 if (printDebugOutput) {
1504 std::ostringstream os;
1505 os << *prefix << "Pack numCols=1 const stride" << endl;
1506 std::cerr << os.str ();
1507 }
1508 if (packOnHost) {
1509 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1510 pack_array_single_column (exports.view_host (),
1511 src_host,
1512 exportLIDs.view_host (),
1513 0,
1514 debugCheckIndices);
1515 }
1516 else { // pack on device
1517 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1518 pack_array_single_column (exports.view_device (),
1519 src_dev,
1520 exportLIDs.view_device (),
1521 0,
1522 debugCheckIndices);
1523 }
1524 }
1525 else {
1526 using KokkosRefactor::Details::pack_array_single_column;
1527 if (printDebugOutput) {
1528 std::ostringstream os;
1529 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1530 std::cerr << os.str ();
1531 }
1532 if (packOnHost) {
1533 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1534 pack_array_single_column (exports.view_host (),
1535 src_host,
1536 exportLIDs.view_host (),
1537 sourceMV.whichVectors_[0],
1538 debugCheckIndices);
1539 }
1540 else { // pack on device
1541 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1542 pack_array_single_column (exports.view_device (),
1543 src_dev,
1544 exportLIDs.view_device (),
1545 sourceMV.whichVectors_[0],
1546 debugCheckIndices);
1547 }
1548 }
1549 }
1550 else { // the source MultiVector has multiple columns
1551 if (sourceMV.isConstantStride ()) {
1552 using KokkosRefactor::Details::pack_array_multi_column;
1553 if (printDebugOutput) {
1554 std::ostringstream os;
1555 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1556 std::cerr << os.str ();
1557 }
1558 if (packOnHost) {
1559 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1560 pack_array_multi_column (exports.view_host (),
1561 src_host,
1562 exportLIDs.view_host (),
1563 numCols,
1564 debugCheckIndices);
1565 }
1566 else { // pack on device
1567 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1568 pack_array_multi_column (exports.view_device (),
1569 src_dev,
1570 exportLIDs.view_device (),
1571 numCols,
1572 debugCheckIndices);
1573 }
1574 }
1575 else {
1576 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1577 if (printDebugOutput) {
1578 std::ostringstream os;
1579 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1580 << endl;
1581 std::cerr << os.str ();
1582 }
1583 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1584 // whichVectors_ can be expensive, but pack and unpack for
1585 // nonconstant-stride MultiVectors is slower anyway.
1586 using IST = impl_scalar_type;
1587 using DV = Kokkos::DualView<IST*, device_type>;
1588 using HES = typename DV::t_host::execution_space;
1589 using DES = typename DV::t_dev::execution_space;
1590 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1591 if (packOnHost) {
1592 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1593 pack_array_multi_column_variable_stride
1594 (exports.view_host (),
1595 src_host,
1596 exportLIDs.view_host (),
1597 getKokkosViewDeepCopy<HES> (whichVecs),
1598 numCols,
1599 debugCheckIndices);
1600 }
1601 else { // pack on device
1602 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1603 pack_array_multi_column_variable_stride
1604 (exports.view_device (),
1605 src_dev,
1606 exportLIDs.view_device (),
1607 getKokkosViewDeepCopy<DES> (whichVecs),
1608 numCols,
1609 debugCheckIndices);
1610 }
1611 }
1612 }
1613
1614 if (printDebugOutput) {
1615 std::ostringstream os;
1616 os << *prefix << "Done!" << endl;
1617 std::cerr << os.str ();
1618 }
1619
1620 }
1621
1622
1623 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1624 template <class NO>
1625 typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1626 typename NO::device_type::memory_space>::value,
1627 bool>::type
1629 reallocImportsIfNeededImpl (const size_t newSize,
1630 const bool verbose,
1631 const std::string* prefix,
1632 const bool areRemoteLIDsContiguous,
1633 const CombineMode CM)
1634 {
1635 // This implementation of reallocImportsIfNeeded is an
1636 // optimization that is specific to MultiVector. We check if the
1637 // imports_ view can be aliased to the end of the data view_. If
1638 // that is the case, we can skip the unpackAndCombine call.
1639
1640 if (verbose) {
1641 std::ostringstream os;
1642 os << *prefix << "Realloc (if needed) imports_ from "
1643 << this->imports_.extent (0) << " to " << newSize << std::endl;
1644 std::cerr << os.str ();
1645 }
1646
1647 bool reallocated = false;
1648
1649 using IST = impl_scalar_type;
1650 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1651
1652 // Conditions for aliasing memory:
1653 // - When both sides of the dual view are in the same memory
1654 // space, we do not need to worry about syncing things.
1655 // - When both memory spaces are different, we only alias if this
1656 // does not incur additional sync'ing.
1657 // - The remote LIDs need to be contiguous, so that we do not need
1658 // to reorder received information.
1659 // - CombineMode needs to be INSERT.
1660 // - The number of vectors needs to be 1, otherwise we need to
1661 // reorder the received data.
1662 if ((dual_view_type::impl_dualview_is_single_device::value ||
1663 (Details::Behavior::assumeMpiIsCudaAware () && !this->need_sync_device()) ||
1664 (!Details::Behavior::assumeMpiIsCudaAware () && !this->need_sync_host())) &&
1665 areRemoteLIDsContiguous &&
1666 (CM == INSERT || CM == REPLACE) &&
1667 (getNumVectors() == 1) &&
1668 (newSize > 0)) {
1669
1670 size_t offset = getLocalLength () - newSize;
1671 reallocated = this->imports_.d_view.data() != view_.d_view.data() + offset;
1672 if (reallocated) {
1673 typedef std::pair<size_t, size_t> range_type;
1674 this->imports_ = DV(view_,
1675 range_type (offset, getLocalLength () ),
1676 0);
1677
1678 if (verbose) {
1679 std::ostringstream os;
1680 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1681 std::cerr << os.str ();
1682 }
1683 }
1684 return reallocated;
1685 }
1686 {
1688 reallocated =
1689 reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1690 if (verbose) {
1691 std::ostringstream os;
1692 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1693 std::cerr << os.str ();
1694 }
1695 this->imports_ = this->unaliased_imports_;
1696 }
1697 return reallocated;
1698 }
1699
1700 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1701 template <class NO>
1702 typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1703 typename NO::device_type::memory_space>::value,
1704 bool>::type
1706 reallocImportsIfNeededImpl (const size_t newSize,
1707 const bool verbose,
1708 const std::string* prefix,
1709 const bool areRemoteLIDsContiguous,
1710 const CombineMode CM)
1711 {
1712 return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1713 }
1714
1715 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1716 bool
1718 reallocImportsIfNeeded(const size_t newSize,
1719 const bool verbose,
1720 const std::string* prefix,
1721 const bool areRemoteLIDsContiguous,
1722 const CombineMode CM) {
1724 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1725 }
1726
1727 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1728 bool
1731 return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
1732 view_.d_view.data() + view_.d_view.extent(0));
1733 }
1734
1735
1736 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1737 void
1740 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1741 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1742 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1743 const size_t constantNumPackets,
1744 const CombineMode CM)
1745 {
1746 using ::Tpetra::Details::Behavior;
1747 using ::Tpetra::Details::ProfilingRegion;
1748 using KokkosRefactor::Details::unpack_array_multi_column;
1749 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1750 using Kokkos::Compat::getKokkosViewDeepCopy;
1751 using std::endl;
1752 const char longFuncName[] = "Tpetra::MultiVector::unpackAndCombine";
1753 const char tfecfFuncName[] = "unpackAndCombine: ";
1754 ProfilingRegion regionUAC (longFuncName);
1755
1756 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1757 // have the option to check indices. We do so when Tpetra is in
1758 // debug mode. It is in debug mode by default in a debug build,
1759 // but you may control this at run time, before launching the
1760 // executable, by setting the TPETRA_DEBUG environment variable to
1761 // "1" (or "TRUE").
1762 const bool debugCheckIndices = Behavior::debug ();
1763
1764 const bool printDebugOutput = Behavior::verbose ();
1765 std::unique_ptr<std::string> prefix;
1766 if (printDebugOutput) {
1767 auto map = this->getMap ();
1768 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1769 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1770 std::ostringstream os;
1771 os << "Proc " << myRank << ": " << longFuncName << ": ";
1772 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1773 os << "Start" << endl;
1774 std::cerr << os.str ();
1775 }
1776
1777 // If we have no imports, there is nothing to do
1778 if (importLIDs.extent (0) == 0) {
1779 if (printDebugOutput) {
1780 std::ostringstream os;
1781 os << *prefix << "No imports. Done!" << endl;
1782 std::cerr << os.str ();
1783 }
1784 return;
1785 }
1786
1787 // Check, whether imports_ is a subview of the MV view.
1788 if (importsAreAliased()) {
1789 if (printDebugOutput) {
1790 std::ostringstream os;
1791 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1792 std::cerr << os.str ();
1793 }
1794 return;
1795 }
1796
1797
1798 const size_t numVecs = getNumVectors ();
1799 if (debugCheckIndices) {
1800 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1801 (static_cast<size_t> (imports.extent (0)) !=
1802 numVecs * importLIDs.extent (0),
1803 std::runtime_error,
1804 "imports.extent(0) = " << imports.extent (0)
1805 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1806 << " * " << importLIDs.extent (0) << " = "
1807 << numVecs * importLIDs.extent (0) << ".");
1808
1809 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1810 (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1811 "constantNumPackets input argument must be nonzero.");
1812
1813 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1814 (static_cast<size_t> (numVecs) !=
1815 static_cast<size_t> (constantNumPackets),
1816 std::runtime_error, "constantNumPackets must equal numVecs.");
1817 }
1818
1819 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1820 // the memory space in which the imports buffer was last modified and
1821 // the size of the imports buffer.
1822 // DistObject::doTransferNew decides where it was last modified (based on
1823 // whether communication buffers used were on host or device).
1824 const bool unpackOnHost = runKernelOnHost(imports);
1825 if (unpackOnHost) {
1826 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1827 }
1828 else {
1829 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1830 }
1831
1832 if (printDebugOutput) {
1833 std::ostringstream os;
1834 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1835 << endl;
1836 std::cerr << os.str ();
1837 }
1838
1839 // We have to sync before modifying, because this method may read
1840 // as well as write (depending on the CombineMode).
1841 auto imports_d = imports.view_device ();
1842 auto imports_h = imports.view_host ();
1843 auto importLIDs_d = importLIDs.view_device ();
1844 auto importLIDs_h = importLIDs.view_host ();
1845
1846 Kokkos::DualView<size_t*, device_type> whichVecs;
1847 if (! isConstantStride ()) {
1848 Kokkos::View<const size_t*, Kokkos::HostSpace,
1849 Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1850 numVecs);
1851 whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1852 if (unpackOnHost) {
1853 whichVecs.modify_host ();
1854 Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1855 }
1856 else {
1857 whichVecs.modify_device ();
1858 Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1859 }
1860 }
1861 auto whichVecs_d = whichVecs.view_device ();
1862 auto whichVecs_h = whichVecs.view_host ();
1863
1864 /* The layout in the export for MultiVectors is as follows:
1865 imports = { all of the data from row exportLIDs.front() ;
1866 ....
1867 all of the data from row exportLIDs.back() }
1868 This doesn't have the best locality, but is necessary because
1869 the data for a Packet (all data associated with an LID) is
1870 required to be contiguous. */
1871
1872 if (numVecs > 0 && importLIDs.extent (0) > 0) {
1873 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1874 using host_exec_space = typename dual_view_type::t_host::execution_space;
1875
1876 // This fixes GitHub Issue #4418.
1877 const bool use_atomic_updates = unpackOnHost ?
1878 host_exec_space::concurrency () != 1 :
1879 dev_exec_space::concurrency () != 1;
1880
1881 if (printDebugOutput) {
1882 std::ostringstream os;
1883 os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
1884 std::cerr << os.str ();
1885 }
1886
1887 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1888 // custom combine modes, start editing here.
1889
1890 if (CM == INSERT || CM == REPLACE) {
1891 using op_type = KokkosRefactor::Details::InsertOp;
1892 if (isConstantStride ()) {
1893 if (unpackOnHost) {
1894 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1895 unpack_array_multi_column (host_exec_space (),
1896 X_h, imports_h, importLIDs_h,
1897 op_type (), numVecs,
1898 use_atomic_updates,
1899 debugCheckIndices);
1900
1901 }
1902 else { // unpack on device
1903 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1904 unpack_array_multi_column (dev_exec_space (),
1905 X_d, imports_d, importLIDs_d,
1906 op_type (), numVecs,
1907 use_atomic_updates,
1908 debugCheckIndices);
1909 }
1910 }
1911 else { // not constant stride
1912 if (unpackOnHost) {
1913 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1914 unpack_array_multi_column_variable_stride (host_exec_space (),
1915 X_h, imports_h,
1916 importLIDs_h,
1917 whichVecs_h,
1918 op_type (),
1919 numVecs,
1920 use_atomic_updates,
1921 debugCheckIndices);
1922 }
1923 else { // unpack on device
1924 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1925 unpack_array_multi_column_variable_stride (dev_exec_space (),
1926 X_d, imports_d,
1927 importLIDs_d,
1928 whichVecs_d,
1929 op_type (),
1930 numVecs,
1931 use_atomic_updates,
1932 debugCheckIndices);
1933 }
1934 }
1935 }
1936 else if (CM == ADD || CM == ADD_ASSIGN) {
1937 using op_type = KokkosRefactor::Details::AddOp;
1938 if (isConstantStride ()) {
1939 if (unpackOnHost) {
1940 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1941 unpack_array_multi_column (host_exec_space (),
1942 X_h, imports_h, importLIDs_h,
1943 op_type (), numVecs,
1944 use_atomic_updates,
1945 debugCheckIndices);
1946 }
1947 else { // unpack on device
1948 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1949 unpack_array_multi_column (dev_exec_space (),
1950 X_d, imports_d, importLIDs_d,
1951 op_type (), numVecs,
1952 use_atomic_updates,
1953 debugCheckIndices);
1954 }
1955 }
1956 else { // not constant stride
1957 if (unpackOnHost) {
1958 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1959 unpack_array_multi_column_variable_stride (host_exec_space (),
1960 X_h, imports_h,
1961 importLIDs_h,
1962 whichVecs_h,
1963 op_type (),
1964 numVecs,
1965 use_atomic_updates,
1966 debugCheckIndices);
1967 }
1968 else { // unpack on device
1969 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1970 unpack_array_multi_column_variable_stride (dev_exec_space (),
1971 X_d, imports_d,
1972 importLIDs_d,
1973 whichVecs_d,
1974 op_type (),
1975 numVecs,
1976 use_atomic_updates,
1977 debugCheckIndices);
1978 }
1979 }
1980 }
1981 else if (CM == ABSMAX) {
1982 using op_type = KokkosRefactor::Details::AbsMaxOp;
1983 if (isConstantStride ()) {
1984 if (unpackOnHost) {
1985 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1986 unpack_array_multi_column (host_exec_space (),
1987 X_h, imports_h, importLIDs_h,
1988 op_type (), numVecs,
1989 use_atomic_updates,
1990 debugCheckIndices);
1991 }
1992 else { // unpack on device
1993 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1994 unpack_array_multi_column (dev_exec_space (),
1995 X_d, imports_d, importLIDs_d,
1996 op_type (), numVecs,
1997 use_atomic_updates,
1998 debugCheckIndices);
1999 }
2000 }
2001 else {
2002 if (unpackOnHost) {
2003 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2004 unpack_array_multi_column_variable_stride (host_exec_space (),
2005 X_h, imports_h,
2006 importLIDs_h,
2007 whichVecs_h,
2008 op_type (),
2009 numVecs,
2010 use_atomic_updates,
2011 debugCheckIndices);
2012 }
2013 else { // unpack on device
2014 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2015 unpack_array_multi_column_variable_stride (dev_exec_space (),
2016 X_d, imports_d,
2017 importLIDs_d,
2018 whichVecs_d,
2019 op_type (),
2020 numVecs,
2021 use_atomic_updates,
2022 debugCheckIndices);
2023 }
2024 }
2025 }
2026 else {
2027 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2028 (true, std::logic_error, "Invalid CombineMode");
2029 }
2030 }
2031 else {
2032 if (printDebugOutput) {
2033 std::ostringstream os;
2034 os << *prefix << "Nothing to unpack" << endl;
2035 std::cerr << os.str ();
2036 }
2037 }
2038
2039 if (printDebugOutput) {
2040 std::ostringstream os;
2041 os << *prefix << "Done!" << endl;
2042 std::cerr << os.str ();
2043 }
2044 }
2045
2046 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2047 size_t
2049 getNumVectors () const
2050 {
2051 if (isConstantStride ()) {
2052 return static_cast<size_t> (view_.extent (1));
2053 } else {
2054 return static_cast<size_t> (whichVectors_.size ());
2055 }
2056 }
2057
2058 namespace { // (anonymous)
2059
2060 template<class RV>
2061 void
2062 gblDotImpl (const RV& dotsOut,
2063 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2064 const bool distributed)
2066 using Teuchos::REDUCE_MAX;
2067 using Teuchos::REDUCE_SUM;
2068 using Teuchos::reduceAll;
2069 typedef typename RV::non_const_value_type dot_type;
2070
2071 const size_t numVecs = dotsOut.extent (0);
2072
2073 // If the MultiVector is distributed over multiple processes, do
2074 // the distributed (interprocess) part of the dot product. We
2075 // assume that the MPI implementation can read from and write to
2076 // device memory.
2077 //
2078 // replaceMap() may have removed some processes. Those
2079 // processes have a null Map. They must not participate in any
2080 // collective operations. We ask first whether the Map is null,
2081 // because isDistributed() defers that question to the Map. We
2082 // still compute and return local dots for processes not
2083 // participating in collective operations; those probably don't
2084 // make any sense, but it doesn't hurt to do them, since it's
2085 // illegal to call dot() on those processes anyway.
2086 if (distributed && ! comm.is_null ()) {
2087 // The calling process only participates in the collective if
2088 // both the Map and its Comm on that process are nonnull.
2089 const int nv = static_cast<int> (numVecs);
2090 const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2091
2092 if (commIsInterComm) {
2093 // If comm is an intercomm, then we may not alias input and
2094 // output buffers, so we have to make a copy of the local
2095 // sum.
2096 typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2097 Kokkos::deep_copy (lclDots, dotsOut);
2098 const dot_type* const lclSum = lclDots.data ();
2099 dot_type* const gblSum = dotsOut.data ();
2100 reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2102 else {
2103 dot_type* const inout = dotsOut.data ();
2104 reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2105 }
2106 }
2107 }
2108 } // namespace (anonymous)
2109
2110 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2111 void
2114 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2115 {
2116 using ::Tpetra::Details::Behavior;
2117 using Kokkos::subview;
2118 using Teuchos::Comm;
2119 using Teuchos::null;
2120 using Teuchos::RCP;
2121 // View of all the dot product results.
2122 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2123 typedef typename dual_view_type::t_dev_const XMV;
2124 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2125
2126 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2127
2128 const size_t numVecs = this->getNumVectors ();
2129 if (numVecs == 0) {
2130 return; // nothing to do
2131 }
2132 const size_t lclNumRows = this->getLocalLength ();
2133 const size_t numDots = static_cast<size_t> (dots.extent (0));
2134 const bool debug = Behavior::debug ();
2135
2136 if (debug) {
2137 const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2138 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2139 (! compat, std::invalid_argument, "'*this' MultiVector is not "
2140 "compatible with the input MultiVector A. We only test for this "
2141 "in debug mode.");
2142 }
2143
2144 // FIXME (mfh 11 Jul 2014) These exception tests may not
2145 // necessarily be thrown on all processes consistently. We should
2146 // instead pass along error state with the inner product. We
2147 // could do this by setting an extra slot to
2148 // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
2149 // final sum should be
2150 // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2151 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2152 lclNumRows != A.getLocalLength (), std::runtime_error,
2153 "MultiVectors do not have the same local length. "
2154 "this->getLocalLength() = " << lclNumRows << " != "
2155 "A.getLocalLength() = " << A.getLocalLength () << ".");
2156 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2157 numVecs != A.getNumVectors (), std::runtime_error,
2158 "MultiVectors must have the same number of columns (vectors). "
2159 "this->getNumVectors() = " << numVecs << " != "
2160 "A.getNumVectors() = " << A.getNumVectors () << ".");
2161 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2162 numDots != numVecs, std::runtime_error,
2163 "The output array 'dots' must have the same number of entries as the "
2164 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2165 numDots << " != this->getNumVectors() = " << numVecs << ".");
2166
2167 const std::pair<size_t, size_t> colRng (0, numVecs);
2168 RV dotsOut = subview (dots, colRng);
2169 RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2170 this->getMap ()->getComm ();
2171
2172 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2173 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2174
2175 ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2176 this->whichVectors_.getRawPtr (),
2177 A.whichVectors_.getRawPtr (),
2178 this->isConstantStride (), A.isConstantStride ());
2179 gblDotImpl (dotsOut, comm, this->isDistributed ());
2180 }
2181
2182 namespace { // (anonymous)
2183 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2187 {
2188 using ::Tpetra::Details::ProfilingRegion;
2190 using dot_type = typename MV::dot_type;
2191 ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2192
2193 auto map = x.getMap ();
2194 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2195 map.is_null () ? Teuchos::null : map->getComm ();
2196 if (comm.is_null ()) {
2197 return Kokkos::ArithTraits<dot_type>::zero ();
2198 }
2199 else {
2200 using LO = LocalOrdinal;
2201 // The min just ensures that we don't overwrite memory that
2202 // doesn't belong to us, in the erroneous input case where x
2203 // and y have different numbers of rows.
2204 const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2205 y.getLocalLength ()));
2206 const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2207 dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2208 dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2209
2210 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2211 //const_cast<MV&>(x).sync_device ();
2212 //const_cast<MV&>(y).sync_device ();
2213
2214 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2215 auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2216 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2217 auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2218 lclDot = KokkosBlas::dot (x_1d, y_1d);
2219
2220 if (x.isDistributed ()) {
2221 using Teuchos::outArg;
2222 using Teuchos::REDUCE_SUM;
2223 using Teuchos::reduceAll;
2224 reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2225 }
2226 else {
2227 gblDot = lclDot;
2228 }
2229 return gblDot;
2230 }
2231 }
2232 } // namespace (anonymous)
2233
2234
2236 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2237 void
2240 const Teuchos::ArrayView<dot_type>& dots) const
2241 {
2242 const char tfecfFuncName[] = "dot: ";
2243 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2245 const size_t numVecs = this->getNumVectors ();
2246 const size_t lclNumRows = this->getLocalLength ();
2247 const size_t numDots = static_cast<size_t> (dots.size ());
2248
2249 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2250 // not necessarily be thrown on all processes consistently. We
2251 // keep them for now, because MultiVector's unit tests insist on
2252 // them. In the future, we should instead pass along error state
2253 // with the inner product. We could do this by setting an extra
2254 // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
2255 // The final sum should be
2256 // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2257 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2258 (lclNumRows != A.getLocalLength (), std::runtime_error,
2259 "MultiVectors do not have the same local length. "
2260 "this->getLocalLength() = " << lclNumRows << " != "
2261 "A.getLocalLength() = " << A.getLocalLength () << ".");
2262 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2263 (numVecs != A.getNumVectors (), std::runtime_error,
2264 "MultiVectors must have the same number of columns (vectors). "
2265 "this->getNumVectors() = " << numVecs << " != "
2266 "A.getNumVectors() = " << A.getNumVectors () << ".");
2267 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2268 (numDots != numVecs, std::runtime_error,
2269 "The output array 'dots' must have the same number of entries as the "
2270 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2271 numDots << " != this->getNumVectors() = " << numVecs << ".");
2272
2273 if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2274 const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2275 dots[0] = gblDot;
2276 }
2277 else {
2278 this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2279 }
2280 }
2281
2282 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2283 void
2285 norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2286 {
2288 using ::Tpetra::Details::NORM_TWO;
2289 using ::Tpetra::Details::ProfilingRegion;
2290 ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2291
2292 // The function needs to be able to sync X.
2293 MV& X = const_cast<MV&> (*this);
2294 multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2295 }
2296
2297 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2298 void
2300 norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2301 {
2302 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2303 this->norm2 (norms_av);
2304 }
2305
2307 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2308 void
2310 norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2311 {
2313 using ::Tpetra::Details::NORM_ONE;
2314 using ::Tpetra::Details::ProfilingRegion;
2315 ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2316
2317 // The function needs to be able to sync X.
2318 MV& X = const_cast<MV&> (*this);
2319 multiVectorNormImpl (norms.data (), X, NORM_ONE);
2320 }
2321
2322 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2323 void
2325 norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2326 {
2327 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2328 this->norm1 (norms_av);
2329 }
2330
2331 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2332 void
2334 normInf (const Teuchos::ArrayView<mag_type>& norms) const
2335 {
2337 using ::Tpetra::Details::NORM_INF;
2338 using ::Tpetra::Details::ProfilingRegion;
2339 ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2340
2341 // The function needs to be able to sync X.
2342 MV& X = const_cast<MV&> (*this);
2343 multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2344 }
2345
2346 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2347 void
2349 normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2350 {
2351 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2352 this->normInf (norms_av);
2353 }
2354
2355 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2356 void
2358 meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2359 {
2360 // KR FIXME Overload this method to take a View.
2361 using Kokkos::ALL;
2362 using Kokkos::subview;
2363 using Teuchos::Comm;
2364 using Teuchos::RCP;
2365 using Teuchos::reduceAll;
2366 using Teuchos::REDUCE_SUM;
2367 typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2368
2369 const size_t lclNumRows = this->getLocalLength ();
2370 const size_t numVecs = this->getNumVectors ();
2371 const size_t numMeans = static_cast<size_t> (means.size ());
2372
2373 TEUCHOS_TEST_FOR_EXCEPTION(
2374 numMeans != numVecs, std::runtime_error,
2375 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2376 << " != this->getNumVectors() = " << numVecs << ".");
2377
2378 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2379 const std::pair<size_t, size_t> colRng (0, numVecs);
2380
2381 // Make sure that the final output view has the same layout as the
2382 // temporary view's HostMirror. Left or Right doesn't matter for
2383 // a 1-D array anyway; this is just to placate the compiler.
2384 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2385 typedef Kokkos::View<impl_scalar_type*,
2386 typename local_view_type::HostMirror::array_layout,
2387 Kokkos::HostSpace,
2388 Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2389 host_local_view_type meansOut (means.getRawPtr (), numMeans);
2391 RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2392 this->getMap ()->getComm ();
2393
2394 // If we need sync to device, then host has the most recent version.
2395 const bool useHostVersion = this->need_sync_device ();
2396 if (useHostVersion) {
2397 // DualView was last modified on host, so run the local kernel there.
2398 auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2399 rowRng, Kokkos::ALL ());
2400 // Compute the local sum of each column.
2401 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2402 if (isConstantStride ()) {
2403 KokkosBlas::sum (lclSums, X_lcl);
2404 }
2405 else {
2406 for (size_t j = 0; j < numVecs; ++j) {
2407 const size_t col = whichVectors_[j];
2408 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2409 }
2410 }
2411
2412 // If there are multiple MPI processes, the all-reduce reads
2413 // from lclSums, and writes to meansOut. Otherwise, we just
2414 // copy lclSums into meansOut.
2415 if (! comm.is_null () && this->isDistributed ()) {
2416 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2417 lclSums.data (), meansOut.data ());
2418 }
2419 else {
2420 Kokkos::deep_copy (meansOut, lclSums);
2421 }
2422 }
2423 else {
2424 // DualView was last modified on device, so run the local kernel there.
2425 auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2426 rowRng, Kokkos::ALL ());
2427
2428 // Compute the local sum of each column.
2429 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2430 if (isConstantStride ()) {
2431 KokkosBlas::sum (lclSums, X_lcl);
2432 }
2433 else {
2434 for (size_t j = 0; j < numVecs; ++j) {
2435 const size_t col = whichVectors_[j];
2436 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2437 }
2438 }
2439
2440 // If there are multiple MPI processes, the all-reduce reads
2441 // from lclSums, and writes to meansOut. (We assume that MPI
2442 // can read device memory.) Otherwise, we just copy lclSums
2443 // into meansOut.
2444 if (! comm.is_null () && this->isDistributed ()) {
2445 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2446 lclSums.data (), meansOut.data ());
2447 }
2448 else {
2449 Kokkos::deep_copy (meansOut, lclSums);
2450 }
2451 }
2452
2453 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2454 // to the magnitude type, since operator/ (std::complex<T>, int)
2455 // isn't necessarily defined.
2456 const impl_scalar_type OneOverN =
2457 ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2458 for (size_t k = 0; k < numMeans; ++k) {
2459 meansOut(k) = meansOut(k) * OneOverN;
2460 }
2461 }
2462
2463
2464 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2465 void
2467 randomize ()
2468 {
2469 typedef impl_scalar_type IST;
2470 typedef Kokkos::Details::ArithTraits<IST> ATS;
2471 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2472 typedef typename pool_type::generator_type generator_type;
2473
2474 const IST max = Kokkos::rand<generator_type, IST>::max ();
2475 const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2476
2477 this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2478 }
2479
2480
2481 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2482 void
2484 randomize (const Scalar& minVal, const Scalar& maxVal)
2485 {
2486 typedef impl_scalar_type IST;
2487 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2488
2489 // Seed the pseudorandom number generator using the calling
2490 // process' rank. This helps decorrelate different process'
2491 // pseudorandom streams. It's not perfect but it's effective and
2492 // doesn't require MPI communication. The seed also includes bits
2493 // from the standard library's rand().
2494 //
2495 // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2496 // The code below just makes a new seed each time.
2497
2498 const uint64_t myRank =
2499 static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2500 uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2501 unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2502
2503 pool_type rand_pool (seed);
2504 const IST max = static_cast<IST> (maxVal);
2505 const IST min = static_cast<IST> (minVal);
2506
2507 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2508
2509 if (isConstantStride ()) {
2510 Kokkos::fill_random (thisView, rand_pool, min, max);
2511 }
2512 else {
2513 const size_t numVecs = getNumVectors ();
2514 for (size_t k = 0; k < numVecs; ++k) {
2515 const size_t col = whichVectors_[k];
2516 auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2517 Kokkos::fill_random (X_k, rand_pool, min, max);
2518 }
2519 }
2521
2522 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2523 void
2525 putScalar (const Scalar& alpha)
2526 {
2527 using ::Tpetra::Details::ProfilingRegion;
2528 using ::Tpetra::Details::Blas::fill;
2529 using DES = typename dual_view_type::t_dev::execution_space;
2530 using HES = typename dual_view_type::t_host::execution_space;
2531 using LO = LocalOrdinal;
2532 ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2533
2534 // We need this cast for cases like Scalar = std::complex<T> but
2535 // impl_scalar_type = Kokkos::complex<T>.
2536 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2537 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2538 const LO numVecs = static_cast<LO> (this->getNumVectors ());
2539
2540 // Modify the most recently updated version of the data. This
2541 // avoids sync'ing, which could violate users' expectations.
2542 //
2543 // If we need sync to device, then host has the most recent version.
2544 const bool runOnHost = runKernelOnHost(*this);
2545 if (! runOnHost) {
2546 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2547 if (this->isConstantStride ()) {
2548 fill (DES (), X, theAlpha, lclNumRows, numVecs);
2549 }
2550 else {
2551 fill (DES (), X, theAlpha, lclNumRows, numVecs,
2552 this->whichVectors_.getRawPtr ());
2553 }
2554 }
2555 else { // last modified in host memory, so modify data there.
2556 auto X = this->getLocalViewHost(Access::OverwriteAll);
2557 if (this->isConstantStride ()) {
2558 fill (HES (), X, theAlpha, lclNumRows, numVecs);
2559 }
2560 else {
2561 fill (HES (), X, theAlpha, lclNumRows, numVecs,
2562 this->whichVectors_.getRawPtr ());
2563 }
2564 }
2565 }
2566
2567
2568 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2569 void
2571 replaceMap (const Teuchos::RCP<const map_type>& newMap)
2572 {
2573 using Teuchos::ArrayRCP;
2574 using Teuchos::Comm;
2575 using Teuchos::RCP;
2576 using ST = Scalar;
2577 using LO = LocalOrdinal;
2578 using GO = GlobalOrdinal;
2579
2580 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2581 // it might work if the MV is a column view of another MV.
2582 // However, things might go wrong when restoring the original
2583 // Map, so we don't allow this case for now.
2584 TEUCHOS_TEST_FOR_EXCEPTION(
2585 ! this->isConstantStride (), std::logic_error,
2586 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2587 "if the MultiVector is a column view of another MultiVector (that is, if "
2588 "isConstantStride() == false).");
2589
2590 // Case 1: current Map and new Map are both nonnull on this process.
2591 // Case 2: current Map is nonnull, new Map is null.
2592 // Case 3: current Map is null, new Map is nonnull.
2593 // Case 4: both Maps are null: forbidden.
2594 //
2595 // Case 1 means that we don't have to do anything on this process,
2596 // other than assign the new Map. (We always have to do that.)
2597 // It's an error for the user to supply a Map that requires
2598 // resizing in this case.
2599 //
2600 // Case 2 means that the calling process is in the current Map's
2601 // communicator, but will be excluded from the new Map's
2602 // communicator. We don't have to do anything on the calling
2603 // process; just leave whatever data it may have alone.
2604 //
2605 // Case 3 means that the calling process is excluded from the
2606 // current Map's communicator, but will be included in the new
2607 // Map's communicator. This means we need to (re)allocate the
2608 // local DualView if it does not have the right number of rows.
2609 // If the new number of rows is nonzero, we'll fill the newly
2610 // allocated local data with zeros, as befits a projection
2611 // operation.
2612 //
2613 // The typical use case for Case 3 is that the MultiVector was
2614 // first created with the Map with more processes, then that Map
2615 // was replaced with a Map with fewer processes, and finally the
2616 // original Map was restored on this call to replaceMap.
2617
2618#ifdef HAVE_TEUCHOS_DEBUG
2619 // mfh 28 Mar 2013: We can't check for compatibility across the
2620 // whole communicator, unless we know that the current and new
2621 // Maps are nonnull on _all_ participating processes.
2622 // TEUCHOS_TEST_FOR_EXCEPTION(
2623 // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2624 // std::invalid_argument, "Tpetra::MultiVector::project: "
2625 // "If the input Map's communicator is compatible (has the same number of "
2626 // "processes as) the current Map's communicator, then the two Maps must be "
2627 // "compatible. The replaceMap() method is not for data redistribution; "
2628 // "use Import or Export for that purpose.");
2629
2630 // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2631 // of the Map, in case the process counts don't match.
2632#endif // HAVE_TEUCHOS_DEBUG
2633
2634 if (this->getMap ().is_null ()) { // current Map is null
2635 // If this->getMap() is null, that means that this MultiVector
2636 // has already had replaceMap happen to it. In that case, just
2637 // reallocate the DualView with the right size.
2638
2639 TEUCHOS_TEST_FOR_EXCEPTION(
2640 newMap.is_null (), std::invalid_argument,
2641 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2642 "This probably means that the input Map is incorrect.");
2643
2644 // Case 3: current Map is null, new Map is nonnull.
2645 // Reallocate the DualView with the right dimensions.
2646 const size_t newNumRows = newMap->getNodeNumElements ();
2647 const size_t origNumRows = view_.extent (0);
2648 const size_t numCols = this->getNumVectors ();
2649
2650 if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2651 view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2652 }
2653 }
2654 else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2655 // I am an excluded process. Reinitialize my data so that I
2656 // have 0 rows. Keep the number of columns as before.
2657 const size_t newNumRows = static_cast<size_t> (0);
2658 const size_t numCols = this->getNumVectors ();
2659 view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2660 }
2661
2662 this->map_ = newMap;
2663 }
2664
2665 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2666 void
2668 scale (const Scalar& alpha)
2669 {
2670 using Kokkos::ALL;
2671 using IST = impl_scalar_type;
2672
2673 const IST theAlpha = static_cast<IST> (alpha);
2674 if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2675 return; // do nothing
2676 }
2677 const size_t lclNumRows = getLocalLength ();
2678 const size_t numVecs = getNumVectors ();
2679 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2680 const std::pair<size_t, size_t> colRng (0, numVecs);
2681
2682 // We can't substitute putScalar(0.0) for scale(0.0), because the
2683 // former will overwrite NaNs present in the MultiVector. The
2684 // semantics of this call require multiplying them by 0, which
2685 // IEEE 754 requires to be NaN.
2686
2687 // If we need sync to device, then host has the most recent version.
2688 const bool useHostVersion = need_sync_device ();
2689 if (useHostVersion) {
2690 auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2691 if (isConstantStride ()) {
2692 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2693 }
2694 else {
2695 for (size_t k = 0; k < numVecs; ++k) {
2696 const size_t Y_col = whichVectors_[k];
2697 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2698 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2699 }
2700 }
2701 }
2702 else { // work on device
2703 auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2704 if (isConstantStride ()) {
2705 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2706 }
2707 else {
2708 for (size_t k = 0; k < numVecs; ++k) {
2709 const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2710 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2711 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2712 }
2713 }
2714 }
2715 }
2716
2717
2718 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2719 void
2721 scale (const Teuchos::ArrayView<const Scalar>& alphas)
2722 {
2723 const size_t numVecs = this->getNumVectors ();
2724 const size_t numAlphas = static_cast<size_t> (alphas.size ());
2725 TEUCHOS_TEST_FOR_EXCEPTION(
2726 numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2727 "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2728 << numVecs << ".");
2729
2730 // Use a DualView to copy the scaling constants onto the device.
2731 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2732 k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2733 k_alphas.modify_host ();
2734 for (size_t i = 0; i < numAlphas; ++i) {
2735 k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2736 }
2737 k_alphas.sync_device ();
2738 // Invoke the scale() overload that takes a device View of coefficients.
2739 this->scale (k_alphas.view_device ());
2740 }
2741
2742 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2743 void
2745 scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2746 {
2747 using Kokkos::ALL;
2748 using Kokkos::subview;
2749
2750 const size_t lclNumRows = this->getLocalLength ();
2751 const size_t numVecs = this->getNumVectors ();
2752 TEUCHOS_TEST_FOR_EXCEPTION(
2753 static_cast<size_t> (alphas.extent (0)) != numVecs,
2754 std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2755 "alphas.extent(0) = " << alphas.extent (0)
2756 << " != this->getNumVectors () = " << numVecs << ".");
2757 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2758 const std::pair<size_t, size_t> colRng (0, numVecs);
2759
2760 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2761 // type of the return value of subview. This is because if we
2762 // switch the array layout from LayoutLeft to LayoutRight
2763 // (preferred for performance of block operations), the types
2764 // below won't be valid. (A view of a column of a LayoutRight
2765 // multivector has LayoutStride, not LayoutLeft.)
2766
2767 // If we need sync to device, then host has the most recent version.
2768 const bool useHostVersion = this->need_sync_device ();
2769 if (useHostVersion) {
2770 // Work in host memory. This means we need to create a host
2771 // mirror of the input View of coefficients.
2772 auto alphas_h = Kokkos::create_mirror_view (alphas);
2773 Kokkos::deep_copy (alphas_h, alphas);
2774
2775 auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2776 if (isConstantStride ()) {
2777 KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2778 }
2779 else {
2780 for (size_t k = 0; k < numVecs; ++k) {
2781 const size_t Y_col = this->isConstantStride () ? k :
2782 this->whichVectors_[k];
2783 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2784 // We don't have to use the entire 1-D View here; we can use
2785 // the version that takes a scalar coefficient.
2786 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2787 }
2788 }
2789 }
2790 else { // Work in device memory, using the input View 'alphas' directly.
2791 auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2792 if (isConstantStride ()) {
2793 KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2794 }
2795 else {
2796 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2797 // as values on host, so copy them to host. Another approach
2798 // would be to fix scal() so that it takes a 0-D View as the
2799 // second argument.
2800 auto alphas_h = Kokkos::create_mirror_view (alphas);
2801 Kokkos::deep_copy (alphas_h, alphas);
2802
2803 for (size_t k = 0; k < numVecs; ++k) {
2804 const size_t Y_col = this->isConstantStride () ? k :
2805 this->whichVectors_[k];
2806 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2807 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2808 }
2809 }
2810 }
2811 }
2812
2813 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2814 void
2816 scale (const Scalar& alpha,
2818 {
2819 using Kokkos::ALL;
2820 using Kokkos::subview;
2821 const char tfecfFuncName[] = "scale: ";
2822
2823 const size_t lclNumRows = getLocalLength ();
2824 const size_t numVecs = getNumVectors ();
2825
2826 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2827 lclNumRows != A.getLocalLength (), std::invalid_argument,
2828 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2829 << A.getLocalLength () << ".");
2830 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2831 numVecs != A.getNumVectors (), std::invalid_argument,
2832 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2833 << A.getNumVectors () << ".");
2834
2835 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2836 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2837 const std::pair<size_t, size_t> colRng (0, numVecs);
2838
2839 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2840 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2841 auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2842 auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2843
2844 if (isConstantStride () && A.isConstantStride ()) {
2845 KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2846 }
2847 else {
2848 // Make sure that Kokkos only uses the local length for add.
2849 for (size_t k = 0; k < numVecs; ++k) {
2850 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2851 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2852 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2853 auto X_k = subview (X_lcl, ALL (), X_col);
2854
2855 KokkosBlas::scal (Y_k, theAlpha, X_k);
2856 }
2857 }
2858 }
2859
2860
2861
2862 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2863 void
2866 {
2867 const char tfecfFuncName[] = "reciprocal: ";
2868
2869 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2870 getLocalLength () != A.getLocalLength (), std::runtime_error,
2871 "MultiVectors do not have the same local length. "
2872 "this->getLocalLength() = " << getLocalLength ()
2873 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2874 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2875 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2876 ": MultiVectors do not have the same number of columns (vectors). "
2877 "this->getNumVectors() = " << getNumVectors ()
2878 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2879
2880 const size_t numVecs = getNumVectors ();
2881
2882 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2883 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2884
2885 if (isConstantStride () && A.isConstantStride ()) {
2886 KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2887 }
2888 else {
2889 using Kokkos::ALL;
2890 using Kokkos::subview;
2891 for (size_t k = 0; k < numVecs; ++k) {
2892 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2893 auto vector_k = subview (this_view_dev, ALL (), this_col);
2894 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2895 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2896 KokkosBlas::reciprocal (vector_k, vector_Ak);
2897 }
2898 }
2899 }
2900
2901 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2902 void
2905 {
2906 const char tfecfFuncName[] = "abs";
2907
2908 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2909 getLocalLength () != A.getLocalLength (), std::runtime_error,
2910 ": MultiVectors do not have the same local length. "
2911 "this->getLocalLength() = " << getLocalLength ()
2912 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2913 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2914 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2915 ": MultiVectors do not have the same number of columns (vectors). "
2916 "this->getNumVectors() = " << getNumVectors ()
2917 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2918 const size_t numVecs = getNumVectors ();
2919
2920 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2921 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2922
2923 if (isConstantStride () && A.isConstantStride ()) {
2924 KokkosBlas::abs (this_view_dev, A_view_dev);
2925 }
2926 else {
2927 using Kokkos::ALL;
2928 using Kokkos::subview;
2929
2930 for (size_t k=0; k < numVecs; ++k) {
2931 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2932 auto vector_k = subview (this_view_dev, ALL (), this_col);
2933 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2934 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2935 KokkosBlas::abs (vector_k, vector_Ak);
2936 }
2937 }
2938 }
2939
2940 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2941 void
2943 update (const Scalar& alpha,
2945 const Scalar& beta)
2946 {
2947 const char tfecfFuncName[] = "update: ";
2948 using Kokkos::subview;
2949 using Kokkos::ALL;
2950
2951 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
2952
2953 const size_t lclNumRows = getLocalLength ();
2954 const size_t numVecs = getNumVectors ();
2955
2956 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2957 lclNumRows != A.getLocalLength (), std::invalid_argument,
2958 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2959 << A.getLocalLength () << ".");
2960 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2961 numVecs != A.getNumVectors (), std::invalid_argument,
2962 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2963 << A.getNumVectors () << ".");
2964
2965 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2966 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2967 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2968 const std::pair<size_t, size_t> colRng (0, numVecs);
2969
2970 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2971 auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2972 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2973 auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2974
2975 // The device memory of *this is about to be modified
2976 if (isConstantStride () && A.isConstantStride ()) {
2977 KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2978 }
2979 else {
2980 // Make sure that Kokkos only uses the local length for add.
2981 for (size_t k = 0; k < numVecs; ++k) {
2982 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2983 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2984 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2985 auto X_k = subview (X_lcl, ALL (), X_col);
2986
2987 KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2988 }
2989 }
2990 }
2991
2992 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2993 void
2995 update (const Scalar& alpha,
2997 const Scalar& beta,
2999 const Scalar& gamma)
3000 {
3001 using Kokkos::ALL;
3002 using Kokkos::subview;
3003
3004 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3005
3006 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3007
3008 const size_t lclNumRows = this->getLocalLength ();
3009 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3010 lclNumRows != A.getLocalLength (), std::invalid_argument,
3011 "The input MultiVector A has " << A.getLocalLength () << " local "
3012 "row(s), but this MultiVector has " << lclNumRows << " local "
3013 "row(s).");
3014 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3015 lclNumRows != B.getLocalLength (), std::invalid_argument,
3016 "The input MultiVector B has " << B.getLocalLength () << " local "
3017 "row(s), but this MultiVector has " << lclNumRows << " local "
3018 "row(s).");
3019 const size_t numVecs = getNumVectors ();
3020 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3021 A.getNumVectors () != numVecs, std::invalid_argument,
3022 "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3023 "but this MultiVector has " << numVecs << " column(s).");
3024 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3025 B.getNumVectors () != numVecs, std::invalid_argument,
3026 "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3027 "but this MultiVector has " << numVecs << " column(s).");
3028
3029 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3030 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3031 const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3032
3033 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3034 const std::pair<size_t, size_t> colRng (0, numVecs);
3035
3036 // Prefer 'auto' over specifying the type explicitly. This avoids
3037 // issues with a subview possibly having a different type than the
3038 // original view.
3039 auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3040 auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3041 auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3042
3043 if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3044 KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3045 }
3046 else {
3047 // Some input (or *this) is not constant stride,
3048 // so perform the update one column at a time.
3049 for (size_t k = 0; k < numVecs; ++k) {
3050 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3051 const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3052 const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3053 KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3054 theBeta, subview (B_lcl, rowRng, B_col),
3055 theGamma, subview (C_lcl, rowRng, this_col));
3056 }
3057 }
3058 }
3059
3060 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3061 Teuchos::ArrayRCP<const Scalar>
3063 getData (size_t j) const
3064 {
3065 using Kokkos::ALL;
3066 using IST = impl_scalar_type;
3067 const char tfecfFuncName[] = "getData: ";
3068
3069 // Any MultiVector method that called the (classic) Kokkos Node's
3070 // viewBuffer or viewBufferNonConst methods always implied a
3071 // device->host synchronization. Thus, we synchronize here as
3072 // well.
3073
3074 auto hostView = getLocalViewHost(Access::ReadOnly);
3075 const size_t col = isConstantStride () ? j : whichVectors_[j];
3076 auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3077 Teuchos::ArrayRCP<const IST> dataAsArcp =
3078 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3079
3080 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3081 (static_cast<size_t> (hostView_j.extent (0)) <
3082 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3083 "hostView_j.extent(0) = " << hostView_j.extent (0)
3084 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3085 "Please report this bug to the Tpetra developers.");
3086
3087 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3088 }
3089
3090 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3091 Teuchos::ArrayRCP<Scalar>
3093 getDataNonConst (size_t j)
3094 {
3095 using Kokkos::ALL;
3096 using Kokkos::subview;
3097 using IST = impl_scalar_type;
3098 const char tfecfFuncName[] = "getDataNonConst: ";
3099
3100 auto hostView = getLocalViewHost(Access::ReadWrite);
3101 const size_t col = isConstantStride () ? j : whichVectors_[j];
3102 auto hostView_j = subview (hostView, ALL (), col);
3103 Teuchos::ArrayRCP<IST> dataAsArcp =
3104 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3105
3106 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3107 (static_cast<size_t> (hostView_j.extent (0)) <
3108 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3109 "hostView_j.extent(0) = " << hostView_j.extent (0)
3110 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3111 "Please report this bug to the Tpetra developers.");
3112
3113 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3114 }
3115
3116 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3117 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3119 subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3120 {
3121 using Teuchos::RCP;
3123
3124 // Check whether the index set in cols is contiguous. If it is,
3125 // use the more efficient Range1D version of subCopy.
3126 bool contiguous = true;
3127 const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3128 for (size_t j = 1; j < numCopyVecs; ++j) {
3129 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3130 contiguous = false;
3131 break;
3132 }
3133 }
3134 if (contiguous && numCopyVecs > 0) {
3135 return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3136 }
3137 else {
3138 RCP<const MV> X_sub = this->subView (cols);
3139 RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3140 Y->assign (*X_sub);
3141 return Y;
3142 }
3143 }
3144
3145 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3146 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3148 subCopy (const Teuchos::Range1D &colRng) const
3149 {
3150 using Teuchos::RCP;
3152
3153 RCP<const MV> X_sub = this->subView (colRng);
3154 RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3155 Y->assign (*X_sub);
3156 return Y;
3157 }
3158
3159 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3160 size_t
3162 getOrigNumLocalRows () const {
3163 return origView_.extent (0);
3164 }
3165
3166 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3167 size_t
3169 getOrigNumLocalCols () const {
3170 return origView_.extent (1);
3171 }
3172
3173 template <class Scalar, class LO, class GO, class Node>
3176 const Teuchos::RCP<const map_type>& subMap,
3177 const local_ordinal_type rowOffset) :
3178 base_type (subMap)
3179 {
3180 using Kokkos::ALL;
3181 using Kokkos::subview;
3182 using Teuchos::outArg;
3183 using Teuchos::RCP;
3184 using Teuchos::rcp;
3185 using Teuchos::reduceAll;
3186 using Teuchos::REDUCE_MIN;
3187 using std::endl;
3189 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3190 const char suffix[] = "Please report this bug to the Tpetra developers.";
3191 int lclGood = 1;
3192 int gblGood = 1;
3193 std::unique_ptr<std::ostringstream> errStrm;
3194 const bool debug = ::Tpetra::Details::Behavior::debug ();
3195 const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3196
3197 // Be careful to use the input Map's communicator, not X's. The
3198 // idea is that, on return, *this is a subview of X, using the
3199 // input Map.
3200 const auto comm = subMap->getComm ();
3201 TEUCHOS_ASSERT( ! comm.is_null () );
3202 const int myRank = comm->getRank ();
3203
3204 const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3205 const LO numCols = static_cast<LO> (X.getNumVectors ());
3206 const LO newNumRows = static_cast<LO> (subMap->getNodeNumElements ());
3207 if (verbose) {
3208 std::ostringstream os;
3209 os << "Proc " << myRank << ": " << prefix
3210 << "X: {lclNumRows: " << lclNumRowsBefore
3211 << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3212 << ", numCols: " << numCols << "}, "
3213 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3214 std::cerr << os.str ();
3215 }
3216 // We ask for the _original_ number of rows in X, because X could
3217 // be a shorter (fewer rows) view of a longer MV. (For example, X
3218 // could be a domain Map view of a column Map MV.)
3219 const bool tooManyElts =
3220 newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3221 if (tooManyElts) {
3222 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3223 *errStrm << " Proc " << myRank << ": subMap->getNodeNumElements() (="
3224 << newNumRows << ") + rowOffset (=" << rowOffset
3225 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3226 << ")." << endl;
3227 lclGood = 0;
3228 TEUCHOS_TEST_FOR_EXCEPTION
3229 (! debug && tooManyElts, std::invalid_argument,
3230 prefix << errStrm->str () << suffix);
3231 }
3232
3233 if (debug) {
3234 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3235 if (gblGood != 1) {
3236 std::ostringstream gblErrStrm;
3237 const std::string myErrStr =
3238 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3239 ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3240 TEUCHOS_TEST_FOR_EXCEPTION
3241 (true, std::invalid_argument, gblErrStrm.str ());
3242 }
3243 }
3244
3245 using range_type = std::pair<LO, LO>;
3246 const range_type origRowRng
3247 (rowOffset, static_cast<LO> (X.origView_.extent (0)));
3248 const range_type rowRng
3249 (rowOffset, rowOffset + newNumRows);
3250
3251 dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3252 dual_view_type newView = subview (X.view_, rowRng, ALL ());
3253
3254 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3255 // handling subviews of degenerate Views quite so well. For some
3256 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3257 // 0. We work around by creating a new empty DualView of the
3258 // desired (degenerate) dimensions.
3259 if (newOrigView.extent (0) == 0 &&
3260 newOrigView.extent (1) != X.origView_.extent (1)) {
3261 newOrigView =
3262 allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3263 }
3264 if (newView.extent (0) == 0 &&
3265 newView.extent (1) != X.view_.extent (1)) {
3266 newView =
3267 allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3268 }
3269
3270 MV subViewMV = X.isConstantStride () ?
3271 MV (subMap, newView, newOrigView) :
3272 MV (subMap, newView, newOrigView, X.whichVectors_ ());
3273
3274 subViewMV.owningView_ = X.owningView_;
3275
3276 if (debug) {
3277 const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3278 const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3279 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3280 lclGood = 0;
3281 if (errStrm.get () == nullptr) {
3282 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3283 }
3284 *errStrm << " Proc " << myRank <<
3285 ": subMap.getNodeNumElements(): " << newNumRows <<
3286 ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3287 ", X.getNumVectors(): " << numCols <<
3288 ", subViewMV.getNumVectors(): " << numColsRet << endl;
3289 }
3290 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3291 if (gblGood != 1) {
3292 std::ostringstream gblErrStrm;
3293 if (myRank == 0) {
3294 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3295 "dimensions on one or more processes:" << endl;
3296 }
3297 const std::string myErrStr =
3298 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3299 ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3300 gblErrStrm << suffix << endl;
3301 TEUCHOS_TEST_FOR_EXCEPTION
3302 (true, std::invalid_argument, gblErrStrm.str ());
3303 }
3304 }
3305
3306 if (verbose) {
3307 std::ostringstream os;
3308 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3309 std::cerr << os.str ();
3310 }
3311
3312 *this = subViewMV; // shallow copy
3313
3314 if (verbose) {
3315 std::ostringstream os;
3316 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3317 std::cerr << os.str ();
3318 }
3319 }
3320
3321 template <class Scalar, class LO, class GO, class Node>
3324 const map_type& subMap,
3325 const size_t rowOffset) :
3326 MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3327 static_cast<local_ordinal_type> (rowOffset))
3328 {}
3329
3330 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3331 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3333 offsetView (const Teuchos::RCP<const map_type>& subMap,
3334 const size_t offset) const
3335 {
3337 return Teuchos::rcp (new MV (*this, *subMap, offset));
3338 }
3339
3340 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3341 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3343 offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3344 const size_t offset)
3345 {
3347 return Teuchos::rcp (new MV (*this, *subMap, offset));
3348 }
3349
3350 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3351 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3353 subView (const Teuchos::ArrayView<const size_t>& cols) const
3354 {
3355 using Teuchos::Array;
3356 using Teuchos::rcp;
3358
3359 const size_t numViewCols = static_cast<size_t> (cols.size ());
3360 TEUCHOS_TEST_FOR_EXCEPTION(
3361 numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3362 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3363 "contain at least one entry, but cols.size() = " << cols.size ()
3364 << " == 0.");
3365
3366 // Check whether the index set in cols is contiguous. If it is,
3367 // use the more efficient Range1D version of subView.
3368 bool contiguous = true;
3369 for (size_t j = 1; j < numViewCols; ++j) {
3370 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3371 contiguous = false;
3372 break;
3373 }
3374 }
3375 if (contiguous) {
3376 if (numViewCols == 0) {
3377 // The output MV has no columns, so there is nothing to view.
3378 return rcp (new MV (this->getMap (), numViewCols));
3379 } else {
3380 // Use the more efficient contiguous-index-range version.
3381 return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3382 }
3383 }
3384
3385 if (isConstantStride ()) {
3386 return rcp (new MV (this->getMap (), view_, origView_, cols));
3387 }
3388 else {
3389 Array<size_t> newcols (cols.size ());
3390 for (size_t j = 0; j < numViewCols; ++j) {
3391 newcols[j] = whichVectors_[cols[j]];
3392 }
3393 return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3394 }
3395 }
3396
3397
3398 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3399 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3401 subView (const Teuchos::Range1D& colRng) const
3402 {
3403 using ::Tpetra::Details::Behavior;
3404 using Kokkos::ALL;
3405 using Kokkos::subview;
3406 using Teuchos::Array;
3407 using Teuchos::RCP;
3408 using Teuchos::rcp;
3410 const char tfecfFuncName[] = "subView(Range1D): ";
3411
3412 const size_t lclNumRows = this->getLocalLength ();
3413 const size_t numVecs = this->getNumVectors ();
3414 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3415 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3416 // "at least one vector.");
3417 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3418 static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3419 "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3420 << numVecs << ".");
3421 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3422 numVecs != 0 && colRng.size () != 0 &&
3423 (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3424 static_cast<size_t> (colRng.ubound ()) >= numVecs),
3425 std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3426 "," << colRng.ubound () << "] exceeds the valid range of column indices "
3427 "[0, " << numVecs << "].");
3428
3429 RCP<const MV> X_ret; // the MultiVector subview to return
3430
3431 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3432 // broken for the case of views with zero rows. I will brutally
3433 // enforce that the subview has the correct dimensions. In
3434 // particular, in the case of zero rows, I will, if necessary,
3435 // create a new dual_view_type with zero rows and the correct
3436 // number of columns. In a debug build, I will use an all-reduce
3437 // to ensure that it has the correct dimensions on all processes.
3438
3439 const std::pair<size_t, size_t> rows (0, lclNumRows);
3440 if (colRng.size () == 0) {
3441 const std::pair<size_t, size_t> cols (0, 0); // empty range
3442 dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3443 X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3444 }
3445 else {
3446 // Returned MultiVector is constant stride only if *this is.
3447 if (isConstantStride ()) {
3448 const std::pair<size_t, size_t> cols (colRng.lbound (),
3449 colRng.ubound () + 1);
3450 dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3451 X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3452 }
3453 else {
3454 if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3455 // We're only asking for one column, so the result does have
3456 // constant stride, even though this MultiVector does not.
3457 const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3458 whichVectors_[0] + colRng.ubound () + 1);
3459 dual_view_type X_sub = takeSubview (view_, ALL (), col);
3460 X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3461 }
3462 else {
3463 Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3464 whichVectors_.begin () + colRng.ubound () + 1);
3465 X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3466 }
3467 }
3468 }
3469
3470 const bool debug = Behavior::debug ();
3471 if (debug) {
3472 using Teuchos::Comm;
3473 using Teuchos::outArg;
3474 using Teuchos::REDUCE_MIN;
3475 using Teuchos::reduceAll;
3476
3477 RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3478 Teuchos::null : this->getMap ()->getComm ();
3479 if (! comm.is_null ()) {
3480 int lclSuccess = 1;
3481 int gblSuccess = 1;
3482
3483 if (X_ret.is_null ()) {
3484 lclSuccess = 0;
3485 }
3486 reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3487 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3488 (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3489 "MultiVector; the return value of this method) is null on some MPI "
3490 "process in this MultiVector's communicator. This should never "
3491 "happen. Please report this bug to the Tpetra developers.");
3492 if (! X_ret.is_null () &&
3493 X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3494 lclSuccess = 0;
3495 }
3496 reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3497 outArg (gblSuccess));
3498 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3499 (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3500 "colRng.size(), on at least one MPI process in this MultiVector's "
3501 "communicator. This should never happen. "
3502 "Please report this bug to the Tpetra developers.");
3503 }
3504 }
3505 return X_ret;
3506 }
3507
3508
3509 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3510 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3512 subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3513 {
3515 return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3516 }
3517
3518
3519 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3520 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3522 subViewNonConst (const Teuchos::Range1D &colRng)
3523 {
3525 return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3526 }
3527
3528
3529 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3532 const size_t j)
3533 : base_type (X.getMap ())
3534 {
3535 using Kokkos::subview;
3536 typedef std::pair<size_t, size_t> range_type;
3537 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3538
3539 const size_t numCols = X.getNumVectors ();
3540 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3541 (j >= numCols, std::invalid_argument, "Input index j (== " << j
3542 << ") exceeds valid column index range [0, " << numCols << " - 1].");
3543 const size_t jj = X.isConstantStride () ?
3544 static_cast<size_t> (j) :
3545 static_cast<size_t> (X.whichVectors_[j]);
3546 this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3547 this->origView_ = X.origView_;
3548 this->owningView_ = X.owningView_;
3549
3550 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3551 // Export or Import operations with different subviews of a
3552 // MultiVector. Thus, it is safe to reuse communication buffers.
3553 // See #1560 discussion.
3554 //
3555 // We only need one column's worth of buffer for imports_ and
3556 // exports_. Taking subviews now ensures that their lengths will
3557 // be exactly what we need, so we won't have to resize them later.
3558 {
3559 const size_t newSize = X.imports_.extent (0) / numCols;
3560 const size_t offset = jj*newSize;
3561 auto newImports = X.imports_;
3562 newImports.d_view = subview (X.imports_.d_view,
3563 range_type (offset, offset+newSize));
3564 newImports.h_view = subview (X.imports_.h_view,
3565 range_type (offset, offset+newSize));
3566 this->imports_ = newImports;
3567 }
3568 {
3569 const size_t newSize = X.exports_.extent (0) / numCols;
3570 const size_t offset = jj*newSize;
3571 auto newExports = X.exports_;
3572 newExports.d_view = subview (X.exports_.d_view,
3573 range_type (offset, offset+newSize));
3574 newExports.h_view = subview (X.exports_.h_view,
3575 range_type (offset, offset+newSize));
3576 this->exports_ = newExports;
3577 }
3578 // These two DualViews already either have the right number of
3579 // entries, or zero entries. This means that we don't need to
3580 // resize them.
3583 }
3584
3585
3586 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3587 Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3589 getVector (const size_t j) const
3590 {
3592 return Teuchos::rcp (new V (*this, j));
3593 }
3594
3595
3596 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3597 Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3599 getVectorNonConst (const size_t j)
3600 {
3602 return Teuchos::rcp (new V (*this, j));
3603 }
3604
3605
3606 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3607 void
3609 get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3610 {
3611 using IST = impl_scalar_type;
3612 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3613 Kokkos::HostSpace,
3614 Kokkos::MemoryUnmanaged>;
3615 const char tfecfFuncName[] = "get1dCopy: ";
3616
3617 const size_t numRows = this->getLocalLength ();
3618 const size_t numCols = this->getNumVectors ();
3619
3620 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3621 (LDA < numRows, std::runtime_error,
3622 "LDA = " << LDA << " < numRows = " << numRows << ".");
3623 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3624 (numRows > size_t (0) && numCols > size_t (0) &&
3625 size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3626 std::runtime_error,
3627 "A.size() = " << A.size () << ", but its size must be at least "
3628 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3629
3630 const std::pair<size_t, size_t> rowRange (0, numRows);
3631 const std::pair<size_t, size_t> colRange (0, numCols);
3632
3633 input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3634 LDA, numCols);
3635 auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3636
3648 if (this->need_sync_host() && this->need_sync_device()) {
3651 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3652 }
3653 else {
3654 const bool useHostView = owningView_.h_view.use_count() >= owningView_.d_view.use_count();
3655 if (this->isConstantStride ()) {
3656 if (useHostView) {
3657 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3658 Kokkos::deep_copy (A_view, srcView_host);
3659 } else {
3660 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3661 Kokkos::deep_copy (A_view, srcView_device);
3662 }
3663 }
3664 else {
3665 for (size_t j = 0; j < numCols; ++j) {
3666 const size_t srcCol = this->whichVectors_[j];
3667 auto dstColView = Kokkos::subview (A_view, rowRange, j);
3668
3669 if (useHostView) {
3670 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3671 auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3672 Kokkos::deep_copy (dstColView, srcColView_host);
3673 } else {
3674 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3675 auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3676 Kokkos::deep_copy (dstColView, srcColView_device);
3677 }
3678 }
3679 }
3680 }
3681 }
3682
3683
3684 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3685 void
3687 get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3688 {
3690 const char tfecfFuncName[] = "get2dCopy: ";
3691 const size_t numRows = this->getLocalLength ();
3692 const size_t numCols = this->getNumVectors ();
3693
3694 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3695 static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3696 std::runtime_error, "Input array of pointers must contain as many "
3697 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3698 << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3699
3700 if (numRows != 0 && numCols != 0) {
3701 // No side effects until we've validated the input.
3702 for (size_t j = 0; j < numCols; ++j) {
3703 const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3704 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3705 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3706 "the input array of arrays is not long enough to fit all entries in "
3707 "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3708 << " < getLocalLength() = " << numRows << ".");
3709 }
3710
3711 // We've validated the input, so it's safe to start copying.
3712 for (size_t j = 0; j < numCols; ++j) {
3713 Teuchos::RCP<const V> X_j = this->getVector (j);
3714 const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3715 X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3716 }
3717 }
3718 }
3719
3720
3721 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3722 Teuchos::ArrayRCP<const Scalar>
3724 get1dView () const
3725 {
3726 if (getLocalLength () == 0 || getNumVectors () == 0) {
3727 return Teuchos::null;
3728 } else {
3729 TEUCHOS_TEST_FOR_EXCEPTION(
3730 ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3731 "get1dView: This MultiVector does not have constant stride, so it is "
3732 "not possible to view its data as a single array. You may check "
3733 "whether a MultiVector has constant stride by calling "
3734 "isConstantStride().");
3735 // Since get1dView() is and was always marked const, I have to
3736 // cast away const here in order not to break backwards
3737 // compatibility.
3738 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3739 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3740 Kokkos::Compat::persistingView (X_lcl);
3741 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3742 }
3743 }
3744
3745 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3746 Teuchos::ArrayRCP<Scalar>
3749 {
3750 if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3751 return Teuchos::null;
3752 }
3753 else {
3754 TEUCHOS_TEST_FOR_EXCEPTION
3755 (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3756 "get1dViewNonConst: This MultiVector does not have constant stride, "
3757 "so it is not possible to view its data as a single array. You may "
3758 "check whether a MultiVector has constant stride by calling "
3759 "isConstantStride().");
3760 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3761 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3762 Kokkos::Compat::persistingView (X_lcl);
3763 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3764 }
3765 }
3766
3767 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3768 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3771 {
3772 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3773
3774 // Don't use the row range here on the outside, in order to avoid
3775 // a strided return type (in case Kokkos::subview is conservative
3776 // about that). Instead, use the row range for the column views
3777 // in the loop.
3778 const size_t myNumRows = this->getLocalLength ();
3779 const size_t numCols = this->getNumVectors ();
3780 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3781
3782 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3783 for (size_t j = 0; j < numCols; ++j) {
3784 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3785 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3786 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3787 Kokkos::Compat::persistingView (X_lcl_j);
3788 views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3789 }
3790 return views;
3791 }
3792
3793 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3796 getLocalViewHost(Access::ReadOnlyStruct) const
3797 {
3798 //returning dual_view_type::t_host::const_type
3799 if (owningView_.d_view.use_count() > owningView_.h_view.use_count()) {
3800 const bool debug = Details::Behavior::debug();
3801 const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3802 " host while a device view is alive";
3803 if (debug) {
3804 std::cout << "Rank " << this->getMap()->getComm()->getRank()
3805 << ": " << msg << std::endl;
3806 }
3807 throw std::runtime_error(msg);
3808 }
3809 owningView_.sync_host();
3810 return view_.view_host();
3811 }
3812
3813 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3816 getLocalViewHost(Access::ReadWriteStruct)
3817 {
3818 //returning dual_view_type::t_host::type
3819 if (owningView_.d_view.use_count() > owningView_.h_view.use_count()) {
3820 const bool debug = Details::Behavior::debug();
3821 const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3822 " host while a device view is alive";
3823 if (debug) {
3824 std::cout << "Rank " << this->getMap()->getComm()->getRank()
3825 << ": " << msg << std::endl;
3826 }
3827 throw std::runtime_error(msg);
3828 }
3829 owningView_.sync_host();
3830 owningView_.modify_host();
3831 return view_.view_host();
3832 }
3833
3834 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3837 getLocalViewHost(Access::OverwriteAllStruct)
3838 {
3839 //returning dual_view_type::t_host::type
3840 if (owningView_.h_view != view_.h_view) {
3841 // view is a subview of owningView_; for safety, need to do ReadWrite
3842 return getLocalViewHost(Access::ReadWrite);
3843 }
3844
3845 if (owningView_.d_view.use_count() > owningView_.h_view.use_count()) {
3846 const bool debug = Details::Behavior::debug();
3847 const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3848 " host while a device view is alive";
3849 if (debug) {
3850 std::cout << "Rank " << this->getMap()->getComm()->getRank()
3851 << ": " << msg << std::endl;
3852 }
3853 throw std::runtime_error(msg);
3854 }
3855 owningView_.clear_sync_state();
3856 owningView_.modify_host();
3857 return view_.view_host();
3858 }
3859
3860 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3863 getLocalViewDevice(Access::ReadOnlyStruct) const
3864 {
3865 //returning dual_view_type::t_dev::const_type
3866 if (owningView_.h_view.use_count() > owningView_.d_view.use_count()) {
3867 const bool debug = Details::Behavior::debug();
3868 const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3869 " device while a host view is alive";
3870 if (debug) {
3871 std::cout << "Rank " << this->getMap()->getComm()->getRank()
3872 << ": " << msg << std::endl;
3873 }
3874 throw std::runtime_error(msg);
3875 }
3876 owningView_.sync_device();
3877 return view_.view_device();
3878 }
3879
3880 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3883 getLocalViewDevice(Access::ReadWriteStruct)
3884 {
3885 //returning dual_view_type::t_dev::type
3886 if(owningView_.h_view.use_count() > owningView_.d_view.use_count()) {
3887 const bool debug = Details::Behavior::debug();
3888 const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3889 " device while a host view is alive";
3890 if (debug) {
3891 std::cout << "Rank " << this->getMap()->getComm()->getRank()
3892 << ": " << msg << std::endl;
3893 }
3894 throw std::runtime_error(msg);
3895 }
3896 owningView_.sync_device();
3897 owningView_.modify_device();
3898 return view_.view_device();
3899 }
3900
3901 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3904 getLocalViewDevice(Access::OverwriteAllStruct)
3905 {
3906 //returning dual_view_type::t_dev::type
3907 if (owningView_.h_view != view_.h_view) {
3908 // view_ is a subview of owningView_; for safety, need to do ReadWrite
3909 return getLocalViewDevice(Access::ReadWrite);
3910 }
3911
3912 if (owningView_.h_view.use_count() > owningView_.d_view.use_count()) {
3913 const bool debug = Details::Behavior::debug();
3914 const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3915 " device while a host view is alive";
3916 if (debug) {
3917 std::cout << "Rank " << this->getMap()->getComm()->getRank()
3918 << ": " << msg << std::endl;
3919 }
3920 throw std::runtime_error(msg);
3921 }
3922 owningView_.clear_sync_state();
3923 owningView_.modify_device();
3924 return view_.view_device();
3925 }
3926
3927 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3928 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3930 get2dView () const
3931 {
3932 // Since get2dView() is and was always marked const, I have to
3933 // cast away const here in order not to break backwards
3934 // compatibility.
3935 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3936
3937 // Don't use the row range here on the outside, in order to avoid
3938 // a strided return type (in case Kokkos::subview is conservative
3939 // about that). Instead, use the row range for the column views
3940 // in the loop.
3941 const size_t myNumRows = this->getLocalLength ();
3942 const size_t numCols = this->getNumVectors ();
3943 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3944
3945 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3946 for (size_t j = 0; j < numCols; ++j) {
3947 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3948 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3949 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3950 Kokkos::Compat::persistingView (X_lcl_j);
3951 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3952 }
3953 return views;
3954 }
3955
3956 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3957 void
3959 multiply (Teuchos::ETransp transA,
3960 Teuchos::ETransp transB,
3961 const Scalar& alpha,
3964 const Scalar& beta)
3965 {
3966 using ::Tpetra::Details::ProfilingRegion;
3967 using Teuchos::CONJ_TRANS;
3968 using Teuchos::NO_TRANS;
3969 using Teuchos::TRANS;
3970 using Teuchos::RCP;
3971 using Teuchos::rcp;
3972 using Teuchos::rcpFromRef;
3973 using std::endl;
3974 using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3975 using LO = local_ordinal_type;
3976 using STS = Teuchos::ScalarTraits<Scalar>;
3978 const char tfecfFuncName[] = "multiply: ";
3979 ProfilingRegion region ("Tpetra::MV::multiply");
3980
3981 // This routine performs a variety of matrix-matrix multiply
3982 // operations, interpreting the MultiVector (this-aka C , A and B)
3983 // as 2D matrices. Variations are due to the fact that A, B and C
3984 // can be locally replicated or globally distributed MultiVectors
3985 // and that we may or may not operate with the transpose of A and
3986 // B. Possible cases are:
3987 //
3988 // Operations # Cases Notes
3989 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3990 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3991 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3992 //
3993 // The following operations are not meaningful for 1-D
3994 // distributions:
3995 //
3996 // u1) C(local) = A^T(distr) * B^T(distr) 1
3997 // u2) C(local) = A (distr) * B^X(distr) 2
3998 // u3) C(distr) = A^X(local) * B^X(local) 4
3999 // u4) C(distr) = A^X(local) * B^X(distr) 4
4000 // u5) C(distr) = A^T(distr) * B^X(local) 2
4001 // u6) C(local) = A^X(distr) * B^X(local) 4
4002 // u7) C(distr) = A^X(distr) * B^X(local) 4
4003 // u8) C(local) = A^X(local) * B^X(distr) 4
4004 //
4005 // Total number of cases: 32 (= 2^5).
4006
4007 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4008
4009 const bool A_is_local = ! A.isDistributed ();
4010 const bool B_is_local = ! B.isDistributed ();
4011 const bool C_is_local = ! this->isDistributed ();
4012
4013 // In debug mode, check compatibility of local dimensions. We
4014 // only do this in debug mode, since it requires an all-reduce
4015 // to ensure correctness on all processses. It's entirely
4016 // possible that only some processes may have incompatible local
4017 // dimensions. Throwing an exception only on those processes
4018 // could cause this method to hang.
4019 const bool debug = ::Tpetra::Details::Behavior::debug ();
4020 if (debug) {
4021 auto myMap = this->getMap ();
4022 if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4023 using Teuchos::REDUCE_MIN;
4024 using Teuchos::reduceAll;
4025 using Teuchos::outArg;
4026
4027 auto comm = myMap->getComm ();
4028 const size_t A_nrows =
4029 (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4030 const size_t A_ncols =
4031 (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4032 const size_t B_nrows =
4033 (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4034 const size_t B_ncols =
4035 (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4036
4037 const bool lclBad = this->getLocalLength () != A_nrows ||
4038 this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4039
4040 const int myRank = comm->getRank ();
4041 std::ostringstream errStrm;
4042 if (this->getLocalLength () != A_nrows) {
4043 errStrm << "Proc " << myRank << ": this->getLocalLength()="
4044 << this->getLocalLength () << " != A_nrows=" << A_nrows
4045 << "." << std::endl;
4046 }
4047 if (this->getNumVectors () != B_ncols) {
4048 errStrm << "Proc " << myRank << ": this->getNumVectors()="
4049 << this->getNumVectors () << " != B_ncols=" << B_ncols
4050 << "." << std::endl;
4051 }
4052 if (A_ncols != B_nrows) {
4053 errStrm << "Proc " << myRank << ": A_ncols="
4054 << A_ncols << " != B_nrows=" << B_nrows
4055 << "." << std::endl;
4056 }
4057
4058 const int lclGood = lclBad ? 0 : 1;
4059 int gblGood = 0;
4060 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4061 if (gblGood != 1) {
4062 std::ostringstream os;
4063 ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4064
4065 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4066 (true, std::runtime_error, "Inconsistent local dimensions on at "
4067 "least one process in this object's communicator." << std::endl
4068 << "Operation: "
4069 << "C(" << (C_is_local ? "local" : "distr") << ") = "
4070 << alpha << "*A"
4071 << (transA == Teuchos::TRANS ? "^T" :
4072 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4073 << "(" << (A_is_local ? "local" : "distr") << ") * "
4074 << beta << "*B"
4075 << (transA == Teuchos::TRANS ? "^T" :
4076 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4077 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4078 << "Global dimensions: C(" << this->getGlobalLength () << ", "
4079 << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4080 << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4081 << ", " << B.getNumVectors () << ")." << std::endl
4082 << os.str ());
4083 }
4084 }
4085 }
4086
4087 // Case 1: C(local) = A^X(local) * B^X(local)
4088 const bool Case1 = C_is_local && A_is_local && B_is_local;
4089 // Case 2: C(local) = A^T(distr) * B (distr)
4090 const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4091 transA != NO_TRANS &&
4092 transB == NO_TRANS;
4093 // Case 3: C(distr) = A (distr) * B^X(local)
4094 const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4095 transA == NO_TRANS;
4096
4097 // Test that we are considering a meaningful case
4098 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4099 (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4100 "Multiplication of op(A) and op(B) into *this is not a "
4101 "supported use case.");
4102
4103 if (beta != STS::zero () && Case2) {
4104 // If Case2, then C is local and contributions must be summed
4105 // across all processes. However, if beta != 0, then accumulate
4106 // beta*C into the sum. When summing across all processes, we
4107 // only want to accumulate this once, so set beta == 0 on all
4108 // processes except Process 0.
4109 const int myRank = this->getMap ()->getComm ()->getRank ();
4110 if (myRank != 0) {
4111 beta_local = ATS::zero ();
4112 }
4113 }
4114
4115 // We only know how to do matrix-matrix multiplies if all the
4116 // MultiVectors have constant stride. If not, we have to make
4117 // temporary copies of those MultiVectors (including possibly
4118 // *this) that don't have constant stride.
4119 RCP<MV> C_tmp;
4120 if (! isConstantStride ()) {
4121 C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4122 } else {
4123 C_tmp = rcp (this, false);
4124 }
4125
4126 RCP<const MV> A_tmp;
4127 if (! A.isConstantStride ()) {
4128 A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4129 } else {
4130 A_tmp = rcpFromRef (A);
4131 }
4132
4133 RCP<const MV> B_tmp;
4134 if (! B.isConstantStride ()) {
4135 B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4136 } else {
4137 B_tmp = rcpFromRef (B);
4138 }
4139
4140 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4141 (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4142 ! A_tmp->isConstantStride (), std::logic_error,
4143 "Failed to make temporary constant-stride copies of MultiVectors.");
4144
4145 {
4146 const LO A_lclNumRows = A_tmp->getLocalLength ();
4147 const LO A_numVecs = A_tmp->getNumVectors ();
4148 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4149 auto A_sub = Kokkos::subview (A_lcl,
4150 std::make_pair (LO (0), A_lclNumRows),
4151 std::make_pair (LO (0), A_numVecs));
4152
4153
4154 const LO B_lclNumRows = B_tmp->getLocalLength ();
4155 const LO B_numVecs = B_tmp->getNumVectors ();
4156 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4157 auto B_sub = Kokkos::subview (B_lcl,
4158 std::make_pair (LO (0), B_lclNumRows),
4159 std::make_pair (LO (0), B_numVecs));
4160
4161 const LO C_lclNumRows = C_tmp->getLocalLength ();
4162 const LO C_numVecs = C_tmp->getNumVectors ();
4163
4164 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4165 auto C_sub = Kokkos::subview (C_lcl,
4166 std::make_pair (LO (0), C_lclNumRows),
4167 std::make_pair (LO (0), C_numVecs));
4168 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4169 (transA == Teuchos::TRANS ? 'T' : 'C'));
4170 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4171 (transB == Teuchos::TRANS ? 'T' : 'C'));
4172 const impl_scalar_type alpha_IST (alpha);
4173
4174 ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4175
4176 KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4177 beta_local, C_sub);
4178 }
4179
4180 if (! isConstantStride ()) {
4181 ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4182 }
4183
4184 // Dispose of (possibly) extra copies of A and B.
4185 A_tmp = Teuchos::null;
4186 B_tmp = Teuchos::null;
4187
4188 // If Case 2 then sum up *this and distribute it to all processes.
4189 if (Case2) {
4190 this->reduce ();
4191 }
4192 }
4193
4194 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4195 void
4197 elementWiseMultiply (Scalar scalarAB,
4200 Scalar scalarThis)
4201 {
4202 using Kokkos::ALL;
4203 using Kokkos::subview;
4204 const char tfecfFuncName[] = "elementWiseMultiply: ";
4205
4206 const size_t lclNumRows = this->getLocalLength ();
4207 const size_t numVecs = this->getNumVectors ();
4208
4209 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4210 (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4211 std::runtime_error, "MultiVectors do not have the same local length.");
4212 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4213 numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4214 "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4215 << ".");
4216
4217 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4218 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4219 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4220
4221 if (isConstantStride () && B.isConstantStride ()) {
4222 // A is just a Vector; it only has one column, so it always has
4223 // constant stride.
4224 //
4225 // If both *this and B have constant stride, we can do an
4226 // element-wise multiply on all columns at once.
4227 KokkosBlas::mult (scalarThis,
4228 this_view,
4229 scalarAB,
4230 subview (A_view, ALL (), 0),
4231 B_view);
4232 }
4233 else {
4234 for (size_t j = 0; j < numVecs; ++j) {
4235 const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4236 const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4237 KokkosBlas::mult (scalarThis,
4238 subview (this_view, ALL (), C_col),
4239 scalarAB,
4240 subview (A_view, ALL (), 0),
4241 subview (B_view, ALL (), B_col));
4242 }
4243 }
4244 }
4245
4246 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4247 void
4249 reduce ()
4250 {
4252 using ::Tpetra::Details::ProfilingRegion;
4253 ProfilingRegion region ("Tpetra::MV::reduce");
4254
4255 const auto map = this->getMap ();
4256 if (map.get () == nullptr) {
4257 return;
4258 }
4259 const auto comm = map->getComm ();
4260 if (comm.get () == nullptr) {
4261 return;
4262 }
4263
4264 // Avoid giving device buffers to MPI. Even if MPI handles them
4265 // correctly, doing so may not perform well.
4266 const bool changed_on_device = this->need_sync_host ();
4267 if (changed_on_device) {
4268 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4269 // and host will be separate allocations. In that case, it may
4270 // pay to do the all-reduce from device to host.
4271 Kokkos::fence(); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4272 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4273 allReduceView (X_lcl, X_lcl, *comm);
4274 }
4275 else {
4276 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4277 allReduceView (X_lcl, X_lcl, *comm);
4278 }
4279 }
4280
4281 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4282 void
4284 replaceLocalValue (const LocalOrdinal lclRow,
4285 const size_t col,
4286 const impl_scalar_type& ScalarValue)
4287 {
4289 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4290 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4291 TEUCHOS_TEST_FOR_EXCEPTION(
4292 lclRow < minLocalIndex || lclRow > maxLocalIndex,
4293 std::runtime_error,
4294 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4295 << " is invalid. The range of valid row indices on this process "
4296 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4297 << ", " << maxLocalIndex << "].");
4298 TEUCHOS_TEST_FOR_EXCEPTION(
4299 vectorIndexOutOfRange(col),
4300 std::runtime_error,
4301 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4302 << " of the multivector is invalid.");
4303 }
4304
4305 auto view = this->getLocalViewHost(Access::ReadWrite);
4306 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4307 view (lclRow, colInd) = ScalarValue;
4308 }
4309
4310
4311 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4312 void
4314 sumIntoLocalValue (const LocalOrdinal lclRow,
4315 const size_t col,
4316 const impl_scalar_type& value,
4317 const bool atomic)
4318 {
4320 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4321 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4322 TEUCHOS_TEST_FOR_EXCEPTION(
4323 lclRow < minLocalIndex || lclRow > maxLocalIndex,
4324 std::runtime_error,
4325 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4326 << " is invalid. The range of valid row indices on this process "
4327 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4328 << ", " << maxLocalIndex << "].");
4329 TEUCHOS_TEST_FOR_EXCEPTION(
4330 vectorIndexOutOfRange(col),
4331 std::runtime_error,
4332 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4333 << " of the multivector is invalid.");
4334 }
4335
4336 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4337
4338 auto view = this->getLocalViewHost(Access::ReadWrite);
4339 if (atomic) {
4340 Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4341 }
4342 else {
4343 view(lclRow, colInd) += value;
4344 }
4345 }
4346
4347
4348 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4349 void
4351 replaceGlobalValue (const GlobalOrdinal gblRow,
4352 const size_t col,
4353 const impl_scalar_type& ScalarValue)
4354 {
4355 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4356 // touches the RCP's reference count, which isn't thread safe.
4357 const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4358
4360 const char tfecfFuncName[] = "replaceGlobalValue: ";
4361 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4362 (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4363 std::runtime_error,
4364 "Global row index " << gblRow << " is not present on this process "
4365 << this->getMap ()->getComm ()->getRank () << ".");
4366 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4367 (this->vectorIndexOutOfRange (col), std::runtime_error,
4368 "Vector index " << col << " of the MultiVector is invalid.");
4369 }
4370
4371 this->replaceLocalValue (lclRow, col, ScalarValue);
4372 }
4373
4374 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4375 void
4377 sumIntoGlobalValue (const GlobalOrdinal globalRow,
4378 const size_t col,
4379 const impl_scalar_type& value,
4380 const bool atomic)
4381 {
4382 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4383 // touches the RCP's reference count, which isn't thread safe.
4384 const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4385
4387 TEUCHOS_TEST_FOR_EXCEPTION(
4388 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4389 std::runtime_error,
4390 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4391 << " is not present on this process "
4392 << this->getMap ()->getComm ()->getRank () << ".");
4393 TEUCHOS_TEST_FOR_EXCEPTION(
4394 vectorIndexOutOfRange(col),
4395 std::runtime_error,
4396 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4397 << " of the multivector is invalid.");
4398 }
4399
4400 this->sumIntoLocalValue (lclRow, col, value, atomic);
4401 }
4402
4403 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4404 template <class T>
4405 Teuchos::ArrayRCP<T>
4407 getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4408 size_t j) const
4409 {
4410 typedef Kokkos::DualView<impl_scalar_type*,
4411 typename dual_view_type::array_layout,
4412 execution_space> col_dual_view_type;
4413 const size_t col = isConstantStride () ? j : whichVectors_[j];
4414 col_dual_view_type X_col =
4415 Kokkos::subview (view_, Kokkos::ALL (), col);
4416 return Kokkos::Compat::persistingView (X_col.d_view);
4417 }
4418
4419
4420#ifdef TPETRA_ENABLE_DEPRECATED_CODE
4421 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4422 void
4425 owningView_.clear_sync_state ();
4426 }
4427
4428 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4429 void
4431 sync_host () {
4432 owningView_.sync_host ();
4433 }
4434
4435 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4436 void
4438 sync_device () {
4439 owningView_.sync_device ();
4440 }
4441#endif // TPETRA_ENABLE_DEPRECATED_CODE
4442
4443 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4444 bool
4446 need_sync_host () const {
4447 return owningView_.need_sync_host ();
4448 }
4449
4450 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4451 bool
4453 need_sync_device () const {
4454 return owningView_.need_sync_device ();
4455 }
4456
4457#ifdef TPETRA_ENABLE_DEPRECATED_CODE
4458 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4459 void
4461 modify_device () {
4462 owningView_.modify_device ();
4463 }
4464
4465 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4466 void
4468 modify_host () {
4469 owningView_.modify_host ();
4470 }
4471#endif // TPETRA_ENABLE_DEPRECATED_CODE
4472
4473#ifdef TPETRA_ENABLE_DEPRECATED_CODE
4474 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4477 getLocalViewDevice () const {
4478 return view_.view_device ();
4479 }
4480
4481 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4484 getLocalViewHost () const {
4485 return view_.view_host ();
4486 }
4487#endif // TPETRA_ENABLE_DEPRECATED_CODE
4488
4489 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4490 std::string
4492 descriptionImpl (const std::string& className) const
4493 {
4494 using Teuchos::TypeNameTraits;
4495
4496 std::ostringstream out;
4497 out << "\"" << className << "\": {";
4498 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4499 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4500 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4501 << ", Node" << Node::name ()
4502 << "}, ";
4503 if (this->getObjectLabel () != "") {
4504 out << "Label: \"" << this->getObjectLabel () << "\", ";
4505 }
4506 out << ", numRows: " << this->getGlobalLength ();
4507 if (className != "Tpetra::Vector") {
4508 out << ", numCols: " << this->getNumVectors ()
4509 << ", isConstantStride: " << this->isConstantStride ();
4510 }
4511 if (this->isConstantStride ()) {
4512 out << ", columnStride: " << this->getStride ();
4513 }
4514 out << "}";
4515
4516 return out.str ();
4517 }
4518
4519 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4520 std::string
4522 description () const
4523 {
4524 return this->descriptionImpl ("Tpetra::MultiVector");
4525 }
4526
4527 namespace Details
4528 {
4529 template<typename ViewType>
4530 void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4531 {
4532 using std::endl;
4533 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4534 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4535 static_assert(ViewType::rank == 2,
4536 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4537 // The square braces [] and their contents are in Matlab
4538 // format, so users may copy and paste directly into Matlab.
4539 out << "Values("<<prefix<<"): " << std::endl
4540 << "[";
4541 const size_t numRows = v.extent(0);
4542 const size_t numCols = v.extent(1);
4543 if (numCols == 1) {
4544 for (size_t i = 0; i < numRows; ++i) {
4545 out << v(i,0);
4546 if (i + 1 < numRows) {
4547 out << "; ";
4548 }
4549 }
4550 }
4551 else {
4552 for (size_t i = 0; i < numRows; ++i) {
4553 for (size_t j = 0; j < numCols; ++j) {
4554 out << v(i,j);
4555 if (j + 1 < numCols) {
4556 out << ", ";
4557 }
4558 }
4559 if (i + 1 < numRows) {
4560 out << "; ";
4561 }
4562 }
4563 }
4564 out << "]" << endl;
4565 }
4566 }
4567
4568 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4569 std::string
4571 localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4572 {
4573 typedef LocalOrdinal LO;
4574 using std::endl;
4575
4576 if (vl <= Teuchos::VERB_LOW) {
4577 return std::string ();
4578 }
4579 auto map = this->getMap ();
4580 if (map.is_null ()) {
4581 return std::string ();
4582 }
4583 auto outStringP = Teuchos::rcp (new std::ostringstream ());
4584 auto outp = Teuchos::getFancyOStream (outStringP);
4585 Teuchos::FancyOStream& out = *outp;
4586 auto comm = map->getComm ();
4587 const int myRank = comm->getRank ();
4588 const int numProcs = comm->getSize ();
4589
4590 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4591 Teuchos::OSTab tab1 (out);
4592
4593 // VERB_MEDIUM and higher prints getLocalLength()
4594 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4595 out << "Local number of rows: " << lclNumRows << endl;
4596
4597 if (vl > Teuchos::VERB_MEDIUM) {
4598 // VERB_HIGH and higher prints isConstantStride() and getStride().
4599 // The first is only relevant if the Vector has multiple columns.
4600 if (this->getNumVectors () != static_cast<size_t> (1)) {
4601 out << "isConstantStride: " << this->isConstantStride () << endl;
4602 }
4603 if (this->isConstantStride ()) {
4604 out << "Column stride: " << this->getStride () << endl;
4605 }
4606
4607 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4608 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4610 // so we can't use our regular accessor functins
4611
4612 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4613 // to trigger (since this function is conceptually const)
4614 auto X_dev = view_.view_device();
4615 auto X_host = view_.view_host();
4616
4617 if(X_dev.data() == X_host.data()) {
4618 // One single allocation
4619 Details::print_vector(out,"unified",X_host);
4620 }
4621 else {
4622 Details::print_vector(out,"host",X_host);
4623 if(X_dev.span_is_contiguous())
4624 {
4625 auto X_dev_on_host = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), X_dev);
4626 Details::print_vector(out,"dev",X_dev_on_host);
4627 }
4628 else
4629 {
4630 auto X_contig = Tpetra::Details::TempView::toLayout<decltype(X_dev), Kokkos::LayoutLeft>(X_dev);
4631 auto X_dev_on_host = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), X_contig);
4632 Details::print_vector(out,"dev",X_dev_on_host);
4633 }
4634 }
4635 }
4636 }
4637 out.flush (); // make sure the ostringstream got everything
4638 return outStringP->str ();
4639 }
4640
4641 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4642 void
4644 describeImpl (Teuchos::FancyOStream& out,
4645 const std::string& className,
4646 const Teuchos::EVerbosityLevel verbLevel) const
4647 {
4648 using Teuchos::TypeNameTraits;
4649 using Teuchos::VERB_DEFAULT;
4650 using Teuchos::VERB_NONE;
4651 using Teuchos::VERB_LOW;
4652 using std::endl;
4653 const Teuchos::EVerbosityLevel vl =
4654 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4655
4656 if (vl == VERB_NONE) {
4657 return; // don't print anything
4658 }
4659 // If this Vector's Comm is null, then the Vector does not
4660 // participate in collective operations with the other processes.
4661 // In that case, it is not even legal to call this method. The
4662 // reasonable thing to do in that case is nothing.
4663 auto map = this->getMap ();
4664 if (map.is_null ()) {
4665 return;
4666 }
4667 auto comm = map->getComm ();
4668 if (comm.is_null ()) {
4669 return;
4670 }
4671
4672 const int myRank = comm->getRank ();
4673
4674 // Only Process 0 should touch the output stream, but this method
4675 // in general may need to do communication. Thus, we may need to
4676 // preserve the current tab level across multiple "if (myRank ==
4677 // 0) { ... }" inner scopes. This is why we sometimes create
4678 // OSTab instances by pointer, instead of by value. We only need
4679 // to create them by pointer if the tab level must persist through
4680 // multiple inner scopes.
4681 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4682
4683 // VERB_LOW and higher prints the equivalent of description().
4684 if (myRank == 0) {
4685 tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4686 out << "\"" << className << "\":" << endl;
4687 tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4688 {
4689 out << "Template parameters:" << endl;
4690 Teuchos::OSTab tab2 (out);
4691 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4692 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4693 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4694 << "Node: " << Node::name () << endl;
4695 }
4696 if (this->getObjectLabel () != "") {
4697 out << "Label: \"" << this->getObjectLabel () << "\", ";
4698 }
4699 out << "Global number of rows: " << this->getGlobalLength () << endl;
4700 if (className != "Tpetra::Vector") {
4701 out << "Number of columns: " << this->getNumVectors () << endl;
4702 }
4703 // getStride() may differ on different processes, so it (and
4704 // isConstantStride()) properly belong to per-process data.
4705 }
4706
4707 // This is collective over the Map's communicator.
4708 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4709 const std::string lclStr = this->localDescribeToString (vl);
4710 ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4711 }
4712 }
4713
4714 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4715 void
4717 describe (Teuchos::FancyOStream &out,
4718 const Teuchos::EVerbosityLevel verbLevel) const
4719 {
4720 this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4721 }
4722
4723 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4724 void
4726 removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4727 {
4728 replaceMap (newMap);
4729 }
4730
4731 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4732 void
4735 {
4736 using ::Tpetra::Details::localDeepCopy;
4737 const char prefix[] = "Tpetra::MultiVector::assign: ";
4738
4739 TEUCHOS_TEST_FOR_EXCEPTION
4740 (this->getGlobalLength () != src.getGlobalLength () ||
4741 this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4742 prefix << "Global dimensions of the two Tpetra::MultiVector "
4743 "objects do not match. src has dimensions [" << src.getGlobalLength ()
4744 << "," << src.getNumVectors () << "], and *this has dimensions ["
4745 << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4746 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4747 TEUCHOS_TEST_FOR_EXCEPTION
4748 (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4749 prefix << "The local row counts of the two Tpetra::MultiVector "
4750 "objects do not match. src has " << src.getLocalLength () << " row(s) "
4751 << " and *this has " << this->getLocalLength () << " row(s).");
4752
4753
4754 // See #1510. We're writing to *this, so we don't care about its
4755 // contents in either memory space, and we don't want
4756 // DualView::modify to complain about "concurrent modification" of
4757 // host and device Views.
4758
4762
4763 // If need sync to device, then host has most recent version.
4764 const bool src_last_updated_on_host = src.need_sync_device ();
4765
4766 if (src_last_updated_on_host) {
4767 localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4768 src.getLocalViewHost(Access::ReadOnly),
4769 this->isConstantStride (),
4770 src.isConstantStride (),
4771 this->whichVectors_ (),
4772 src.whichVectors_ ());
4773 }
4774 else {
4775 localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4776 src.getLocalViewDevice(Access::ReadOnly),
4777 this->isConstantStride (),
4778 src.isConstantStride (),
4779 this->whichVectors_ (),
4780 src.whichVectors_ ());
4781 }
4782 }
4783
4784 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4785 template<class T>
4786 Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4788 convert () const
4789 {
4790 using Teuchos::RCP;
4791
4792 auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4793 this->getNumVectors(),
4794 /*zeroOut=*/false));
4795 Tpetra::deep_copy(*newMV, *this);
4796 return newMV;
4797 }
4798
4799 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4800 bool
4803 {
4804 using ::Tpetra::Details::PackTraits;
4805
4806 const size_t l1 = this->getLocalLength();
4807 const size_t l2 = vec.getLocalLength();
4808 if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4809 return false;
4810 }
4811
4812 return true;
4813 }
4814
4815 template <class ST, class LO, class GO, class NT>
4818 std::swap(mv.map_, this->map_);
4819 std::swap(mv.view_, this->view_);
4820 std::swap(mv.origView_, this->origView_);
4821 std::swap(mv.owningView_, this->owningView_);
4822 std::swap(mv.whichVectors_, this->whichVectors_);
4823 }
4824
4825#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4826 template <class ST, class LO, class GO, class NT>
4827 void
4829 const Teuchos::SerialDenseMatrix<int, ST>& src)
4830 {
4831 using ::Tpetra::Details::localDeepCopy;
4832 using MV = MultiVector<ST, LO, GO, NT>;
4833 using IST = typename MV::impl_scalar_type;
4834 using input_view_type =
4835 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4836 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4837 using pair_type = std::pair<LO, LO>;
4838
4839 const LO numRows = static_cast<LO> (src.numRows ());
4840 const LO numCols = static_cast<LO> (src.numCols ());
4841
4842 TEUCHOS_TEST_FOR_EXCEPTION
4843 (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4844 numCols != static_cast<LO> (dst.getNumVectors ()),
4845 std::invalid_argument, "Tpetra::deep_copy: On Process "
4846 << dst.getMap ()->getComm ()->getRank () << ", dst is "
4847 << dst.getLocalLength () << " x " << dst.getNumVectors ()
4848 << ", but src is " << numRows << " x " << numCols << ".");
4849
4850 const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4851 input_view_type src_orig (src_raw, src.stride (), numCols);
4852 input_view_type src_view =
4853 Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4854
4855 constexpr bool src_isConstantStride = true;
4856 Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4857 localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4858 src_view,
4859 dst.isConstantStride (),
4860 src_isConstantStride,
4861 getMultiVectorWhichVectors (dst),
4862 srcWhichVectors);
4863 }
4864
4865 template <class ST, class LO, class GO, class NT>
4866 void
4867 deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4868 const MultiVector<ST, LO, GO, NT>& src)
4869 {
4870 using ::Tpetra::Details::localDeepCopy;
4871 using MV = MultiVector<ST, LO, GO, NT>;
4872 using IST = typename MV::impl_scalar_type;
4873 using output_view_type =
4874 Kokkos::View<IST**, Kokkos::LayoutLeft,
4875 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4876 using pair_type = std::pair<LO, LO>;
4877
4878 const LO numRows = static_cast<LO> (dst.numRows ());
4879 const LO numCols = static_cast<LO> (dst.numCols ());
4880
4881 TEUCHOS_TEST_FOR_EXCEPTION
4882 (numRows != static_cast<LO> (src.getLocalLength ()) ||
4883 numCols != static_cast<LO> (src.getNumVectors ()),
4884 std::invalid_argument, "Tpetra::deep_copy: On Process "
4885 << src.getMap ()->getComm ()->getRank () << ", src is "
4886 << src.getLocalLength () << " x " << src.getNumVectors ()
4887 << ", but dst is " << numRows << " x " << numCols << ".");
4888
4889 IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4890 output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4891 auto dst_view =
4892 Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4893
4894 constexpr bool dst_isConstantStride = true;
4895 Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4896
4897 // Prefer the host version of src's data.
4898 localDeepCopy (dst_view,
4899 src.getLocalViewHost(Access::ReadOnly),
4900 dst_isConstantStride,
4901 src.isConstantStride (),
4902 dstWhichVectors,
4903 getMultiVectorWhichVectors (src));
4904 }
4905#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4906
4907 template <class Scalar, class LO, class GO, class NT>
4908 Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4909 createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4910 size_t numVectors)
4911 {
4913 return Teuchos::rcp (new MV (map, numVectors));
4914 }
4915
4916 template <class ST, class LO, class GO, class NT>
4919 {
4920 typedef MultiVector<ST, LO, GO, NT> MV;
4921 MV cpy (src.getMap (), src.getNumVectors (), false);
4922 cpy.assign (src);
4923 return cpy;
4924 }
4925
4926} // namespace Tpetra
4927
4928//
4929// Explicit instantiation macro
4930//
4931// Must be expanded from within the Tpetra namespace!
4932//
4933
4934#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4935# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4936 template class MultiVector< SCALAR , LO , GO , NODE >; \
4937 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4938 template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4939 template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4940 template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4941
4942#else
4943# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4944 template class MultiVector< SCALAR , LO , GO , NODE >; \
4945 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4946
4947#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4948
4949
4950#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4951 \
4952 template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4953 MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4954
4955
4956#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void fill(const ExecutionSpace &execSpace, const ViewType &X, const ValueType &alpha, const IndexType numRows, const IndexType numCols)
Fill the entries of the given 1-D or 2-D Kokkos::View with the given scalar value alpha.
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
static bool assumeMpiIsCudaAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
Base class for distributed Tpetra objects that support data redistribution.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
dual_view_type origView_
The "original view" of the MultiVector's data.
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
void randomize()
Set all values in the multivector to pseudorandom numbers.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM)
Perform copies and permutations that are local to the calling (MPI) process.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type owningView_
The true original DualView - it owns the memory of this MultiVector, and was not constructed as a sub...
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
Implementation details of Tpetra.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
EWhichNorm
Input argument for normImpl() (which see).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.