Tpetra parallel linear algebra Version of the Day
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42
43#include "TpetraCore_config.h"
44#include "Teuchos_Array.hpp"
45#include "Teuchos_ArrayView.hpp"
46#include "Teuchos_OrdinalTraits.hpp"
54#include "Kokkos_Core.hpp"
55#include <memory>
56#include <string>
57
78
79namespace Tpetra {
80
81//
82// Users must never rely on anything in the Details namespace.
83//
84namespace Details {
85
86namespace UnpackAndCombineCrsMatrixImpl {
87
97template<class ST, class LO, class GO>
98KOKKOS_FUNCTION int
99unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
100 const typename PackTraits<int>::output_array_type& pids_out,
101 const typename PackTraits<ST>::output_array_type& vals_out,
102 const char imports[],
103 const size_t offset,
104 const size_t /* num_bytes */,
105 const size_t num_ent,
106 const size_t bytes_per_value)
107{
108 if (num_ent == 0) {
109 // Empty rows always take zero bytes, to ensure sparsity.
110 return 0;
111 }
112 bool unpack_pids = pids_out.size() > 0;
113
114 const size_t num_ent_beg = offset;
115 const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
116
117 const size_t gids_beg = num_ent_beg + num_ent_len;
118 const size_t gids_len =
119 num_ent * PackTraits<GO>::packValueCount (GO (0));
120
121 const size_t pids_beg = gids_beg + gids_len;
122 const size_t pids_len = unpack_pids ?
123 size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
124 size_t (0);
125
126 const size_t vals_beg = gids_beg + gids_len + pids_len;
127 const size_t vals_len = num_ent * bytes_per_value;
128
129 const char* const num_ent_in = imports + num_ent_beg;
130 const char* const gids_in = imports + gids_beg;
131 const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
132 const char* const vals_in = imports + vals_beg;
133
134 size_t num_bytes_out = 0;
135 LO num_ent_out;
136 num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
137 if (static_cast<size_t> (num_ent_out) != num_ent) {
138 return 20; // error code
139 }
140
141 {
142 Kokkos::pair<int, size_t> p;
143 p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
144 if (p.first != 0) {
145 return 21; // error code
146 }
147 num_bytes_out += p.second;
148
149 if (unpack_pids) {
150 p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
151 if (p.first != 0) {
152 return 22; // error code
153 }
154 num_bytes_out += p.second;
155 }
156
157 p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
158 if (p.first != 0) {
159 return 23; // error code
160 }
161 num_bytes_out += p.second;
162 }
163
164 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165 if (num_bytes_out != expected_num_bytes) {
166 return 24; // error code
167 }
168 return 0; // no errors
169}
170
181template<class LocalMatrix, class LocalMap, class BufferDeviceType>
183 typedef LocalMatrix local_matrix_type;
184 typedef LocalMap local_map_type;
185
186 typedef typename local_matrix_type::value_type ST;
187 typedef typename local_map_type::local_ordinal_type LO;
188 typedef typename local_map_type::global_ordinal_type GO;
189 typedef typename local_map_type::device_type DT;
190 typedef typename DT::execution_space XS;
191
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
197
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
200
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and "
203 "LocalMatrix::ordinal_type must be the same.");
204
205 local_matrix_type local_matrix;
206 local_map_type local_col_map;
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
212 Tpetra::CombineMode combine_mode;
213 size_t batch_size;
214 size_t bytes_per_value;
215 bool atomic;
216 error_type error_code;
217
219 const local_matrix_type& local_matrix_in,
220 const local_map_type& local_col_map_in,
221 const input_buffer_type& imports_in,
222 const num_packets_per_lid_type& num_packets_per_lid_in,
223 const import_lids_type& import_lids_in,
224 const Kokkos::View<const LO*[2], DT>& batch_info_in,
225 const offsets_type& offsets_in,
226 const Tpetra::CombineMode combine_mode_in,
227 const size_t batch_size_in,
228 const size_t bytes_per_value_in,
229 const bool atomic_in) :
230 local_matrix (local_matrix_in),
231 local_col_map (local_col_map_in),
232 imports (imports_in),
233 num_packets_per_lid (num_packets_per_lid_in),
234 import_lids (import_lids_in),
235 batch_info (batch_info_in),
236 offsets (offsets_in),
237 combine_mode (combine_mode_in),
238 batch_size (batch_size_in),
239 bytes_per_value (bytes_per_value_in),
240 atomic (atomic_in),
241 error_code("error")
242 {}
243
244 KOKKOS_INLINE_FUNCTION
245 void operator()(member_type team_member) const
246 {
247 using Kokkos::View;
248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
250
251 const LO batch = team_member.league_rank();
252 const LO lid_no = batch_info(batch, 0);
253 const LO batch_no = batch_info(batch, 1);
254
255 const size_t num_bytes = num_packets_per_lid(lid_no);
256
257 // Only unpack data if there is a nonzero number of bytes.
258 if (num_bytes == 0)
259 return;
260
261 // there is actually something in the row
262 const LO import_lid = import_lids(lid_no);
263 const size_t buf_size = imports.size();
264 const size_t offset = offsets(lid_no);
265
266 // Get the number of entries to expect in the received data for this row.
267 LO num_ent_LO = 0;
268 const char* const in_buf = imports.data() + offset;
269 (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
270 const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
271
272 // Count the number of bytes expected to unpack
273 size_t expected_num_bytes = 0;
274 {
275 expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
276 expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
277 expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
278 }
279
280 if (expected_num_bytes > num_bytes)
281 {
282// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
283#ifndef KOKKOS_ENABLE_SYCL
284 printf(
285 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287 (int) lid_no, (int) expected_num_bytes, (int) num_bytes
288 );
289#endif
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
291 return;
292 }
293
294 if (offset > buf_size || offset + num_bytes > buf_size)
295 {
296// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
297#ifndef KOKKOS_ENABLE_SYCL
298 printf(
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
301 (int) lid_no, (int) offset, (int) buf_size
302 );
303#endif
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
305 return;
306 }
307
308 // Determine the number of entries to unpack in this batch
309 size_t num_entries_in_batch = 0;
310 if (num_entries_in_row <= batch_size)
311 num_entries_in_batch = num_entries_in_row;
312 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313 num_entries_in_batch = batch_size;
314 else
315 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
316
317 const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
318 const size_t num_ent_start = offset;
319 const size_t num_ent_end = num_ent_start + bytes_per_lid;
320
321 const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
322 const size_t gids_start = num_ent_end;
323 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
324
325 const size_t vals_start = gids_end;
326
327 const size_t shift = batch_no * batch_size;
328 const char* const num_ent_in = imports.data() + num_ent_start;
329 const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330 const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
331
332 LO num_ent_out;
333 (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
334 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
335 {
336// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
337#ifndef KOKKOS_ENABLE_SYCL
338 printf(
339 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341 (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
342 );
343#endif
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
345 }
346
347 constexpr bool matrix_has_sorted_rows = true; // see #6282
348 Kokkos::parallel_for(
349 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
350 KOKKOS_LAMBDA(const LO& j)
351 {
352 size_t distance = 0;
353
354 GO gid_out;
355 distance = j * bytes_per_gid;
356 (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
357 auto lid_out = local_col_map.getLocalElement(gid_out);
358
359 // Column indices come in as global indices, in case the
360 // source object's column Map differs from the target object's
361 // (this's) column Map, and must be converted local index values
362
363 // assume that ST is default constructible
364 ST val_out;
365 distance = j * bytes_per_value;
366 (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
367
368 if (combine_mode == ADD) {
369 // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
370 // different threads don't touch the same row (i.e., no
371 // duplicates in incoming LIDs list).
372 const bool use_atomic_updates = atomic;
373 (void)local_matrix.sumIntoValues(
374 import_lid,
375 &lid_out,
376 1,
377 &val_out,
378 matrix_has_sorted_rows,
379 use_atomic_updates
380 );
381 } else if (combine_mode == REPLACE) {
382 // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
383 // combine mode with multiple incoming rows that touch the same
384 // target matrix entries, so we never need atomic updates.
385 const bool use_atomic_updates = false;
386 (void)local_matrix.replaceValues(
387 import_lid,
388 &lid_out,
389 1,
390 &val_out,
391 matrix_has_sorted_rows,
392 use_atomic_updates
393 );
394 } else {
395 // should never get here
396// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
397#ifndef KOKKOS_ENABLE_SYCL
398 printf(
399 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
400 "At row %d, an unknown error occurred during unpack\n", (int) lid_no
401 );
402#endif
403 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
404 }
405 }
406 );
407
408 team_member.team_barrier();
409
410 }
411
413 int error() const {
414 auto error_code_h = Kokkos::create_mirror_view_and_copy(
415 Kokkos::HostSpace(), error_code
416 );
417 return error_code_h();
418 }
419
420};
421
422struct MaxNumEntTag {};
423struct TotNumEntTag {};
424
433template<class LO, class DT, class BDT>
435public:
436 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
437 typedef Kokkos::View<const size_t*, DT> offsets_type;
438 typedef Kokkos::View<const char*, BDT> input_buffer_type;
439 // This needs to be public, since it appears in the argument list of
440 // public methods (see below). Otherwise, build errors may happen.
441 typedef size_t value_type;
442
443private:
444 num_packets_per_lid_type num_packets_per_lid;
445 offsets_type offsets;
446 input_buffer_type imports;
447
448public:
449 NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
450 const offsets_type& offsets_in,
451 const input_buffer_type& imports_in) :
452 num_packets_per_lid (num_packets_per_lid_in),
453 offsets (offsets_in),
454 imports (imports_in)
455 {}
456
457 KOKKOS_INLINE_FUNCTION void
458 operator() (const MaxNumEntTag, const LO i, value_type& update) const {
459 // Get how many entries to expect in the received data for this row.
460 const size_t num_bytes = num_packets_per_lid(i);
461 if (num_bytes > 0) {
462 LO num_ent_LO = 0; // output argument of unpackValue
463 const char* const in_buf = imports.data () + offsets(i);
464 (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
465 const size_t num_ent = static_cast<size_t> (num_ent_LO);
466
467 update = (update < num_ent) ? num_ent : update;
468 }
469 }
470
471 KOKKOS_INLINE_FUNCTION void
472 join (const MaxNumEntTag,
473 volatile value_type& dst,
474 const volatile value_type& src) const
475 {
476 if (dst < src) dst = src;
477 }
478
479 KOKKOS_INLINE_FUNCTION void
480 operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
481 // Get how many entries to expect in the received data for this row.
482 const size_t num_bytes = num_packets_per_lid(i);
483 if (num_bytes > 0) {
484 LO num_ent_LO = 0; // output argument of unpackValue
485 const char* const in_buf = imports.data () + offsets(i);
486 (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
487 tot_num_ent += static_cast<size_t> (num_ent_LO);
488 }
489 }
490};
491
499template<class LO, class DT, class BDT>
500size_t
502 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
503 const Kokkos::View<const size_t*, DT>& offsets,
504 const Kokkos::View<const char*, BDT>& imports)
505{
506 typedef typename DT::execution_space XS;
507 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
508 MaxNumEntTag> range_policy;
509
510 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
511 imports);
512 const LO numRowsToUnpack =
513 static_cast<LO> (num_packets_per_lid.extent (0));
514 size_t max_num_ent = 0;
515 Kokkos::parallel_reduce ("Max num entries in CRS",
516 range_policy (0, numRowsToUnpack),
517 functor, max_num_ent);
518 return max_num_ent;
519}
520
528template<class LO, class DT, class BDT>
529size_t
531 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
532 const Kokkos::View<const size_t*, DT>& offsets,
533 const Kokkos::View<const char*, BDT>& imports)
534{
535 typedef typename DT::execution_space XS;
536 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
537 size_t tot_num_ent = 0;
538 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
539 imports);
540 const LO numRowsToUnpack =
541 static_cast<LO> (num_packets_per_lid.extent (0));
542 Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
543 range_policy (0, numRowsToUnpack),
544 functor, tot_num_ent);
545 return tot_num_ent;
546}
547
548template<class LO>
549KOKKOS_INLINE_FUNCTION
550size_t
551unpackRowCount(const char imports[],
552 const size_t offset,
553 const size_t num_bytes)
554{
555 using PT = PackTraits<LO>;
556
557 LO num_ent_LO = 0;
558 if (num_bytes > 0) {
559 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
560 if (p_num_bytes > num_bytes) {
561 return OrdinalTraits<size_t>::invalid();
562 }
563 const char* const in_buf = imports + offset;
564 (void) PT::unpackValue(num_ent_LO, in_buf);
565 }
566 return static_cast<size_t>(num_ent_LO);
567}
568
573template<class View1, class View2>
574inline
575bool
577 const View1& batches_per_lid,
578 View2& batch_info
579)
580{
581 using LO = typename View2::value_type;
582 size_t batch = 0;
583 for (size_t i=0; i<batches_per_lid.extent(0); i++)
584 {
585 for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
586 {
587 batch_info(batch, 0) = static_cast<LO>(i);
588 batch_info(batch, 1) = batch_no;
589 batch++;
590 }
591 }
592 return batch == batch_info.extent(0);
593}
594
602template<class LocalMatrix, class LocalMap, class BufferDeviceType>
603void
605 const LocalMatrix& local_matrix,
606 const LocalMap& local_map,
607 const Kokkos::View<const char*, BufferDeviceType>& imports,
608 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
610 const Tpetra::CombineMode combine_mode)
611{
612 using ST = typename LocalMatrix::value_type;
613 using LO = typename LocalMap::local_ordinal_type;
614 using DT = typename LocalMap::device_type;
615 using XS = typename DT::execution_space;
616 const char prefix[] =
617 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
618 "unpackAndCombineIntoCrsMatrix: ";
619
620 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
621 if (num_import_lids == 0) {
622 // Nothing to unpack
623 return;
624 }
625
626 {
627 // Check for correct input
628 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
629 std::invalid_argument,
630 prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
631 "static graph (i.e., was constructed with the CrsMatrix constructor "
632 "that takes a const CrsGraph pointer).");
633
634 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
635 std::invalid_argument,
636 prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
637 "(i.e., was constructed with the CrsMatrix constructor that takes a "
638 "const CrsGraph pointer).");
639
640 // Unknown combine mode!
641 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
642 std::invalid_argument,
643 prefix << "Invalid combine mode; should never get "
644 "here! Please report this bug to the Tpetra developers.");
645
646 // Check that sizes of input objects are consistent.
647 bool bad_num_import_lids =
648 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
649 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
650 std::invalid_argument,
651 prefix << "importLIDs.size() (" << num_import_lids << ") != "
652 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
653 } // end QA error checking
654
655 // Get the offsets
656 Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
657 computeOffsetsFromCounts(offsets, num_packets_per_lid);
658
659 // Determine the sizes of the unpack batches
660 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
661 const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
662 const size_t batch_size = std::min(default_batch_size, max_num_ent);
663
664 // To achieve some balance amongst threads, unpack each row in equal size batches
665 size_t num_batches = 0;
666 Kokkos::View<LO*[2], DT> batch_info("", num_batches);
667 Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
668 // Compute meta data that allows batch unpacking
669 Kokkos::parallel_reduce(
670 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
671 KOKKOS_LAMBDA(const size_t i, size_t& batches)
672 {
673 const size_t num_entries_in_row = unpackRowCount<LO>(
674 imports.data(), offsets(i), num_packets_per_lid(i)
675 );
676 batches_per_lid(i) =
677 (num_entries_in_row <= batch_size) ?
678 1 :
679 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
680 batches += batches_per_lid(i);
681 },
682 num_batches
683 );
684 Kokkos::resize(batch_info, num_batches);
685
686 Kokkos::HostSpace host_space;
687 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
688 Kokkos::deep_copy(batches_per_lid_h, batches_per_lid);
689
690 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
691
692 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
693 Kokkos::deep_copy(batch_info, batch_info_h);
694
695 // FIXME (TJF SEP 2017)
696 // The scalar type is not necessarily default constructible
697 size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
698
699 // Now do the actual unpack!
700 const bool atomic = XS::concurrency() != 1;
702 functor f(
703 local_matrix,
704 local_map,
705 imports,
706 num_packets_per_lid,
707 import_lids,
708 batch_info,
709 offsets,
710 combine_mode,
711 batch_size,
712 bytes_per_value,
713 atomic
714 );
715
716 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
718#if defined(KOKKOS_ENABLE_CUDA)
719 constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
720#else
721 constexpr bool is_cuda = false;
722#endif
723 if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
724 {
725 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
726 }
727 else
728 {
729 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
730 }
731
732 auto error_code = f.error();
733 TEUCHOS_TEST_FOR_EXCEPTION(
734 error_code != 0,
735 std::runtime_error,
736 prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
737 );
738}
739
740template<class LocalMatrix, class BufferDeviceType>
741size_t
743 const LocalMatrix& local_matrix,
745 const Kokkos::View<const char*, BufferDeviceType>& imports,
746 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
747 const size_t num_same_ids)
748{
749 using Kokkos::parallel_reduce;
750 typedef typename LocalMatrix::ordinal_type LO;
751 typedef typename LocalMatrix::device_type device_type;
752 typedef typename device_type::execution_space XS;
753 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
754 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
755 typedef BufferDeviceType BDT;
756
757 size_t count = 0;
758 LO num_items;
759
760 // Number of matrix entries to unpack (returned by this function).
761 num_items = static_cast<LO>(num_same_ids);
762 if (num_items) {
763 size_t kcnt = 0;
764 parallel_reduce(range_policy(0, num_items),
765 KOKKOS_LAMBDA(const LO lid, size_t& update) {
766 update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
767 -local_matrix.graph.row_map[lid]);
768 }, kcnt);
769 count += kcnt;
770 }
771
772 // Count entries copied directly from the source matrix with permuting.
773 num_items = static_cast<LO>(permute_from_lids.extent(0));
774 if (num_items) {
775 size_t kcnt = 0;
776 parallel_reduce(range_policy(0, num_items),
777 KOKKOS_LAMBDA(const LO i, size_t& update) {
778 const LO lid = permute_from_lids(i);
779 update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
780 - local_matrix.graph.row_map[lid]);
781 }, kcnt);
782 count += kcnt;
783 }
784
785 {
786 // Count entries received from other MPI processes.
787 const size_type np = num_packets_per_lid.extent(0);
788 Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
789 computeOffsetsFromCounts(offsets, num_packets_per_lid);
790 count +=
791 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
792 offsets, imports);
793 }
794
795 return count;
796}
797
799template<class LO, class DT, class BDT>
800int
801setupRowPointersForRemotes(
802 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
803 const typename PackTraits<LO>::input_array_type& import_lids,
804 const Kokkos::View<const char*, BDT>& imports,
805 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
806 const typename PackTraits<size_t>::input_array_type& offsets)
807{
808 using Kokkos::parallel_reduce;
809 typedef typename DT::execution_space XS;
810 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
811 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
812
813 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
814 const size_type N = num_packets_per_lid.extent(0);
815
816 int errors = 0;
817 parallel_reduce ("Setup row pointers for remotes",
818 range_policy (0, N),
819 KOKKOS_LAMBDA (const size_t i, int& k_error) {
820 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
821 const size_t num_bytes = num_packets_per_lid(i);
822 const size_t offset = offsets(i);
823 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
824 if (num_ent == InvalidNum) {
825 k_error += 1;
826 }
827 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
828 }, errors);
829 return errors;
830}
831
832// Convert array of row lengths to a CRS pointer array
833template<class DT>
834void
835makeCrsRowPtrFromLengths(
836 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
837 const Kokkos::View<size_t*,DT>& new_start_row)
838{
839 using Kokkos::parallel_scan;
840 typedef typename DT::execution_space XS;
841 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
842 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
843 const size_type N = new_start_row.extent(0);
844 parallel_scan(range_policy(0, N),
845 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
846 auto cur_val = tgt_rowptr(i);
847 if (final) {
848 tgt_rowptr(i) = update;
849 new_start_row(i) = tgt_rowptr(i);
850 }
851 update += cur_val;
852 }
853 );
854}
855
856template<class LocalMatrix, class LocalMap>
857void
858copyDataFromSameIDs(
860 const typename PackTraits<int>::output_array_type& tgt_pids,
862 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
863 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
864 const typename PackTraits<int>::input_array_type& src_pids,
865 const LocalMatrix& local_matrix,
866 const LocalMap& local_col_map,
867 const size_t num_same_ids,
868 const int my_pid)
869{
870 using Kokkos::parallel_for;
871 typedef typename LocalMap::device_type DT;
872 typedef typename LocalMap::local_ordinal_type LO;
873 typedef typename DT::execution_space XS;
874 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
875
876 parallel_for(range_policy(0, num_same_ids),
877 KOKKOS_LAMBDA(const size_t i) {
878 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
879
880 const LO src_lid = static_cast<LO>(i);
881 size_t src_row = local_matrix.graph.row_map(src_lid);
882
883 const LO tgt_lid = static_cast<LO>(i);
884 const size_t tgt_row = tgt_rowptr(tgt_lid);
885
886 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
887 - local_matrix.graph.row_map(src_lid);
888 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
889
890 for (size_t j=local_matrix.graph.row_map(src_lid);
891 j<local_matrix.graph.row_map(src_lid+1); ++j) {
892 LO src_col = local_matrix.graph.entries(j);
893 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
894 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
895 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
896 }
897 }
898 );
899}
900
901template<class LocalMatrix, class LocalMap>
902void
903copyDataFromPermuteIDs(
905 const typename PackTraits<int>::output_array_type& tgt_pids,
907 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
908 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
909 const typename PackTraits<int>::input_array_type& src_pids,
912 const LocalMatrix& local_matrix,
913 const LocalMap& local_col_map,
914 const int my_pid)
915{
916 using Kokkos::parallel_for;
917 typedef typename LocalMap::device_type DT;
918 typedef typename LocalMap::local_ordinal_type LO;
919 typedef typename DT::execution_space XS;
920 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
921 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
922
923 const size_type num_permute_to_lids = permute_to_lids.extent(0);
924
925 parallel_for(range_policy(0, num_permute_to_lids),
926 KOKKOS_LAMBDA(const size_t i) {
927 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
928
929 const LO src_lid = permute_from_lids(i);
930 const size_t src_row = local_matrix.graph.row_map(src_lid);
931
932 const LO tgt_lid = permute_to_lids(i);
933 const size_t tgt_row = tgt_rowptr(tgt_lid);
934
935 size_t nsr = local_matrix.graph.row_map(src_lid+1)
936 - local_matrix.graph.row_map(src_lid);
937 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
938
939 for (size_t j=local_matrix.graph.row_map(src_lid);
940 j<local_matrix.graph.row_map(src_lid+1); ++j) {
941 LO src_col = local_matrix.graph.entries(j);
942 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
943 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
944 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
945 }
946 }
947 );
948}
949
950template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
951int
952unpackAndCombineIntoCrsArrays2(
954 const typename PackTraits<int>::output_array_type& tgt_pids,
956 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
957 const typename PackTraits<size_t>::input_array_type& offsets,
959 const Kokkos::View<const char*, BufferDeviceType>& imports,
960 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
961 const LocalMatrix& /* local_matrix */,
962 const LocalMap /*& local_col_map*/,
963 const int my_pid,
964 const size_t bytes_per_value)
965{
966 using Kokkos::View;
967 using Kokkos::subview;
968 using Kokkos::MemoryUnmanaged;
969 using Kokkos::parallel_reduce;
970 using Kokkos::atomic_fetch_add;
972 typedef typename LocalMap::device_type DT;
973 typedef typename LocalMap::local_ordinal_type LO;
974 typedef typename LocalMap::global_ordinal_type GO;
975 typedef typename LocalMatrix::value_type ST;
976 typedef typename DT::execution_space XS;
977 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
978 typedef typename Kokkos::pair<size_type, size_type> slice;
979 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
980
981 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
982 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
983 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
984
985 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
986
987 int errors = 0;
988 const size_type num_import_lids = import_lids.size();
989
990 // RemoteIDs: Loop structure following UnpackAndCombine
991 parallel_reduce ("Unpack and combine into CRS",
992 range_policy (0, num_import_lids),
993 KOKKOS_LAMBDA (const size_t i, int& k_error) {
994 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
995 const size_t num_bytes = num_packets_per_lid(i);
996 const size_t offset = offsets(i);
997 if (num_bytes == 0) {
998 // Empty buffer means that the row is empty.
999 return;
1000 }
1001 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1002 if (num_ent == InvalidNum) {
1003 k_error += 1;
1004 return;
1005 }
1006 const LO lcl_row = import_lids(i);
1007 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1008 const size_t end_row = start_row + num_ent;
1009
1010 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1011 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1012 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1013
1014 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1015 imports.data(), offset, num_bytes,
1016 num_ent, bytes_per_value);
1017
1018 // Correct target PIDs.
1019 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1020 const int pid = pids_out(j);
1021 pids_out(j) = (pid != my_pid) ? pid : -1;
1022 }
1023 }, errors);
1024
1025 return errors;
1026}
1027
1028template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1029void
1031 const LocalMatrix & local_matrix,
1032 const LocalMap & local_col_map,
1034 const Kokkos::View<const char*, BufferDeviceType>& imports,
1035 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1038 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1041 const typename PackTraits<int>::input_array_type& src_pids,
1042 const typename PackTraits<int>::output_array_type& tgt_pids,
1043 const size_t num_same_ids,
1044 const size_t tgt_num_rows,
1045 const size_t tgt_num_nonzeros,
1046 const int my_tgt_pid,
1047 const size_t bytes_per_value)
1048{
1049 using Kokkos::View;
1050 using Kokkos::subview;
1051 using Kokkos::parallel_for;
1052 using Kokkos::MemoryUnmanaged;
1054 typedef typename LocalMap::device_type DT;
1055 typedef typename LocalMap::local_ordinal_type LO;
1056 typedef typename DT::execution_space XS;
1057 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1058 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1059 typedef BufferDeviceType BDT;
1060
1061 const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1062
1063 const size_t N = tgt_num_rows;
1064
1065 // In the case of reduced communicators, the sourceMatrix won't have
1066 // the right "my_pid", so thus we have to supply it.
1067 const int my_pid = my_tgt_pid;
1068
1069 // Zero the rowptr
1070 parallel_for(range_policy(0, N+1),
1071 KOKKOS_LAMBDA(const size_t i) {
1072 tgt_rowptr(i) = 0;
1073 }
1074 );
1075
1076 // same IDs: Always first, always in the same place
1077 parallel_for(range_policy(0, num_same_ids),
1078 KOKKOS_LAMBDA(const size_t i) {
1079 const LO tgt_lid = static_cast<LO>(i);
1080 const LO src_lid = static_cast<LO>(i);
1081 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1082 - local_matrix.graph.row_map(src_lid);
1083 }
1084 );
1085
1086 // Permute IDs: Still local, but reordered
1087 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1088 parallel_for(range_policy(0, num_permute_to_lids),
1089 KOKKOS_LAMBDA(const size_t i) {
1090 const LO tgt_lid = permute_to_lids(i);
1091 const LO src_lid = permute_from_lids(i);
1092 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1093 - local_matrix.graph.row_map(src_lid);
1094 }
1095 );
1096
1097 // Get the offsets from the number of packets per LID
1098 const size_type num_import_lids = import_lids.extent(0);
1099 View<size_t*, DT> offsets("offsets", num_import_lids+1);
1100 computeOffsetsFromCounts(offsets, num_packets_per_lid);
1101
1102#ifdef HAVE_TPETRA_DEBUG
1103 {
1104 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1105 const bool condition =
1106 nth_offset_h != static_cast<size_t>(imports.extent (0));
1107 TEUCHOS_TEST_FOR_EXCEPTION
1108 (condition, std::logic_error, prefix
1109 << "The final offset in bytes " << nth_offset_h
1110 << " != imports.size() = " << imports.extent(0)
1111 << ". Please report this bug to the Tpetra developers.");
1112 }
1113#endif // HAVE_TPETRA_DEBUG
1114
1115 // Setup row pointers for remotes
1116 int k_error =
1117 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1118 import_lids, imports, num_packets_per_lid, offsets);
1119 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1120 << " Error transferring data to target row pointers. "
1121 "Please report this bug to the Tpetra developers.");
1122
1123 // If multiple processes contribute to the same row, we may need to
1124 // update row offsets. This tracks that.
1125 View<size_t*, DT> new_start_row ("new_start_row", N+1);
1126
1127 // Turn row length into a real CRS row pointer
1128 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1129
1130 // SameIDs: Copy the data over
1131 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1132 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1133
1134 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1135 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1136 local_matrix, local_col_map, my_pid);
1137
1138 if (imports.extent(0) <= 0) {
1139 return;
1140 }
1141
1142 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1143 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1144 local_matrix, local_col_map, my_pid, bytes_per_value);
1145 TEUCHOS_TEST_FOR_EXCEPTION(
1146 unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1147 "should never happen. Please report this bug to the Tpetra developers.");
1148
1149 return;
1150}
1151
1152} // namespace UnpackAndCombineCrsMatrixImpl
1153
1193template<typename ST, typename LO, typename GO, typename Node>
1194void
1196 const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1197 const Teuchos::ArrayView<const char>& imports,
1198 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1199 const Teuchos::ArrayView<const LO>& importLIDs,
1200 size_t /* constantNumPackets */,
1201 CombineMode combineMode)
1202{
1203 using Kokkos::View;
1204 typedef typename Node::device_type device_type;
1205 typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1206 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1207 "Node::device_type and LocalMatrix::device_type must be the same.");
1208
1209 // Convert all Teuchos::Array to Kokkos::View.
1210 device_type outputDevice;
1211
1212 // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1213 // them to device. Since unpacking is done directly in to the local matrix
1214 // (lclMatrix), no copying needs to be performed after unpacking.
1215 auto num_packets_per_lid_d =
1216 create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1217 numPacketsPerLID.size(), true, "num_packets_per_lid");
1218
1219 auto import_lids_d =
1220 create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1221 importLIDs.size(), true, "import_lids");
1222
1223 auto imports_d =
1224 create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1225 imports.size(), true, "imports");
1226
1227 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1228 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1229
1230//KDDKDD This loop doesn't appear to do anything; what is it?
1231//KDDKDD for (int i=0; i<importLIDs.size(); i++)
1232//KDDKDD {
1233//KDDKDD auto lclRow = importLIDs[i];
1234//KDDKDD Teuchos::ArrayView<const LO> A_indices;
1235//KDDKDD Teuchos::ArrayView<const ST> A_values;
1236//KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1237//KDDKDD }
1238 // Now do the actual unpack!
1239 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1240 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1241 import_lids_d, combineMode);
1242
1243}
1244
1245template<typename ST, typename LO, typename GO, typename NT>
1246void
1247unpackCrsMatrixAndCombineNew(
1248 const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1249 Kokkos::DualView<char*,
1251 Kokkos::DualView<size_t*,
1252 typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1253 const Kokkos::DualView<const LO*,
1255 const size_t /* constantNumPackets */,
1256 const CombineMode combineMode)
1257{
1258 using Kokkos::View;
1259 using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1260 using dist_object_type = DistObject<char, LO, GO, NT>;
1261 using device_type = typename crs_matrix_type::device_type;
1262 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1263 using buffer_device_type = typename dist_object_type::buffer_device_type;
1264
1265 static_assert
1266 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1267 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1268 "must be the same.");
1269
1270 if (numPacketsPerLID.need_sync_device()) {
1271 numPacketsPerLID.sync_device ();
1272 }
1273 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1274
1275 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1276 auto import_lids_d = importLIDs.view_device ();
1277
1278 if (imports.need_sync_device()) {
1279 imports.sync_device ();
1280 }
1281 auto imports_d = imports.view_device ();
1282
1283 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1284 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1285 typedef decltype (local_col_map) local_map_type;
1286
1287 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1288 local_matrix_device_type,
1289 local_map_type,
1290 buffer_device_type
1291 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1292 import_lids_d, combineMode);
1293}
1294
1341//
1350template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1351size_t
1354 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1355 const Teuchos::ArrayView<const char> &imports,
1356 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1357 size_t /* constantNumPackets */,
1358 CombineMode /* combineMode */,
1359 size_t numSameIDs,
1360 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1361 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1362{
1363 using Kokkos::MemoryUnmanaged;
1364 using Kokkos::View;
1365 typedef typename Node::device_type DT;
1367 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1368
1369 TEUCHOS_TEST_FOR_EXCEPTION
1370 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1371 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1372 "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1373 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1374 // process, then the matrix is neither locally nor globally indexed.
1375 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1376 TEUCHOS_TEST_FOR_EXCEPTION
1377 (! locallyIndexed, std::invalid_argument, prefix << "The input "
1378 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1379 TEUCHOS_TEST_FOR_EXCEPTION
1380 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1381 prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1382 "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1383
1384 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1385 auto permute_from_lids_d =
1387 permuteFromLIDs.getRawPtr (),
1388 permuteFromLIDs.size (), true,
1389 "permute_from_lids");
1390 auto imports_d =
1392 imports.getRawPtr (),
1393 imports.size (), true,
1394 "imports");
1395 auto num_packets_per_lid_d =
1397 numPacketsPerLID.getRawPtr (),
1398 numPacketsPerLID.size (), true,
1399 "num_packets_per_lid");
1400
1401 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1402 local_matrix, permute_from_lids_d, imports_d,
1403 num_packets_per_lid_d, numSameIDs);
1404}
1405
1420template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1421void
1424 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1425 const Teuchos::ArrayView<const char>& imports,
1426 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1427 const size_t /* constantNumPackets */,
1428 const CombineMode /* combineMode */,
1429 const size_t numSameIDs,
1430 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1431 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1432 size_t TargetNumRows,
1433 size_t TargetNumNonzeros,
1434 const int MyTargetPID,
1435 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1436 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1437 const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1438 const Teuchos::ArrayView<const int>& SourcePids,
1439 Teuchos::Array<int>& TargetPids)
1440{
1442
1443 using Kokkos::View;
1444 using Kokkos::deep_copy;
1445
1446 using Teuchos::ArrayView;
1447 using Teuchos::outArg;
1448 using Teuchos::REDUCE_MAX;
1449 using Teuchos::reduceAll;
1450
1451 typedef LocalOrdinal LO;
1452
1453 typedef typename Node::device_type DT;
1454
1456 typedef typename matrix_type::impl_scalar_type ST;
1457 typedef typename ArrayView<const LO>::size_type size_type;
1458
1459 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1460
1461 TEUCHOS_TEST_FOR_EXCEPTION(
1462 TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1463 std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1464 CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1465
1466 TEUCHOS_TEST_FOR_EXCEPTION(
1467 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1468 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1469 << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1470 const size_type numImportLIDs = importLIDs.size ();
1471
1472 TEUCHOS_TEST_FOR_EXCEPTION(
1473 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1474 prefix << "importLIDs.size() = " << numImportLIDs << " != "
1475 "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1476
1477 // Preseed TargetPids with -1 for local
1478 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1479 TargetPids.resize (TargetNumNonzeros);
1480 }
1481 TargetPids.assign (TargetNumNonzeros, -1);
1482
1483 // Grab pointers for sourceMatrix
1484 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1485 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1486
1487 // Convert input arrays to Kokkos::View
1488 DT outputDevice;
1489 auto import_lids_d =
1490 create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1491 importLIDs.size(), true, "import_lids");
1492
1493 auto imports_d =
1494 create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1495 imports.size(), true, "imports");
1496
1497 auto num_packets_per_lid_d =
1498 create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1499 numPacketsPerLID.size(), true, "num_packets_per_lid");
1500
1501 auto permute_from_lids_d =
1502 create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1503 permuteFromLIDs.size(), true, "permute_from_lids");
1504
1505 auto permute_to_lids_d =
1506 create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1507 permuteToLIDs.size(), true, "permute_to_lids");
1508
1509 auto crs_rowptr_d =
1510 create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1511 CRS_rowptr.size(), true, "crs_rowptr");
1512
1513 auto crs_colind_d =
1514 create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1515 CRS_colind.size(), true, "crs_colidx");
1516
1517#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1518 static_assert (! std::is_same<
1519 typename std::remove_const<
1520 typename std::decay<
1521 decltype (CRS_vals)
1522 >::type::value_type
1523 >::type,
1524 std::complex<double> >::value,
1525 "CRS_vals::value_type is std::complex<double>; this should never happen"
1526 ", since std::complex does not work in Kokkos::View objects.");
1527#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1528
1529 auto crs_vals_d =
1530 create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1531 CRS_vals.size(), true, "crs_vals");
1532
1533#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1534 static_assert (! std::is_same<
1535 typename decltype (crs_vals_d)::non_const_value_type,
1536 std::complex<double> >::value,
1537 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1538 "never happen, since std::complex does not work in Kokkos::View objects.");
1539#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1540
1541 auto src_pids_d =
1542 create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1543 SourcePids.size(), true, "src_pids");
1544
1545 auto tgt_pids_d =
1546 create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1547 TargetPids.size(), true, "tgt_pids");
1548
1549 size_t bytes_per_value = 0;
1551 // assume that ST is default constructible
1552 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1553 }
1554 else {
1555 // Since the packed data come from the source matrix, we can use the source
1556 // matrix to get the number of bytes per Scalar value stored in the matrix.
1557 // This assumes that all Scalar values in the source matrix require the same
1558 // number of bytes. If the source matrix has no entries on the calling
1559 // process, then we hope that some process does have some idea how big
1560 // a Scalar value is. Of course, if no processes have any entries, then no
1561 // values should be packed (though this does assume that in our packing
1562 // scheme, rows with zero entries take zero bytes).
1563 size_t bytes_per_value_l = 0;
1564 if (local_matrix.values.extent(0) > 0) {
1565 const ST& val = local_matrix.values(0);
1566 bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1567 } else {
1568 const ST& val = crs_vals_d(0);
1569 bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1570 }
1571 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1572 Teuchos::REDUCE_MAX,
1573 bytes_per_value_l,
1574 outArg(bytes_per_value));
1575 }
1576
1577#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1578 static_assert (! std::is_same<
1579 typename decltype (crs_vals_d)::non_const_value_type,
1580 std::complex<double> >::value,
1581 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1582 "never happen, since std::complex does not work in Kokkos::View objects.");
1583#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1584
1585 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1586 local_matrix, local_col_map, import_lids_d, imports_d,
1587 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1589 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1590 bytes_per_value);
1591
1592 // Copy outputs back to host
1593 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1594 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1595 deep_copy(crs_rowptr_h, crs_rowptr_d);
1596
1597 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1598 CRS_colind.getRawPtr(), CRS_colind.size());
1599 deep_copy(crs_colind_h, crs_colind_d);
1600
1601 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1602 CRS_vals.getRawPtr(), CRS_vals.size());
1603 deep_copy(crs_vals_h, crs_vals_d);
1604
1605 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1606 TargetPids.getRawPtr(), TargetPids.size());
1607 deep_copy(tgt_pids_h, tgt_pids_d);
1608
1609}
1610
1611} // namespace Details
1612} // namespace Tpetra
1613
1614#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1615 template void \
1616 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1617 const CrsMatrix<ST, LO, GO, NT>&, \
1618 const Teuchos::ArrayView<const char>&, \
1619 const Teuchos::ArrayView<const size_t>&, \
1620 const Teuchos::ArrayView<const LO>&, \
1621 size_t, \
1622 CombineMode); \
1623 template void \
1624 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1625 const CrsMatrix<ST, LO, GO, NT>&, \
1626 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1627 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1628 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1629 const size_t, \
1630 const CombineMode); \
1631 template void \
1632 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1633 const CrsMatrix<ST, LO, GO, NT> &, \
1634 const Teuchos::ArrayView<const LO>&, \
1635 const Teuchos::ArrayView<const char>&, \
1636 const Teuchos::ArrayView<const size_t>&, \
1637 const size_t, \
1638 const CombineMode, \
1639 const size_t, \
1640 const Teuchos::ArrayView<const LO>&, \
1641 const Teuchos::ArrayView<const LO>&, \
1642 size_t, \
1643 size_t, \
1644 const int, \
1645 const Teuchos::ArrayView<size_t>&, \
1646 const Teuchos::ArrayView<GO>&, \
1647 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1648 const Teuchos::ArrayView<const int>&, \
1649 Teuchos::Array<int>&); \
1650 template size_t \
1651 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1652 const CrsMatrix<ST, LO, GO, NT> &, \
1653 const Teuchos::ArrayView<const LO> &, \
1654 const Teuchos::ArrayView<const char> &, \
1655 const Teuchos::ArrayView<const size_t>&, \
1656 size_t, \
1657 CombineMode, \
1658 size_t, \
1659 const Teuchos::ArrayView<const LO>&, \
1660 const Teuchos::ArrayView<const LO>&);
1661
1662#endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Base class for distributed Tpetra objects that support data redistribution.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.