Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// Zoltan2: A package of combinatorial algorithms for scientific computing
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact Karen Devine (kddevin@sandia.gov)
39// Erik Boman (egboman@sandia.gov)
40// Siva Rajamanickam (srajama@sandia.gov)
41//
42// ***********************************************************************
43//
44// @HEADER
49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
51
55#include <Zoltan2_Algorithm.hpp>
58#include <Zoltan2_Util.hpp>
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
63
64#include <algorithm> // std::sort
65#include <vector>
66#include <unordered_map>
67
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h" // for error codes
73#endif
74#endif
75
76namespace Teuchos{
77
81template <typename Ordinal, typename T>
82class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83{
84private:
85 Ordinal size;
86 T epsilon;
87
88public:
92 epsilon(std::numeric_limits<T>::epsilon()) {}
93
98 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99
105 void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106 for(Ordinal i = 0; i < count; i++) {
107 if(Z2_ABS(inBuffer[i]) > epsilon) {
108 inoutBuffer[i] = inBuffer[i];
109 }
110 }
111 }
112};
113
114} // namespace Teuchos
115
116namespace Zoltan2{
117
124template <typename IT, typename CT, typename WT>
126{
127public:
128 // TODO: Why volatile?
129 // no idea, another intel compiler failure.
130 volatile IT index;
131 volatile CT count;
132 volatile WT *val;
133 volatile WT epsilon;
134
136 this->index = 0;
137 this->count = 0;
138 this->val = NULL;
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
140 }
141
142 // TODO: Document these methods?
143 uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144 this->index = index_;
145 this->count = count_;
146 this->val = vals_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
148 }
149
151 }
152
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
156 this->val = vals_;
157 }
158
159 bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160 assert(this->count == other.count);
161 for(CT i = 0; i < this->count; ++i) {
162 // if the values are equal go to next one.
163 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164 continue;
165 }
166 // if next value is smaller return true;
167 if(this->val[i] < other.val[i]) {
168 return true;
169 }
170 // if next value is bigger return false;
171 else {
172 return false;
173 }
174 }
175 // if they are totally equal.
176 return this->index < other.index;
177 }
178};
179
182template <class IT, class WT>
184{
185 IT id;
186 WT val;
187};
188
193template <class IT, class WT>
194void uqsort(IT n, uSortItem<IT, WT> * arr) {
195 int NSTACK = 50;
196 int M = 7;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[50];
199 WT aval;
201
202 --arr;
203 for(;;) {
204 if(ir-l < M) {
205 for(j=l+1;j<=ir;j++) {
206 a=arr[j];
207 aval = a.val;
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
210 break;
211 arr[i+1] = arr[i];
212 }
213 arr[i+1]=a;
214 }
215 if(jstack == 0)
216 break;
217 ir=istack[jstack--];
218 l=istack[jstack--];
219 }
220 else {
221 k=(l+ir) >> 1;
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
225 }
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
228 }
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
231 }
232 i=l+1;
233 j=ir;
234 a=arr[l];
235 aval = a.val;
236 for(;;) {
237 do i++; while (arr[i].val < aval);
238 do j--; while (arr[j].val > aval);
239 if(j < i) break;
240 std::swap(arr[i],arr[j]);
241 }
242 arr[l]=arr[j];
243 arr[j]=a;
244 jstack += 2;
245 if(jstack > NSTACK) {
246 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247 std::terminate();
248 }
249 if(ir-i+1 >= j-l) {
250 istack[jstack]=ir;
251 istack[jstack-1]=i;
252 ir=j-1;
253 }
254 else {
255 istack[jstack]=j-1;
256 istack[jstack-1]=l;
257 l=i;
258 }
259 }
260 }
261}
262
263template <class IT, class WT, class SIGN>
265{
266 IT id;
267 WT val;
268 SIGN signbit; // 1 means positive, 0 means negative.
270 /*if I am negative, the other is positive*/
271 if(this->signbit < rhs.signbit) {
272 return true;
273 }
274 /*if both has the same sign*/
275 else if(this->signbit == rhs.signbit) {
276 if(this->val < rhs.val) {//if my value is smaller,
277 return this->signbit;//then if we both are positive return true.
278 //if we both are negative, return false.
279 }
280 else if(this->val > rhs.val) {//if my value is larger,
281 return !this->signbit; //then if we both are positive return false.
282 //if we both are negative, return true.
283 }
284 else { //if both are equal.
285 return false;
286 }
287 }
288 else {
289 /*if I am positive, the other is negative*/
290 return false;
291 }
292 }
293
295 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296 }
297};
298
302template <class IT, class WT, class SIGN>
304 IT NSTACK = 50;
305 IT M = 7;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[50];
309
310 --arr;
311 for(;;) {
312 if(ir < M + l) {
313 for(j=l+1;j<=ir;j++) {
314 a=arr[j];
315 for(i=j-1;i>=1;i--) {
316 if(arr[i] <= a) {
317 break;
318 }
319 arr[i+1] = arr[i];
320 }
321 arr[i+1]=a;
322 }
323 if(jstack == 0) {
324 break;
325 }
326 ir=istack[jstack--];
327 l=istack[jstack--];
328 }
329 else {
330 k=(l+ir) >> 1;
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
334 }
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
337 }
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
340 }
341 i=l+1;
342 j=ir;
343 a=arr[l];
344 for(;;) {
345 do i++; while (arr[i] < a);
346 do j--; while (a < arr[j]);
347 if(j < i) break;
348 std::swap(arr[i],arr[j]);
349 }
350 arr[l]=arr[j];
351 arr[j]=a;
352 jstack += 2;
353 if(jstack > NSTACK) {
354 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355 std::terminate();
356 }
357 if(ir+l+1 >= j+i) {
358 istack[jstack]=ir;
359 istack[jstack-1]=i;
360 ir=j-1;
361 }
362 else {
363 istack[jstack]=j-1;
364 istack[jstack-1]=l;
365 l=i;
366 }
367 }
368 }
369}
370
371// This exists only so we can track how many times the MJ algorithm is
372// called and put each of those into different timer names.
373// Currently the MultiJaggedTest.cpp will actually call it twice.
374// First time with data from a Tpetra MultiVector and then a second time using
375// a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376// results of the two runs are compared which helps to catch a lot of bugs. For
377// profiling I'm mostly just interested in the UVM off case and need it to be
378// in separate timers. Passing a value through would mess up the API. Possibly
379// we could check the Adapter and use that. The statics have to be outside the
380// templated class as the two called instances will be different template
381// parameters. Another complication is that MultiJagged.cpp will call through
382// the Zoltan2_AlgMJ class and we want to time things in both classes. However
383// TaskMapper will directly call AlgMJ so I made two counters for the two
384// classes to make sure it was always correct. This does not impact any
385// behavior and has the sole purpose of generating unique timer names. If you
386// run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387// 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389 static int get_counter_AlgMJ() {
390 static int counter = 0;
391 return counter++;
392 }
394 static int counter = 0;
395 return counter++;
396 }
397};
398
401template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402 typename mj_part_t, typename mj_node_t>
403class AlgMJ
404{
405private:
406 typedef typename mj_node_t::device_type device_t; // for views
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409
410 //if the (last dimension reduce all count) x the mpi world size
411 //estimated to be bigger than this number then migration will be forced
412 //in earlier iterations.
413 static constexpr size_t future_reduceall_cutoff = 1500000;
414
415 //if parts right before last dimension are estimated to have less than
416 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417 static constexpr mj_lno_t min_work_last_dim = 1000;
418
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
421
422 std::string mj_timer_base_string; // for convenience making timer names
423
424 RCP<const Environment> mj_env; // the environment object
425 RCP<const Comm<int> > mj_problemComm; // initial comm object
426 RCP<Comm<int> > comm; // comm object than can be altered during execution
427 double imbalance_tolerance; // input imbalance tolerance.
428 int recursion_depth; // number of steps that partitioning will be solved in.
429 int coord_dim; // coordinate dim
430 int num_weights_per_coord; // # of weights per coord
431 size_t initial_num_loc_coords; // initial num local coords.
432 global_size_t initial_num_glob_coords; // initial num global coords.
433 mj_lno_t num_local_coords; // number of local coords.
434 mj_gno_t num_global_coords; // number of global coords.
435 mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436
437 // can distribute points on same coordinant to different parts.
438 bool distribute_points_on_cut_lines;
439
440 // how many parts we can calculate concurrently.
441 mj_part_t max_concurrent_part_calculation;
442
443 bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444 int mj_user_recursion_depth; // the recursion depth value provided by user.
445 bool mj_keep_part_boxes; // if the boxes need to be kept.
446
447 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448 int check_migrate_avoid_migration_option;
449
450 // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451 // aim for minimized number of messages with possibly bad load-imbalance
452 int migration_type;
453
454 // when MJ decides whether to migrate, the minimum imbalance for migration.
455 double minimum_migration_imbalance;
456
457 // Nonuniform first level partitioning
458 // (Currently available only for sequential_task_partitioning):
459 // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460 // machine coordinates and application coordinates.
461 // An optimization that completely partitions the most important machine dimension
462 // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463 // MJ alg follows after the nonuniform first level partitioning.
464 //
465 // Ex. (first level partitioning): If we have 120 elements,
466 // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467 // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468 // continues for all subsequent levels.
469
470 // If used, number of parts requested for a nonuniform
471 // first level partitioning
472 mj_part_t num_first_level_parts;
473
474 // If used, the requested distribution of parts for the
475 // nonuniform first level partitioning
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477
478 mj_part_t total_num_cut ; // how many cuts will be totally
479 mj_part_t total_num_part; // how many parts will be totally
480
481 mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482 mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483
484 // maximum part+cut count along a dimension.
485 size_t max_num_total_part_along_dim;
486
487 mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488
489 // max no of parts that might occur during the partition before the last
490 // partitioning dimension.
491 mj_part_t last_dim_num_part;
492
493 // input part array specifying num part to divide along each dim.
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495
496 // two dimension coordinate array
497 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499 mj_coordinates;
500
501 // two dimension weight array
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503
504 // if the target parts are uniform
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506
507 // if the coordinates have uniform weights
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509
510 int mj_num_teams; // the number of teams
511
512 size_t num_global_parts; // the targeted number of parts
513
514 // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515 RCP<mj_partBoxVector_t> kept_boxes;
516
517 RCP<mj_partBox_t> global_box;
518
519 int myRank; // processor rank
520 int myActualRank; // initial rank
521
522 bool divide_to_prime_first;
523
524 // initial global ids of the coordinates.
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526
527 // current global ids of the coordinates, might change during migration.
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529
530 // the actual processor owner of the coordinate, to track after migrations.
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532
533 // permutation of coordinates, for partitioning.
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535
536 // permutation work array.
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538
539 // the part ids assigned to coordinates.
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541
542 // beginning and end of each part.
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
544
545 // work array for beginning and end of each part.
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549
550 // how much weight should a MPI put left side of the each cutline
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
553
554 // weight percentage each thread in MPI puts left side of the each outline
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
557
558 // work array to manipulate coordinate of cutlines in different iterations.
559 // necessary because previous cut line information is used for determining
560 // the next cutline information. therefore, cannot update the cut work array
561 // until all cutlines are determined.
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563
564 // Used for swapping above cut_coordinates_work_array
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566
567 // cumulative part weight array.
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569
570 // upper bound coordinate of a cut line
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572
573 // lower bound coordinate of a cut line
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575
576 // lower bound weight of a cut line
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578
579 // upper bound weight of a cut line
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581
582 // combined array to exchange the min and max coordinate, and total
583 // weight of part.
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
586
587 // global combined array with the results for min, max and total weight.
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
590
591 // isDone is used to determine if a cutline is determined already. If a cut
592 // line is already determined, the next iterations will skip this cut line.
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
594
595 // incomplete_cut_count count holds the number of cutlines that have not
596 // been finalized for each part when concurrentPartCount>1, using this
597 // information, if incomplete_cut_count[x]==0, then no work is done
598 // for this part.
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
602
603 // Need a quick accessor for this on host
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
605
606 // local part weights of each thread.
607 Kokkos::View<double *, device_t>
608 thread_part_weights;
609
610 // the work manupulation array for partweights.
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
613
614 // thread_cut_left_closest_point to hold the closest coordinate
615 // to a cutline from left (for each thread).
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
618
619 // thread_cut_right_closest_point to hold the closest coordinate
620 // to a cutline from right (for each thread)
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
623
624 // to store how many points in each part a thread has.
625 Kokkos::View<mj_lno_t *, device_t>
626 thread_point_counts;
627
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630
631 // for faster communication, concatanation of
632 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633 // leftClosest distances sized P-1, since P-1 cut lines
634 // rightClosest distances size P-1, since P-1 cut lines.
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
639
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim; // for quick access on host
643
644 /* \brief helper functio to calculate imbalance.
645 * \param achieved balance we achieved.
646 * \param expected balance expected.
647 */
648 static
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652 }
653
654 /* \brief Either the mj array (part_no_array) or num_global_parts should be
655 * provided in the input. part_no_array takes precedence if both are
656 * provided. Depending on these parameters, total cut/part number, maximum
657 * part/cut number along a dimension, estimated number of reduceAlls,
658 * and the number of parts before the last dimension is calculated.
659 * */
660 void set_part_specifications();
661
662 /* \brief Tries to determine the part number for current dimension,
663 * by trying to make the partitioning as square as possible.
664 * \param num_total_future how many more partitionings are required.
665 * \param root how many more recursion depth is left.
666 */
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
669 double root);
670
671 /* \brief for part communication we keep track of the box boundaries.
672 * This is performed when either asked specifically, or when geometric
673 * mapping is performed afterwards. This function initializes a single box
674 * with all global min and max coordinates.
675 * \param initial_partitioning_boxes the input and output vector for boxes.
676 */
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678
679 /* \brief Function returns how many parts that will be obtained after this
680 * dimension partitioning. It sets how many parts each current part will be
681 * partitioned into in this dimension to device_num_partitioning_in_current_dim
682 * vector, sets how many total future parts each obtained part will be
683 * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684 * are kept, then sets initializes the output_part_boxes as its ancestor.
685 * \param future_num_part_in_parts: input, how many future parts each
686 * current part will be partitioned into.
687 * \param next_future_num_parts_in_parts: output, how many future parts
688 * each obtained part will be partitioned into.
689 * \param future_num_parts: output, max number of future parts that will be
690 * obtained from a single
691 * \param current_num_parts: input, how many parts are there currently.
692 * \param current_iteration: input, current dimension iteration number.
693 * \param input_part_boxes: input, if boxes are kept, current boxes.
694 * \param output_part_boxes: output, if boxes are kept, the initial box
695 * boundaries for obtained parts.
696 * \param atomic_part_count // DOCWORK: Documentation
697 */
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
707
719 static
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
729
754 bool mj_perform_migration(
755 mj_part_t in_num_parts, //current number of parts
756 mj_part_t &out_num_parts, //output number of parts.
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
764
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
785 mj_part_t num_procs,
786 mj_part_t num_parts,
787 mj_gno_t *num_points_in_all_processor_parts);
788
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
815 mj_part_t num_parts,
816 mj_part_t num_procs,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
824
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
852 mj_part_t num_parts,
853 mj_part_t num_procs,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
860
876 void assign_send_destinations(
877 mj_part_t num_parts,
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
882
897 void assign_send_destinations2(
898 mj_part_t num_parts,
899 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
903
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
928 mj_part_t num_parts,
929 mj_part_t num_procs,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
936
950 void mj_migrate_coords(
951 mj_part_t num_procs,
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
956
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
964
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
973 while (n > 1) {
974 while (n % divisor == 0) {
975 n = n / divisor;
976 largest_factor = divisor;
977 }
978 ++divisor;
979 if(divisor * divisor > n) {
980 if(n > 1) {
981 largest_factor = n;
982 }
983 break;
984 }
985 }
986 return largest_factor;
987 }
988
989public:
990 AlgMJ();
991
992 // DOCWORK: Make param documentation use : consistently
1018 void multi_jagged_part(
1019 const RCP<const Environment> &env,
1020 RCP<const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1022 int num_teams,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1026 int coord_dim,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1057
1061
1064 RCP<mj_partBox_t> get_global_box() const;
1065
1068 RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069
1072 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073 RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1118 int coord_dim,
1119 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134 public:
1135#else
1136 private:
1137#endif
1138
1139 /* \brief Allocates all required memory for the mj partitioning algorithm.
1140 */
1141 void allocate_set_work_memory();
1142
1143 /* \brief compute global bounding box: min/max coords of global domain */
1144 void compute_global_box();
1145
1146 // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts/*p-1*/ ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1246
1263 void mj_1D_part(
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282 int loop_count);
1283
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1294
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1352 mj_part_t kk,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1384
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1392
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422 int coordInd,
1423 bool longest_dim_part,
1424 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1439};
1440
1443template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444 typename mj_part_t, typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1464 mj_num_teams(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1469{
1470}
1471
1515template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516 typename mj_part_t, typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1523 int coord_dim_,
1524 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526 mj_coordinates_,
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536{
1537 this->mj_env = env;
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1542
1543 this->divide_to_prime_first = divide_to_prime_first_;
1544 //weights are uniform for task mapping
1545
1546 //parts are uniform for task mapping
1547 //as input indices.
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1552
1553 // If nonuniform first level partitioning, the requested num of parts and the
1554 // requested distribution of elements for each part
1555 this->num_first_level_parts = num_first_level_parts_;
1556
1557 this->first_level_distribution = first_level_distribution_;
1558
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1561
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1564
1565
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568
1569 this->num_weights_per_coord = 0;
1570
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) = true;
1574
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576 ("weights", 1, 1);
1577
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580 this->mj_uniform_parts(0) = true;
1581
1582 this->set_part_specifications();
1583
1584 this->allocate_set_work_memory();
1585
1586 // Do single init
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (int dummy) {
1591 local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592 });
1593
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595
1596 mj_part_t current_num_parts = 1;
1597
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1600
1601 mj_part_t future_num_parts = this->total_num_part;
1602
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1610
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1613 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617
1618 // Need a device counter - how best to allocate?
1619 // Putting this allocation in the loops is very costly so moved out here.
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size("view_total_reduction_size", 1);
1624
1625 for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626 // next_future_num_parts_in_parts will be as the size of outnumParts,
1627 // and this will hold how many more parts that each output part
1628 // should be divided. this array will also be used to determine the weight
1629 // ratios of the parts.
1630 // swap the arrays to use iteratively..
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1634
1635 // clear next_future_num_parts_in_parts array as
1636 // getPartitionArrays expects it to be empty.
1637 next_future_num_parts_in_parts->clear();
1638
1639 // returns the total number of output parts for this dimension partitioning.
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1644 future_num_parts,
1645 current_num_parts,
1646 rd,
1647 t1,
1648 t2, num_ranks_per_node);
1649
1650 // if the number of obtained parts equal to current number of parts,
1651 // skip this dimension. For example, this happens when 1 is given in
1652 // the input part array is given. P=4,5,1,2
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1657 continue;
1658 }
1659
1660 //convert i to string to be used for debugging purposes.
1661 std::string istring = std::to_string(rd);
1662
1663 // alloc Memory to point the indices
1664 // of the parts in the permutation array.
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1667
1668 // the index where in the outtotalCounts will be written.
1669
1670 mj_part_t output_part_index = 0;
1671
1672 // whatever is written to outTotalCounts will be added with previousEnd
1673 // so that the points will be shifted.
1674 mj_part_t output_coordinate_end_index = 0;
1675
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1678
1679 mj_part_t obtained_part_index = 0;
1680
1681 // get the coordinate axis along which the partitioning will be done.
1682 int coordInd = rd % this->coord_dim;
1683
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691
1692 // run for all available parts.
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1695
1696 mj_part_t actual_work_part_count = 0;
1697
1698 // initialization for 1D partitioning.
1699 // get the min and max coordinates of each part
1700 // together with the part weights of each part.
1701 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1704
1705 // if this part wont be partitioned any further
1706 // dont do any work for this part.
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1710 continue;
1711 }
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721
1722 this->mj_get_local_min_max_coord_totW(
1723 current_work_part,
1724 current_concurrent_num_parts,
1725 coords);
1726
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1728 coord_traverse_ind;
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1733
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1743 }
1744
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1755 });
1756
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759 }
1760 else {
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1764 current_work_part,
1765 current_concurrent_num_parts,
1766 coords);
1767 }
1768 }
1769
1770 // 1D partitioning
1771 if(actual_work_part_count > 0) {
1772 // obtain global Min max of the part.
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1777
1778 // update host copy
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1781
1782 // represents the total number of cutlines
1783 // whose coordinate should be determined.
1784 mj_part_t total_incomplete_cut_count = 0;
1785
1786 //Compute weight ratios for parts & cuts:
1787 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788 // part0 cut0 part1 cut1 part2 cut2 part3
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1798
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1800
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1803
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1815
1816 // shift the usedCutCoordinate array as noCuts.
1817 concurrent_part_cut_shift += partition_count - 1;
1818 // shift the partRatio array as noParts.
1819 concurrent_part_part_shift += partition_count;
1820 // calculate only if part is not empty,
1821 // and part will be further partitioend.
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823 // increase allDone by the number of cuts of the current
1824 // part's cut line number.
1825 total_incomplete_cut_count += partition_count - 1;
1826
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1828
1829 // When num_first_level_parts != 1 we have
1830 // nonuniform partitioning on the first level, providing
1831 // requested number of parts (num_first_level_parts) and
1832 // requested distribution in parts (first_level_distribution)
1833
1834 // Get the target part weights given a desired distribution
1835 this->mj_get_initial_cut_coords_target_weights(
1836 min_coordinate,
1837 max_coordinate,
1838 partition_count - 1,
1839 global_total_weight,
1840 usedCutCoordinate,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1848
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1854
1855 // get the initial estimated part assignments of the coordinates.
1856 this->set_initial_coordinate_parts(
1857 max_coordinate,
1858 min_coordinate,
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1863 partition_count);
1864 }
1865 else {
1866 // e.g., if have fewer coordinates than parts, don't need to do
1867 // next dim.
1868 this->incomplete_cut_count(kk) = 0;
1869 }
1870 obtained_part_index += partition_count;
1871 }
1872
1873 // used imbalance, it is always 0, as it is difficult
1874 // to estimate a range.
1875 double used_imbalance = 0;
1876
1877 // Determine cut lines for k parts here.
1878 this->mj_env->timerStart(MACRO_TIMERS,
1879 mj_timer_base_string + "mj_1D_part()");
1880
1881 this->mj_1D_part(
1882 mj_current_dim_coords,
1883 used_imbalance,
1884 current_work_part,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1890
1891 this->mj_env->timerStop(MACRO_TIMERS,
1892 mj_timer_base_string + "mj_1D_part()");
1893 }
1894 else {
1895 obtained_part_index += current_concurrent_num_parts;
1896 }
1897 // create part chunks
1898 {
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1903
1904 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1906
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1909
1910 // if the part is empty, skip the part.
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1915
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917 // we still need to write the begin and end point of the empty part.
1918 // simply set it zero, the array indices will be shifted later
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1925 });
1926
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1931 continue;
1932 }
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1938
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1943 cut_shift,
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1949 cut_shift,
1950 process_cut_line_weight_to_put_left.size()));
1951
1952 this->thread_part_weight_work =
1953 Kokkos::subview(
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1958
1959 if(num_parts > 1) {
1960 // Rewrite the indices based on the computed cuts.
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1966
1967 this->create_consistent_chunks(
1968 num_parts,
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1971 coordinate_begin,
1972 coordinate_end,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1975 coordInd,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1978 }
1979 else {
1980 // if this part is partitioned into 1 then just copy
1981 // the old values.
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1983
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1989 = part_size;
1990 });
1991
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1995 coordinate_begin,
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2000 coordinate_begin,
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2004 }
2005
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2010 }
2011
2012 // shift cut coordinates so that all cut coordinates are stored.
2013 // current_cut_coordinates += cutShift;
2014
2015 // getChunks from coordinates partitioned the parts and
2016 // wrote the indices as if there were a single part.
2017 // now we need to shift the beginning indices.
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028 //shift it by previousCount
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2031
2032 if(ii % 2 == 1) {
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2037
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041 //MARKER: FLIPPED ZORDER BELOW
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043 }
2044 }
2045 });
2046
2047 // increase the previous count by current end.
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce("Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053 }, get_single);;
2054
2055 output_coordinate_end_index = get_single;
2056 // increase the current out.
2057 output_part_index += num_parts;
2058 }
2059 }
2060 }
2061
2062 // end of this partitioning dimension
2063 // set the current num parts for next dim partitioning
2064 current_num_parts = output_part_count_in_dimension;
2065
2066 //swap the coordinate permutations for the next dimension.
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2070
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075 }
2076
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078
2079 // Return output_xadj in CSR format
2080 output_xadj[0] = 0;
2081 for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2083 }
2084
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2087}
2088
2092template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093 typename mj_part_t, typename mj_node_t>
2094RCP<typename AlgMJ
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2097 get_global_box() const
2098{
2099 return this->global_box;
2100}
2101
2104template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105 typename mj_part_t, typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2108{
2109 this->mj_keep_part_boxes = true;
2110}
2111
2112/* \brief Either the mj array (part_no_array) or num_global_parts should be
2113 * provided in the input. part_no_array takes
2114 * precedence if both are provided.
2115 * Depending on these parameters, total cut/part number,
2116 * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117 * and the number of parts before the last dimension is calculated.
2118 * */
2119template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120 typename mj_part_t, typename mj_node_t>
2123{
2124 this->total_num_cut = 0; //how many cuts will be totally
2125 this->total_num_part = 1; //how many parts will be totally
2126 this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127 this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128 this->last_dim_num_part = 1; //max no of parts that might occur
2129 //during the partition before the
2130 //last partitioning dimension.
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2133
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2136
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2139
2140 this->total_num_part = 1;
2141 for(int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2143 }
2144
2145 mj_part_t track_max = 0;
2146 for(int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2149 };
2150 }
2151
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2154
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2157 } else {
2158 mj_part_t future_num_parts = this->num_global_parts;
2159
2160 // If using nonuniform first level partitioning.
2161 // initial value max_num_part_along_dim == num_first_level_parts
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2165 }
2166
2167 // we need to calculate the part numbers now, to determine
2168 // the maximum along the dimensions.
2169 for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2172
2173 // Nonuniform first level partitioning sets part specificiations for
2174 // rd == 0 only, given requested num of parts and distribution in parts
2175 // for the first level.
2176 if (rd == 0 &&
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2179
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2182
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2185
2186 // Cumulative sum of distribution of parts and size of largest part
2187 for (int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2191 }
2192
2193 // Total parts in largest nonuniform superpart from
2194 // first level partitioning
2195 nfutureNumParts =
2196 this->num_global_parts * max_part / sum_first_level_dist;
2197 }
2198 // Standard uniform partitioning this level
2199 else {
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2206 ++nfutureNumParts;
2207 }
2208 }
2209 future_num_parts = nfutureNumParts;
2210 }
2211 this->total_num_part = this->num_global_parts;
2212
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2216 }
2217 else {
2218 //this is the lower bound.
2219 //estimate reduceAll Count here.
2220 //we find the upperbound instead.
2221 size_t p = 1;
2222 for(int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2225 }
2226
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2229 }
2230 else {
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2232 }
2233 }
2234 }
2235
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2240 // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241
2242 // refine the concurrent part count, if it is given bigger than the maximum
2243 // possible part count.
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr << "Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250 }
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2252 }
2253}
2254
2255/* \brief Tries to determine the part number for current dimension,
2256 * by trying to make the partitioning as square as possible.
2257 * \param num_total_future how many more partitionings are required.
2258 * \param root how many more recursion depth is left.
2259 */
2260template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261 typename mj_part_t, typename mj_node_t>
2262inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future, double root)
2264{
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268 return ip;
2269 }
2270 else {
2271 return ip + 1;
2272 }
2273}
2274
2275/* \brief Function returns how many parts that will be obtained after this
2276 * dimension partitioning. It sets how many parts each current part will be
2277 * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278 * view, sets how many total future parts each obtained part will be
2279 * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280 * kept, then sets initializes the output_part_boxes as its ancestor.
2281 * \param future_num_part_in_parts: input, how many future parts each current
2282 * part will be partitioned into.
2283 * \param next_future_num_parts_in_parts: output, how many future parts each
2284 * obtained part will be partitioned into.
2285 * \param future_num_parts: output, max number of future parts that will be
2286 * obtained from a single
2287 * \param current_num_parts: input, how many parts are there currently.
2288 * \param current_iteration: input, current dimension iteration number.
2289 * \param input_part_boxes: input, if boxes are kept, current boxes.
2290 * \param output_part_boxes: output, if boxes are kept, the initial box
2291 * boundaries for obtained parts.
2292 * \param atomic_part_count DOCWORK: Documentation
2293 */
2294template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295 typename mj_part_t, typename mj_node_t>
2296mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2306{
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2308
2309 // how many parts that will be obtained after this dimension.
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2312 // when the partNo array is provided as input,
2313 // each current partition will be partition to the same number of parts.
2314 // we dont need to use the future_num_part_in_parts vector in this case.
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2317
2318 if(current_part_no_array < 1) {
2319 std::cout << "Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration << "] is given as:" <<
2321 current_part_no_array << std::endl;
2322 std::terminate();
2323 }
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2326 }
2327
2328 // If using part_no_array, ensure compatibility with num_first_level_parts.
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout << "Current recursive iteration: " << current_iteration
2333 << " part_no_array[" << current_iteration << "] is given as: " <<
2334 current_part_no_array << " and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2336 std::terminate();
2337 }
2338
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2341 }
2342
2343/*
2344 std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345 current_iteration << " current_num_parts: " <<
2346 current_num_parts << "\n\n";
2347
2348 std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349 num_partitioning_in_current_dim[0] << "\n\n";
2350
2351 std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352 << " num_partitioning_in_current_dim[0]: " <<
2353 num_partitioning_in_current_dim[0] << " " <<
2354 future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355*/
2356
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362 //initialized the output boxes as its ancestor.
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2366 }
2367 }
2368 }
2369
2370 // set the how many more parts each part will be divided.
2371 // this is obvious when partNo array is provided as input.
2372 // however, fill this so weights will be calculated according to this array.
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2375 }
2376 }
2377 else {
2378 // if partNo array is not provided as input, future_num_part_in_parts
2379 // holds how many parts each part should be divided. Initially it holds a
2380 // single number equal to the total number of global parts.
2381
2382 // calculate the future_num_parts from beginning,
2383 // since each part might be divided into different number of parts.
2384 future_num_parts = 1;
2385
2386 // cout << "i:" << i << std::endl;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388 // get how many parts a part should be divided.
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390
2391 // get the ideal number of parts that is close to the
2392 // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2396 );
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr << "ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2407 std::endl;
2408 std::terminate();
2409 }
2410 // add this number to vector_num_partitioning_in_current_dim vector.
2411 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413
2414 // Update part num arrays when on current_iteration == 0 and
2415 // using nonuniform first level partitioning
2416 // with requested num parts (num_first_level_parts) and
2417 // a requested distribution in parts (first_level_distribution).
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2421 // Only 1 current part to begin and partitions into
2422 // num_first_level_parts many parts
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424
2425 // The output number of parts from first level partitioning
2426 output_num_parts = this->num_first_level_parts;
2427
2428 // Remaining parts left to partition for all future levels
2429 future_num_parts /= this->num_first_level_parts;
2430
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2433
2434 // Cumulative sum of distribution of first level parts
2435 // and size of largest first level part
2436 for (int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2438
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2441 }
2442
2443 // Maximum # of remaining parts left to partition for all future levels
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445
2446 // Number of parts remaining left to partition for each future_part
2447 // The sum must exactly equal global_num_parts
2448 for (int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2451 }
2452 }
2453 else if (this->divide_to_prime_first) {
2454 // Add this number to num_partitioning_in_current_dim vector.
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458
2459 //increase the output number of parts.
2460 output_num_parts += num_partitions_in_current_dim;
2461
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2465 }
2466
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469
2470 // We divide to num_partitions_in_current_dim. But we adjust the weights
2471 // based on largest prime/ if num_partitions_in_current_dim = 2,
2472 // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473 // if the largest prime is less than part count, we use the part count
2474 // so that we divide uniformly.
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2477 }
2478 //ideal number of future partitions for each part.
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483
2484/*
2485 std::cout << "\ncurrent num part: " << ii
2486 << " largest_prime_factor: " << largest_prime_factor
2487 << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488*/
2489
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2493 //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2496 }
2497 //scale with 'x';
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2500
2501 //if there is a remainder in the part increase the part weight.
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503 //if not uniform, add 1 for the extra parts.
2504 ++num_future_parts_for_part_iii;
2505 }
2506
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508
2509 //if part boxes are stored, initialize the box of the parts as the ancestor.
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2512 }
2513
2514 //set num future_num_parts to maximum in this part.
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2517
2518 }
2519 }
2520 else {
2521 // Add this number to num_partitioning_in_current_dim vector.
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523
2524 //increase the output number of parts.
2525 output_num_parts += num_partitions_in_current_dim;
2526
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2530 }
2531 //ideal number of future partitions for each part.
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2538
2539 //if there is a remainder in the part increase the part weight.
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2542 // if not uniform, add 1 for the extra parts.
2543 ++num_future_parts_for_part_iii;
2544 }
2545
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2548
2549 // if part boxes are stored, initialize the box of the parts as
2550 // the ancestor.
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2553 }
2554 //set num future_num_parts to maximum in this part.
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2557 }
2558 }
2559 }
2560 }
2561 // move temp std::vector to host view
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2569 }
2570 // setup device equivalent - this data is used on host and device and it's
2571 // more efficient to just setup array on both sides now rather than copy
2572 // values as needed later.
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2576}
2577
2578/* \brief Allocates and initializes the work memory that will be used by MJ.
2579 * */
2580template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581 typename mj_part_t, typename mj_node_t>
2582void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2584{
2585 // Throughout the partitioning execution,
2586 // instead of the moving the coordinates, hold a permutation array for parts.
2587 // coordinate_permutations holds the current permutation.
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2596 });
2597
2598 // new_coordinate_permutations holds the current permutation.
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601 this->num_local_coords);
2602
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608 this->num_local_coords);
2609 }
2610
2611 // single partition starts at index-0, and ends at numLocalCoords
2612 // inTotalCounts array holds the end points in coordinate_permutations array
2613 // for each partition. Initially sized 1, and single element is set to
2614 // numLocalCoords.
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620
2621 // the ends points of the output, this is allocated later.
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624
2625 // only store this much if cuts are needed to be stored.
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629
2630 // how much weight percentage should a MPI put left side of the each cutline
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633
2634 // how much weight percentage should each thread in MPI put left side of
2635 // each outline
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2659 }
2660
2661 // work array to manipulate coordinate of cutlines in different iterations.
2662 // necessary because previous cut line information is used for determining
2663 // the next cutline information. therefore, cannot update the cut work array
2664 // until all cutlines are determined.
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669
2670 // cumulative part weight array.
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674
2675 // upper bound coordinate of a cut line
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680
2681 // lower bound coordinate of a cut line
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686
2687 // lower bound weight of a cut line
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692
2693 //upper bound weight of a cut line
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698
2699 // combined array to exchange the min and max coordinate,
2700 // and total weight of part.
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2706
2707 // global combined array with the results for min, max and total weight.
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2712
2713 // is_cut_line_determined is used to determine if a cutline is
2714 // determined already. If a cut line is already determined, the next
2715 // iterations will skip this cut line.
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719
2720 // incomplete_cut_count count holds the number of cutlines that have not
2721 // been finalized for each part when concurrentPartCount>1, using this
2722 // information, if incomplete_cut_count[x]==0, then no work is done for
2723 // this part.
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2729
2730 // local part weights of each thread.
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738
2739 // thread_cut_right_closest_point to hold the closest coordinate to a
2740 // cutline from right (for each thread)
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744
2745 // to store how many points in each part a thread has.
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748 this->max_num_part_along_dim);
2749
2750 // for faster communication, concatanation of
2751 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752 // leftClosest distances sized P-1, since P-1 cut lines
2753 // rightClosest distances size P-1, since P-1 cut lines.
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2760
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773 num_local_coords);
2774
2775 // changes owners back to host - so we don't run them on device
2776 // this improves migration code but means we have to serial init here.
2777 // Note we might allow this to be OpenMP when available even for CUDA.
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786 });
2787}
2788
2789/* \brief compute the global bounding box
2790 */
2791template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792 typename mj_part_t, typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2795{
2796 //local min coords
2797 mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798 //global min coords
2799 mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800 //local max coords
2801 mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802 //global max coords
2803 mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804
2805 auto local_mj_coordinates = this->mj_coordinates;
2806
2807 // If we are only doing 2 parts then we don't need these values
2808 // for y and z. Init them all to 0 first
2809 for(int i = 0; i < this->coord_dim; ++i) {
2810 mins[i] = 0;
2811 maxs[i] = 0;
2812 }
2813
2814 for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce("MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2821 }
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce("MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2829 }
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831 }
2832
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2835 );
2836
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2839 );
2840
2841 //create single box with all areas.
2842 global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843 //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844 delete [] mins;
2845 delete [] gmins;
2846 delete [] maxs;
2847 delete [] gmaxs;
2848}
2849
2850/* \brief for part communication we keep track of the box boundaries.
2851 * This is performed when either asked specifically, or when geometric mapping
2852 * is performed afterwards.
2853 * This function initializes a single box with all global min, max coordinates.
2854 * \param initial_partitioning_boxes the input and output vector for boxes.
2855 */
2856template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857 typename mj_part_t, typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861{
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2864}
2865
2870template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871 typename mj_part_t,
2872 typename mj_node_t>
2873void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878{
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2883
2884 bool bUniformWeights = mj_uniform_weights(0);
2885
2886 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2893
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2897 //if the part is empty.
2898 //set the min and max coordinates as reverse.
2899 if(coordinate_begin_index >= coordinate_end_index)
2900 {
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2904 }
2905 else {
2906 // get min
2907 Kokkos::parallel_reduce("get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915 // get max
2916 Kokkos::parallel_reduce("get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2926 }
2927 else {
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce("get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2936 }
2937 }
2938
2939 // single write
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2944 my_min_coord;
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2949 });
2950 }
2951}
2952
2965template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966 typename mj_part_t, typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972 // reduce min for first current_concurrent_num_parts elements, reduce
2973 // max for next concurrentPartCount elements, reduce sum for the last
2974 // concurrentPartCount elements.
2975 if(this->comm->getSize() > 1) {
2976 // We're using explicit host here as Spectrum MPI would fail
2977 // with the prior HostMirror UVMSpace to UVMSpace setup.
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2986 try {
2987 reduceAll<int, mj_scalar_t>(
2988 *(this->comm),
2989 reductionOp,
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2993 }
2994 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996 }
2997 else {
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3003 });
3004 }
3005}
3006
3039template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040 typename mj_part_t, typename mj_node_t>
3041void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts/*p-1*/ ,
3046 mj_scalar_t global_weight,
3047 /*p - 1 sized, coordinate of each cut line*/
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049 /*cumulative weights, at left side of each cut line. p-1 sized*/
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057{
3058 mj_scalar_t coord_range = max_coord - min_coord;
3059
3060 // We decided we could keep some std::vectors around for now. Eventually
3061 // it would be nice to have everything just as views with some being device
3062 // and some host. This particular case needs a bit of work to get setup
3063 // in a cleaner way so not going to mess with it at the moment.
3064
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073 std::terminate();
3074 }
3075 }
3076
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080
3081 mj_scalar_t cumulative = 0;
3082
3083 if(bUniformPartsCheck) {
3084 // How many total future parts the part will be partitioned into.
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087
3088 // How much each part should weigh in ideal case.
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3091
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3095 }
3096 }
3097 else {
3098 // Sum of entries in the first level partition distribution vector
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3102 }
3103
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3108 }
3109 }
3110
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3112
3113 Kokkos::parallel_for("Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116 // set target part weight.
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3120 // set this multiple times but here for device handling
3121 current_target_part_weights(num_cuts) = global_weight;
3122 });
3123
3124 // round the target part weights.
3125 // Note need to discuss regarding DragonFly commits and determine if we
3126 // would not simply check mj_uniform_weights here.
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130 (0, num_cuts + 1),
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3134 });
3135 }
3136}
3137
3154template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155 typename mj_part_t, typename mj_node_t>
3156void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3166{
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168
3169 // if there is single point, or if all points are along a line.
3170 // set initial part to 0 for all.
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177 });
3178 }
3179 else {
3180 // otherwise estimate an initial part for each coordinate.
3181 // assuming uniform distribution of points.
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188 mj_part_t pp =
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1; // don't want last coord in an invalid part
3192 }
3193 mj_part_ids[iii] = 2 * pp;
3194 });
3195 }
3196}
3197
3212template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213 typename mj_part_t, typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223{
3224 this->temp_cut_coords = current_cut_coordinates;
3225
3227 *reductionOp = NULL;
3228
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3230
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3235 }
3237 <mj_part_t, mj_scalar_t>(
3238 &temp,
3239 current_work_part,
3240 current_concurrent_num_parts);
3241 }
3242
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3266
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3270
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (int dummy) {
3274
3275 // these need to be initialized
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3278
3279 // initialize the lower and upper bounds of the cuts.
3280 mj_part_t next = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) = false;
3289 // min coordinate
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3292 // max coordinate
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3296 // total weight
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3303 }
3304 ++next;
3305 }
3306 }
3307 });
3308
3309 // loop_count allows the kernel to behave differently on the first loop
3310 // and subsequent loops. First loop we do a binary search and subsequent
3311 // loops we simply step towards our target.
3312 int loop_count = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3316 current_work_part,
3317 mj_current_dim_coords,
3318 loop_count);
3319 ++loop_count;
3320
3321 this->mj_combine_rightleft_and_weights(
3322 current_work_part,
3323 current_concurrent_num_parts);
3324
3325 // now sum up the results of mpi processors.
3326 if(!bSingleProcess) {
3327 // We're using explicit host here as Spectrum MPI would fail
3328 // with the prior HostMirror UVMSpace to UVMSpace setup.
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3335
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3338
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce("Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3345
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3352 }
3353 else {
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3356 }
3357
3358 // how much cut will be shifted for the next part in the concurrent
3359 // part calculation.
3360 mj_part_t cut_shift = 0;
3361
3362 // how much the concantaneted array will be shifted for the next part
3363 // in concurrent part calculation.
3364 size_t tlr_shift = 0;
3365
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3369
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3374
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3377
3378 //if the cuts of this cut has already been completed.
3379 //nothing to do for this part.
3380 //just update the shift amount and proceed.
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3386 continue;
3387 }
3388
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3392 tlr_shift,
3393 this->total_part_weight_left_right_closests.size()));
3394
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396 Kokkos::subview(
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3399 tlr_shift,
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3405 num_total_part,
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414 current_global_tlr;
3415
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3419 cut_shift,
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3424 cut_shift + kk,
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3430 cut_shift,
3431 local_process_cut_line_weight_to_put_left.size()));
3432
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3435
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3440 cut_shift,
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3445 cut_shift,
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3450 cut_shift,
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3455 cut_shift,
3456 local_cut_lower_bound_coordinates.size()));
3457
3458 // Now compute the new cut coordinates.
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3467
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3470 kk,
3471 num_cuts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3487
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3490 } // end of kk loop
3491
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3496 }
3497
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3504 });
3505 } // end of the while loop
3506
3507 // Needed only if keep_cuts; otherwise can simply swap array pointers
3508 // cutCoordinates and cutCoordinatesWork.
3509 // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510 // computed cuts must be in cutCoordinates.
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(int dummy) {
3515 mj_part_t next = 0;
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523 }
3524 next += num_cuts;
3525 }
3526 for(int n = 0; n <
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529 }
3530 });
3531 }
3532
3533 delete reductionOp;
3534}
3535
3536template<class scalar_t>
3538 scalar_t * ptr;
3539
3540 // With new kokkos setup parallel_reduce will call empty constructor and
3541 // we update the ptr in the init method.
3542 KOKKOS_INLINE_FUNCTION
3544
3545 KOKKOS_INLINE_FUNCTION
3546 Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547
3549 ptr = zmj.ptr;
3550 return *this;
3551 }
3552};
3553
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3555
3556template<class policy_t, class scalar_t, class part_t>
3558
3561 scalar_t max_scalar;
3565
3566 KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3567 scalar_t mj_max_scalar,
3568 value_type &val,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3571 max_scalar(mj_max_scalar),
3572 value(&val),
3573 value_count_rightleft(mj_value_count_rightleft),
3574 value_count_weights(mj_value_count_weights)
3575 {}
3576
3577 KOKKOS_INLINE_FUNCTION
3579 return *value;
3580 }
3581
3582 KOKKOS_INLINE_FUNCTION
3583 void join(value_type& dst, const value_type& src) const {
3584 for(int n = 0; n < value_count_weights; ++n) {
3585 dst.ptr[n] += src.ptr[n];
3586 }
3587
3588 for(int n = value_count_weights + 2;
3589 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3590 if(src.ptr[n] > dst.ptr[n]) {
3591 dst.ptr[n] = src.ptr[n];
3592 }
3593 if(src.ptr[n+1] < dst.ptr[n+1]) {
3594 dst.ptr[n+1] = src.ptr[n+1];
3595 }
3596 }
3597 }
3598
3599 KOKKOS_INLINE_FUNCTION
3600 void join (volatile value_type& dst, const volatile value_type& src) const {
3601 for(int n = 0; n < value_count_weights; ++n) {
3602 dst.ptr[n] += src.ptr[n];
3603 }
3604
3605 for(int n = value_count_weights + 2;
3606 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3607 if(src.ptr[n] > dst.ptr[n]) {
3608 dst.ptr[n] = src.ptr[n];
3609 }
3610 if(src.ptr[n+1] < dst.ptr[n+1]) {
3611 dst.ptr[n+1] = src.ptr[n+1];
3612 }
3613 }
3614 }
3615
3616 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3617 dst.ptr = value->ptr; // must update ptr
3618
3619 for(int n = 0; n < value_count_weights; ++n) {
3620 dst.ptr[n] = 0;
3621 }
3622
3623 for(int n = value_count_weights;
3625 dst.ptr[n] = -max_scalar;
3626 dst.ptr[n+1] = max_scalar;
3627 }
3628 }
3629};
3630#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3631
3632template<class policy_t, class scalar_t, class part_t, class index_t,
3633 class device_t, class array_t>
3635 typedef typename policy_t::member_type member_type;
3636 typedef Kokkos::View<scalar_t*> scalar_view_t;
3637
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3639 typedef array_t value_type[];
3640#endif
3641
3643 array_t max_scalar;
3644
3652 Kokkos::View<index_t*, device_t> permutations;
3653 Kokkos::View<scalar_t *, device_t> coordinates;
3654 Kokkos::View<scalar_t**, device_t> weights;
3655 Kokkos::View<part_t*, device_t> parts;
3656 Kokkos::View<scalar_t *, device_t> cut_coordinates;
3657 Kokkos::View<index_t *, device_t> part_xadj;
3659 scalar_t sEpsilon;
3660
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3665#endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3666
3668 int mj_loop_count,
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3671 part_t mj_num_cuts,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3688#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3689 ) :
3690 loop_count(mj_loop_count),
3691 max_scalar(mj_max_scalar),
3692 concurrent_current_part(mj_concurrent_current_part),
3693 num_cuts(mj_num_cuts),
3694 current_work_part(mj_current_work_part),
3695 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3696 value_count_rightleft(mj_left_right_array_size),
3697 value_count_weights(mj_weight_array_size),
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3699 permutations(mj_permutations),
3700 coordinates(mj_coordinates),
3701 weights(mj_weights),
3702 parts(mj_parts),
3703 cut_coordinates(mj_cut_coordinates),
3704 part_xadj(mj_part_xadj),
3705 uniform_weights0(mj_uniform_weights0),
3706 sEpsilon(mj_sEpsilon)
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3711#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3712 {
3713 }
3714
3715 size_t team_shmem_size (int team_size) const {
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result = sizeof(array_t) *
3719#else
3720 int result = sizeof(array_t) *
3722#endif
3723
3724 // pad this to a multiple of 8 or it will run corrupt
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3728 }
3729 return result;
3730 }
3731
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734 void operator() (const member_type & teamMember) const {
3735#else
3736 void operator() (const member_type & teamMember, value_type teamSum) const {
3737#endif
3738
3739 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3741 index_t all_end = part_xadj(concurrent_current_part);
3742
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3745
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3748 stride += 1; // make sure we have coverage for the final points
3749 }
3750
3751 // the last team may have less work than the other teams
3752 // the last team can be empty (begin > end) if num_teams > stride
3753 // which is true for many teams and small numbers of coords (tests)
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3756 if(end > all_end) {
3757 end = all_end;
3758 }
3759
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3761 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3763
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3765 sh_mem_size);
3766
3767 // init the shared array to 0
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3769 for(int n = 0; n < value_count_weights; ++n) {
3770 shared_ptr[n] = 0;
3771 }
3772 for(int n = value_count_weights;
3774 shared_ptr[n] = -max_scalar;
3775 shared_ptr[n+1] = max_scalar;
3776 }
3777 });
3778 teamMember.team_barrier();
3779
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3782 [=] (index_t ii) {
3783#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3784 // create the team shared data - each thread gets one of the arrays
3785 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3786 value_count_rightleft) * teamMember.team_size();
3787
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3789 sh_mem_size);
3790
3791 // select the array for this thread
3792 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3794
3795 // create reducer which handles the Zoltan2_MJArrayType class
3797 max_scalar, array,
3800
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3803 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3804#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3805
3806 int i = permutations(ii);
3807 scalar_t coord = coordinates(i);
3808 array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3809
3810 // now check each part and it's right cut
3811 index_t part = parts(i)/2;
3812
3813 int upper = num_cuts;
3814 int lower = 0;
3815
3816 // binary search - find matching part
3817 while(true) {
3818 scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3819 scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3820
3821 if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3822#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3824#else
3825 threadSum.ptr[part*2] += w;
3826#endif
3827
3828 parts(i) = part*2;
3829
3830 // now handle the left/right closest part
3831#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832 array_t new_value = (array_t) coord;
3833 array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3836 &shared_ptr[value_count_weights + part * 2 + 1],
3837 prev_value, new_value);
3838 }
3839 prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3842 &shared_ptr[value_count_weights + part * 2 + 2],
3843 prev_value, new_value);
3844 }
3845#else
3846 // note cut to left needs to set right closest and cut to right needs
3847 // to set left closest. It's index +1 and +2 instead of -1 and +0
3848 // because right/left segment is padded with an extra pair at
3849 // begining and end to avoid branching with if checks.
3850 if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3851 threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3852 }
3853 if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3854 threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3855 }
3856#endif
3857
3858 break;
3859 }
3860 else if(part != num_cuts) {
3861 if(coord < b + sEpsilon && coord > b - sEpsilon) {
3862 // Note if on cut we set right/left closest to the cut itself
3863 // but we add +2 because we buffered the area with an extra slot
3864 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3865#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3867 shared_ptr[value_count_weights + part * 2 + 2] = b;
3868 shared_ptr[value_count_weights + part * 2 + 3] = b;
3869#else
3870 threadSum.ptr[part*2+1] += w;
3871 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3872 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3873#endif
3874
3875 parts(i) = part*2+1;
3876
3877 // Need to scan up for any other cuts of same coordinate
3878 // This is costly but it's only relevant for the fix4785 test
3879 // which loads a lot of coordinates on the same point, so without
3880 // this our cuts would all just sit at 0.
3881 part_t base_b = part;
3882 scalar_t base_coord = cut_coordinates(base_b);
3883 part += 1;
3884 while(part < num_cuts) {
3885 b = cut_coordinates(part);
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3888 if(delta < sEpsilon) {
3889 // Note if on cut we set right/left closest to the cut itself
3890 // but we add +2 because we buffered the area with an extra slot
3891 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3892#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3894 shared_ptr[value_count_weights + part * 2 + 2] = b;
3895 shared_ptr[value_count_weights + part * 2 + 3] = b;
3896#else
3897 threadSum.ptr[part*2+1] += w;
3898 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3899 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3900#endif
3901 }
3902 else { break; }
3903 ++part;
3904 }
3905 part = base_b - 1;
3906 while(part >= 0) {
3907 b = cut_coordinates(part);
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3910 if(delta < sEpsilon) {
3911 // Note if on cut we set right/left closest to the cut itself
3912 // but we add +2 because we buffered the area with an extra slot
3913 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3914#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3916 shared_ptr[value_count_weights + part * 2 + 2] = b;
3917 shared_ptr[value_count_weights + part * 2 + 3] = b;
3918#else
3919 threadSum.ptr[part*2+1] += w;
3920 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3921 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3922#endif
3923 }
3924 else { break; }
3925 --part;
3926 }
3927
3928 break;
3929 }
3930 }
3931
3932 if(loop_count != 0) {
3933 // subsequent loops can just step towards target
3934 if(coord < b) {
3935 part -= 1;
3936 }
3937 else {
3938 part += 1;
3939 }
3940 }
3941 else {
3942 // initial loop binary search
3943 if(coord < b) {
3944 if(part == lower + 1) {
3945 part = lower;
3946 }
3947 else {
3948 upper = part - 1;
3949 part -= (part - lower)/2;
3950 }
3951 }
3952 else if(part == upper - 1) {
3953 part = upper;
3954 }
3955 else {
3956 lower = part + 1;
3957 part += (upper - part)/2;
3958 }
3959 }
3960 }
3961#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3962 });
3963#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3964 }, arraySumReducer);
3965#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3966
3967 teamMember.team_barrier();
3968
3969 // collect all the team's results
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3971 for(int n = 0; n < value_count_weights; ++n) {
3972#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973 Kokkos::atomic_add(&current_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3975#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3976 teamSum[n] += array.ptr[n];
3977#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3978 }
3979
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 int insert_left = 0;
3982 int insert_right = 0;
3983#endif
3984
3985 for(int n = 2 + value_count_weights;
3986 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3987#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 &current_right_closest(insert_right), prev_value, new_value);
3993 }
3994
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 &current_left_closest(insert_left), prev_value, new_value);
4000 }
4001
4002 ++insert_left;
4003 ++insert_right;
4004#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4007 }
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4010 }
4011#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4012 }
4013 });
4014
4015 teamMember.team_barrier();
4016 }
4017
4018#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019 KOKKOS_INLINE_FUNCTION
4020 void join(value_type dst, const value_type src) const {
4021 for(int n = 0; n < value_count_weights; ++n) {
4022 dst[n] += src[n];
4023 }
4024
4025 for(int n = value_count_weights + 2;
4026 n < value_count_weights + value_count_rightleft - 2; n += 2) {
4027 if(src[n] > dst[n]) {
4028 dst[n] = src[n];
4029 }
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4032 }
4033 }
4034 }
4035
4036 KOKKOS_INLINE_FUNCTION
4037 void join (volatile value_type dst, const volatile value_type src) const {
4038 for(int n = 0; n < value_count_weights; ++n) {
4039 dst[n] += src[n];
4040 }
4041
4042 for(int n = value_count_weights + 2;
4043 n < value_count_weights + value_count_rightleft - 2; n += 2) {
4044 if(src[n] > dst[n]) {
4045 dst[n] = src[n];
4046 }
4047 if(src[n+1] < dst[n+1]) {
4048 dst[n+1] = src[n+1];
4049 }
4050 }
4051 }
4052
4053 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4054 for(int n = 0; n < value_count_weights; ++n) {
4055 dst[n] = 0;
4056 }
4057
4058 for(int n = value_count_weights;
4060 dst[n] = -max_scalar;
4061 dst[n+1] = max_scalar;
4062 }
4063 }
4064#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4065};
4066
4074template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4075 typename mj_part_t, typename mj_node_t>
4076void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4077 mj_1D_part_get_part_weights(
4079 mj_part_t current_work_part,
4080 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4081 int loop_count)
4082{
4083 auto local_is_cut_line_determined = is_cut_line_determined;
4084 auto local_thread_part_weights = thread_part_weights;
4085 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4086 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4087
4088 // Create some locals so we don't use this inside the kernels
4089 // which causes problems
4090 auto local_sEpsilon = this->sEpsilon;
4091 auto local_assigned_part_ids = this->assigned_part_ids;
4092 auto local_coordinate_permutations = this->coordinate_permutations;
4093 auto local_mj_weights = this->mj_weights;
4094 auto local_part_xadj = this->part_xadj;
4095 auto local_global_min_max_coord_total_weight =
4096 this->global_min_max_coord_total_weight;
4097
4098 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4099
4100 auto local_device_num_partitioning_in_current_dim =
4101 device_num_partitioning_in_current_dim;
4102
4103 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4104 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4105
4106 mj_part_t total_part_shift = 0;
4107
4108 mj_part_t concurrent_cut_shifts = 0;
4109 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4110 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4111 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4112 concurrent_cut_shifts, temp_cut_coords.size()));
4113
4114 mj_part_t num_parts =
4115 host_num_partitioning_in_current_dim(current_work_part + kk);
4116 mj_part_t num_cuts = num_parts - 1;
4117 mj_part_t total_part_count = num_parts + num_cuts;
4118 mj_part_t weight_array_length = num_cuts + num_parts;
4119
4120 // for right/left closest + buffer cut on either side
4121 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4122
4123 if(this->incomplete_cut_count(kk) == 0) {
4124 total_part_shift += total_part_count;
4125 concurrent_cut_shifts += num_cuts;
4126 continue;
4127 }
4128
4129 // if not set use 60 - was initial testing amount but somewhat arbitrary
4130 auto policy_ReduceWeightsFunctor = policy_t(
4131 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4132
4133#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4134 int total_array_length =
4135 weight_array_length + right_left_array_length;
4136#endif
4137
4138 // Using float here caused some numerical errors for coord on cut calculations.
4139 // Probably that can be fixed with proper epsilon adjustment but since cuda
4140 // doesn't reduce right now the shared memory pressure is no longer relevant.
4141 // Just use scalar_t to match the original algorithm.
4142 typedef mj_scalar_t array_t;
4143
4144#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4145 array_t * reduce_array =
4146 new array_t[static_cast<size_t>(total_array_length)];
4147#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4148
4149 int offset_cuts = 0;
4150 for(int kk2 = 0; kk2 < kk; ++kk2) {
4151 offset_cuts +=
4152 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4153 }
4154 Kokkos::View<double *, device_t> my_current_part_weights =
4155 Kokkos::subview(local_thread_part_weights,
4156 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4157 total_part_shift + total_part_count));
4158 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4159 Kokkos::subview(local_thread_cut_left_closest_point,
4160 std::pair<mj_lno_t, mj_lno_t>(
4161 offset_cuts,
4162 local_thread_cut_left_closest_point.size()));
4163 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4164 Kokkos::subview(local_thread_cut_right_closest_point,
4165 std::pair<mj_lno_t, mj_lno_t>(
4166 offset_cuts,
4167 local_thread_cut_right_closest_point.size()));
4168
4169 array_t max_scalar = std::numeric_limits<array_t>::max();
4170
4171#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4172 // initialize values
4173 Kokkos::parallel_for(
4174 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4175 KOKKOS_LAMBDA (int dummy) {
4176 for(int n = 0; n < weight_array_length; ++n) {
4177 my_current_part_weights(n) = 0;
4178 }
4179 for(int n = 0; n < num_cuts; ++n) {
4180 my_current_left_closest(n) = -max_scalar;
4181 my_current_right_closest(n) = max_scalar;
4182 }
4183 });
4184#endif
4185
4186 mj_part_t concurrent_current_part =
4187 current_work_part + kk;
4188
4189 ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4190 typename mj_node_t::device_type, array_t>
4191 teamFunctor(
4192 loop_count,
4193 max_scalar,
4195 num_cuts,
4198 right_left_array_length,
4199 weight_array_length,
4200 coordinate_permutations,
4201 mj_current_dim_coords,
4202 mj_weights,
4203 assigned_part_ids,
4204 local_temp_cut_coords,
4205 part_xadj,
4206 mj_uniform_weights(0), // host and currently only relevant to slot 0
4207 sEpsilon
4208#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4209 ,my_current_part_weights,
4210 my_current_left_closest,
4211 my_current_right_closest
4212#endif
4213 );
4214
4215#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4216 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4217#else
4218 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4219 teamFunctor, reduce_array);
4220#endif
4221
4222#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4223 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4224
4225 for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4226 hostArray(i) = reduce_array[i];
4227 }
4228
4229 Kokkos::deep_copy(my_current_part_weights, hostArray);
4230
4231 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4232 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4233 for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4234 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4235 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4236 }
4237 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4238 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4239
4240 delete [] reduce_array;
4241#endif
4242
4243 total_part_shift += total_part_count;
4244 concurrent_cut_shifts += num_cuts;
4245 }
4246
4247 auto local_temp_cut_coords = temp_cut_coords;
4248
4249 Kokkos::parallel_for(
4250 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4251 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4252 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4253 current_work_part + kk);
4254 mj_part_t num_cuts = num_parts - 1;
4255 mj_part_t total_part_count = num_parts + num_cuts;
4256
4257 if(local_device_incomplete_cut_count(kk) > 0) {
4258 // get the prefix sum
4259 // This is an inefficiency but not sure if it matters much
4260 size_t offset = 0;
4261 size_t offset_cuts = 0;
4262 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4263 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4264 current_work_part + kk2);
4265 offset += num_parts_kk2 * 2 - 1;
4266 offset_cuts += num_parts_kk2 - 1;
4267 }
4268
4269 for(mj_part_t i = 1; i < total_part_count; ++i) {
4270 // check for cuts sharing the same position; all cuts sharing a position
4271 // have the same weight == total weight for all cuts sharing the
4272 // position. Don't want to accumulate that total weight more than once.
4273 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4274 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4275 local_temp_cut_coords(offset_cuts + i /2 - 1))
4276 < local_sEpsilon) {
4277 // i % 2 = 0 when part i represents the cut coordinate.
4278 // if it is a cut, and if next cut also has the same coordinate, then
4279 // dont addup.
4280 local_thread_part_weights(offset + i)
4281 = local_thread_part_weights(offset + i-2);
4282 continue;
4283 }
4284
4285 // otherwise do the prefix sum.
4286 local_thread_part_weights(offset + i) +=
4287 local_thread_part_weights(offset + i-1);
4288 }
4289 }
4290 });
4291}
4292
4300template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4301 typename mj_part_t, typename mj_node_t>
4302void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4303 mj_combine_rightleft_and_weights(
4304 mj_part_t current_work_part,
4306{
4307 auto local_thread_part_weights = this->thread_part_weights;
4308 auto local_is_cut_line_determined = this->is_cut_line_determined;
4309 auto local_thread_cut_left_closest_point =
4310 this->thread_cut_left_closest_point;
4311 auto local_thread_cut_right_closest_point =
4312 this->thread_cut_right_closest_point;
4313 auto local_total_part_weight_left_right_closests =
4314 this->total_part_weight_left_right_closests;
4315 auto local_device_num_partitioning_in_current_dim =
4316 device_num_partitioning_in_current_dim;
4317 Kokkos::parallel_for(
4318 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4319 KOKKOS_LAMBDA (int dummy) {
4320
4321 size_t tlr_array_shift = 0;
4322 mj_part_t cut_shift = 0;
4323 size_t total_part_array_shift = 0;
4324
4325 // iterate for all concurrent parts to find the left and right closest
4326 // points in the process.
4327 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4328
4329 mj_part_t num_parts_in_part =
4330 local_device_num_partitioning_in_current_dim(current_work_part + i);
4331 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4332 size_t num_total_part_in_part =
4333 num_parts_in_part + size_t (num_cuts_in_part);
4334
4335 // iterate for cuts in a single part.
4336 for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4337 mj_part_t next = tlr_array_shift + ii;
4338 mj_part_t cut_index = cut_shift + ii;
4339
4340 if(!local_is_cut_line_determined(cut_index)) {
4341 mj_scalar_t left_closest_in_process =
4342 local_thread_cut_left_closest_point(cut_index);
4343 mj_scalar_t right_closest_in_process =
4344 local_thread_cut_right_closest_point(cut_index);
4345
4346 // store the left and right closes points.
4347 local_total_part_weight_left_right_closests(
4348 num_total_part_in_part + next) = left_closest_in_process;
4349
4350 local_total_part_weight_left_right_closests(
4351 num_total_part_in_part + num_cuts_in_part + next) =
4352 right_closest_in_process;
4353 }
4354 }
4355
4356 for(size_t j = 0; j < num_total_part_in_part; ++j) {
4357 mj_part_t cut_ind = j / 2 + cut_shift;
4358
4359 // need to check j != num_total_part_in_part - 1
4360 // which is same as j/2 != num_cuts_in_part.
4361 // we cannot check it using cut_ind, because of the concurrent part
4362 // concantanetion.
4363 if(j == num_total_part_in_part - 1 ||
4364 !local_is_cut_line_determined(cut_ind)) {
4365 double pwj = local_thread_part_weights(total_part_array_shift + j);
4366 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4367 }
4368 }
4369
4370 // set the shift position in the arrays
4371 cut_shift += num_cuts_in_part;
4372 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4373 total_part_array_shift += num_total_part_in_part;
4374 }
4375 });
4376}
4377
4390template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4391 typename mj_part_t, typename mj_node_t>
4392KOKKOS_INLINE_FUNCTION
4393void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4394 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4395 mj_scalar_t cut_lower_bound,
4396 mj_scalar_t cut_upper_weight,
4397 mj_scalar_t cut_lower_weight,
4398 mj_scalar_t expected_weight,
4399 mj_scalar_t &new_cut_position,
4400 mj_scalar_t sEpsilon) {
4401
4402 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4403 new_cut_position = cut_upper_bound; //or lower bound does not matter.
4404 }
4405
4406 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4407 new_cut_position = cut_lower_bound;
4408 }
4409
4410 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4411 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4412 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4413
4414 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4415 int scale_constant = 20;
4416 int shiftint= int (required_shift * scale_constant);
4417 if(shiftint == 0) shiftint = 1;
4418 required_shift = mj_scalar_t (shiftint) / scale_constant;
4419 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4420}
4421
4422#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4423
4424template<class policy_t, class scalar_t>
4426
4431
4432 KOKKOS_INLINE_FUNCTION ArrayReducer(
4433 value_type &val,
4434 int mj_value_count) :
4435 value(&val),
4436 value_count(mj_value_count)
4437 {}
4438
4439 KOKKOS_INLINE_FUNCTION
4441 return *value;
4442 }
4443
4444 KOKKOS_INLINE_FUNCTION
4445 void join(value_type& dst, const value_type& src) const {
4446 for(int n = 0; n < value_count; ++n) {
4447 dst.ptr[n] += src.ptr[n];
4448 }
4449 }
4450
4451 KOKKOS_INLINE_FUNCTION
4452 void join (volatile value_type& dst, const volatile value_type& src) const {
4453 for(int n = 0; n < value_count; ++n) {
4454 dst.ptr[n] += src.ptr[n];
4455 }
4456 }
4457
4458 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4459 dst.ptr = value->ptr; // must update ptr
4460 for(int n = 0; n < value_count; ++n) {
4461 dst.ptr[n] = 0;
4462 }
4463 }
4464};
4465
4466#endif
4467
4468template<class policy_t, class scalar_t, class part_t, class index_t,
4469 class device_t, class array_t>
4471 typedef typename policy_t::member_type member_type;
4472 typedef Kokkos::View<scalar_t*> scalar_view_t;
4473
4474#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4475 typedef array_t value_type[];
4476#endif
4477
4480 Kokkos::View<index_t*, device_t> permutations;
4481 Kokkos::View<scalar_t *, device_t> coordinates;
4482 Kokkos::View<part_t*, device_t> parts;
4483 Kokkos::View<index_t *, device_t> part_xadj;
4484 Kokkos::View<index_t *, device_t> track_on_cuts;
4485
4486#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4487 Kokkos::View<int *, device_t> local_point_counts;
4488#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4489
4491 part_t mj_concurrent_current_part,
4492 part_t mj_weight_array_size,
4493 Kokkos::View<index_t*, device_t> & mj_permutations,
4494 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4495 Kokkos::View<part_t*, device_t> & mj_parts,
4496 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4497 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4498#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4499 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4500#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4501 ) :
4502 concurrent_current_part(mj_concurrent_current_part),
4503 value_count(mj_weight_array_size),
4504 permutations(mj_permutations),
4505 coordinates(mj_coordinates),
4506 parts(mj_parts),
4507 part_xadj(mj_part_xadj),
4508 track_on_cuts(mj_track_on_cuts)
4509#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510 ,local_point_counts(mj_local_point_counts)
4511#endif
4512 {
4513 }
4514
4515 size_t team_shmem_size (int team_size) const {
4516#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4517 int result = sizeof(array_t) * (value_count);
4518#else
4519 int result = sizeof(array_t) * (value_count) * team_size;
4520#endif
4521
4522 // pad this to a multiple of 8 or it will run corrupt
4523 int remainder = result % 8;
4524 if(remainder != 0) {
4525 result += 8 - remainder;
4526 }
4527 return result;
4528 }
4529
4530 KOKKOS_INLINE_FUNCTION
4531#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532 void operator() (const member_type & teamMember) const {
4533#else
4534 void operator() (const member_type & teamMember, value_type teamSum) const {
4535#endif
4536 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4538 index_t all_end = part_xadj(concurrent_current_part);
4539
4540 index_t num_working_points = all_end - all_begin;
4541 int num_teams = teamMember.league_size();
4542
4543 index_t stride = num_working_points / num_teams;
4544 if((num_working_points % num_teams) > 0) {
4545 stride += 1; // make sure we have coverage for the final points
4546 }
4547
4548 index_t begin = all_begin + stride * teamMember.league_rank();
4549 index_t end = begin + stride;
4550 if(end > all_end) {
4551 end = all_end; // the last team may have less work than the other teams
4552 }
4553
4554 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4555
4556 // create the team shared data - each thread gets one of the arrays
4557#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4558 size_t sh_mem_size = sizeof(array_t) * (value_count);
4559#else
4560 size_t sh_mem_size =
4561 sizeof(array_t) * (value_count) * teamMember.team_size();
4562#endif
4563
4564 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4565 sh_mem_size);
4566
4567#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4568 // init the shared array to 0
4569 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4570 for(int n = 0; n < value_count; ++n) {
4571 shared_ptr[n] = 0;
4572 }
4573 });
4574 teamMember.team_barrier();
4575
4576 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4577 [=] (index_t ii) {
4578#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4579 // select the array for this thread
4580 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4581 (value_count)]);
4582
4583 // create reducer which handles the Zoltan2_MJArrayType class
4584 ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4585
4586 Kokkos::parallel_reduce(
4587 Kokkos::TeamThreadRange(teamMember, begin, end),
4588 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4589#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4590
4591 index_t coordinate_index = permutations(ii);
4592 part_t place = parts(coordinate_index);
4593 part_t part = place / 2;
4594 if(place % 2 == 0) {
4595#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596 Kokkos::atomic_add(&shared_ptr[part], 1);
4597#else
4598 threadSum.ptr[part] += 1;
4599#endif
4600
4601 parts(coordinate_index) = part;
4602 }
4603 else {
4604 // fill a tracking array so we can process these slower points
4605 // in next cycle
4606 index_t set_index = Kokkos::atomic_fetch_add(
4607 &track_on_cuts(track_on_cuts_insert_index), 1);
4608 track_on_cuts(set_index) = ii;
4609 }
4610#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4611 });
4612#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4613 }, arrayReducer);
4614#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4615
4616 teamMember.team_barrier();
4617
4618 // collect all the team's results
4619 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4620 for(int n = 0; n < value_count; ++n) {
4621#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4622 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4623#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4624 teamSum[n] += array.ptr[n];
4625#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4626 }
4627 });
4628
4629 teamMember.team_barrier();
4630 }
4631
4632#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4633
4634 KOKKOS_INLINE_FUNCTION
4635 void join(value_type dst, const value_type src) const {
4636 for(int n = 0; n < value_count; ++n) {
4637 dst[n] += src[n];
4638 }
4639 }
4640
4641 KOKKOS_INLINE_FUNCTION
4642 void join (volatile value_type dst, const volatile value_type src) const {
4643 for(int n = 0; n < value_count; ++n) {
4644 dst[n] += src[n];
4645 }
4646 }
4647
4648 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4649 for(int n = 0; n < value_count; ++n) {
4650 dst[n] = 0;
4651 }
4652 }
4653#endif
4654};
4655
4671template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4672 typename mj_part_t, typename mj_node_t>
4673void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4674mj_create_new_partitions(
4675 mj_part_t num_parts,
4676 mj_part_t current_concurrent_work_part,
4677 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4678 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4679 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4680 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4681{
4682 // Get locals for cuda
4683 auto local_thread_part_weight_work = this->thread_part_weight_work;
4684 auto local_point_counts = this->thread_point_counts;
4685 auto local_distribute_points_on_cut_lines =
4686 this->distribute_points_on_cut_lines;
4687 auto local_thread_cut_line_weight_to_put_left =
4688 this->thread_cut_line_weight_to_put_left;
4689 auto local_sEpsilon = this->sEpsilon;
4690 auto local_coordinate_permutations = this->coordinate_permutations;
4691 auto local_mj_weights = this->mj_weights;
4692 auto local_assigned_part_ids = this->assigned_part_ids;
4693 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4694
4695 mj_part_t num_cuts = num_parts - 1;
4696
4697 Kokkos::parallel_for(
4698 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4699 KOKKOS_LAMBDA(int dummy) {
4700
4701 if(local_distribute_points_on_cut_lines) {
4702 for(int i = 0; i < num_cuts; ++i) {
4703 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4704 if(left_weight > local_sEpsilon) {
4705 // the weight of thread ii on cut.
4706 mj_scalar_t thread_ii_weight_on_cut =
4707 local_thread_part_weight_work(i * 2 + 1) -
4708 local_thread_part_weight_work(i * 2);
4709
4710 if(thread_ii_weight_on_cut < left_weight) {
4711 // if left weight is bigger than threads weight on cut.
4712 local_thread_cut_line_weight_to_put_left(i) =
4713 thread_ii_weight_on_cut;
4714 }
4715 else {
4716 // if thread's weight is bigger than space, then put only a portion.
4717 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4718 }
4719 left_weight -= thread_ii_weight_on_cut;
4720 }
4721 else {
4722 local_thread_cut_line_weight_to_put_left(i) = 0;
4723 }
4724 }
4725
4726 // this is a special case. If cutlines share the same coordinate,
4727 // their weights are equal. We need to adjust the ratio for that.
4728 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4729 if(std::abs(current_concurrent_cut_coordinate(i) -
4730 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4731 local_thread_cut_line_weight_to_put_left(i) -=
4732 local_thread_cut_line_weight_to_put_left(i - 1);
4733 }
4734 local_thread_cut_line_weight_to_put_left(i) =
4735 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4736 least_signifiance) * significance_mul) /
4737 static_cast<mj_scalar_t>(significance_mul);
4738 }
4739 }
4740
4741 for(mj_part_t i = 0; i < num_parts; ++i) {
4742 local_point_counts(i) = 0;
4743 }
4744 });
4745
4746 mj_lno_t coordinate_begin_index =
4747 current_concurrent_work_part == 0 ? 0 :
4748 host_part_xadj(current_concurrent_work_part - 1);
4749 mj_lno_t coordinate_end_index =
4750 host_part_xadj(current_concurrent_work_part);
4751
4752 mj_lno_t total_on_cut;
4753 Kokkos::parallel_reduce("Get total_on_cut",
4754 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4755 coordinate_begin_index, coordinate_end_index),
4756 KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4757 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4758 mj_part_t coordinate_assigned_place =
4759 local_assigned_part_ids(coordinate_index);
4760 if(coordinate_assigned_place % 2 == 1) {
4761 val += 1;
4762 }
4763 }, total_on_cut);
4764
4765 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4766 if(total_on_cut > 0) {
4767 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4768 "track_on_cuts", // would do WithoutInitialization but need last init to 0
4769 total_on_cut + 1); // extra index to use for tracking
4770 }
4771
4772 // here we need to parallel reduce an array to count coords in each part
4773 // atomically adding, especially for low part count would kill us
4774 // in the original setup we kept arrays allocated for each thread but for
4775 // the cuda version we'd like to avoid allocating N arrays for the number
4776 // of teams/threads which would be complicated based on running openmp or
4777 // cuda.
4778 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4779
4780 // if not set use 60 - somewhat arbitrary based on initial performance tests
4781 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4782
4783 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4784 typedef int array_t;
4785
4786 // just need parts - on the cuts will be handled in a separate serial
4787 // call after this.
4788#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4789 array_t * reduce_array = new array_t[static_cast<size_t>(num_parts)];
4790#endif
4791
4792 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4793 typename mj_node_t::device_type, array_t>teamFunctor(
4794 current_concurrent_work_part,
4795 num_parts,
4796 coordinate_permutations,
4797 mj_current_dim_coords,
4798 assigned_part_ids,
4799 part_xadj,
4800 track_on_cuts
4801#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4802 ,local_point_counts
4803#endif
4804 );
4805
4806#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4807 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4808#else
4809 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4810#endif
4811
4812#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4813 for(mj_part_t part = 0; part < num_parts; ++part) {
4814 local_point_counts(part) = reduce_array[part];
4815 }
4816 delete [] reduce_array;
4817#endif
4818
4819 // the last member is utility used for atomically inserting the values.
4820 // Sorting here avoids potential indeterminancy in the partitioning results
4821 if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4822 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4823 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4824 Kokkos::sort(track_on_cuts_sort);
4825 }
4826
4827 bool uniform_weights0 = this->mj_uniform_weights(0);
4828 Kokkos::parallel_for(
4829 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4830 KOKKOS_LAMBDA (int dummy) {
4831
4832 for(int j = 0; j < total_on_cut; ++j) {
4833 int ii = track_on_cuts(j);
4834 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4835 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4836 local_mj_weights(coordinate_index,0);
4837 mj_part_t coordinate_assigned_place =
4838 local_assigned_part_ids(coordinate_index);
4839 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4840 // if it is on the cut.
4841 if(local_distribute_points_on_cut_lines &&
4842 local_thread_cut_line_weight_to_put_left(
4843 coordinate_assigned_part) > local_sEpsilon) {
4844 // if the rectilinear partitioning is allowed,
4845 // and the thread has still space to put on the left of the cut
4846 // then thread puts the vertex to left.
4847 local_thread_cut_line_weight_to_put_left(
4848 coordinate_assigned_part) -= coordinate_weight;
4849 // if putting the vertex to left increased the weight more
4850 // than expected, and if the next cut is on the same coordinate,
4851 // then we need to adjust how much weight next cut puts to its left as
4852 // well, in order to take care of the imbalance.
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4855 num_cuts - 1 &&
4856 std::abs(current_concurrent_cut_coordinate(
4857 coordinate_assigned_part+1) -
4858 current_concurrent_cut_coordinate(
4859 coordinate_assigned_part)) < local_sEpsilon)
4860 {
4861 local_thread_cut_line_weight_to_put_left(
4862 coordinate_assigned_part + 1) +=
4863 local_thread_cut_line_weight_to_put_left(
4864 coordinate_assigned_part);
4865 }
4866 ++local_point_counts(coordinate_assigned_part);
4867 local_assigned_part_ids(coordinate_index) =
4868 coordinate_assigned_part;
4869 }
4870 else {
4871 // if there is no more space on the left, put the coordinate to the
4872 // right of the cut.
4873 ++coordinate_assigned_part;
4874 // this while loop is necessary when a line is partitioned into more
4875 // than 2 parts.
4876 while(local_distribute_points_on_cut_lines &&
4877 coordinate_assigned_part < num_cuts)
4878 {
4879 // traverse all the cut lines having the same partitiong
4880 if(std::abs(current_concurrent_cut_coordinate(
4881 coordinate_assigned_part) -
4882 current_concurrent_cut_coordinate(
4883 coordinate_assigned_part - 1)) < local_sEpsilon)
4884 {
4885 // if line has enough space on left, put it there.
4886 if(local_thread_cut_line_weight_to_put_left(
4887 coordinate_assigned_part) > local_sEpsilon &&
4888 local_thread_cut_line_weight_to_put_left(
4889 coordinate_assigned_part) >=
4890 std::abs(local_thread_cut_line_weight_to_put_left(
4891 coordinate_assigned_part) - coordinate_weight))
4892 {
4893 local_thread_cut_line_weight_to_put_left(
4894 coordinate_assigned_part) -= coordinate_weight;
4895 // Again if it put too much on left of the cut,
4896 // update how much the next cut sharing the same coordinate will
4897 // put to its left.
4898 if(local_thread_cut_line_weight_to_put_left(
4899 coordinate_assigned_part) < 0 &&
4900 coordinate_assigned_part < num_cuts - 1 &&
4901 std::abs(current_concurrent_cut_coordinate(
4902 coordinate_assigned_part+1) -
4903 current_concurrent_cut_coordinate(
4904 coordinate_assigned_part)) < local_sEpsilon)
4905 {
4906 local_thread_cut_line_weight_to_put_left(
4907 coordinate_assigned_part + 1) +=
4908 local_thread_cut_line_weight_to_put_left(
4909 coordinate_assigned_part);
4910 }
4911 break;
4912 }
4913 }
4914 else {
4915 break;
4916 }
4917 ++coordinate_assigned_part;
4918 }
4919 local_point_counts(coordinate_assigned_part) += 1;
4920 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4921 }
4922 }
4923
4924 for(int j = 0; j < num_parts; ++j) {
4925 out_part_xadj(j) = local_point_counts(j);
4926 local_point_counts(j) = 0;
4927
4928 if(j != 0) {
4929 out_part_xadj(j) += out_part_xadj(j - 1);
4930 local_point_counts(j) += out_part_xadj(j - 1);
4931 }
4932 }
4933 });
4934
4935 // here we will determine insert indices for N teams
4936 // then all the teams can fill
4937
4938#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4939
4940 // This is the fastest so far - just straight atomic writes for CUDA
4941 // However this is not a deterministic result since it is atomic.
4942 // The final result will be deterministic.
4943 Kokkos::parallel_for(
4944 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4945 coordinate_begin_index, coordinate_end_index),
4946 KOKKOS_LAMBDA (mj_lno_t ii) {
4947 mj_lno_t i = local_coordinate_permutations(ii);
4948 mj_part_t p = local_assigned_part_ids(i);
4949 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4950 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4951 });
4952
4953#else
4954
4955#ifdef KOKKOS_ENABLE_OPENMP
4956 // will return and fix this - revert back to 1 for clear auto testing
4957 const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4958#else
4959 const int num_threads = 1;
4960#endif
4961
4962 const int num_teams = 1; // cuda is handled above using a different format
4963
4964 // allow init - we want all 0's first
4965 Kokkos::View<mj_lno_t*, device_t>
4966 point_counter("insert indices", num_teams * num_threads * num_parts);
4967
4968 // count how many coords per thread
4969 // then we will fill each independently
4970 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4971 block_policy(num_teams, num_threads);
4972 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4973 member_type member_type;
4974 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4975 mj_lno_t block_size = range / num_teams + 1;
4976 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4977 int team = team_member.league_rank();
4978 int team_offset = team * num_threads * num_parts;
4979 mj_lno_t begin = coordinate_begin_index + team * block_size;
4980 mj_lno_t end = begin + block_size;
4981 if(end > coordinate_end_index) {
4982 end = coordinate_end_index;
4983 }
4984
4985 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4986 [=] (mj_lno_t ii) {
4987 int thread = team_member.team_rank();
4988 mj_lno_t i = local_coordinate_permutations(ii);
4989 mj_part_t p = local_assigned_part_ids(i);
4990 int index = team_offset + thread * num_parts + p;
4991 ++point_counter(index);
4992 });
4993 });
4994
4995 // now prefix sum
4996 // we currently have the counts in the slots
4997 // we want the first counter for each part to be 0
4998 // then the rest should be the sum of all the priors
4999 Kokkos::parallel_for(
5000 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5001 KOKKOS_LAMBDA (int dummy) {
5002 int num_sets = point_counter.size() / num_parts;
5003 for(int set = num_sets - 1; set >= 1; set -=1) {
5004 int base = set * num_parts;
5005 for(int part = 0; part < num_parts; ++part) {
5006 point_counter(base + part) = point_counter(base + part - num_parts);
5007 }
5008 }
5009
5010 for(int part = 0; part < num_parts; ++part) {
5011 point_counter(part) = 0;
5012 }
5013
5014 for(int set = 1; set < num_sets; ++set) {
5015 int base = set * num_parts;
5016 for(int part = 0; part < num_parts; ++part) {
5017 point_counter(base + part) += point_counter(base + part - num_parts);
5018 }
5019 }
5020 });
5021
5022 // now permute
5023 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5024 int team = team_member.league_rank();
5025 int team_offset = team * num_threads * num_parts;
5026 mj_lno_t begin = coordinate_begin_index + team * block_size;
5027 mj_lno_t end = begin + block_size;
5028 if(end > coordinate_end_index) {
5029 end = coordinate_end_index;
5030 }
5031 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5032 [=] (mj_lno_t ii) {
5033 int thread = team_member.team_rank();
5034 mj_lno_t i = local_coordinate_permutations(ii);
5035 mj_part_t p = local_assigned_part_ids(i);
5036 int index = team_offset + thread * num_parts + p;
5037 int set_counter = (point_counter(index)++) + local_point_counts(p);
5038 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5039 });
5040 });
5041#endif
5042}
5043
5087template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5088 typename mj_part_t, typename mj_node_t>
5089void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5090 mj_node_t>::mj_get_new_cut_coordinates(
5092 mj_part_t kk,
5093 const mj_part_t &num_cuts,
5094 const double &used_imbalance_tolerance,
5095 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5096 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5097 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5098 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5099 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5100 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5101 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5102 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5103 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5104 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5105 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5106 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5107 Kokkos::View<mj_scalar_t *, device_t> &
5108 current_part_cut_line_weight_to_put_left,
5109 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5110{
5111 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5112
5113 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5114 auto local_sEpsilon = sEpsilon;
5115 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5116 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5117 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5118 auto local_global_min_max_coord_total_weight =
5119 global_min_max_coord_total_weight;
5120
5121 const auto _sEpsilon = this->sEpsilon;
5122 // Note for a 22 part system I tried removing the outer loop
5123 // and doing each sub loop as a simple parallel_for over num_cuts.
5124 // But that was about twice as slow (10ms) as the current form (5ms)
5125 // so I think the overhead of launching the new global parallel kernels
5126 // is costly. This form is just running one team so effectively using
5127 // a single warp to process the cuts. I expect with a lot of parts this
5128 // might need changing.
5129 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5130 policy_one_team(1, Kokkos::AUTO());
5131 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5132 member_type member_type;
5133 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5134
5135 mj_scalar_t min_coordinate =
5136 local_global_min_max_coord_total_weight(kk);
5137 mj_scalar_t max_coordinate =
5138 local_global_min_max_coord_total_weight(
5140 mj_scalar_t global_total_weight =
5141 local_global_min_max_coord_total_weight(
5143
5144 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5145 [=] (mj_part_t i) {
5146 // if left and right closest points are not set yet,
5147 // set it to the cut itself.
5148 if(min_coordinate -
5149 current_global_left_closest_points(i) > local_sEpsilon) {
5150 current_global_left_closest_points(i) =
5151 current_cut_coordinates(i);
5152 }
5153 if(current_global_right_closest_points(i) -
5154 max_coordinate > local_sEpsilon) {
5155 current_global_right_closest_points(i) =
5156 current_cut_coordinates(i);
5157 }
5158 });
5159 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5160
5161 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5162 [=] (mj_part_t i) {
5163 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5164 mj_node_t>;
5165 // seen weight in the part
5166 mj_scalar_t seen_weight_in_part = 0;
5167 // expected weight for part.
5168 mj_scalar_t expected_weight_in_part = 0;
5169 // imbalance for the left and right side of the cut.
5170 double imbalance_on_left = 0, imbalance_on_right = 0;
5171 if(local_distribute_points_on_cut_lines) {
5172 // init the weight on the cut.
5173 local_global_rectilinear_cut_weight(i) = 0;
5174 local_process_rectilinear_cut_weight(i) = 0;
5175 }
5176 bool bContinue = false;
5177 // if already determined at previous iterations,
5178 // then just write the coordinate to new array, and proceed.
5179 if(current_cut_line_determined(i)) {
5180 new_current_cut_coordinates(i) =
5181 current_cut_coordinates(i);
5182 bContinue = true;
5183 }
5184 if(!bContinue) {
5185 //current weight of the part at the left of the cut line.
5186 seen_weight_in_part = current_global_part_weights(i * 2);
5187
5188 //expected ratio
5189 expected_weight_in_part = current_part_target_weights(i);
5190
5191 //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5192 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5193 expected_weight_in_part);
5194 // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5195 // globalTotalWeight, 1 - expected);
5196 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5197 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5198 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5199 used_imbalance_tolerance < local_sEpsilon ;
5200 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5201 used_imbalance_tolerance < local_sEpsilon;
5202 //if the cut line reaches to desired imbalance.
5203 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5204 current_cut_line_determined(i) = true;
5205 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5206 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5207 }
5208 else if(imbalance_on_left < 0) {
5209 //if left imbalance < 0 then we need to move the cut to right.
5210 if(local_distribute_points_on_cut_lines) {
5211 // if it is okay to distribute the coordinate on
5212 // the same coordinate to left and right.
5213 // then check if we can reach to the target weight by including the
5214 // coordinates in the part.
5215 if(current_global_part_weights(i * 2 + 1) ==
5216 expected_weight_in_part) {
5217 // if it is we are done.
5218 current_cut_line_determined(i) = true;
5219 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5220
5221 //then assign everything on the cut to the left of the cut.
5222 new_current_cut_coordinates(i) =
5223 current_cut_coordinates(i);
5224 //for this cut all the weight on cut will be put to left.
5225 current_part_cut_line_weight_to_put_left(i) =
5226 current_local_part_weights(i * 2 + 1) -
5227 current_local_part_weights(i * 2);
5228 bContinue = true;
5229 }
5230 else if(current_global_part_weights(i * 2 + 1) >
5231 expected_weight_in_part) {
5232 // if the weight is larger than the expected weight,
5233 // then we need to distribute some points to left, some to right.
5234 current_cut_line_determined(i) = true;
5235 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5236
5237 // increase the num cuts to be determined with rectilinear
5238 // partitioning.
5239 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5240 new_current_cut_coordinates(i) =
5241 current_cut_coordinates(i);
5242 local_process_rectilinear_cut_weight[i] =
5243 current_local_part_weights(i * 2 + 1) -
5244 current_local_part_weights(i * 2);
5245 bContinue = true;
5246 }
5247 }
5248
5249 if(!bContinue) {
5250
5251 // we need to move further right,so set lower bound to current line,
5252 // and shift it to the closes point from right.
5253 current_cut_lower_bounds(i) =
5254 current_global_right_closest_points(i);
5255
5256 //set the lower bound weight to the weight we have seen.
5257 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5258
5259 // compare the upper bound with what has been found in the
5260 // last iteration.
5261 // we try to make more strict bounds for the cut here.
5262 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5263 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5264 mj_scalar_t line_weight =
5265 current_global_part_weights(ii * 2 + 1);
5266 if(p_weight >= expected_weight_in_part) {
5267 // if a cut on the right has the expected weight, then we found
5268 // our cut position. Set up and low coordiantes to this
5269 // new cut coordinate, but we need one more iteration to
5270 // finalize the cut position, as wee need to update the part ids.
5271 if(p_weight == expected_weight_in_part) {
5272 current_cut_upper_bounds(i) =
5273 current_cut_coordinates(ii);
5274 current_cut_upper_weights(i) = p_weight;
5275 current_cut_lower_bounds(i) =
5276 current_cut_coordinates(ii);
5277 current_cut_lower_bound_weights(i) = p_weight;
5278 } else if(p_weight < current_cut_upper_weights(i)) {
5279 // if a part weight is larger then my expected weight,
5280 // but lower than my upper bound weight, update upper bound.
5281 current_cut_upper_bounds(i) =
5282 current_global_left_closest_points(ii);
5283 current_cut_upper_weights(i) = p_weight;
5284 }
5285 break;
5286 }
5287 // if comes here then pw < ew
5288 // then compare the weight against line weight.
5289 if(line_weight >= expected_weight_in_part) {
5290 // if the line is larger than the expected weight, then we need
5291 // to reach to the balance by distributing coordinates on
5292 // this line.
5293 current_cut_upper_bounds(i) =
5294 current_cut_coordinates(ii);
5295 current_cut_upper_weights(i) = line_weight;
5296 current_cut_lower_bounds(i) =
5297 current_cut_coordinates(ii);
5298 current_cut_lower_bound_weights(i) = p_weight;
5299 break;
5300 }
5301 // if a stricter lower bound is found,
5302 // update the lower bound.
5303 if(p_weight <= expected_weight_in_part && p_weight >=
5304 current_cut_lower_bound_weights(i)) {
5305 current_cut_lower_bounds(i) =
5306 current_global_right_closest_points(ii);
5307 current_cut_lower_bound_weights(i) = p_weight;
5308 }
5309 }
5310
5311 mj_scalar_t new_cut_position = 0;
5312 algMJ_t::mj_calculate_new_cut_position(
5313 current_cut_upper_bounds(i),
5314 current_cut_lower_bounds(i),
5315 current_cut_upper_weights(i),
5316 current_cut_lower_bound_weights(i),
5317 expected_weight_in_part, new_cut_position,
5318 _sEpsilon);
5319
5320 // if cut line does not move significantly.
5321 // then finalize the search.
5322 if(std::abs(current_cut_coordinates(i) -
5323 new_cut_position) < local_sEpsilon) {
5324 current_cut_line_determined(i) = true;
5325 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5326
5327 //set the cut coordinate and proceed.
5328 new_current_cut_coordinates(i) =
5329 current_cut_coordinates(i);
5330 } else {
5331 new_current_cut_coordinates(i) = new_cut_position;
5332 }
5333 } // bContinue
5334 } else {
5335 // need to move the cut line to left.
5336 // set upper bound to current line.
5337 current_cut_upper_bounds(i) =
5338 current_global_left_closest_points(i);
5339 current_cut_upper_weights(i) =
5340 seen_weight_in_part;
5341 // compare the current cut line weights with
5342 // previous upper and lower bounds.
5343 for(int ii = i - 1; ii >= 0; --ii) {
5344 mj_scalar_t p_weight =
5345 current_global_part_weights(ii * 2);
5346 mj_scalar_t line_weight =
5347 current_global_part_weights(ii * 2 + 1);
5348 if(p_weight <= expected_weight_in_part) {
5349 if(p_weight == expected_weight_in_part) {
5350 // if the weight of the part is my expected weight
5351 // then we find the solution.
5352 current_cut_upper_bounds(i) =
5353 current_cut_coordinates(ii);
5354 current_cut_upper_weights(i) = p_weight;
5355 current_cut_lower_bounds(i) =
5356 current_cut_coordinates(ii);
5357 current_cut_lower_bound_weights(i) = p_weight;
5358 }
5359 else if(p_weight > current_cut_lower_bound_weights(i)) {
5360 // if found weight is bigger than the lower bound
5361 // then update the lower bound.
5362 current_cut_lower_bounds(i) =
5363 current_global_right_closest_points(ii);
5364 current_cut_lower_bound_weights(i) = p_weight;
5365
5366 // at the same time, if weight of line is bigger than the
5367 // expected weight, then update the upper bound as well.
5368 // in this case the balance will be obtained by distributing
5369 // weights on this cut position.
5370 if(line_weight > expected_weight_in_part) {
5371 current_cut_upper_bounds(i) =
5372 current_global_right_closest_points(ii);
5373 current_cut_upper_weights(i) = line_weight;
5374 }
5375 }
5376 break;
5377 }
5378 // if the weight of the cut on the left is still bigger than
5379 // my weight, and also if the weight is smaller than the current
5380 // upper weight, or if the weight is equal to current upper
5381 // weight, but on the left of the upper weight, then update
5382 // upper bound.
5383 if(p_weight >= expected_weight_in_part &&
5384 (p_weight < current_cut_upper_weights(i) ||
5385 (p_weight == current_cut_upper_weights(i) &&
5386 current_cut_upper_bounds(i) >
5387 current_global_left_closest_points(ii)))) {
5388 current_cut_upper_bounds(i) =
5389 current_global_left_closest_points(ii);
5390 current_cut_upper_weights(i) = p_weight;
5391 }
5392 }
5393 mj_scalar_t new_cut_position = 0;
5394 algMJ_t::mj_calculate_new_cut_position(
5395 current_cut_upper_bounds(i),
5396 current_cut_lower_bounds(i),
5397 current_cut_upper_weights(i),
5398 current_cut_lower_bound_weights(i),
5399 expected_weight_in_part,
5400 new_cut_position,
5401 _sEpsilon);
5402
5403 // if cut line does not move significantly.
5404 if(std::abs(current_cut_coordinates(i) -
5405 new_cut_position) < local_sEpsilon) {
5406 current_cut_line_determined(i) = true;
5407 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5408 //set the cut coordinate and proceed.
5409 new_current_cut_coordinates(i) =
5410 current_cut_coordinates(i);
5411 } else {
5412 new_current_cut_coordinates(i) =
5413 new_cut_position;
5414 }
5415 }
5416 }; // bContinue
5417 });
5418
5419 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5420 });
5421
5422 // view_rectilinear_cut_count
5423 mj_part_t rectilinear_cut_count;
5424 Kokkos::parallel_reduce("Read bDoingWork",
5425 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5426 KOKKOS_LAMBDA(int dummy, int & set_single) {
5427 set_single = view_rectilinear_cut_count(0);
5428 }, rectilinear_cut_count);
5429
5430 if(rectilinear_cut_count > 0) {
5431 auto host_local_process_rectilinear_cut_weight =
5432 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5433 local_process_rectilinear_cut_weight);
5434 auto host_local_global_rectilinear_cut_weight =
5435 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5436 local_global_rectilinear_cut_weight);
5437 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5438 local_process_rectilinear_cut_weight);
5439 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5440 local_global_rectilinear_cut_weight);
5441 Teuchos::scan<int,mj_scalar_t>(
5442 *comm, Teuchos::REDUCE_SUM,
5443 num_cuts,
5444 host_local_process_rectilinear_cut_weight.data(),
5445 host_local_global_rectilinear_cut_weight.data());
5446 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5447 host_local_process_rectilinear_cut_weight);
5448 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5449 host_local_global_rectilinear_cut_weight);
5450
5451 Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5452 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5453 KOKKOS_LAMBDA(int dummy) {
5454 for(mj_part_t i = 0; i < num_cuts; ++i) {
5455 // if cut line weight to be distributed.
5456 if(local_global_rectilinear_cut_weight(i) > 0) {
5457 // expected weight to go to left of the cut.
5458 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5459 // the weight that should be put to left of the cut.
5460 mj_scalar_t necessary_weight_on_line_for_left =
5461 expected_part_weight - current_global_part_weights(i * 2);
5462
5463 // the weight of the cut in the process
5464 mj_scalar_t my_weight_on_line =
5465 local_process_rectilinear_cut_weight(i);
5466
5467 // the sum of the cut weights upto this process,
5468 // including the weight of this process.
5469 mj_scalar_t weight_on_line_upto_process_inclusive =
5470 local_global_rectilinear_cut_weight(i);
5471 // the space on the left side of the cut after all processes
5472 // before this process (including this process)
5473 // puts their weights on cut to left.
5474 mj_scalar_t space_to_put_left =
5475 necessary_weight_on_line_for_left -
5476 weight_on_line_upto_process_inclusive;
5477 // add my weight to this space to find out how much space
5478 // is left to me.
5479 mj_scalar_t space_left_to_me =
5480 space_to_put_left + my_weight_on_line;
5481
5482 /*
5483 cout << "expected_part_weight:" << expected_part_weight
5484 << " necessary_weight_on_line_for_left:"
5485 << necessary_weight_on_line_for_left
5486 << " my_weight_on_line" << my_weight_on_line
5487 << " weight_on_line_upto_process_inclusive:"
5488 << weight_on_line_upto_process_inclusive
5489 << " space_to_put_left:" << space_to_put_left
5490 << " space_left_to_me" << space_left_to_me << endl;
5491 */
5492
5493 if(space_left_to_me < 0) {
5494 // space_left_to_me is negative and i dont need to put
5495 // anything to left.
5496 current_part_cut_line_weight_to_put_left(i) = 0;
5497 }
5498 else if(space_left_to_me >= my_weight_on_line) {
5499 // space left to me is bigger than the weight of the
5500 // processor on cut.
5501 // so put everything to left.
5502 current_part_cut_line_weight_to_put_left(i) =
5503 my_weight_on_line;
5504 // cout << "setting current_part_cut_line_weight_to_put_left
5505 // to my_weight_on_line:" << my_weight_on_line << endl;
5506 }
5507 else {
5508 // put only the weight as much as the space.
5509 current_part_cut_line_weight_to_put_left(i) =
5510 space_left_to_me;
5511 // cout << "setting current_part_cut_line_weight_to_put_left
5512 // to space_left_to_me:" << space_left_to_me << endl;
5513 }
5514 }
5515 }
5516 view_rectilinear_cut_count(0) = 0;
5517 });
5518 }
5519
5520 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5521}
5522
5532template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5533 typename mj_part_t, typename mj_node_t>
5534void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5535 get_processor_num_points_in_parts(
5536 mj_part_t num_procs,
5537 mj_part_t num_parts,
5538 mj_gno_t *&num_points_in_all_processor_parts)
5539{
5540 // initially allocation_size is num_parts
5541 size_t allocation_size = num_parts * (num_procs + 1);
5542
5543 // this will be output
5544 // holds how many each processor has in each part.
5545 // last portion is the sum of all processor points in each part.
5546
5547 // allocate memory for the local num coordinates in each part.
5548 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5549 new mj_gno_t[allocation_size];
5550
5551 // this is the portion of the memory which will be used
5552 // at the summation to obtain total number of processors' points in each part.
5553 mj_gno_t *my_local_points_to_reduce_sum =
5554 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5555
5556 // this is the portion of the memory where each stores its local number.
5557 // this information is needed by other processors.
5558 mj_gno_t *my_local_point_counts_in_each_part =
5559 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5560
5561 // initialize the array with 0's.
5562 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5563 sizeof(mj_gno_t)*allocation_size);
5564
5565 auto local_new_part_xadj = this->new_part_xadj;
5566 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5567 Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5568 Kokkos::parallel_for("get vals on device",
5569 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5570 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5571 points_per_part(i) =
5572 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5573 });
5574 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5575 Kokkos::deep_copy(host_points_per_part, points_per_part);
5576 for(int i = 0; i < num_parts; ++i) {
5577 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5578 }
5579
5580 // copy the local num parts to the last portion of array, so that this portion
5581 // will represent the global num points in each part after the reduction.
5582 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5583 sizeof(mj_gno_t) * (num_parts) );
5584
5585 // reduceAll operation.
5586 // the portion that belongs to a processor with index p
5587 // will start from myRank * num_parts.
5588 // the global number of points will be held at the index
5589 try{
5590 reduceAll<int, mj_gno_t>(
5591 *(this->comm),
5592 Teuchos::REDUCE_SUM,
5593 allocation_size,
5594 num_local_points_in_each_part_to_reduce_sum,
5595 num_points_in_all_processor_parts);
5596 }
5597 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5598
5599 delete [] num_local_points_in_each_part_to_reduce_sum;
5600}
5601
5617template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5618 typename mj_part_t, typename mj_node_t>
5619bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5620 mj_check_to_migrate(
5621 size_t migration_reduce_all_population,
5622 mj_lno_t num_coords_for_last_dim_part,
5623 mj_part_t num_procs,
5624 mj_part_t num_parts,
5625 mj_gno_t *num_points_in_all_processor_parts)
5626{
5627 // if reduce all count and population in the last dim is too high
5628 if(migration_reduce_all_population > future_reduceall_cutoff) {
5629 return true;
5630 }
5631
5632 // if the work in a part per processor in the last dim is too low.
5633 if(num_coords_for_last_dim_part < min_work_last_dim) {
5634 return true;
5635 }
5636
5637 // if migration is to be checked and the imbalance is too high
5638 if(this->check_migrate_avoid_migration_option == 0) {
5639 double global_imbalance = 0;
5640 // global shift to reach the sum of coordiante count in each part.
5641 size_t global_shift = num_procs * num_parts;
5642
5643 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5644 for(mj_part_t i = 0; i < num_parts; ++i) {
5645 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5646 / double(num_procs);
5647
5648 global_imbalance += std::abs(ideal_num -
5649 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5650 }
5651 }
5652 global_imbalance /= num_parts;
5653 global_imbalance /= num_procs;
5654
5655 if(global_imbalance <= this->minimum_migration_imbalance) {
5656 return false;
5657 }
5658 else {
5659 return true;
5660 }
5661 }
5662 else {
5663 // if migration is forced
5664 return true;
5665 }
5666}
5667
5681template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5682 typename mj_part_t, typename mj_node_t>
5683void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5684 assign_send_destinations(
5685 mj_part_t num_parts,
5686 mj_part_t *part_assignment_proc_begin_indices,
5687 mj_part_t *processor_chains_in_parts,
5688 mj_lno_t *send_count_to_each_proc,
5689 int *coordinate_destinations) {
5690
5691 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5692 deep_copy(host_new_part_xadj, this->new_part_xadj);
5693
5694 auto host_new_coordinate_permutations =
5695 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5696 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5697
5698 for(mj_part_t p = 0; p < num_parts; ++p) {
5699 mj_lno_t part_begin = 0;
5700 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5701 mj_lno_t part_end = host_new_part_xadj(p);
5702 // get the first part that current processor will send its part-p.
5703 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5704 // initialize how many point I sent to this processor.
5705 mj_lno_t num_total_send = 0;
5706 for(mj_lno_t j=part_begin; j < part_end; j++) {
5707 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5708 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5709 // then get the next processor to send the points in part p.
5710 num_total_send = 0;
5711 // assign new processor to part_assign_begin[p]
5712 part_assignment_proc_begin_indices[p] =
5713 processor_chains_in_parts[proc_to_sent];
5714 // remove the previous processor
5715 processor_chains_in_parts[proc_to_sent] = -1;
5716 // choose the next processor as the next one to send.
5717 proc_to_sent = part_assignment_proc_begin_indices[p];
5718 }
5719 // write the gno index to corresponding position in sendBuf.
5720 coordinate_destinations[local_ind] = proc_to_sent;
5721 ++num_total_send;
5722 }
5723 }
5724}
5725
5746template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5747 typename mj_part_t, typename mj_node_t>
5748void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5749 mj_assign_proc_to_parts(
5750 mj_gno_t * num_points_in_all_processor_parts,
5751 mj_part_t num_parts,
5752 mj_part_t num_procs,
5753 mj_lno_t *send_count_to_each_proc,
5754 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5755 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5756 mj_part_t &out_part_index,
5757 mj_part_t &output_part_numbering_begin_index,
5758 int * coordinate_destinations) {
5759 mj_gno_t *global_num_points_in_parts =
5760 num_points_in_all_processor_parts + num_procs * num_parts;
5761 mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5762
5763 // boolean variable if the process finds its part to be assigned.
5764 bool did_i_find_my_group = false;
5765
5766 mj_part_t num_free_procs = num_procs;
5767 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5768
5769 double max_imbalance_difference = 0;
5770 mj_part_t max_differing_part = 0;
5771
5772 // find how many processor each part requires.
5773 for(mj_part_t i = 0; i < num_parts; i++) {
5774
5775 // scalar portion of the required processors
5776 double scalar_required_proc = num_procs *
5777 (double (global_num_points_in_parts[i]) /
5778 double (this->num_global_coords));
5779
5780 // round it to closest integer; make sure have at least one proc.
5781 mj_part_t required_proc =
5782 static_cast<mj_part_t> (0.5 + scalar_required_proc);
5783 if(required_proc == 0) required_proc = 1;
5784
5785 // if assigning the required num procs, creates problems for the rest
5786 // of the parts, then only assign {num_free_procs -
5787 // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5788 if(num_free_procs -
5789 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5790 required_proc = num_free_procs -
5791 (minimum_num_procs_required_for_rest_of_parts);
5792 }
5793
5794 // reduce the free processor count
5795 num_free_procs -= required_proc;
5796
5797 // reduce the free minimum processor count required for the rest of the
5798 // part by 1.
5799 --minimum_num_procs_required_for_rest_of_parts;
5800
5801 // part (i) is assigned to (required_proc) processors.
5802 num_procs_assigned_to_each_part[i] = required_proc;
5803
5804 // because of the roundings some processors might be left as unassigned.
5805 // we want to assign those processors to the part with most imbalance.
5806 // find the part with the maximum imbalance here.
5807 double imbalance_wrt_ideal =
5808 (scalar_required_proc - required_proc) / required_proc;
5809 if(imbalance_wrt_ideal > max_imbalance_difference) {
5810 max_imbalance_difference = imbalance_wrt_ideal;
5811 max_differing_part = i;
5812 }
5813 }
5814
5815 // assign extra processors to the part with maximum imbalance
5816 // than the ideal.
5817 if(num_free_procs > 0) {
5818 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5819 }
5820
5821 // now find what are the best processors with least migration for each part.
5822
5823 // part_assignment_proc_begin_indices ([i]) is the array that holds the
5824 // beginning index of a processor that processor sends its data for part - i
5825 mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5826
5827 // the next processor send is found in processor_chains_in_parts,
5828 // in linked list manner.
5829 mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5830 mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5831
5832 // initialize the assignment of each processor.
5833 // this has a linked list implementation.
5834 // the beginning of processors assigned
5835 // to each part is hold at part_assignment_proc_begin_indices[part].
5836 // then the next processor assigned to that part is located at
5837 // proc_part_assignments[part_assign_begins[part]], this is a chain
5838 // until the value of -1 is reached.
5839 for(int i = 0; i < num_procs; ++i ) {
5840 processor_part_assignments[i] = -1;
5841 processor_chains_in_parts[i] = -1;
5842 }
5843 for(int i = 0; i < num_parts; ++i ) {
5844 part_assignment_proc_begin_indices[i] = -1;
5845 }
5846
5847 // std::cout << "Before migration: mig type:" <<
5848 // this->migration_type << std::endl;
5849 // Allocate memory for sorting data structure.
5850 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5851 sort_item_num_part_points_in_procs =
5852 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5853
5854 for(mj_part_t i = 0; i < num_parts; ++i) {
5855 // the algorithm tries to minimize the cost of migration, by assigning the
5856 // processors with highest number of coordinates on that part.
5857 // here we might want to implement a maximum weighted bipartite matching
5858 // algorithm.
5859 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5860 sort_item_num_part_points_in_procs[ii].id = ii;
5861 // if processor is not assigned yet.
5862 // add its num points to the sort data structure.
5863 if(processor_part_assignments[ii] == -1) {
5864 sort_item_num_part_points_in_procs[ii].val =
5865 num_points_in_all_processor_parts[ii * num_parts + i];
5866 // indicate that the processor has positive weight.
5867 sort_item_num_part_points_in_procs[ii].signbit = 1;
5868 }
5869 else {
5870 // if processor is already assigned, insert -nLocal - 1 so that it
5871 // won't be selected again.
5872 // would be same if we simply set it to -1, but more information with
5873 // no extra cost (which is used later) is provided.
5874 // sort_item_num_part_points_in_procs[ii].val =
5875 // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5876
5877 // UPDATE: Since above gets warning when unsigned is used to
5878 // represent, we added extra bit to as sign bit to the sort item.
5879 // It is 1 for positives, 0 for negatives.
5880 sort_item_num_part_points_in_procs[ii].val =
5881 num_points_in_all_processor_parts[ii * num_parts + i];
5882 sort_item_num_part_points_in_procs[ii].signbit = 0;
5883 }
5884 }
5885
5886 // sort the processors in the part.
5887 uqSignsort<mj_part_t, mj_gno_t,char>
5888 (num_procs, sort_item_num_part_points_in_procs);
5889
5890 /*
5891 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5892 std::cout << "ii:" << ii << " " <<
5893 sort_item_num_part_points_in_procs[ii].id <<
5894 " " << sort_item_num_part_points_in_procs[ii].val <<
5895 " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5896 std::endl;
5897 }
5898 */
5899
5900 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5901 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5902 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5903 ceil(total_num_points_in_part / double (required_proc_count)));
5904
5905 // starts sending to least heaviest part.
5906 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5907 mj_part_t next_proc_to_send_id =
5908 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5909 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5910 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5911
5912 // find the processors that will be assigned to this part, which are the
5913 // heaviest non assigned processors.
5914 for(mj_part_t ii = num_procs - 1;
5915 ii >= num_procs - required_proc_count; --ii) {
5916 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5917 // assign processor to part - i.
5918 processor_part_assignments[proc_id] = i;
5919 }
5920
5921 bool did_change_sign = false;
5922 // if processor has a minus count, reverse it.
5923 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5924 // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5925 // TODO: SEE BUG 6194
5926 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5927 did_change_sign = true;
5928 sort_item_num_part_points_in_procs[ii].signbit = 1;
5929 }
5930 else {
5931 break;
5932 }
5933 }
5934
5935 if(did_change_sign) {
5936 // resort the processors in the part for the rest of the processors that
5937 // is not assigned.
5938 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5939 sort_item_num_part_points_in_procs);
5940 }
5941
5942 /*
5943 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5944 std::cout << "after resort ii:" << ii << " " <<
5945 sort_item_num_part_points_in_procs[ii].id <<
5946 " " << sort_item_num_part_points_in_procs[ii].val <<
5947 " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5948 std::endl;
5949 }
5950 */
5951
5952 // check if this processors is one of the procs assigned to this part.
5953 // if it is, then get the group.
5954 if(!did_i_find_my_group) {
5955 for(mj_part_t ii = num_procs - 1; ii >=
5956 num_procs - required_proc_count; --ii) {
5957
5958 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5959
5960 // add the proc to the group.
5961 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5962
5963 if(proc_id_to_assign == this->myRank) {
5964 // if the assigned process is me, then I find my group.
5965 did_i_find_my_group = true;
5966
5967 // set the beginning of part i to my rank.
5968 part_assignment_proc_begin_indices[i] = this->myRank;
5969 processor_chains_in_parts[this->myRank] = -1;
5970
5971 // set send count to myself to the number of points that I have
5972 // in part i.
5973 send_count_to_each_proc[this->myRank] =
5974 sort_item_num_part_points_in_procs[ii].val;
5975
5976 // calculate the shift required for the
5977 // output_part_numbering_begin_index
5978 for(mj_part_t in = 0; in < i; ++in) {
5979 output_part_numbering_begin_index +=
5980 (*next_future_num_parts_in_parts)[in];
5981 }
5982 out_part_index = i;
5983 }
5984 }
5985
5986 // if these was not my group,
5987 // clear the subcomminicator processor array.
5988 if(!did_i_find_my_group) {
5989 processor_ranks_for_subcomm.clear();
5990 }
5991 }
5992
5993 // send points of the nonassigned coordinates to the assigned coordinates.
5994 // starts from the heaviest nonassigned processor.
5995 // TODO we might want to play with this part, that allows more
5996 // computational imbalance but having better communication balance.
5997 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5998 mj_part_t nonassigned_proc_id =
5999 sort_item_num_part_points_in_procs[ii].id;
6000 mj_lno_t num_points_to_sent =
6001 sort_item_num_part_points_in_procs[ii].val;
6002
6003 // we set number of points to -to_sent - 1 for the assigned processors.
6004 // we reverse it here. This should not happen, as we have already
6005 // reversed them above.
6006#ifdef MJ_DEBUG
6007 if(num_points_to_sent < 0) {
6008 cout << "Migration - processor assignments - for part:" << i
6009 << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
6010 << num_points_to_sent << std::endl;
6011 std::terminate();
6012 }
6013#endif
6014
6015 switch (migration_type) {
6016 case 0:
6017 {
6018 // now sends the points to the assigned processors.
6019 while (num_points_to_sent > 0) {
6020 // if the processor has enough space.
6021 if(num_points_to_sent <= space_left_in_sent_proc) {
6022 // reduce the space left in the processor.
6023 space_left_in_sent_proc -= num_points_to_sent;
6024 // if my rank is the one that is sending the coordinates.
6025 if(this->myRank == nonassigned_proc_id) {
6026 // set my sent count to the sent processor.
6027 send_count_to_each_proc[next_proc_to_send_id] =
6028 num_points_to_sent;
6029 // save the processor in the list (processor_chains_in_parts
6030 // and part_assignment_proc_begin_indices)
6031 // that the processor will send its point in part-i.
6032 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6033 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6034 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6035 }
6036 num_points_to_sent = 0;
6037 }
6038 else {
6039 // there might be no space left in the processor.
6040 if(space_left_in_sent_proc > 0) {
6041 num_points_to_sent -= space_left_in_sent_proc;
6042
6043 //send as the space left in the processor.
6044 if(this->myRank == nonassigned_proc_id) {
6045 // send as much as the space in this case.
6046 send_count_to_each_proc[next_proc_to_send_id] =
6047 space_left_in_sent_proc;
6048 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6049 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6050 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6051 }
6052 }
6053 // change the sent part
6054 ++next_proc_to_send_index;
6055
6056#ifdef MJ_DEBUG
6057 if(next_part_to_send_index < nprocs - required_proc_count ) {
6058 cout << "Migration - processor assignments - for part:"
6059 << i
6060 << " next_part_to_send :" << next_part_to_send_index
6061 << " nprocs:" << nprocs
6062 << " required_proc_count:" << required_proc_count
6063 << " Error: next_part_to_send_index <" <<
6064 << " nprocs - required_proc_count" << std::endl;
6065 std::terminate();
6066 }
6067#endif
6068 // send the new id.
6069 next_proc_to_send_id =
6070 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6071 // set the new space in the processor.
6072 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6073 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6074 }
6075 }
6076 }
6077 break;
6078 default:
6079 {
6080 // to minimize messages, we want each processor to send its
6081 // coordinates to only a single point.
6082 // we do not respect imbalances here, we send all points to the
6083 // next processor.
6084 if(this->myRank == nonassigned_proc_id) {
6085 // set my sent count to the sent processor.
6086 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6087 // save the processor in the list (processor_chains_in_parts and
6088 // part_assignment_proc_begin_indices)
6089 // that the processor will send its point in part-i.
6090 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6091 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6092 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6093 }
6094 num_points_to_sent = 0;
6095 ++next_proc_to_send_index;
6096
6097 // if we made it to the heaviest processor we round robin and
6098 // go to beginning
6099 if(next_proc_to_send_index == num_procs) {
6100 next_proc_to_send_index = num_procs - required_proc_count;
6101 }
6102 // send the new id.
6103 next_proc_to_send_id =
6104 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6105 // set the new space in the processor.
6106 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6107 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6108 }
6109 }
6110 }
6111 }
6112
6113 /*
6114 for(int i = 0; i < num_procs;++i) {
6115 std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6116 send_count_to_each_proc[i] << std::endl;
6117 }
6118 */
6119
6120 this->assign_send_destinations(
6121 num_parts,
6122 part_assignment_proc_begin_indices,
6123 processor_chains_in_parts,
6124 send_count_to_each_proc,
6125 coordinate_destinations);
6126 delete [] part_assignment_proc_begin_indices;
6127 delete [] processor_chains_in_parts;
6128 delete [] processor_part_assignments;
6129 delete [] sort_item_num_part_points_in_procs;
6130 delete [] num_procs_assigned_to_each_part;
6131}
6132
6148template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6149 typename mj_part_t, typename mj_node_t>
6150void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6151 assign_send_destinations2(
6152 mj_part_t num_parts,
6153 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6154 int *coordinate_destinations,
6155 mj_part_t &output_part_numbering_begin_index,
6156 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6157{
6158 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6159 mj_part_t previous_processor = -1;
6160
6161 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6162 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6163
6164 auto local_new_coordinate_permutations =
6165 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6166 Kokkos::deep_copy(local_new_coordinate_permutations,
6167 this->new_coordinate_permutations);
6168
6169 for(mj_part_t i = 0; i < num_parts; ++i) {
6170 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6171
6172 // assigned processors are sorted.
6173 mj_lno_t part_begin_index = 0;
6174
6175 if(p > 0) {
6176 part_begin_index = local_new_part_xadj(p - 1);
6177 }
6178
6179 mj_lno_t part_end_index = local_new_part_xadj(p);
6180
6181 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6182 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6183 output_part_numbering_begin_index = part_shift_amount;
6184 }
6185 previous_processor = assigned_proc;
6186 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6187
6188 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6189 mj_lno_t localInd = local_new_coordinate_permutations(j);
6190 coordinate_destinations[localInd] = assigned_proc;
6191 }
6192 }
6193}
6194
6216template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6217 typename mj_part_t, typename mj_node_t>
6218void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6219 mj_assign_parts_to_procs(
6220 mj_gno_t * num_points_in_all_processor_parts,
6221 mj_part_t num_parts,
6222 mj_part_t num_procs,
6223 mj_lno_t *send_count_to_each_proc,
6224 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6225 mj_part_t &out_num_part,
6226 std::vector<mj_part_t> &out_part_indices,
6227 mj_part_t &output_part_numbering_begin_index,
6228 int *coordinate_destinations) {
6229
6230 out_num_part = 0;
6231 mj_gno_t *global_num_points_in_parts =
6232 num_points_in_all_processor_parts + num_procs * num_parts;
6233 out_part_indices.clear();
6234
6235 // to sort the parts that is assigned to the processors.
6236 // id is the part number, sort value is the assigned processor id.
6237 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6238 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6239 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6240 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6241
6242 // calculate the optimal number of coordinates that should be assigned
6243 // to each processor.
6244 mj_lno_t work_each =
6245 mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6246
6247 // to hold the left space as the number of coordinates to the optimal
6248 // number in each proc.
6249 mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6250
6251 // initialize left space in each.
6252 for(mj_part_t i = 0; i < num_procs; ++i) {
6253 space_in_each_processor[i] = work_each;
6254 }
6255
6256 // we keep track of how many parts each processor is assigned to.
6257 // because in some weird inputs, it might be possible that some
6258 // processors is not assigned to any part. Using these variables,
6259 // we force each processor to have at least one part.
6260 mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6261 memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6262 int empty_proc_count = num_procs;
6263
6264 // to sort the parts with decreasing order of their coordiantes.
6265 // id are the part numbers, sort value is the number of points in each.
6266 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6267 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6268
6269 // initially we will sort the parts according to the number of coordinates
6270 // they have, so that we will start assigning with the part that has the most
6271 // number of coordinates.
6272 for(mj_part_t i = 0; i < num_parts; ++i) {
6273 sort_item_point_counts_in_parts[i].id = i;
6274 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6275 }
6276
6277 // sort parts with increasing order of loads.
6278 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6279
6280 // assigning parts to the processors
6281 // traverse the part with decreasing order of load.
6282 // first assign the heaviest part.
6283 for(mj_part_t j = 0; j < num_parts; ++j) {
6284 // sorted with increasing order, traverse inverse.
6285 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6286
6287 // load of the part
6288 mj_gno_t load = global_num_points_in_parts[i];
6289
6290 // assigned processors
6291 mj_part_t assigned_proc = -1;
6292
6293 // sort processors with increasing number of points in this part.
6294 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6295 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6296
6297 // if there are still enough parts to fill empty processors, than proceed
6298 // normally, but if empty processor count is equal to the number of part,
6299 // then we force to part assignments only to empty processors.
6300 if(empty_proc_count < num_parts - j ||
6301 num_parts_proc_assigned[ii] == 0) {
6302 // how many points processor ii has in part i?
6303 sort_item_num_points_of_proc_in_part_i[ii].val =
6304 num_points_in_all_processor_parts[ii * num_parts + i];
6305 }
6306 else {
6307 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6308 }
6309 }
6310
6311 uqsort<mj_part_t, mj_gno_t>(num_procs,
6312 sort_item_num_points_of_proc_in_part_i);
6313
6314 // traverse all processors with decreasing load.
6315 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6316 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6317 if(assigned_proc == -1 ||
6318 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6319 assigned_proc = ii;
6320 }
6321 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6322 if(ii < assigned_proc) {
6323 // ties go to lower proc
6324 // not necessary for a valid result but allows testing to compare
6325 // MPI results and have parts numbers assigned to the same boxes.
6326 // We don't break here because we may have more ties still to check.
6327 // The indeterminate state before this is due to Cuda using
6328 // atomics to refill the permutation array. So non-cuda runs don't
6329 // actualy need this since they will always have the same pattern.
6330 assigned_proc = ii;
6331 }
6332 }
6333 else {
6334 break; // now we can break - we have our part and no more ties.
6335 }
6336 }
6337
6338 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6339 --empty_proc_count;
6340 }
6341
6342 space_in_each_processor[assigned_proc] -= load;
6343 //to sort later, part-i is assigned to the proccessor - assignment.
6344 sort_item_part_to_proc_assignment[j].id = i; //part i
6345
6346 // assigned to processor - assignment.
6347 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6348
6349 // if assigned processor is me, increase the number.
6350 if(assigned_proc == this->myRank) {
6351 out_num_part++;//assigned_part_count;
6352 out_part_indices.push_back(i);
6353 }
6354
6355 // increase the send to that processor by the number of points in that
6356 // part, as everyone send their coordiantes in this part to the
6357 // processor assigned to this part.
6358 send_count_to_each_proc[assigned_proc] +=
6359 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6360 }
6361
6362 delete [] num_parts_proc_assigned;
6363 delete [] sort_item_num_points_of_proc_in_part_i;
6364 delete [] sort_item_point_counts_in_parts;
6365 delete [] space_in_each_processor;
6366
6367 // sort assignments with respect to the assigned processors.
6368 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6369
6370 // fill sendBuf.
6371 this->assign_send_destinations2(
6372 num_parts,
6373 sort_item_part_to_proc_assignment,
6374 coordinate_destinations,
6375 output_part_numbering_begin_index,
6376 next_future_num_parts_in_parts);
6377
6378 delete [] sort_item_part_to_proc_assignment;
6379}
6380
6381
6405template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6406 typename mj_part_t, typename mj_node_t>
6407void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6408 mj_migration_part_proc_assignment(
6409 mj_gno_t * num_points_in_all_processor_parts,
6410 mj_part_t num_parts,
6411 mj_part_t num_procs,
6412 mj_lno_t *send_count_to_each_proc,
6413 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6414 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6415 mj_part_t &out_num_part,
6416 std::vector<mj_part_t> &out_part_indices,
6417 mj_part_t &output_part_numbering_begin_index,
6418 int *coordinate_destinations)
6419{
6420 processor_ranks_for_subcomm.clear();
6421 // if(this->num_local_coords > 0)
6422 if(num_procs > num_parts) {
6423 // if there are more processors than the number of current part
6424 // then processors share the existing parts.
6425 // at the end each processor will have a single part,
6426 // but a part will be shared by a group of processors.
6427 mj_part_t out_part_index = 0;
6428
6429 this->mj_assign_proc_to_parts(
6430 num_points_in_all_processor_parts,
6431 num_parts,
6432 num_procs,
6433 send_count_to_each_proc,
6434 processor_ranks_for_subcomm,
6435 next_future_num_parts_in_parts,
6436 out_part_index,
6437 output_part_numbering_begin_index,
6438 coordinate_destinations
6439 );
6440
6441 out_num_part = 1;
6442 out_part_indices.clear();
6443 out_part_indices.push_back(out_part_index);
6444 }
6445 else {
6446
6447 // there are more parts than the processors.
6448 // therefore a processor will be assigned multiple parts,
6449 // the subcommunicators will only have a single processor.
6450 processor_ranks_for_subcomm.push_back(this->myRank);
6451
6452 // since there are more parts then procs,
6453 // assign multiple parts to processors.
6454
6455 this->mj_assign_parts_to_procs(
6456 num_points_in_all_processor_parts,
6457 num_parts,
6458 num_procs,
6459 send_count_to_each_proc,
6460 next_future_num_parts_in_parts,
6461 out_num_part,
6462 out_part_indices,
6463 output_part_numbering_begin_index,
6464 coordinate_destinations);
6465 }
6466}
6467
6481template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6482 typename mj_part_t, typename mj_node_t>
6483void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6484 mj_migrate_coords(
6485 mj_part_t num_procs,
6486 mj_lno_t &num_new_local_points,
6487 std::string iteration,
6488 int *coordinate_destinations,
6489 mj_part_t num_parts)
6490{
6491
6492#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6493 if(sizeof(mj_lno_t) <= sizeof(int)) {
6494 // Cannot use Zoltan_Comm with local ordinals larger than ints.
6495 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6496 // may overflow.
6497 ZOLTAN_COMM_OBJ *plan = NULL;
6498 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6499 int num_incoming_gnos = 0;
6500 int message_tag = 7859;
6501
6502 this->mj_env->timerStart(MACRO_TIMERS,
6503 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6504 int ierr = Zoltan_Comm_Create(
6505 &plan,
6506 int(this->num_local_coords),
6507 coordinate_destinations,
6508 mpi_comm,
6509 message_tag,
6510 &num_incoming_gnos);
6511
6512 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6513 this->mj_env->timerStop(MACRO_TIMERS,
6514 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6515
6516 this->mj_env->timerStart(MACRO_TIMERS,
6517 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6518
6519 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6520
6521 // migrate gnos.
6522 {
6523 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6524 Kokkos::HostSpace(), this->current_mj_gnos);
6525 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6526 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6527 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6528 auto host_dst_gnos = Kokkos::create_mirror_view(
6529 Kokkos::HostSpace(), dst_gnos);
6530 message_tag++;
6531 ierr = Zoltan_Comm_Do(
6532 plan,
6533 message_tag,
6534 (char *) host_current_mj_gnos.data(),
6535 sizeof(mj_gno_t),
6536 (char *) host_dst_gnos.data());
6537 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6538 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6539 this->current_mj_gnos = dst_gnos;
6540 }
6541
6542 //migrate coordinates
6543 {
6544 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6545 auto host_src_coordinates = Kokkos::create_mirror_view(
6546 Kokkos::HostSpace(), this->mj_coordinates);
6547 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6548 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6549 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6550 num_incoming_gnos, this->coord_dim);
6551 auto host_dst_coordinates = Kokkos::create_mirror_view(
6552 Kokkos::HostSpace(), dst_coordinates);
6553 for(int i = 0; i < this->coord_dim; ++i) {
6554 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6555 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6556 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6557 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6558 // Note Layout Left means we can do these in contiguous blocks
6559 message_tag++;
6560 ierr = Zoltan_Comm_Do(
6561 plan,
6562 message_tag,
6563 (char *) sub_host_src_coordinates.data(),
6564 sizeof(mj_scalar_t),
6565 (char *) sub_host_dst_coordinates.data());
6566 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6567 }
6568 deep_copy(dst_coordinates, host_dst_coordinates);
6569 this->mj_coordinates = dst_coordinates;
6570 }
6571
6572 // migrate weights.
6573 {
6574 auto host_src_weights = Kokkos::create_mirror_view(
6575 Kokkos::HostSpace(), this->mj_weights);
6576 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6577 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6578 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6579 num_incoming_gnos, this->num_weights_per_coord);
6580 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6581 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6582 auto sub_host_src_weights
6583 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6584 auto sub_host_dst_weights
6585 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6586 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6587 // Copy because of layout
6588 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6589 sent_weight[n] = sub_host_src_weights(n);
6590 }
6591 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6592 message_tag++;
6593 ierr = Zoltan_Comm_Do(
6594 plan,
6595 message_tag,
6596 (char *) sent_weight.getRawPtr(),
6597 sizeof(mj_scalar_t),
6598 (char *) received_weight.getRawPtr());
6599 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6600 // Again we copy by index due to layout
6601 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6602 sub_host_dst_weights(n) = received_weight[n];
6603 }
6604 }
6605 deep_copy(dst_weights, host_dst_weights);
6606 this->mj_weights = dst_weights;
6607 }
6608
6609 // migrate owners.
6610 {
6611 // Note that owners we kept on Serial
6612 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6613 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6614 num_incoming_gnos);
6615 message_tag++;
6616 ierr = Zoltan_Comm_Do(
6617 plan,
6618 message_tag,
6619 (char *) owner_of_coordinate.data(),
6620 sizeof(int),
6621 (char *) dst_owners_of_coordinate.data());
6622 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6623 this->owner_of_coordinate = dst_owners_of_coordinate;
6624 }
6625
6626 // if num procs is less than num parts,
6627 // we need the part assigment arrays as well, since
6628 // there will be multiple parts in processor.
6629 {
6630 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6631 Kokkos::HostSpace(), this->assigned_part_ids);
6632 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6633 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6634 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6635 num_incoming_gnos);
6636 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6637 Kokkos::HostSpace(), dst_assigned_part_ids);
6638 mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6639 if(num_procs < num_parts) {
6640 message_tag++;
6641 ierr = Zoltan_Comm_Do(
6642 plan,
6643 message_tag,
6644 (char *) host_src_assigned_part_ids.data(),
6645 sizeof(mj_part_t),
6646 (char *) host_dst_assigned_part_ids.data());
6647 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6648 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6649 }
6650 // In original code this would just assign to an uninitialized array
6651 // if num_procs < num_parts. We're doing the same here.
6652 this->assigned_part_ids = dst_assigned_part_ids;
6653 }
6654
6655 ierr = Zoltan_Comm_Destroy(&plan);
6656 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6657 num_new_local_points = num_incoming_gnos;
6658 this->mj_env->timerStop(MACRO_TIMERS,
6659 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6660 }
6661 else
6662#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6663 {
6664 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6665 "Migration DistributorPlanCreating-" + iteration);
6666
6667 Tpetra::Distributor distributor(this->comm);
6668 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6669 this->num_local_coords);
6670 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6671 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6672 "Migration DistributorPlanCreating-" + iteration);
6673 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6674 "Migration DistributorMigration-" + iteration);
6675
6676 // note MPI buffers should all be on Kokkos::HostSpace and not
6677 // Kokkos::CudaUVMSpace.
6678
6679 // migrate gnos.
6680 {
6681 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6682 auto src_host_current_mj_gnos =
6683 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
6684 Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
6685 ArrayView<mj_gno_t> sent_gnos(
6686 src_host_current_mj_gnos.data(), this->num_local_coords);
6687 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6688 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6689 Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6690 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6691 this->current_mj_gnos);
6692 memcpy(host_current_mj_gnos.data(),
6693 received_gnos.getRawPtr(), num_incoming_gnos * sizeof(mj_gno_t));
6694 Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
6695 }
6696
6697 // migrate coordinates
6698 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6699 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6700 dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6701 auto host_dst_coordinates = Kokkos::create_mirror_view(dst_coordinates);
6702 auto host_src_coordinates = Kokkos::create_mirror_view(
6703 Kokkos::HostSpace(), this->mj_coordinates);
6704 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6705 for(int i = 0; i < this->coord_dim; ++i) {
6706 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6707 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6708 auto sub_host_dst_coordinates
6709 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6710 // Note Layout Left means we can do these in contiguous blocks
6711 // This form was causing problems on cuda 10 pascal nodes, issue #6422
6712 // Doing a manual copy clears the error though it seems this is probably
6713 // just shifting some kind of race condition or UVM issue around. The
6714 // bug can be sensitive to simple changes like adding a printf log.
6715
6716 // Using this form will segfault on cuda 10 pascal node
6717 //ArrayView<mj_scalar_t> sent_coord(
6718 // sub_host_src_coordinates.data(), this->num_local_coords);
6719
6720 // Manual copy will clear the error but this is probably just due to
6721 // shifting some kind of race condition.
6722 ArrayRCP<mj_scalar_t> sent_coord(this->num_local_coords);
6723 for(int n = 0; n < this->num_local_coords; ++n) {
6724 sent_coord[n] = sub_host_src_coordinates[n];
6725 }
6726
6727 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6728 distributor.doPostsAndWaits<mj_scalar_t>(
6729 sent_coord(), 1, received_coord());
6730 memcpy(sub_host_dst_coordinates.data(),
6731 received_coord.getRawPtr(), num_incoming_gnos * sizeof(mj_scalar_t));
6732 }
6733 deep_copy(dst_coordinates, host_dst_coordinates);
6734 this->mj_coordinates = dst_coordinates;
6735
6736 // migrate weights.
6737 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6738 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6739 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6740 auto host_src_weights = Kokkos::create_mirror_view(
6741 Kokkos::HostSpace(), this->mj_weights);
6742 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6743 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6744 auto sub_host_src_weights
6745 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6746 auto sub_host_dst_weights
6747 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6748 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6749
6750 // TODO: Layout Right means these are not contiguous
6751 // However we don't have any systems setup with more than 1 weight so
6752 // really I have not tested any of this code with num weights > 1.
6753 // I think this is the right thing to do.
6754 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6755 sent_weight[n] = sub_host_src_weights(n);
6756 }
6757 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6758 distributor.doPostsAndWaits<mj_scalar_t>(
6759 sent_weight(), 1, received_weight());
6760
6761 // Again we copy by index due to layout
6762 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6763 sub_host_dst_weights(n) = received_weight[n];
6764 }
6765 }
6766 Kokkos::deep_copy(dst_weights, host_dst_weights);
6767 this->mj_weights = dst_weights;
6768
6769 // migrate owners
6770 {
6771 // Note owners we kept on Serial
6772 ArrayView<int> sent_owners(
6773 owner_of_coordinate.data(), this->num_local_coords);
6774 ArrayRCP<int> received_owners(num_incoming_gnos);
6775 distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
6776 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>
6777 ("owner_of_coordinate", num_incoming_gnos);
6778 memcpy(this->owner_of_coordinate.data(),
6779 received_owners.getRawPtr(), num_incoming_gnos * sizeof(int));
6780 }
6781
6782 // if num procs is less than num parts,
6783 // we need the part assigment arrays as well, since
6784 // there will be multiple parts in processor.
6785 if(num_procs < num_parts) {
6786 auto src_host_assigned_part_ids =
6787 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
6788 Kokkos::deep_copy(src_host_assigned_part_ids, assigned_part_ids);
6789 ArrayView<mj_part_t> sent_partids(
6790 src_host_assigned_part_ids.data(), this->num_local_coords);
6791 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
6792 distributor.doPostsAndWaits<mj_part_t>(
6793 sent_partids, 1, received_partids());
6794 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6795 ("assigned_part_ids", num_incoming_gnos);
6796 auto host_assigned_part_ids = Kokkos::create_mirror_view(
6797 this->assigned_part_ids);
6798 memcpy(
6799 host_assigned_part_ids.data(),
6800 received_partids.getRawPtr(),
6801 num_incoming_gnos * sizeof(mj_part_t));
6802 Kokkos::deep_copy(this->assigned_part_ids, host_assigned_part_ids);
6803 }
6804 else {
6805 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6806 ("assigned_part_ids", num_incoming_gnos);
6807 }
6808 this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6809 "Migration DistributorMigration-" + iteration);
6810
6811 num_new_local_points = num_incoming_gnos;
6812 }
6813}
6814
6820template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6821 typename mj_part_t, typename mj_node_t>
6822void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6823 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6824{
6825 mj_part_t group_size = processor_ranks_for_subcomm.size();
6826 mj_part_t *ids = new mj_part_t[group_size];
6827 for(mj_part_t i = 0; i < group_size; ++i) {
6828 ids[i] = processor_ranks_for_subcomm[i];
6829 }
6830 ArrayView<const mj_part_t> idView(ids, group_size);
6831 this->comm = this->comm->createSubcommunicator(idView);
6832 delete [] ids;
6833}
6834
6840template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6841 typename mj_part_t, typename mj_node_t>
6842void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6843 fill_permutation_array(
6844 mj_part_t output_num_parts,
6845 mj_part_t num_parts)
6846{
6847 // if there is single output part, then simply fill the permutation array.
6848 if(output_num_parts == 1) {
6849 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6850 Kokkos::parallel_for(
6851 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6852 (0, this->num_local_coords),
6853 KOKKOS_LAMBDA(mj_lno_t i) {
6854 local_new_coordinate_permutations(i) = i;
6855 });
6856 auto local_new_part_xadj = this->new_part_xadj;
6857 auto local_num_local_coords = this->num_local_coords;
6858 Kokkos::parallel_for(
6859 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6860 KOKKOS_LAMBDA(int dummy) {
6861 local_new_part_xadj(0) = local_num_local_coords;
6862 });
6863 }
6864 else {
6865 auto local_num_local_coords = this->num_local_coords;
6866 auto local_assigned_part_ids = this->assigned_part_ids;
6867 auto local_new_part_xadj = this->new_part_xadj;
6868 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6869
6870 // part shift holds the which part number an old part number corresponds to.
6871 Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6872
6873 // otherwise we need to count how many points are there in each part.
6874 // we allocate here as num_parts, because the sent partids are up to
6875 // num_parts, although there are outout_num_parts different part.
6876 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6877 "num_points_in_parts", num_parts);
6878
6879 Kokkos::parallel_for(
6880 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6881 KOKKOS_LAMBDA(int dummy) {
6882
6883 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6884 mj_part_t ii = local_assigned_part_ids(i);
6885 ++num_points_in_parts(ii);
6886 }
6887
6888 // write the end points of the parts.
6889 mj_part_t p = 0;
6890 mj_lno_t prev_index = 0;
6891 for(mj_part_t i = 0; i < num_parts; ++i) {
6892 if(num_points_in_parts(i) > 0) {
6893 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6894 prev_index += num_points_in_parts(i);
6895 part_shifts(i) = p++;
6896 }
6897 }
6898
6899 // for the rest of the parts write the end index as end point.
6900 mj_part_t assigned_num_parts = p - 1;
6901 for(;p < num_parts; ++p) {
6902 local_new_part_xadj(p) =
6903 local_new_part_xadj(assigned_num_parts);
6904 }
6905 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6906 num_points_in_parts(i) = local_new_part_xadj(i);
6907 }
6908
6909 // write the permutation array here.
6910 // get the part of the coordinate i, shift it to obtain the new part number.
6911 // assign it to the end of the new part numbers pointer.
6912 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6913 mj_part_t part =
6914 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6915 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6916 }
6917 });
6918 }
6919}
6920
6945template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6946 typename mj_part_t, typename mj_node_t>
6947bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6948 mj_perform_migration(
6949 mj_part_t input_num_parts,
6950 mj_part_t &output_num_parts,
6951 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6952 mj_part_t &output_part_begin_index,
6953 size_t migration_reduce_all_population,
6954 mj_lno_t num_coords_for_last_dim_part,
6955 std::string iteration,
6956 RCP<mj_partBoxVector_t> &input_part_boxes,
6957 RCP<mj_partBoxVector_t> &output_part_boxes)
6958{
6959 mj_part_t num_procs = this->comm->getSize();
6960 this->myRank = this->comm->getRank();
6961
6962 // this array holds how many points each processor has in each part.
6963 // to access how many points processor i has on part j,
6964 // num_points_in_all_processor_parts[i * num_parts + j]
6965 mj_gno_t *num_points_in_all_processor_parts =
6966 new mj_gno_t[input_num_parts * (num_procs + 1)];
6967
6968 // get the number of coordinates in each part in each processor.
6969 this->get_processor_num_points_in_parts(
6970 num_procs,
6971 input_num_parts,
6972 num_points_in_all_processor_parts);
6973
6974 // check if migration will be performed or not.
6975 if(!this->mj_check_to_migrate(
6976 migration_reduce_all_population,
6977 num_coords_for_last_dim_part,
6978 num_procs,
6979 input_num_parts,
6980 num_points_in_all_processor_parts)) {
6981 delete [] num_points_in_all_processor_parts;
6982 return false;
6983 }
6984
6985 mj_lno_t *send_count_to_each_proc = NULL;
6986 int *coordinate_destinations = new int[this->num_local_coords];
6987 send_count_to_each_proc = new mj_lno_t[num_procs];
6988
6989 for(int i = 0; i < num_procs; ++i) {
6990 send_count_to_each_proc[i] = 0;
6991 }
6992
6993 std::vector<mj_part_t> processor_ranks_for_subcomm;
6994 std::vector<mj_part_t> out_part_indices;
6995
6996 // determine which processors are assigned to which parts
6997 this->mj_migration_part_proc_assignment(
6998 num_points_in_all_processor_parts,
6999 input_num_parts,
7000 num_procs,
7001 send_count_to_each_proc,
7002 processor_ranks_for_subcomm,
7003 next_future_num_parts_in_parts,
7004 output_num_parts,
7005 out_part_indices,
7006 output_part_begin_index,
7007 coordinate_destinations);
7008
7009 delete [] send_count_to_each_proc;
7010 std::vector <mj_part_t> tmpv;
7011
7012 std::sort (out_part_indices.begin(), out_part_indices.end());
7013 mj_part_t outP = out_part_indices.size();
7014 mj_gno_t new_global_num_points = 0;
7015 mj_gno_t *global_num_points_in_parts =
7016 num_points_in_all_processor_parts + num_procs * input_num_parts;
7017
7018 if(this->mj_keep_part_boxes) {
7019 input_part_boxes->clear();
7020 }
7021
7022 // now we calculate the new values for next_future_num_parts_in_parts.
7023 // same for the part boxes.
7024 for(mj_part_t i = 0; i < outP; ++i) {
7025 mj_part_t ind = out_part_indices[i];
7026 new_global_num_points += global_num_points_in_parts[ind];
7027 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7028 if(this->mj_keep_part_boxes) {
7029 input_part_boxes->push_back((*output_part_boxes)[ind]);
7030 }
7031 }
7032
7033 // swap the input and output part boxes.
7034 if(this->mj_keep_part_boxes) {
7035 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7036 input_part_boxes = output_part_boxes;
7037 output_part_boxes = tmpPartBoxes;
7038 }
7039 next_future_num_parts_in_parts->clear();
7040 for(mj_part_t i = 0; i < outP; ++i) {
7041 mj_part_t p = tmpv[i];
7042 next_future_num_parts_in_parts->push_back(p);
7043 }
7044
7045 delete [] num_points_in_all_processor_parts;
7046
7047 mj_lno_t num_new_local_points = 0;
7048 //perform the actual migration operation here.
7049 this->mj_migrate_coords(
7050 num_procs,
7051 num_new_local_points,
7052 iteration,
7053 coordinate_destinations,
7054 input_num_parts);
7055
7056 delete [] coordinate_destinations;
7057 if(this->num_local_coords != num_new_local_points) {
7058 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7059 (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7060 num_new_local_points);
7061 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7062 (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7063 num_new_local_points);
7064 }
7065 this->num_local_coords = num_new_local_points;
7066 this->num_global_coords = new_global_num_points;
7067
7068 // create subcommunicator.
7069 this->create_sub_communicator(processor_ranks_for_subcomm);
7070
7071 processor_ranks_for_subcomm.clear();
7072
7073 // fill the new permutation arrays.
7074 this->fill_permutation_array(output_num_parts, input_num_parts);
7075
7076 return true;
7077}
7078
7097template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7098 typename mj_part_t, typename mj_node_t>
7099void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7100 create_consistent_chunks(
7101 mj_part_t num_parts,
7102 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7103 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7104 mj_lno_t coordinate_begin,
7105 mj_lno_t coordinate_end,
7106 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7107 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7108 int coordInd,
7109 bool longest_dim_part,
7110 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7111{
7112 // Note that this method is only used by task mapper
7113 // All code in this file has been verified to run with UVM off by running
7114 // mj tests and task mapper tests with UVM off. However for this particular
7115 // method I did not do much for UVM off. I heavily use device to host copies
7116 // and more or less preserve the original logic. Due to the handling of
7117 // arrays it will be a bit of work to convert this to as better form.
7118 // Since it's only relevant to task mapper and I wasn't sure how much priority
7119 // to give it, I put that on hold until further discussion.
7120 mj_part_t no_cuts = num_parts - 1;
7121
7122 // now if the rectilinear partitioning is allowed we decide how
7123 // much weight each thread should put to left and right.
7124 if(this->distribute_points_on_cut_lines) {
7125 auto local_thread_cut_line_weight_to_put_left =
7126 this->thread_cut_line_weight_to_put_left;
7127 auto local_thread_part_weight_work =
7128 this->thread_part_weight_work;
7129 auto local_sEpsilon = this->sEpsilon;
7130
7131 Kokkos::parallel_for(
7132 Kokkos::RangePolicy<typename mj_node_t::execution_space,
7133 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7134 // the left to be put on the left of the cut.
7135 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7136 if(left_weight > local_sEpsilon) {
7137 // the weight of thread ii on cut.
7138 mj_scalar_t thread_ii_weight_on_cut =
7139 local_thread_part_weight_work(i * 2 + 1) -
7140 local_thread_part_weight_work(i * 2);
7141 if(thread_ii_weight_on_cut < left_weight) {
7142 local_thread_cut_line_weight_to_put_left(i) =
7143 thread_ii_weight_on_cut;
7144 }
7145 else {
7146 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7147 }
7148 }
7149 else {
7150 local_thread_cut_line_weight_to_put_left(i) = 0;
7151 }
7152 });
7153
7154 if(no_cuts > 0) {
7155 auto local_least_signifiance = least_signifiance;
7156 auto local_significance_mul = significance_mul;
7157 Kokkos::parallel_for(
7158 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7159 (0, 1), KOKKOS_LAMBDA (int dummy) {
7160 // this is a special case. If cutlines share the same coordinate,
7161 // their weights are equal.
7162 // we need to adjust the ratio for that.
7163 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7164 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7165 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7166 mj_scalar_t delta = cut2 - cut1;
7167 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7168 if(abs_delta < local_sEpsilon) {
7169 local_thread_cut_line_weight_to_put_left(i) -=
7170 local_thread_cut_line_weight_to_put_left(i - 1);
7171 }
7172 local_thread_cut_line_weight_to_put_left(i) =
7173 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7174 local_least_signifiance) * local_significance_mul) /
7175 static_cast<mj_scalar_t>(local_significance_mul);
7176 }
7177 });
7178 }
7179 }
7180
7181 auto local_thread_point_counts = this->thread_point_counts;
7182 Kokkos::parallel_for(
7183 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7184 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7185 local_thread_point_counts(i) = 0;
7186 });
7187
7188 // for this specific case we dont want to distribute the points along the
7189 // cut position randomly, as we need a specific ordering of them. Instead,
7190 // we put the coordinates into a sort item, where we sort those
7191 // using the coordinates of points on other dimensions and the index.
7192
7193 // some of the cuts might share the same position.
7194 // in this case, if cut i and cut j share the same position
7195 // cut_map[i] = cut_map[j] = sort item index.
7196 mj_part_t *cut_map = new mj_part_t[no_cuts];
7197
7198 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7199 typedef std::vector< multiSItem > multiSVector;
7200 typedef std::vector<multiSVector> multiS2Vector;
7201
7202 // to keep track of the memory allocated.
7203 std::vector<mj_scalar_t *>allocated_memory;
7204
7205 // vector for which the coordinates will be sorted.
7206 multiS2Vector sort_vector_points_on_cut;
7207
7208 // the number of cuts that have different coordinates.
7209 mj_part_t different_cut_count = 1;
7210 cut_map[0] = 0;
7211
7212 // now we insert 1 sort vector for all cuts on the different
7213 // positins.if multiple cuts are on the same position,
7214 // they share sort vectors.
7215 multiSVector tmpMultiSVector;
7216 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7217
7218 auto local_current_concurrent_cut_coordinate =
7219 current_concurrent_cut_coordinate;
7220 auto host_current_concurrent_cut_coordinate =
7221 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7222 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7223 local_current_concurrent_cut_coordinate);
7224
7225 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7226 // if cuts share the same cut coordinates
7227 // set the cutmap accordingly.
7228 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7229 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7230 cut_map[i] = cut_map[i-1];
7231 }
7232 else {
7233 cut_map[i] = different_cut_count++;
7234 multiSVector tmp2MultiSVector;
7235 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7236 }
7237 }
7238 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7239 host_current_concurrent_cut_coordinate);
7240
7241 // now the actual part assigment.
7242 auto host_coordinate_permutations =
7243 Kokkos::create_mirror_view(coordinate_permutations);
7244 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7245
7246 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7247 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7248
7249 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7250 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7251
7252 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7253 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7254
7255 auto local_coord_dim = this->coord_dim;
7256
7257 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7258 mj_lno_t i = host_coordinate_permutations(ii);
7259 mj_part_t pp = host_assigned_part_ids(i);
7260 mj_part_t p = pp / 2;
7261 // if the coordinate is on a cut.
7262 if(pp % 2 == 1 ) {
7263 mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7264 allocated_memory.push_back(vals);
7265
7266 // we insert the coordinates to the sort item here.
7267 int val_ind = 0;
7268
7269 if(longest_dim_part) {
7270 // std::cout << std::endl << std::endl;
7271 for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7272 // uSignedSortItem<int, mj_scalar_t, char>
7273 // *p_coord_dimension_range_sorted
7274 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7275 // std::cout << "next_largest_coord_dim: " <<
7276 // next_largest_coord_dim << " ";
7277 // Note refactor in progress
7278 vals[val_ind++] =
7279 host_mj_coordinates(i,next_largest_coord_dim);
7280 }
7281 }
7282 else {
7283 for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7284 vals[val_ind++] = host_mj_coordinates(i,dim);
7285 }
7286 for(int dim = 0; dim < coordInd; ++dim) {
7287 vals[val_ind++] = host_mj_coordinates(i,dim);
7288 }
7289 }
7290
7291 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7292 //insert the point to the sort vector pointed by the cut_map[p].
7293 mj_part_t cmap = cut_map[p];
7294 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7295 }
7296 else {
7297 //if it is not on the cut, simple sorting.
7298 ++host_thread_point_counts(p);
7299 host_assigned_part_ids(i) = p;
7300 }
7301 }
7302
7303 // sort all the sort vectors.
7304 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7305 std::sort (sort_vector_points_on_cut[i].begin(),
7306 sort_vector_points_on_cut[i].end());
7307 }
7308
7309 mj_part_t previous_cut_map = cut_map[0];
7310
7311 auto host_thread_cut_line_weight_to_put_left =
7312 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7313 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7314 thread_cut_line_weight_to_put_left);
7315
7316 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7317 Kokkos::deep_copy(host_mj_weights, mj_weights);
7318
7319 // this is how much previous part owns the weight of the current part.
7320 // when target part weight is 1.6, and the part on the left is given 2,
7321 // the left has an extra 0.4, while the right has missing 0.4 from the
7322 // previous cut.
7323 // This parameter is used to balance this issues.
7324 // in the above example weight_stolen_from_previous_part will be 0.4.
7325 // if the left part target is 2.2 but it is given 2,
7326 // then weight_stolen_from_previous_part will be -0.2.
7327 mj_scalar_t weight_stolen_from_previous_part = 0;
7328 for(mj_part_t p = 0; p < no_cuts; ++p) {
7329 mj_part_t mapped_cut = cut_map[p];
7330
7331 // if previous cut map is done, and it does not have the same index,
7332 // then assign all points left on that cut to its right.
7333 if(previous_cut_map != mapped_cut) {
7334 mj_lno_t sort_vector_end = (mj_lno_t)
7335 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7336 for(; sort_vector_end >= 0; --sort_vector_end) {
7337 multiSItem t =
7338 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7339 mj_lno_t i = t.index;
7340 ++host_thread_point_counts(p);
7341 host_assigned_part_ids(i) = p;
7342 }
7343 sort_vector_points_on_cut[previous_cut_map].clear();
7344 }
7345
7346 // TODO: MD: I dont remember why I have it reverse order here.
7347 mj_lno_t sort_vector_end = (mj_lno_t)
7348 sort_vector_points_on_cut[mapped_cut].size() - 1;
7349 // mj_lno_t sort_vector_begin= 0;
7350 // mj_lno_t sort_vector_size =
7351 // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7352
7353 // TODO commented for reverse order
7354 for(; sort_vector_end >= 0; --sort_vector_end) {
7355 // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7356 // TODO COMMENTED FOR REVERSE ORDER
7357 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7358 //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7359 mj_lno_t i = t.index;
7360 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7361 this->mj_weights(i,0);
7362 // part p has enough space for point i, then put it to point i.
7363 if(host_thread_cut_line_weight_to_put_left(p) +
7364 weight_stolen_from_previous_part> this->sEpsilon &&
7365 host_thread_cut_line_weight_to_put_left(p) +
7366 weight_stolen_from_previous_part -
7367 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7368 weight_stolen_from_previous_part - w)> this->sEpsilon)
7369 {
7370 host_thread_cut_line_weight_to_put_left(p) -= w;
7371
7372 sort_vector_points_on_cut[mapped_cut].pop_back();
7373
7374 ++host_thread_point_counts(p);
7375 host_assigned_part_ids(i) = p;
7376 // if putting this weight to left overweights the left cut, then
7377 // increase the space for the next cut using
7378 // weight_stolen_from_previous_part.
7379 if(p < no_cuts - 1 &&
7380 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7381 if(mapped_cut == cut_map[p + 1] ) {
7382 // if the cut before the cut indexed at p was also at the same
7383 // position special case, as we handle the weight differently here.
7384 if(previous_cut_map != mapped_cut) {
7385 weight_stolen_from_previous_part =
7386 host_thread_cut_line_weight_to_put_left(p);
7387 }
7388 else {
7389 // if the cut before the cut indexed at p was also at the same
7390 // position we assign extra weights cumulatively in this case.
7391 weight_stolen_from_previous_part +=
7392 host_thread_cut_line_weight_to_put_left(p);
7393 }
7394 }
7395 else{
7396 weight_stolen_from_previous_part =
7397 -host_thread_cut_line_weight_to_put_left(p);
7398 }
7399 // end assignment for part p
7400 break;
7401 }
7402 } else {
7403 // if part p does not have enough space for this point
7404 // and if there is another cut sharing the same positon,
7405 // again increase the space for the next
7406 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7407 if(previous_cut_map != mapped_cut) {
7408 weight_stolen_from_previous_part =
7409 host_thread_cut_line_weight_to_put_left(p);
7410 }
7411 else {
7412 weight_stolen_from_previous_part +=
7413 host_thread_cut_line_weight_to_put_left(p);
7414 }
7415 }
7416 else{
7417 weight_stolen_from_previous_part =
7418 -host_thread_cut_line_weight_to_put_left(p);
7419 }
7420 // end assignment for part p
7421 break;
7422 }
7423 }
7424 previous_cut_map = mapped_cut;
7425 }
7426
7427 // TODO commented for reverse order
7428 // put everything left on the last cut to the last part.
7429 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7430 previous_cut_map].size() - 1;
7431
7432 // mj_lno_t sort_vector_begin= 0;
7433 // mj_lno_t sort_vector_size = (mj_lno_t)
7434 // sort_vector_points_on_cut[previous_cut_map].size();
7435 // TODO commented for reverse order
7436 for(; sort_vector_end >= 0; --sort_vector_end) {
7437 // TODO commented for reverse order
7438 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7439 // multiSItem t =
7440 // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7441 mj_lno_t i = t.index;
7442 ++host_thread_point_counts(no_cuts);
7443 host_assigned_part_ids(i) = no_cuts;
7444 }
7445
7446 sort_vector_points_on_cut[previous_cut_map].clear();
7447 delete [] cut_map;
7448
7449 //free the memory allocated for vertex sort items .
7450 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7451 for(mj_lno_t i = 0; i < vSize; ++i) {
7452 delete [] allocated_memory[i];
7453 }
7454
7455 auto local_out_part_xadj = out_part_xadj;
7456 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7457 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7458
7459 // creation of part_xadj as in usual case.
7460 for(mj_part_t j = 0; j < num_parts; ++j) {
7461 host_out_part_xadj(j) = host_thread_point_counts(j);
7462 host_thread_point_counts(j) = 0;
7463 }
7464
7465 // perform prefix sum for num_points in parts.
7466 for(mj_part_t j = 1; j < num_parts; ++j) {
7467 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7468 }
7469
7470 // shift the num points in threads thread to obtain the
7471 // beginning index of each thread's private space.
7472 for(mj_part_t j = 1; j < num_parts; ++j) {
7473 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7474 }
7475
7476 auto host_new_coordinate_permutations =
7477 Kokkos::create_mirror_view(new_coordinate_permutations);
7478 Kokkos::deep_copy(host_new_coordinate_permutations,
7479 new_coordinate_permutations);
7480
7481 // now thread gets the coordinate and writes the index of coordinate to
7482 // the permutation array using the part index we calculated.
7483 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7484 mj_lno_t i = host_coordinate_permutations(ii);
7485 mj_part_t p = host_assigned_part_ids(i);
7486 host_new_coordinate_permutations(coordinate_begin +
7487 host_thread_point_counts(p)++) = i;
7488 }
7489
7490 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7491 Kokkos::deep_copy(new_coordinate_permutations,
7492 host_new_coordinate_permutations);
7493 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7494}
7495
7505template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7506 typename mj_part_t, typename mj_node_t>
7507void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7508 set_final_parts(
7509 mj_part_t current_num_parts,
7510 mj_part_t output_part_begin_index,
7511 RCP<mj_partBoxVector_t> &output_part_boxes,
7512 bool is_data_ever_migrated)
7513{
7514 this->mj_env->timerStart(MACRO_TIMERS,
7515 mj_timer_base_string + "Part_Assignment");
7516
7517 auto local_part_xadj = part_xadj;
7518 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7519 auto local_coordinate_permutations = coordinate_permutations;
7520 auto local_assigned_part_ids = assigned_part_ids;
7521
7522 if(local_mj_keep_part_boxes) {
7523 for(int i = 0; i < current_num_parts; ++i) {
7524 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7525 }
7526 }
7527
7528 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7529 current_num_parts, Kokkos::AUTO());
7530 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7531 member_type member_type;
7532 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7533 int i = team_member.league_rank();
7534 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7535 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7536 [=] (mj_lno_t ii) {
7537 mj_lno_t k = local_coordinate_permutations(ii);
7538 local_assigned_part_ids(k) = i + output_part_begin_index;
7539 });
7540 });
7541
7542 if(is_data_ever_migrated) {
7543#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7544 if(sizeof(mj_lno_t) <= sizeof(int)) {
7545
7546 // Cannot use Zoltan_Comm with local ordinals larger than ints.
7547 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7548 // may overflow.
7549
7550 // if data is migrated, then send part numbers to the original owners.
7551 ZOLTAN_COMM_OBJ *plan = NULL;
7552 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7553
7554 int incoming = 0;
7555 int message_tag = 7856;
7556
7557 this->mj_env->timerStart(MACRO_TIMERS,
7558 mj_timer_base_string + "Final Z1PlanCreating");
7559
7560 // setup incoming count
7561 int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7562 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7563
7564 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7565 this->mj_env->timerStop(MACRO_TIMERS,
7566 mj_timer_base_string + "Final Z1PlanCreating" );
7567
7568 this->mj_env->timerStart(MACRO_TIMERS,
7569 mj_timer_base_string + "Final Z1PlanComm");
7570
7571 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7572
7573 // migrate gnos to actual owners.
7574 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7575 Kokkos::HostSpace(), this->current_mj_gnos);
7576 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7577 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7578 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7579 auto host_dst_gnos = Kokkos::create_mirror_view(
7580 Kokkos::HostSpace(), dst_gnos);
7581 message_tag++;
7582 ierr = Zoltan_Comm_Do( plan, message_tag,
7583 (char *) host_current_mj_gnos.data(),
7584 sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7585 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7586 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7587 this->current_mj_gnos = dst_gnos;
7588
7589 // migrate part ids to actual owners.
7590 auto host_src_part_ids = Kokkos::create_mirror_view(
7591 Kokkos::HostSpace(), this->assigned_part_ids);
7592 deep_copy(host_src_part_ids, this->assigned_part_ids);
7593 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7594 Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7595 auto host_dst_part_ids = Kokkos::create_mirror_view(
7596 Kokkos::HostSpace(), dst_part_ids);
7597 message_tag++;
7598 ierr = Zoltan_Comm_Do( plan, message_tag,
7599 (char *) host_src_part_ids.data(),
7600 sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7601 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7602 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7603 this->assigned_part_ids = dst_part_ids;
7604
7605 ierr = Zoltan_Comm_Destroy(&plan);
7606 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7607
7608 this->num_local_coords = incoming;
7609
7610 this->mj_env->timerStop(MACRO_TIMERS,
7611 mj_timer_base_string + "Final Z1PlanComm");
7612 }
7613 else
7614#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7615 {
7616 // setup incoming count
7617 this->mj_env->timerStart(MACRO_TIMERS,
7618 mj_timer_base_string + "Final DistributorPlanCreating");
7619 Tpetra::Distributor distributor(this->mj_problemComm);
7620 ArrayView<const mj_part_t> owners_of_coords(
7621 this->owner_of_coordinate.data(), this->num_local_coords);
7622 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7623 this->mj_env->timerStop(MACRO_TIMERS,
7624 mj_timer_base_string + "Final DistributorPlanCreating" );
7625
7626 this->mj_env->timerStart(MACRO_TIMERS,
7627 mj_timer_base_string + "Final DistributorPlanComm");
7628
7629 // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7630
7631 // migrate gnos to actual owners.
7632 auto src_host_current_mj_gnos =
7633 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
7634 Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
7635 ArrayRCP<mj_gno_t> received_gnos(incoming);
7636 ArrayView<mj_gno_t> sent_gnos(src_host_current_mj_gnos.data(),
7637 this->num_local_coords);
7638 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7639 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7640 Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7641 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7642 this->current_mj_gnos);
7643 memcpy(host_current_mj_gnos.data(),
7644 received_gnos.getRawPtr(), incoming * sizeof(mj_gno_t));
7645 Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
7646
7647 // migrate part ids to actual owners.
7648 auto src_host_assigned_part_ids =
7649 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
7650 Kokkos::deep_copy(src_host_assigned_part_ids, this->assigned_part_ids);
7651 ArrayView<mj_part_t> sent_partids(src_host_assigned_part_ids.data(),
7652 this->num_local_coords);
7653 ArrayRCP<mj_part_t> received_partids(incoming);
7654 distributor.doPostsAndWaits<mj_part_t>(
7655 sent_partids, 1, received_partids());
7656 this->assigned_part_ids =
7657 Kokkos::View<mj_part_t*, device_t>(
7658 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7659 incoming);
7660 auto host_assigned_part_ids = Kokkos::create_mirror_view(
7661 this->assigned_part_ids);
7662 memcpy( host_assigned_part_ids.data(),
7663 received_partids.getRawPtr(), incoming * sizeof(mj_part_t));
7664 deep_copy(this->assigned_part_ids, host_assigned_part_ids);
7665 this->num_local_coords = incoming;
7666
7667 this->mj_env->timerStop(MACRO_TIMERS,
7668 mj_timer_base_string + "Final DistributorPlanComm");
7669 }
7670 }
7671
7672 this->mj_env->timerStop(MACRO_TIMERS,
7673 mj_timer_base_string + "Part_Assignment");
7674
7675 this->mj_env->timerStart(MACRO_TIMERS,
7676 mj_timer_base_string + "Solution_Part_Assignment");
7677
7678 // ArrayRCP<mj_part_t> partId;
7679 // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7680
7681 if(this->mj_keep_part_boxes) {
7682 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7683 }
7684
7685 this->mj_env->timerStop(MACRO_TIMERS,
7686 mj_timer_base_string + "Solution_Part_Assignment");
7687}
7688
7701template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7702 typename mj_part_t, typename mj_node_t>
7705 bool distribute_points_on_cut_lines_,
7706 int max_concurrent_part_calculation_,
7707 int check_migrate_avoid_migration_option_,
7708 double minimum_migration_imbalance_,
7709 int migration_type_)
7710{
7711 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7712 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7713 this->check_migrate_avoid_migration_option =
7714 check_migrate_avoid_migration_option_;
7715 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7716 this->migration_type = migration_type_;
7717}
7718
7746template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7747 typename mj_part_t, typename mj_node_t>
7750 const RCP<const Environment> &env,
7751 RCP<const Comm<int> > &problemComm,
7752 double imbalance_tolerance_,
7753 int num_teams_,
7754 size_t num_global_parts_,
7755 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7756 int recursion_depth_,
7757 int coord_dim_,
7758 mj_lno_t num_local_coords_,
7759 mj_gno_t num_global_coords_,
7760 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7761 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7762 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7763 int num_weights_per_coord_,
7764 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7765 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7766 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7767 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7768 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7769{
7770
7771 // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7773 this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7774
7775 this->mj_env = env;
7776 this->mj_problemComm = problemComm;
7777 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7778 this->mj_env->timerStart(MACRO_TIMERS,
7779 mj_timer_base_string + "Total");
7780 this->mj_env->debug(3, "In MultiJagged Jagged");
7781 this->imbalance_tolerance = imbalance_tolerance_;
7782 this->mj_num_teams = num_teams_;
7783 this->num_global_parts = num_global_parts_;
7784 this->part_no_array = part_no_array_;
7785 this->recursion_depth = recursion_depth_;
7786 this->coord_dim = coord_dim_;
7787 this->num_local_coords = num_local_coords_;
7788 this->num_global_coords = num_global_coords_;
7789 this->mj_coordinates = mj_coordinates_;
7790 this->initial_mj_gnos = initial_mj_gnos_;
7791 this->num_weights_per_coord = num_weights_per_coord_;
7792 this->mj_uniform_weights = mj_uniform_weights_;
7793 this->mj_weights = mj_weights_;
7794 this->mj_uniform_parts = mj_uniform_parts_;
7795
7796 // this->set_input_data();
7797
7798 this->set_part_specifications();
7799
7800 this->mj_env->timerStart(MACRO_TIMERS,
7801 mj_timer_base_string + "Allocate Views");
7802 this->allocate_set_work_memory();
7803 this->mj_env->timerStop(MACRO_TIMERS,
7804 mj_timer_base_string + "Allocate Views");
7805
7806 // We duplicate the comm as we create subcommunicators during migration.
7807 // We keep the problemComm as it is, while comm changes after each migration.
7808 this->comm = this->mj_problemComm->duplicate();
7809
7810#ifdef print_debug
7811 if(comm->getRank() == 0) {
7812 std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7813 std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7814 std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7815 }
7816#endif
7817
7818 // initially there is a single partition
7819 mj_part_t current_num_parts = 1;
7820 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7821 this->all_cut_coordinates;
7822 this->mj_env->timerStart(MACRO_TIMERS,
7823 mj_timer_base_string + "Problem_Partitioning");
7824 mj_part_t output_part_begin_index = 0;
7825 mj_part_t future_num_parts = this->total_num_part;
7826 bool is_data_ever_migrated = false;
7827
7828 std::vector<mj_part_t> *future_num_part_in_parts =
7829 new std::vector<mj_part_t> ();
7830 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7831 new std::vector<mj_part_t> ();
7832
7833 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7834
7835 RCP<mj_partBoxVector_t> input_part_boxes;
7836 RCP<mj_partBoxVector_t> output_part_boxes;
7837
7838 if(this->mj_keep_part_boxes) {
7839 input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7840 output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7841 compute_global_box();
7842 this->init_part_boxes(output_part_boxes);
7843 }
7844
7845 auto local_part_xadj = this->part_xadj;
7846
7847 // Need a device counter - how best to allocate?
7848 // Putting this allocation in the loops is very costly so moved out here.
7849 Kokkos::View<mj_part_t*, device_t>
7850 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7851 Kokkos::View<size_t*, device_t>
7852 view_total_reduction_size("view_total_reduction_size", 1);
7853
7854 for(int i = 0; i < this->recursion_depth; ++i) {
7855
7856 // convert i to string to be used for debugging purposes.
7857 std::string istring = std::to_string(i);
7858
7859 // next_future_num_parts_in_parts will be as the size of outnumParts,
7860 // and this will hold how many more parts that each output part
7861 // should be divided. this array will also be used to determine the weight
7862 // ratios of the parts. swap the arrays to use iteratively.
7863 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7864 future_num_part_in_parts = next_future_num_parts_in_parts;
7865 next_future_num_parts_in_parts = tmpPartVect;
7866
7867 // clear next_future_num_parts_in_parts array as
7868 // getPartitionArrays expects it to be empty.
7869 next_future_num_parts_in_parts->clear();
7870 if(this->mj_keep_part_boxes) {
7871 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7872 input_part_boxes = output_part_boxes;
7873 output_part_boxes = tmpPartBoxes;
7874 output_part_boxes->clear();
7875 }
7876
7877 // returns the total no. of output parts for this dimension partitioning.
7878 mj_part_t output_part_count_in_dimension =
7879 this->update_part_num_arrays(
7880 future_num_part_in_parts,
7881 next_future_num_parts_in_parts,
7882 future_num_parts,
7883 current_num_parts,
7884 i,
7885 input_part_boxes,
7886 output_part_boxes, 1);
7887
7888 // if the number of obtained parts equal to current number of parts,
7889 // skip this dimension. For example, this happens when 1 is given in the
7890 // input part array is given. P=4,5,1,2
7891 if(output_part_count_in_dimension == current_num_parts) {
7892 //still need to swap the input output arrays.
7893 tmpPartVect= future_num_part_in_parts;
7894 future_num_part_in_parts = next_future_num_parts_in_parts;
7895 next_future_num_parts_in_parts = tmpPartVect;
7896
7897 if(this->mj_keep_part_boxes) {
7898 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7899 input_part_boxes = output_part_boxes;
7900 output_part_boxes = tmpPartBoxes;
7901 }
7902 continue;
7903 }
7904
7905 // get the coordinate axis along which the partitioning will be done.
7906 int coordInd = i % this->coord_dim;
7907
7908 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7909 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7910
7911 this->mj_env->timerStart(MACRO_TIMERS,
7912 mj_timer_base_string + "Problem_Partitioning_" + istring);
7913
7914 // alloc Memory to point the indices
7915 // of the parts in the permutation array.
7916 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7917 "new part xadj", output_part_count_in_dimension);
7918
7919 // the index where in the new_part_xadj will be written.
7920 mj_part_t output_part_index = 0;
7921
7922 // whatever is written to output_part_index will be added with
7923 // output_coordinate_end_index so that the points will be shifted.
7924 mj_part_t output_coordinate_end_index = 0;
7925
7926 mj_part_t current_work_part = 0;
7928 std::min(current_num_parts - current_work_part,
7929 this->max_concurrent_part_calculation);
7930
7931 mj_part_t obtained_part_index = 0;
7932
7933 auto host_process_local_min_max_coord_total_weight =
7934 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7935 auto host_global_min_max_coord_total_weight =
7936 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7937
7938 // run for all available parts.
7939 for(; current_work_part < current_num_parts;
7941
7943 std::min(current_num_parts - current_work_part,
7944 this->max_concurrent_part_calculation);
7945
7946 int bDoingWork_int; // Can't reduce on bool so use int
7947 auto local_device_num_partitioning_in_current_dim =
7948 device_num_partitioning_in_current_dim;
7949 Kokkos::parallel_reduce("Read bDoingWork",
7950 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7951 KOKKOS_LAMBDA(int dummy, int & set_single) {
7952 set_single = 0;
7953 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7954 if(local_device_num_partitioning_in_current_dim(
7955 current_work_part + kk) != 1) {
7956 set_single = 1;
7957 break;
7958 }
7959 }
7960 }, bDoingWork_int);
7961 bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7962
7963 this->mj_get_local_min_max_coord_totW(
7966 mj_current_dim_coords);
7967
7968 // 1D partitioning
7969 if(bDoingWork) {
7970 // obtain global Min max of the part.
7971 this->mj_get_global_min_max_coord_totW(
7973 this->process_local_min_max_coord_total_weight,
7974 this->global_min_max_coord_total_weight);
7975
7976 // represents the total number of cutlines
7977 // whose coordinate should be determined.
7978 mj_part_t total_incomplete_cut_count = 0;
7979
7980 // Compute weight ratios for parts & cuts:
7981 // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7982 // part0 cut0 part1 cut1 part2 cut2 part3
7983 mj_part_t concurrent_part_cut_shift = 0;
7984 mj_part_t concurrent_part_part_shift = 0;
7985
7986 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7987
7988 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7989 global_min_max_coord_total_weight);
7990
7991 mj_scalar_t min_coordinate =
7992 host_global_min_max_coord_total_weight(kk);
7993 mj_scalar_t max_coordinate =
7994 host_global_min_max_coord_total_weight(
7996
7997 mj_scalar_t global_total_weight =
7998 host_global_min_max_coord_total_weight(
8000
8001 mj_part_t concurrent_current_part_index = current_work_part + kk;
8002
8003 mj_part_t partition_count = host_num_partitioning_in_current_dim(
8004 concurrent_current_part_index);
8005
8006 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
8007 Kokkos::subview(current_cut_coordinates,
8008 std::pair<mj_lno_t, mj_lno_t>(
8009 concurrent_part_cut_shift, current_cut_coordinates.size()));
8010 Kokkos::View<mj_scalar_t *, device_t>
8011 current_target_part_weights =
8012 Kokkos::subview(target_part_weights,
8013 std::pair<mj_lno_t, mj_lno_t>(
8014 concurrent_part_part_shift, target_part_weights.size()));
8015
8016 // shift the usedCutCoordinate array as noCuts.
8017 concurrent_part_cut_shift += partition_count - 1;
8018 // shift the partRatio array as noParts.
8019 concurrent_part_part_shift += partition_count;
8020
8021 // calculate only if part is not empty,
8022 // and part will be further partitioned.
8023 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8024
8025 // increase num_cuts_do_be_determined by the number of cuts of the
8026 // current part's cut line number.
8027 total_incomplete_cut_count += partition_count - 1;
8028
8029 this->incomplete_cut_count(kk) = partition_count - 1;
8030
8031 // get the target weights of the parts
8032 this->mj_get_initial_cut_coords_target_weights(
8033 min_coordinate,
8034 max_coordinate,
8035 partition_count - 1,
8036 global_total_weight,
8037 usedCutCoordinate,
8038 current_target_part_weights,
8039 future_num_part_in_parts,
8040 next_future_num_parts_in_parts,
8041 concurrent_current_part_index,
8042 obtained_part_index);
8043
8044 mj_lno_t coordinate_end_index =
8045 host_part_xadj(concurrent_current_part_index);
8046 mj_lno_t coordinate_begin_index =
8047 concurrent_current_part_index==0 ? 0 :
8048 host_part_xadj(concurrent_current_part_index - 1);
8049
8050 this->set_initial_coordinate_parts(
8051 max_coordinate,
8052 min_coordinate,
8053 coordinate_begin_index, coordinate_end_index,
8054 this->coordinate_permutations,
8055 mj_current_dim_coords,
8056 this->assigned_part_ids,
8057 partition_count);
8058 }
8059 else {
8060 // e.g., if have fewer coordinates than parts, don't need to do
8061 // next dim.
8062 this->incomplete_cut_count(kk) = 0;
8063 }
8064
8065 obtained_part_index += partition_count;
8066 }
8067
8068 // used imbalance, it is always 0, as it is difficult to
8069 // estimate a range.
8070 double used_imbalance = 0;
8071 // Determine cut lines for all concurrent parts parts here.
8072 this->mj_env->timerStart(MACRO_TIMERS,
8073 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8074
8075 this->mj_1D_part(
8076 mj_current_dim_coords,
8077 used_imbalance,
8080 current_cut_coordinates,
8081 total_incomplete_cut_count,
8082 view_rectilinear_cut_count,
8083 view_total_reduction_size);
8084
8085 this->mj_env->timerStop(MACRO_TIMERS,
8086 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8087 }
8088
8089 // create new part chunks
8090 {
8091 mj_part_t output_array_shift = 0;
8092 mj_part_t cut_shift = 0;
8093 size_t tlr_shift = 0;
8094 size_t partweight_array_shift = 0;
8095 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8096
8097 mj_part_t current_concurrent_work_part = current_work_part + kk;
8098
8099 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8100 current_concurrent_work_part);
8101
8102 // if the part is empty, skip the part.
8103 int coordinateA_bigger_than_coordinateB =
8104 host_global_min_max_coord_total_weight(kk) >
8105 host_global_min_max_coord_total_weight(
8107
8108 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8109 // we still need to write the begin and end point of the empty part.
8110 // simply set it zero, the array indices will be shifted later
8111 auto local_new_part_xadj = this->new_part_xadj;
8112 Kokkos::parallel_for(
8113 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8114 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8115 local_new_part_xadj(
8116 output_part_index + output_array_shift + jj) = 0;
8117 });
8118
8119 cut_shift += num_parts - 1;
8120 tlr_shift += (4 *(num_parts - 1) + 1);
8121 output_array_shift += num_parts;
8122 partweight_array_shift += (2 * (num_parts - 1) + 1);
8123 continue;
8124 }
8125
8126 Kokkos::View<mj_scalar_t *, device_t>
8127 current_concurrent_cut_coordinate =
8128 Kokkos::subview(current_cut_coordinates,
8129 std::pair<mj_lno_t, mj_lno_t>(
8130 cut_shift,
8131 current_cut_coordinates.size()));
8132 Kokkos::View<mj_scalar_t *, device_t>
8133 used_local_cut_line_weight_to_left =
8134 Kokkos::subview(process_cut_line_weight_to_put_left,
8135 std::pair<mj_lno_t, mj_lno_t>(
8136 cut_shift,
8137 process_cut_line_weight_to_put_left.size()));
8138
8139 this->thread_part_weight_work =
8140 Kokkos::subview(
8141 this->thread_part_weights,
8142 std::pair<mj_lno_t, mj_lno_t>(
8143 partweight_array_shift,
8144 this->thread_part_weights.extent(0)));
8145
8146 if(num_parts > 1) {
8147 if(this->mj_keep_part_boxes) {
8148 // if part boxes are to be stored update the boundaries.
8149 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8150 mj_scalar_t temp_get_val;
8151 Kokkos::parallel_reduce("Read single",
8152 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8153 KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8154 set_single = current_concurrent_cut_coordinate(j);
8155 }, temp_get_val);
8156 (*output_part_boxes)
8157 [output_array_shift + output_part_index + j].
8158 updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8159 (*output_part_boxes)
8160 [output_array_shift + output_part_index + j + 1].
8161 updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8162 }
8163 }
8164
8165 // Rewrite the indices based on the computed cuts.
8166 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8167 Kokkos::subview(this->new_part_xadj,
8168 std::pair<mj_lno_t, mj_lno_t>(
8169 output_part_index + output_array_shift,
8170 this->new_part_xadj.size()));
8171
8172 this->mj_create_new_partitions(
8173 num_parts,
8174 current_concurrent_work_part,
8175 mj_current_dim_coords,
8176 current_concurrent_cut_coordinate,
8177 used_local_cut_line_weight_to_left,
8178 sub_new_part_xadj);
8179 }
8180 else {
8181
8182 mj_lno_t coordinate_end = host_part_xadj(
8183 current_concurrent_work_part);
8184 mj_lno_t coordinate_begin =
8185 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8186 current_concurrent_work_part - 1);
8187
8188 // if this part is partitioned into 1 then just copy
8189 // the old values.
8190 mj_lno_t part_size = coordinate_end - coordinate_begin;
8191
8192 // Awkward here to set one value - need some broader
8193 // refactoring to improve this one.
8194 auto local_new_part_xadj = this->new_part_xadj;
8195 Kokkos::parallel_for(
8196 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8197 (0, 1), KOKKOS_LAMBDA (int dummy) {
8198 local_new_part_xadj(
8199 output_part_index + output_array_shift) = part_size;
8200 });
8201
8202 auto subview_new_coordinate_permutations =
8203 Kokkos::subview(this->new_coordinate_permutations,
8204 std::pair<mj_lno_t, mj_lno_t>(
8205 coordinate_begin,
8206 coordinate_begin + part_size));
8207 auto subview_coordinate_permutations =
8208 Kokkos::subview(this->coordinate_permutations,
8209 std::pair<mj_lno_t, mj_lno_t>(
8210 coordinate_begin,
8211 coordinate_begin + part_size));
8212 Kokkos::deep_copy(subview_new_coordinate_permutations,
8213 subview_coordinate_permutations);
8214 }
8215 cut_shift += num_parts - 1;
8216 output_array_shift += num_parts;
8217 partweight_array_shift += (2 * (num_parts - 1) + 1);
8218 }
8219
8220 // shift cut coordinates so that all cut coordinates are stored.
8221 // no shift now because we dont keep the cuts.
8222 // current_cut_coordinates += cut_shift;
8223 // mj_create_new_partitions from coordinates partitioned the parts
8224 // and write the indices as if there were a single part.
8225 // now we need to shift the beginning indices.
8226 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8227 mj_part_t num_parts =
8228 host_num_partitioning_in_current_dim(current_work_part + kk);
8229
8230 // These two kernels are a bit awkward but need broader redesign to
8231 // avoid this situation.
8232 auto local_new_part_xadj = this->new_part_xadj;
8233 Kokkos::parallel_for(
8234 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8235 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8236 local_new_part_xadj(output_part_index+ii) +=
8237 output_coordinate_end_index;
8238 });
8239
8240 // increase the previous count by current end.
8241 mj_part_t temp_get;
8242 Kokkos::parallel_reduce("Read single",
8243 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8244 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8245 set_single =
8246 local_new_part_xadj(output_part_index + num_parts - 1);
8247 }, temp_get);
8248 output_coordinate_end_index = temp_get;
8249 //increase the current out.
8250 output_part_index += num_parts;
8251 }
8252 }
8253 }
8254
8255 // end of this partitioning dimension
8256 int current_world_size = this->comm->getSize();
8257 long migration_reduce_all_population =
8258 this->total_dim_num_reduce_all * current_world_size;
8259 bool is_migrated_in_current_dimension = false;
8260
8261 // we migrate if there are more partitionings to be done after this step
8262 // and if the migration is not forced to be avoided.
8263 // and the operation is not sequential.
8264 if(future_num_parts > 1 &&
8265 this->check_migrate_avoid_migration_option >= 0 &&
8266 current_world_size > 1) {
8267 this->mj_env->timerStart(MACRO_TIMERS,
8268 mj_timer_base_string + "Problem_Migration-" + istring);
8269 mj_part_t num_parts = output_part_count_in_dimension;
8270
8271 if(this->mj_perform_migration(
8272 num_parts,
8273 current_num_parts, //output
8274 next_future_num_parts_in_parts, //output
8275 output_part_begin_index,
8276 migration_reduce_all_population,
8277 this->num_global_coords / (future_num_parts * current_num_parts),
8278 istring,
8279 input_part_boxes, output_part_boxes) )
8280 {
8281 is_migrated_in_current_dimension = true;
8282 is_data_ever_migrated = true;
8283 this->mj_env->timerStop(MACRO_TIMERS,
8284 mj_timer_base_string + "Problem_Migration-" + istring);
8285 // since data is migrated, we reduce the number of reduceAll
8286 // operations for the last part.
8287 this->total_dim_num_reduce_all /= num_parts;
8288 }
8289 else {
8290 is_migrated_in_current_dimension = false;
8291 this->mj_env->timerStop(MACRO_TIMERS,
8292 mj_timer_base_string + "Problem_Migration-" + istring);
8293 }
8294 }
8295
8296 // swap the coordinate permutations for the next dimension.
8297 Kokkos::View<mj_lno_t*, device_t> tmp =
8298 this->coordinate_permutations;
8299 this->coordinate_permutations =
8300 this->new_coordinate_permutations;
8301
8302 this->new_coordinate_permutations = tmp;
8303 if(!is_migrated_in_current_dimension) {
8304 this->total_dim_num_reduce_all -= current_num_parts;
8305 current_num_parts = output_part_count_in_dimension;
8306 }
8307
8308 {
8309 this->part_xadj = this->new_part_xadj;
8310 local_part_xadj = this->new_part_xadj;
8311 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8312 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8313
8314 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8315 this->mj_env->timerStop(MACRO_TIMERS,
8316 mj_timer_base_string + "Problem_Partitioning_" + istring);
8317 }
8318 }
8319
8320 // Partitioning is done
8321 delete future_num_part_in_parts;
8322 delete next_future_num_parts_in_parts;
8323 this->mj_env->timerStop(MACRO_TIMERS,
8324 mj_timer_base_string + "Problem_Partitioning");
8326
8327 //get the final parts of each initial coordinate
8328 //the results will be written to
8329 //this->assigned_part_ids for gnos given in this->current_mj_gnos
8330 this->set_final_parts(
8331 current_num_parts,
8332 output_part_begin_index,
8333 output_part_boxes,
8334 is_data_ever_migrated);
8335
8336 result_assigned_part_ids_ = this->assigned_part_ids;
8337 result_mj_gnos_ = this->current_mj_gnos;
8338 this->mj_env->timerStop(MACRO_TIMERS,
8339 mj_timer_base_string + "Total");
8340 this->mj_env->debug(3, "Out of MultiJagged");
8341}
8342
8343template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8344 typename mj_part_t, typename mj_node_t>
8345RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8346 mj_partBoxVector_t>
8348 get_kept_boxes() const
8349{
8350 if(this->mj_keep_part_boxes) {
8351 return this->kept_boxes;
8352 }
8353 else {
8354 throw std::logic_error("Error: part boxes are not stored.");
8355 }
8356}
8357
8358template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8359 typename mj_part_t, typename mj_node_t>
8360RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8361 mj_partBoxVector_t>
8363 compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8364{
8365 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8366 mj_part_t ntasks = this->num_global_parts;
8367 int dim = (*localPartBoxes)[0].getDim();
8368 coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8369
8370 memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8371
8372 coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8373 memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8374
8375 coord_t *localPartMins = localPartBoundaries;
8376 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8377
8378 coord_t *globalPartMins = globalPartBoundaries;
8379 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8380
8381 mj_part_t boxCount = localPartBoxes->size();
8382 for(mj_part_t i = 0; i < boxCount; ++i) {
8383 mj_part_t pId = (*localPartBoxes)[i].getpId();
8384
8385 // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8386
8387 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8388 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8389
8390 for(int j = 0; j < dim; ++j) {
8391 localPartMins[dim * pId + j] = lmins[j];
8392 localPartMaxs[dim * pId + j] = lmaxs[j];
8393
8394 /*
8395 std::cout << "me:" << comm->getRank() <<
8396 " dim * pId + j:"<< dim * pId + j <<
8397 " localMin:" << localPartMins[dim * pId + j] <<
8398 " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8399 */
8400 }
8401 }
8402
8403 Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8404
8405 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8406 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8407
8408 RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8409 for(mj_part_t i = 0; i < ntasks; ++i) {
8411 globalPartMins + dim * i,
8412 globalPartMaxs + dim * i);
8413
8414 /*
8415 for(int j = 0; j < dim; ++j) {
8416 std::cout << "me:" << comm->getRank() <<
8417 " dim * pId + j:"<< dim * i + j <<
8418 " globalMin:" << globalPartMins[dim * i + j] <<
8419 " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8420 }
8421 */
8422
8423 pB->push_back(tpb);
8424 }
8425 delete []localPartBoundaries;
8426 delete []globalPartBoundaries;
8427 //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8428 return pB;
8429}
8430
8433template <typename Adapter>
8434class Zoltan2_AlgMJ : public Algorithm<Adapter>
8435{
8436
8437private:
8438
8439#ifndef DOXYGEN_SHOULD_SKIP_THIS
8440 typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
8441
8442 // For coordinates and weights, MJ needs floats or doubles
8443 // But Adapter can provide other scalars, e.g., ints.
8444 // So have separate scalar_t for MJ and adapter.
8445 typedef typename Adapter::scalar_t adapter_scalar_t;
8446
8447 // Provide a default type for mj_scalar_t;
8448 typedef float default_mj_scalar_t;
8449
8450 // If Adapter provided float or double scalar_t, use it (prevents copies).
8451 // Otherwise, use the default type of mj_scalar_t;
8452 typedef typename
8453 std::conditional<
8454 (std::is_same<adapter_scalar_t, float>::value ||
8455 std::is_same<adapter_scalar_t, double>::value),
8456 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8457
8458 typedef typename Adapter::gno_t mj_gno_t;
8459 typedef typename Adapter::lno_t mj_lno_t;
8460 typedef typename Adapter::part_t mj_part_t;
8461 typedef typename Adapter::node_t mj_node_t;
8462 typedef coordinateModelPartBox mj_partBox_t;
8463 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8464 typedef typename mj_node_t::device_type device_t;
8465#endif
8466
8468
8469 RCP<const Environment> mj_env; // the environment object
8470 RCP<const Comm<int> > mj_problemComm; // initial comm object
8471 RCP<const coordinateModel_t> mj_coords; // coordinate adapter
8472
8473 // PARAMETERS
8474 double imbalance_tolerance; // input imbalance tolerance.
8475
8476 int num_teams; // how many teams to run main loop with
8477
8478 size_t num_global_parts; // the targeted number of parts
8479
8480 // input part array specifying num part to divide along each dim.
8481 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8482
8483 // the number of steps that partitioning will be solved in.
8484 int recursion_depth;
8485
8486 int coord_dim; // coordinate dimension.
8487 mj_lno_t num_local_coords; //number of local coords.
8488 mj_gno_t num_global_coords; //number of global coords.
8489
8490 // initial global ids of the coordinates.
8491 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8492
8493 // two dimension coordinate array.
8494 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8495 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8496 mj_coordinates;
8497
8498 int num_weights_per_coord; // number of weights per coordinate
8499
8500 // if the target parts are uniform.
8501 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8502
8503 // two dimensional weight array.
8504 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8505
8506 // if the target parts are uniform
8507 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8508
8509 // Nonuniform first level partitioning
8510 // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8511 // machine coordinates and application coordinates.
8512 // An optimization that completely partitions the most important machine
8513 // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8514 // coordinate). The standard MJ alg follows after the nonuniform first level
8515 // partitioning.
8516 // If used, number of parts for the first level partitioning
8517 mj_part_t num_first_level_parts;
8518
8519 // If used, the distribution of parts for the nonuniform
8520 // first level partitioning
8521 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8522
8523 // if partitioning can distribute points on same coordiante to
8524 // different parts.
8525 bool distribute_points_on_cut_lines;
8526
8527 // how many parts we can calculate concurrently.
8528 mj_part_t max_concurrent_part_calculation;
8529
8530 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8531 int check_migrate_avoid_migration_option;
8532
8533 // when doing the migration, 0 will aim for perfect load-imbalance,
8534 int migration_type;
8535
8536 // 1 for minimized messages
8537
8538 // when MJ decides whether to migrate, the minimum imbalance for migration.
8539 double minimum_migration_imbalance;
8540 bool mj_keep_part_boxes; //if the boxes need to be kept.
8541
8542 // if this is set, then recursion depth is adjusted to its maximum value.
8543 bool mj_run_as_rcb;
8544 int mj_premigration_option;
8545 int min_coord_per_rank_for_premigration;
8546
8547 // communication graph xadj
8548 ArrayRCP<mj_part_t> comXAdj_;
8549
8550 // communication graph adj.
8551 ArrayRCP<mj_part_t> comAdj_;
8552
8553 void copy(
8554 const RCP<PartitioningSolution<Adapter> >&solution);
8555
8556 void set_input_parameters(const Teuchos::ParameterList &p);
8557
8558 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8559
8560 bool mj_premigrate_to_subset(
8561 int used_num_ranks,
8562 int migration_selection_option,
8563 RCP<const Environment> mj_env_,
8564 RCP<const Comm<int> > mj_problemComm_,
8565 int coord_dim_,
8566 mj_lno_t num_local_coords_,
8567 mj_gno_t num_global_coords_, size_t num_global_parts_,
8568 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8569 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8570 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8571 mj_coordinates_,
8572 int num_weights_per_coord_,
8573 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8574 //results
8575 RCP<const Comm<int> > &result_problemComm_,
8576 mj_lno_t & result_num_local_coords_,
8577 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8578 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8579 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8580 result_mj_coordinates_,
8581 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8582 int * &result_actual_owner_rank_);
8583
8584public:
8585
8586 Zoltan2_AlgMJ(const RCP<const Environment> &env,
8587 RCP<const Comm<int> > &problemComm,
8588 const RCP<const coordinateModel_t> &coords) :
8589 mj_partitioner(),
8590 mj_env(env),
8591 mj_problemComm(problemComm),
8592 mj_coords(coords),
8593 imbalance_tolerance(0),
8594 num_teams(0),
8595 num_global_parts(1),
8596 recursion_depth(0),
8597 coord_dim(0),
8598 num_local_coords(0),
8599 num_global_coords(0),
8600 num_weights_per_coord(0),
8601 num_first_level_parts(1),
8602 distribute_points_on_cut_lines(true),
8603 max_concurrent_part_calculation(1),
8604 check_migrate_avoid_migration_option(0),
8605 migration_type(0),
8606 minimum_migration_imbalance(0.30),
8607 mj_keep_part_boxes(false),
8608 mj_run_as_rcb(false),
8609 mj_premigration_option(0),
8610 min_coord_per_rank_for_premigration(32000),
8611 comXAdj_(),
8612 comAdj_()
8613 {
8614 }
8615
8617 {
8618 }
8619
8622 static void getValidParameters(ParameterList & pl)
8623 {
8624 const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8625 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8626 Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8627 pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8628 "algorithm. As many as the dimension count.", mj_parts_Validator);
8629
8630 pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8631 "coordinates will be calculated concurently.",
8633
8634 pl.set("mj_minimum_migration_imbalance", 1.1,
8635 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8636 "processors to avoid migration",
8638
8639 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8640 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8641 pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8642 "depending on the imbalance, 1 for forcing migration, 2 for "
8643 "avoiding migration", mj_migration_option_validator);
8644
8645 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8646 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8647 pl.set("mj_migration_type", 0,
8648 "Migration type, 0 for migration to minimize the imbalance "
8649 "1 for migration to minimize messages exchanged the migration.",
8650 mj_migration_option_validator);
8651
8652 // bool parameter
8653 pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8654 "geometric partitioning.", Environment::getBoolValidator());
8655
8656 // bool parameter
8657 pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8659
8660 pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8661 "greater than 0.", Environment::getAnyIntValidator());
8662
8663 RCP<Teuchos::EnhancedNumberValidator<int>>
8664 mj_num_teams_validator =
8665 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8666 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8667 pl.set("mj_num_teams", 0,
8668 "How many teams for the main kernel loop"
8669 , mj_num_teams_validator);
8670
8671 RCP<Teuchos::EnhancedNumberValidator<int>>
8672 mj_premigration_option_validator =
8673 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8674
8675 pl.set("mj_premigration_option", 0,
8676 "Whether to do premigration or not. 0 for no migration "
8677 "x > 0 for migration to consecutive processors, "
8678 "the subset will be 0,x,2x,3x,...subset ranks."
8679 , mj_premigration_option_validator);
8680
8681 pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8682 "assign each rank in multijagged after premigration"
8684 }
8685
8691 void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8692
8693 mj_partBoxVector_t &getPartBoxesView() const
8694 {
8695 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8696 return *pBoxes;
8697 }
8698
8699 mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8700
8701 void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8702 size_t &nPartsFound, mj_part_t **partsFound) const;
8703
8706 void getCommunicationGraph(
8707 const PartitioningSolution<Adapter> *solution,
8708 ArrayRCP<mj_part_t> &comXAdj,
8709 ArrayRCP<mj_part_t> &comAdj);
8710
8711 void set_up_partitioning_data( // public for CUDA
8712 const RCP<PartitioningSolution<Adapter> >&solution);
8713
8714 private:
8715 std::string timer_base_string; // used for making timers
8716
8717 // After loading views from coordinate adapter we may need to copy them
8718 // if mj type is different, but otherwise we just want to assign the view.
8719 // So purpose of this code is to make that assign only happen when the types
8720 // match. The empty case would otherwise not compile.
8721 // If they don't match the internal code handles allocating the new view
8722 // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8723 template<class dst_t, class src_t> // version for same types
8724 typename std::enable_if<std::is_same<typename dst_t::value_type,
8725 typename src_t::value_type>::value>::type
8726 assign_if_same(dst_t & dst, const src_t & src) {
8727 dst = src;
8728 }
8729 template<class dst_t, class src_t> // version for different types
8730 typename std::enable_if<!std::is_same<typename dst_t::value_type,
8731 typename src_t::value_type>::value>::type
8732 assign_if_same(dst_t & dst, const src_t & src) {
8733 // do nothing - handled manually
8734 }
8735};
8736
8737template <typename Adapter>
8738bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8739 int used_num_ranks,
8740 int migration_selection_option,
8741 RCP<const Environment> mj_env_,
8742 RCP<const Comm<int> > mj_problemComm_,
8743 int coord_dim_,
8744 mj_lno_t num_local_coords_,
8745 mj_gno_t num_global_coords_, size_t num_global_parts_,
8746 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8747 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8748 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8749 int num_weights_per_coord_,
8750 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8751 //results
8752 RCP<const Comm<int> > & result_problemComm_,
8753 mj_lno_t &result_num_local_coords_,
8754 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8755 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8756 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8757 result_mj_coordinates_,
8758 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8759 int * &result_actual_owner_rank_)
8760{
8761 mj_env_->timerStart(MACRO_TIMERS,
8762 timer_base_string + "PreMigration DistributorPlanCreating");
8763
8764 int myRank = mj_problemComm_->getRank();
8765 int worldSize = mj_problemComm_->getSize();
8766
8767 mj_part_t groupsize = worldSize / used_num_ranks;
8768
8769 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8770
8771 mj_part_t i_am_sending_to = 0;
8772 bool am_i_a_receiver = false;
8773
8774 for(int i = 0; i < used_num_ranks; ++i) {
8775 group_begins[i+ 1] = group_begins[i] + groupsize;
8776 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8777 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8778 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8779 i_am_sending_to = group_begins[i];
8780 }
8781 if(myRank == group_begins[i]) {
8782 am_i_a_receiver = true;
8783 }
8784 }
8785
8786 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8787 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8788
8789 Tpetra::Distributor distributor(mj_problemComm_);
8790
8791 std::vector<mj_part_t>
8792 coordinate_destinations(num_local_coords_, i_am_sending_to);
8793
8794 ArrayView<const mj_part_t>
8795 destinations(&(coordinate_destinations[0]), num_local_coords_);
8796 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8797 result_num_local_coords_ = num_incoming_gnos;
8798 mj_env_->timerStop(MACRO_TIMERS,
8799 timer_base_string + "PreMigration DistributorPlanCreating");
8800
8801 mj_env_->timerStart(MACRO_TIMERS,
8802 timer_base_string + "PreMigration DistributorMigration");
8803
8804 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8805
8806 // migrate gnos.
8807 {
8808 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
8809 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_initial_mj_gnos(
8810 Kokkos::ViewAllocateWithoutInitializing("host_initial_mj_gnos"),
8811 initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8812 Kokkos::deep_copy(host_initial_mj_gnos, initial_mj_gnos_);
8813 ArrayView<const mj_gno_t> sent_gnos(host_initial_mj_gnos.data(),
8814 num_local_coords_);
8815 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
8816 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8817 Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8818 num_incoming_gnos);
8819 auto host_result_initial_mj_gnos_ = Kokkos::create_mirror_view(
8820 result_initial_mj_gnos_);
8821 memcpy(host_result_initial_mj_gnos_.data(),
8822 received_gnos.getRawPtr(), num_incoming_gnos * sizeof(mj_gno_t));
8823 Kokkos::deep_copy(result_initial_mj_gnos_, host_result_initial_mj_gnos_);
8824 }
8825
8826 // migrate coordinates
8827 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8828 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8829 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8830 num_incoming_gnos, this->coord_dim);
8831 auto host_dst_coordinates = Kokkos::create_mirror_view(
8832 dst_coordinates);
8833 auto host_src_coordinates =
8834 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->mj_coordinates);
8835 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8836 for(int i = 0; i < this->coord_dim; ++i) {
8837 auto sub_host_src_coordinates
8838 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8839 auto sub_host_dst_coordinates
8840 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
8841 // Note Layout Left means we can do these in contiguous blocks
8842 ArrayView<mj_scalar_t> sent_coord(
8843 sub_host_src_coordinates.data(), this->num_local_coords);
8844 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
8845 distributor.doPostsAndWaits<mj_scalar_t>(
8846 sent_coord, 1, received_coord());
8847 memcpy(sub_host_dst_coordinates.data(),
8848 received_coord.getRawPtr(), num_incoming_gnos * sizeof(mj_scalar_t));
8849 }
8850 deep_copy(dst_coordinates, host_dst_coordinates);
8851 result_mj_coordinates_ = dst_coordinates;
8852
8853 // migrate weights.
8854 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8855 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8856 num_incoming_gnos, this->num_weights_per_coord);
8857 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8858 auto host_src_weights = Kokkos::create_mirror_view(this->mj_weights);
8859 Kokkos::deep_copy(host_src_weights, this->mj_weights);
8860 for(int i = 0; i < this->num_weights_per_coord; ++i) {
8861 auto sub_host_src_weights
8862 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8863 auto sub_host_dst_weights
8864 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8865 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
8866
8867 // Layout Right means these are not contiguous
8868 // However we don't have any systems setup with more than 1 weight so
8869 // really I have not tested any of this code with num weights > 1.
8870 // I think this is the right thing to do. Note that there are other
8871 // places in the code which don't handle the possibility of more weights.
8872 // So evaluating all that and adding tests would be another project.
8873 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8874 sent_weight[n] = sub_host_src_weights(n);
8875 }
8876 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
8877 distributor.doPostsAndWaits<mj_scalar_t>(
8878 sent_weight(), 1, received_weight());
8879
8880 // Again we copy by index due to layout
8881 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882 sub_host_dst_weights(n) = received_weight[n];
8883 }
8884 }
8885 Kokkos::deep_copy(dst_weights, host_dst_weights);
8886 result_mj_weights_ = dst_weights;
8887
8888 // migrate the owners of the coordinates
8889 {
8890 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
8891 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
8892 ArrayRCP<int> received_owners(num_incoming_gnos);
8893 distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
8894 result_actual_owner_rank_ = new int[num_incoming_gnos];
8895 memcpy(
8896 result_actual_owner_rank_,
8897 received_owners.getRawPtr(),
8898 num_incoming_gnos * sizeof(int));
8899 }
8900
8901 mj_env_->timerStop(MACRO_TIMERS,
8902 timer_base_string + "PreMigration DistributorMigration");
8903 return am_i_a_receiver;
8904}
8905
8913template <typename Adapter>
8915 const RCP<PartitioningSolution<Adapter> > &solution)
8916{
8917 // purpose of this code is to validate node and UVM status for the tests
8918 // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8919 // << "Execution Space: " << mj_node_t::execution_space::name()
8920 // << std::endl;
8921
8922 int execute_counter =
8924 timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8925
8926 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8927 {
8928 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8929
8930 this->set_up_partitioning_data(solution);
8931
8932 this->set_input_parameters(this->mj_env->getParameters());
8933 if(this->mj_keep_part_boxes) {
8934 this->mj_partitioner.set_to_keep_part_boxes();
8935 }
8936
8937 this->mj_partitioner.set_partitioning_parameters(
8938 this->distribute_points_on_cut_lines,
8939 this->max_concurrent_part_calculation,
8940 this->check_migrate_avoid_migration_option,
8941 this->minimum_migration_imbalance, this->migration_type);
8942
8943 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8944 mj_lno_t result_num_local_coords = this->num_local_coords;
8945 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8946 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8947 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8948 result_mj_coordinates = this->mj_coordinates;
8949 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8950 this->mj_weights;
8951 int *result_actual_owner_rank = NULL;
8952
8953 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8954 this->initial_mj_gnos;
8955
8956 // TODO: MD 08/2017: Further discussion is required.
8957 // MueLu calls MJ when it has very few coordinates per processors,
8958 // such as 10. For example, it begins with 1K processor with 1K coordinate
8959 // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8960 // It calls MJ to repartition these to 10 coordinates.
8961 // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8962 // 10 parts. As expected strong scaling is problem here, because
8963 // computation is almost 0, and communication cost of MJ linearly increases.
8964 // Premigration option gathers the coordinates to 10 parts before MJ starts
8965 // therefore MJ will run with a smalller subset of the problem.
8966 // Below, I am migrating the coordinates if mj_premigration_option is set,
8967 // and the result parts are less than the current part count, and the
8968 // average number of local coordinates is less than some threshold.
8969 // For example, premigration may not help if 1000 processors are
8970 // partitioning data to 10, but each of them already have 1M coordinate.
8971 // In that case, we premigration would not help.
8972 int current_world_size = this->mj_problemComm->getSize();
8973 mj_lno_t threshold_num_local_coords =
8974 this->min_coord_per_rank_for_premigration;
8975 bool is_pre_migrated = false;
8976 bool am_i_in_subset = true;
8977
8978 // Note that we need to add testing for migration and should also cover the
8979 // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8980 // Currently did a minimal test of this code by running mjTest with
8981 // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8982 if(mj_premigration_option > 0 &&
8983 size_t (current_world_size) > this->num_global_parts &&
8984 this->num_global_coords < mj_gno_t (
8985 current_world_size * threshold_num_local_coords))
8986 {
8987 if(this->mj_keep_part_boxes) {
8988 throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8989 "mj_premigration_option are not supported together yet.");
8990 }
8991
8992 is_pre_migrated =true;
8993 int migration_selection_option = mj_premigration_option;
8994 if(migration_selection_option * this->num_global_parts >
8995 (size_t) (current_world_size)) {
8996 migration_selection_option =
8997 current_world_size / this->num_global_parts;
8998 }
8999
9000 int used_num_ranks = int (this->num_global_coords /
9001 float (threshold_num_local_coords) + 0.5);
9002
9003 if(used_num_ranks == 0) {
9004 used_num_ranks = 1;
9005 }
9006
9007 am_i_in_subset = this->mj_premigrate_to_subset(
9008 used_num_ranks,
9009 migration_selection_option,
9010 this->mj_env,
9011 this->mj_problemComm,
9012 this->coord_dim,
9013 this->num_local_coords,
9014 this->num_global_coords,
9015 this->num_global_parts,
9016 this->initial_mj_gnos,
9017 this->mj_coordinates,
9018 this->num_weights_per_coord,
9019 this->mj_weights,
9020 //results
9021 result_problemComm,
9022 result_num_local_coords,
9023 result_initial_mj_gnos,
9024 result_mj_coordinates,
9025 result_mj_weights,
9026 result_actual_owner_rank);
9027
9028 result_initial_mj_gnos_ = result_initial_mj_gnos;
9029 }
9030
9031 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9032 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9033
9034 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9035
9036 if(am_i_in_subset) {
9037 this->mj_partitioner.multi_jagged_part(
9038 this->mj_env,
9039 result_problemComm, //this->mj_problemComm,
9040 this->imbalance_tolerance,
9041 this->num_teams,
9042 this->num_global_parts,
9043 this->part_no_array,
9044 this->recursion_depth,
9045 this->coord_dim,
9046 result_num_local_coords, //this->num_local_coords,
9047 this->num_global_coords,
9048 result_initial_mj_gnos_,
9049 result_mj_coordinates,
9050 this->num_weights_per_coord,
9051 this->mj_uniform_weights,
9052 result_mj_weights,
9053 this->mj_uniform_parts,
9054 result_assigned_part_ids,
9055 result_mj_gnos
9056 );
9057 }
9058
9059 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9060
9061 // Reorder results so that they match the order of the input
9062 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9063 localGidToLid.reserve(result_num_local_coords);
9064 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9065 Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9066 result_initial_mj_gnos_.size());
9067 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9068 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9069 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9070 }
9071
9072 ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9073 0, result_num_local_coords, true);
9074 auto host_result_assigned_part_ids =
9075 Kokkos::create_mirror_view(result_assigned_part_ids);
9076 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9077 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9078 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9079 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9080 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9081 partId[origLID] = host_result_assigned_part_ids(i);
9082 }
9083
9084 //now the results are reordered. but if premigration occured,
9085 //then we need to send these ids to actual owners again.
9086 if(is_pre_migrated) {
9087 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9088 "PostMigration DistributorPlanCreating");
9089 Tpetra::Distributor distributor(this->mj_problemComm);
9090 ArrayView<const mj_part_t> actual_owner_destinations(
9091 result_actual_owner_rank , result_num_local_coords);
9092 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9093 actual_owner_destinations);
9094 if(num_incoming_gnos != this->num_local_coords) {
9095 throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9096 "num incoming is not equal to num local coords");
9097 }
9098
9099 mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9100 "PostMigration DistributorPlanCreating");
9101 mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9102 "PostMigration DistributorMigration");
9103 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
9104 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
9105 {
9106 ArrayView<const mj_gno_t> sent_gnos(host_result_initial_mj_gnos.data(),
9107 result_num_local_coords);
9108 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
9109 }
9110
9111 {
9112 ArrayView<mj_part_t> sent_partnos(partId());
9113 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1,
9114 received_partids());
9115 }
9116
9117 partId = arcp(new mj_part_t[this->num_local_coords],
9118 0, this->num_local_coords, true);
9119
9120 {
9121 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9122 localGidToLid2.reserve(this->num_local_coords);
9123 auto host_initial_mj_gnos =
9124 Kokkos::create_mirror_view(this->initial_mj_gnos);
9125 Kokkos::deep_copy(host_initial_mj_gnos,
9126 this->initial_mj_gnos);
9127 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9128 localGidToLid2[host_initial_mj_gnos(i)] = i;
9129 }
9130
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9133 partId[origLID] = received_partids[i];
9134 }
9135 }
9136
9137 {
9138 delete [] result_actual_owner_rank;
9139 }
9140 mj_env->timerStop(MACRO_TIMERS,
9141 timer_base_string + "PostMigration DistributorMigration");
9142 }
9143 solution->setParts(partId);
9144 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9145 }
9146
9147 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9148}
9149
9150/* \brief Sets the partitioning data for multijagged algorithm.
9151 * */
9152template <typename Adapter>
9154 const RCP<PartitioningSolution<Adapter> > &solution
9155)
9156{
9157 this->coord_dim = this->mj_coords->getCoordinateDim();
9158 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
9159 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
9160 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
9161 int criteria_dim = (this->num_weights_per_coord ?
9162 this->num_weights_per_coord : 1);
9163 // From the Solution we get part information.
9164 // If the part sizes for a given criteria are not uniform,
9165 // then they are values that sum to 1.0.
9166 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9167 // allocate only two dimensional pointer.
9168 // raw pointer addresess will be obtained from multivector.
9169 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9170 "uniform parts", criteria_dim);
9171 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9172 "uniform weights", criteria_dim);
9173
9174 Kokkos::View<const mj_gno_t *, device_t> gnos;
9175 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9176 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9177 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9178 this->mj_coords->getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9179 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9180 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9181 Kokkos::View<mj_scalar_t **, device_t> wgts;
9182
9183 // Now we must get the data from the adapter.
9184 // If the types match we point to the view but if not, we must copy.
9185 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9186 // we can just point the views but we must specialize because this code
9187 // only compiles in this case - for is_same false assign does nothing.
9188 assign_if_same(xyz, xyz_adapter);
9189 assign_if_same(wgts, wgts_adapter);
9190 }
9191 else {
9192 // we only allocate a new view if we are going to copy
9193 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9194 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9195 (Kokkos::ViewAllocateWithoutInitializing(
9196 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9197 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9198 Kokkos::ViewAllocateWithoutInitializing("wgts"),
9199 wgts_adapter.extent(0), wgts_adapter.extent(1));
9200
9201 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9202 Kokkos::parallel_for(
9203 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9204 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9205 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9206 xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9207 }
9208 });
9209 Kokkos::parallel_for(
9210 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9211 (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9212 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9213 wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9214 }
9215 });
9216 }
9217
9218 // obtain global ids.
9219 this->initial_mj_gnos = gnos;
9220 // extract coordinates from multivector.
9221 this->mj_coordinates = xyz;
9222 // if no weights are provided set uniform weight.
9223
9224 if(this->num_weights_per_coord == 0) {
9225 this->mj_uniform_weights(0) = true;
9226 Kokkos::resize(this->mj_weights, 0, 0);
9227 }
9228 else{
9229 this->mj_weights = wgts;
9230 for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9231 this->mj_uniform_weights(wdim) = false;
9232 }
9233 }
9234
9235 for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9236 if(solution->criteriaHasUniformPartSizes(wdim)) {
9237 this->mj_uniform_parts(wdim) = true;
9238 }
9239 else {
9240 printf("Error: MJ does not support non uniform target part weights\n");
9241 std::terminate();
9242 }
9243 }
9244}
9245
9246/* \brief Sets the partitioning parameters for multijagged algorithm.
9247 * \param pl: is the parameter list provided to zoltan2 call
9248 * */
9249template <typename Adapter>
9251 const Teuchos::ParameterList &pl)
9252{
9253 const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9254 if(pe) {
9255 double tol;
9256 tol = pe->getValue(&tol);
9257 this->imbalance_tolerance = tol - 1.0;
9258 }
9259
9260 // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9261 if(this->imbalance_tolerance <= 0) {
9262 this->imbalance_tolerance= 10e-4;
9263 }
9264
9265 // if an input partitioning array is provided.
9266 Kokkos::resize(this->part_no_array, 0);
9267
9268 // the length of the input partitioning array.
9269 this->recursion_depth = 0;
9270
9271 if(pl.getPtr<int>("mj_num_teams")) {
9272 this->num_teams = pl.get<int>("mj_num_teams");
9273 }
9274
9275 if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9276 auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9277 int mj_parts_size = static_cast<int>(mj_parts.size());
9278
9279 // build the view we'll have data on and copy values from host
9280 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9281 "part_no_array", mj_parts_size);
9282 for(int i = 0; i < mj_parts_size; ++i) {
9283 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9284 }
9285
9286 this->recursion_depth = mj_parts_size - 1;
9287 this->mj_env->debug(2, "mj_parts provided by user");
9288 }
9289
9290 // get mj specific parameters.
9291 this->distribute_points_on_cut_lines = true;
9292 this->max_concurrent_part_calculation = 1;
9293
9294 this->mj_run_as_rcb = false;
9295 this->mj_premigration_option = 0;
9296 this->min_coord_per_rank_for_premigration = 32000;
9297
9298 int mj_user_recursion_depth = -1;
9299 this->mj_keep_part_boxes = false;
9300 this->check_migrate_avoid_migration_option = 0;
9301 this->migration_type = 0;
9302 this->minimum_migration_imbalance = 0.35;
9303
9304 pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9305 if(pe) {
9306 double imb;
9307 imb = pe->getValue(&imb);
9308 this->minimum_migration_imbalance = imb - 1.0;
9309 }
9310
9311 pe = pl.getEntryPtr("mj_migration_option");
9312 if(pe) {
9313 this->check_migrate_avoid_migration_option =
9314 pe->getValue(&this->check_migrate_avoid_migration_option);
9315 } else {
9316 this->check_migrate_avoid_migration_option = 0;
9317 }
9318 if(this->check_migrate_avoid_migration_option > 1) {
9319 this->check_migrate_avoid_migration_option = -1;
9320 }
9321
9323 pe = pl.getEntryPtr("mj_migration_type");
9324 if(pe) {
9325 this->migration_type = pe->getValue(&this->migration_type);
9326 } else {
9327 this->migration_type = 0;
9328 }
9329
9330 //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9332
9333 pe = pl.getEntryPtr("mj_concurrent_part_count");
9334 if(pe) {
9335 this->max_concurrent_part_calculation =
9336 pe->getValue(&this->max_concurrent_part_calculation);
9337 } else {
9338 this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9339 }
9340
9341 pe = pl.getEntryPtr("mj_keep_part_boxes");
9342 if(pe) {
9343 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9344 } else {
9345 this->mj_keep_part_boxes = false; // Set to invalid value
9346 }
9347
9348 // For now, need keep_part_boxes to do pointAssign and boxAssign.
9349 // pe = pl.getEntryPtr("keep_cuts");
9350 // if(pe) {
9351 // int tmp = pe->getValue(&tmp);
9352 // if(tmp) this->mj_keep_part_boxes = true;
9353 // }
9354
9355 //need to keep part boxes if mapping type is geometric.
9356 if(this->mj_keep_part_boxes == false) {
9357 pe = pl.getEntryPtr("mapping_type");
9358 if(pe) {
9359 int mapping_type = -1;
9360 mapping_type = pe->getValue(&mapping_type);
9361 if(mapping_type == 0) {
9362 mj_keep_part_boxes = true;
9363 }
9364 }
9365 }
9366
9367 // need to keep part boxes if mapping type is geometric.
9368 pe = pl.getEntryPtr("mj_enable_rcb");
9369 if(pe) {
9370 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9371 } else {
9372 this->mj_run_as_rcb = false; // Set to invalid value
9373 }
9374
9375 pe = pl.getEntryPtr("mj_premigration_option");
9376 if(pe) {
9377 mj_premigration_option = pe->getValue(&mj_premigration_option);
9378 } else {
9379 mj_premigration_option = 0;
9380 }
9381
9382 pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9383 if(pe) {
9384 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9385 } else {
9386 min_coord_per_rank_for_premigration = 32000;
9387 }
9388
9389 pe = pl.getEntryPtr("mj_recursion_depth");
9390 if(pe) {
9391 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9392 } else {
9393 mj_user_recursion_depth = -1; // Set to invalid value
9394 }
9395
9396 bool val = false;
9397 pe = pl.getEntryPtr("rectilinear");
9398 if(pe) {
9399 val = pe->getValue(&val);
9400 }
9401 if(val) {
9402 this->distribute_points_on_cut_lines = false;
9403 } else {
9404 this->distribute_points_on_cut_lines = true;
9405 }
9406
9407 if(this->mj_run_as_rcb) {
9408 mj_user_recursion_depth =
9409 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9410 }
9411 if(this->recursion_depth < 1) {
9412 if(mj_user_recursion_depth > 0) {
9413 this->recursion_depth = mj_user_recursion_depth;
9414 }
9415 else {
9416 this->recursion_depth = this->coord_dim;
9417 }
9418 }
9419}
9420
9422template <typename Adapter>
9424 int dim,
9425 adapter_scalar_t *lower,
9426 adapter_scalar_t *upper,
9427 size_t &nPartsFound,
9428 typename Adapter::part_t **partsFound) const
9429{
9430 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9431 // TODO: complexity. Or at least do a search through the boxes, using
9432 // TODO: p x q x r x ... if possible.
9433
9434 nPartsFound = 0;
9435 *partsFound = NULL;
9436
9437 if(this->mj_keep_part_boxes) {
9438
9439 // Get vector of part boxes
9440 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9441
9442 size_t nBoxes = (*partBoxes).size();
9443 if(nBoxes == 0) {
9444 throw std::logic_error("no part boxes exist");
9445 }
9446
9447 // Determine whether the box overlaps the globalBox at all
9448 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9449
9450 if(globalBox->boxesOverlap(dim, lower, upper)) {
9451
9452 std::vector<typename Adapter::part_t> partlist;
9453
9454 // box overlaps the global box; find specific overlapping boxes
9455 for(size_t i = 0; i < nBoxes; i++) {
9456 try {
9457 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9458 nPartsFound++;
9459 partlist.push_back((*partBoxes)[i].getpId());
9460 /*
9461 std::cout << "Given box (";
9462 for(int j = 0; j < dim; j++)
9463 std::cout << lower[j] << " ";
9464 std::cout << ") x (";
9465 for(int j = 0; j < dim; j++)
9466 std::cout << upper[j] << " ";
9467 std::cout << ") overlaps PartBox "
9468 << (*partBoxes)[i].getpId() << " (";
9469 for(int j = 0; j < dim; j++)
9470 std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9471 std::cout << ") x (";
9472 for(int j = 0; j < dim; j++)
9473 std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9474 std::cout << ")" << std::endl;
9475 */
9476 }
9477 }
9479 }
9480 if(nPartsFound) {
9481 *partsFound = new mj_part_t[nPartsFound];
9482 for(size_t i = 0; i < nPartsFound; i++)
9483 (*partsFound)[i] = partlist[i];
9484 }
9485 }
9486 else {
9487 // Box does not overlap the domain at all. Find the closest part
9488 // Not sure how to perform this operation for MJ without having the
9489 // cuts. With the RCB cuts, the concept of a part extending to
9490 // infinity was natural. With the boxes, it is much more difficult.
9491 // TODO: For now, return information indicating NO OVERLAP.
9492 }
9493 }
9494 else {
9495 throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9496 }
9497}
9498
9500template <typename Adapter>
9502 int dim,
9503 adapter_scalar_t *point) const
9504{
9505 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9506 // TODO: complexity. Or at least do a search through the boxes, using
9507 // TODO: p x q x r x ... if possible.
9508
9509 if(this->mj_keep_part_boxes) {
9510 typename Adapter::part_t foundPart = -1;
9511
9512 // Get vector of part boxes
9513 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9514
9515 size_t nBoxes = (*partBoxes).size();
9516 if(nBoxes == 0) {
9517 throw std::logic_error("no part boxes exist");
9518 }
9519
9520 // Determine whether the point is within the global domain
9521 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9522
9523 if(globalBox->pointInBox(dim, point)) {
9524
9525 // point is in the global domain; determine in which part it is.
9526 size_t i;
9527 for(i = 0; i < nBoxes; i++) {
9528 try {
9529 if((*partBoxes)[i].pointInBox(dim, point)) {
9530 foundPart = (*partBoxes)[i].getpId();
9531 // std::cout << "Point (";
9532 // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9533 // std::cout << ") found in box " << i << " part " << foundPart
9534 // << std::endl;
9535 // (*partBoxes)[i].print();
9536 break;
9537 }
9538 }
9540 }
9541
9542 if(i == nBoxes) {
9543 // This error should never occur
9544 std::ostringstream oss;
9545 oss << "Point (";
9546 for(int j = 0; j < dim; j++) oss << point[j] << " ";
9547 oss << ") not found in domain";
9548 throw std::logic_error(oss.str());
9549 }
9550 }
9551
9552 else {
9553 // Point is outside the global domain.
9554 // Determine to which part it is closest.
9555 // TODO: with cuts, would not need this special case
9556
9557 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9558 size_t closestBox = 0;
9559 coord_t minDistance = std::numeric_limits<coord_t>::max();
9560 coord_t *centroid = new coord_t[dim];
9561 for(size_t i = 0; i < nBoxes; i++) {
9562 (*partBoxes)[i].computeCentroid(centroid);
9563 coord_t sum = 0.;
9564 coord_t diff;
9565 for(int j = 0; j < dim; j++) {
9566 diff = centroid[j] - point[j];
9567 sum += diff * diff;
9568 }
9569 if(sum < minDistance) {
9570 minDistance = sum;
9571 closestBox = i;
9572 }
9573 }
9574 foundPart = (*partBoxes)[closestBox].getpId();
9575 delete [] centroid;
9576 }
9577
9578 return foundPart;
9579 }
9580 else {
9581 throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9582 }
9583}
9584
9585template <typename Adapter>
9587 const PartitioningSolution<Adapter> *solution,
9588 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9589 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9590{
9591 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9592 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9593 mj_part_t ntasks = (*pBoxes).size();
9594 int dim = (*pBoxes)[0].getDim();
9595 GridHash grid(pBoxes, ntasks, dim);
9596 grid.getAdjArrays(comXAdj_, comAdj_);
9597 }
9598 comAdj = comAdj_;
9599 comXAdj = comXAdj_;
9600}
9601
9602template <typename Adapter>
9603RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9605{
9606 return this->mj_partitioner.get_kept_boxes();
9607}
9608} // namespace Zoltan2
9609
9610#endif
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
#define epsilon
Definition: nd.cpp:82
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.