Zoltan2
Zoltan2_AlgHybrid2GL.hpp
Go to the documentation of this file.
1#ifndef _ZOLTAN2_2GHOSTLAYER_HPP_
2#define _ZOLTAN2_2GHOSTLAYER_HPP_
3
4#include <vector>
5#include <unordered_map>
6#include <iostream>
7#include <queue>
8#ifdef _WIN32
9#include <time.h>
10#else
11#include <sys/time.h>
12#endif
13
14#include "Zoltan2_Algorithm.hpp"
17#include "Zoltan2_Util.hpp"
18#include "Zoltan2_TPLTraits.hpp"
19#include "Zoltan2_AlltoAll.hpp"
20
21
22#include "Tpetra_Core.hpp"
23#include "Teuchos_RCP.hpp"
24#include "Tpetra_Import.hpp"
25#include "Tpetra_FEMultiVector.hpp"
26
27#include "KokkosKernels_Handle.hpp"
28#include "KokkosKernels_IOUtils.hpp"
29#include "KokkosGraph_Distance1Color.hpp"
30#include "KokkosGraph_Distance1ColorHandle.hpp"
31
35// for methods that use two layers of ghosts.
36
37
38namespace Zoltan2{
39
40template <typename Adapter>
41class AlgTwoGhostLayer : public Algorithm<Adapter> {
42
43 public:
44
45 using lno_t = typename Adapter::lno_t;
46 using gno_t = typename Adapter::gno_t;
47 using offset_t = typename Adapter::offset_t;
48 using scalar_t = typename Adapter::scalar_t;
50 using map_t = Tpetra::Map<lno_t,gno_t>;
51 using femv_scalar_t = int;
52 using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53 using device_type = Tpetra::Map<>::device_type;
54 using execution_space = Tpetra::Map<>::execution_space;
55 using memory_space = Tpetra::Map<>::memory_space;
56 using host_exec = typename Kokkos::View<device_type>::HostMirror::execution_space;
57 using host_mem = typename Kokkos::View<device_type>::HostMirror::memory_space;
58
59 double timer(){
60 struct timeval tp;
61 gettimeofday(&tp, NULL);
62 return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
63 }
64 private:
65
66 void buildModel(modelFlag_t &flags);
67
68 //Entry point for parallel local coloring
69 // nVtx is the number of vertices owned by the current process
70 //
71 // adjs_view is the adjacencies, indexed by offset_view
72 //
73 // offset_view is the CSR row map, used to index the adjs_view
74 //
75 // femv is the FEMultiVector that holds the colors for the vertices
76 // the colors will change in this function.
77 //
78 // vertex_list is a list of vertices to recolor
79 //
80 // vertex_list_size is the size of the list of vertices to recolor
81 // vertex_list_size = 0 means recolor all uncolored vertices
82 //
83 // recolor decides which KokkosKernels algorithm to use.
84 //
85 virtual void colorInterior(const size_t nVtx,
86 Kokkos::View<lno_t*, device_type > adjs_view,
87 Kokkos::View<offset_t*,device_type > offset_view,
88 Teuchos::RCP<femv_t> femv,
89 Kokkos::View<lno_t*, device_type> vertex_list,
90 size_t vertex_list_size = 0,
91 bool recolor=false) = 0;
92
93 //Entry point for serial local coloring
94 virtual void colorInterior_serial(const size_t nVtx,
95 typename Kokkos::View<lno_t*, device_type >::HostMirror adjs_view,
96 typename Kokkos::View<offset_t*,device_type >::HostMirror offset_view,
97 Teuchos::RCP<femv_t> femv,
98 typename Kokkos::View<lno_t*, device_type>::HostMirror vertex_list,
99 size_t vertex_list_size = 0,
100 bool recolor=false) = 0;
101 public:
102 //Entry point for parallel conflict detection
103 //INPUT ARGS
104 // n_local is the number of vertices owned by the current process
105 //
106 // dist_offsets_dev is the device view that holds the CSR offsets
107 // It holds offsets for all vertices, owned and ghosts.
108 // It is organized with owned first, and then ghost layers:
109 // -----------------------------------------------
110 // | owned | first | second |
111 // | vtx | ghost | ghost |
112 // | offsets | layer | layer |
113 // -----------------------------------------------
114 // The allocated length is equal to the sum of owned
115 // and ghost vertices + 1. Any vertex with LID >= n_local
116 // is a ghost.
117 //
118 // dist_adjs_dev is the device view that holds CSR adjacencies
119 //
120 // boundary_verts_view holds the local IDs of vertices in the boundary.
121 // By definition, boundary vertices are owned vertices
122 // that have a ghost in their neighborhood.
123 // (distance-1 coloring = 1-hop neighborhood)
124 // (distance-2 coloring = 2-hop neighborhood)
125 // Note: for D1-2GL we do not use this argument.
126 // It is possible to detect all D1 conflicts by only
127 // checking the ghost vertices, without constructing
128 // the boundary.
129 //
130 // rand is a device view that holds random numbers generated from GIDs,
131 // they are consistently generated across processes
132 //
133 // gid is a device view that holds the GIDs of each vertex on this process.
134 // It is indexable by local ID. It stores both owned and ghost
135 // vertices, and LIDs are ordered so that the structure looks like:
136 // -----------------------------------------
137 // | | first | second | owned LIDs are smallest
138 // | owned | ghost | ghost | ghost LIDs increase based
139 // | vtx | layer | layer | on layer (1 < 2)
140 // -----------------------------------------
141 // The allocated size is dist_offsets_dev.extent(0)-1.
142 //
143 //
144 // ghost_degrees is a device view that holds the degrees of ghost vertices only.
145 // A ghost with local ID n_local will be the first entry in this view.
146 // So, ghost_degrees[i] is the degree of the vtx with
147 // GID = gid[n_local+i].
148 //
149 // recolor_degrees is a boolean that determines whether or not we factor in vertex
150 // degrees on recoloring
151 //
152 //OUTPUT ARGS
153 // femv_colors is the device color view.
154 // After this function call, conflicts between vertices will
155 // be resolved via setting a vertex's color to 0. The vertex
156 // to be uncolored is determined by rules that are consistent
157 // across multiple processes.
158 //
159 // verts_to_recolor_view is a device view that holds the list
160 // of vertices to recolor. Any vertex
161 // uncolored by this function will appear in this
162 // view after the function returns.
163 //
164 // verts_to_recolor_size_atomic is an atomic device view that holds the
165 // size of verts_to_recolor_view.
166 // Effectively it represents how many
167 // vertices need to be recolored after
168 // conflict detection.
169 // It differs in size from the allocated size of
170 // verts_to_recolor_view.
171 // Atomicity is required for building
172 // verts_to_recolor_view in parallel, which
173 // is the reason this is a view instead of
174 // just an integer type.
175 //
176 // verts_to_send_view is a device view that holds the list of vertices
177 // that will need to be sent to remotes after recoloring
178 // Note: Only locally owned vertices should ever be in
179 // this view. A process cannot color ghosts correctly.
180 //
181 // verts_to_send_size_atomic is an atomic device view that holds the size of
182 // verts_to_send_view. It differs in size from the
183 // allocated size of verts_to_send_view.
184 // Atomicity is required for building
185 // verts_to_send_view in parallel,
186 // which is the reason this is a view instead of
187 // just an integer type.
188 //
189 // recoloringSize is a device view that stores the total number of
190 // vertices that were uncolored by the conflict detection.
191 //
192 virtual void detectConflicts(const size_t n_local,
193 Kokkos::View<offset_t*, device_type > dist_offsets_dev,
194 Kokkos::View<lno_t*, device_type > dist_adjs_dev,
195 Kokkos::View<int*,device_type > femv_colors,
196 Kokkos::View<lno_t*, device_type > boundary_verts_view,
197 Kokkos::View<lno_t*,
198 device_type > verts_to_recolor_view,
199 Kokkos::View<int*,
201 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic,
202 Kokkos::View<lno_t*,
203 device_type > verts_to_send_view,
204 Kokkos::View<size_t*,
206 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
207 Kokkos::View<size_t*, device_type> recoloringSize,
208 Kokkos::View<int*,
209 device_type> rand,
210 Kokkos::View<gno_t*,
211 device_type> gid,
212 Kokkos::View<gno_t*,
213 device_type> ghost_degrees,
214 bool recolor_degrees) = 0;
215
216 //Entry point for serial conflict detection
217 virtual void detectConflicts_serial(const size_t n_local,
218 typename Kokkos::View<offset_t*, device_type >::HostMirror dist_offsets_host,
219 typename Kokkos::View<lno_t*, device_type >::HostMirror dist_adjs_host,
220 typename Kokkos::View<int*,device_type >::HostMirror femv_colors,
221 typename Kokkos::View<lno_t*, device_type >::HostMirror boundary_verts_view,
222 typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_recolor_view,
223 typename Kokkos::View<int*,device_type>::HostMirror verts_to_recolor_size_atomic,
224 typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send_view,
225 typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_atomic,
226 typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize,
227 typename Kokkos::View<int*, device_type>::HostMirror rand,
228 typename Kokkos::View<gno_t*,device_type>::HostMirror gid,
229 typename Kokkos::View<gno_t*,device_type>::HostMirror ghost_degrees,
230 bool recolor_degrees) = 0;
231 //Entry point for the construction of the boundary
232 //INPUT ARGS
233 // n_local is the number of vertices owned by the current process
234 //
235 // dist_offsets_dev is the device view that holds the CSR offsets
236 //
237 // dist_adjs_dev is the device view that holds CSR adjacencies
238 //
239 // dist_offsets_host is the hostmirror that holds the CSR offsets
240 //
241 // dist_adjs_host is the hostmirror that holds the CSR adjacencies
242 //
243 //OUTPUT ARGS
244 // boundary_verts is an unallocated device view that will hold the
245 // list of boundary vertices.
246 //
247 // verts_to_send_view will hold the list of vertices to send
248 //
249 // verts_to_send_size_atomic will hold the number of vertices to
250 // send currently held in verts_to_send_view.
251 // verts_to_send_size_atomic differs
252 // from the allocated size of verts_to_send_view
253 // Atomicity is required for building
254 // verts_to_send_view in parallel, which is
255 // the reason this is a view instead of an
256 // integer type.
257 //
258 virtual void constructBoundary(const size_t n_local,
259 Kokkos::View<offset_t*, device_type> dist_offsets_dev,
260 Kokkos::View<lno_t*, device_type> dist_adjs_dev,
261 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host,
262 typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host,
263 Kokkos::View<lno_t*, device_type>& boundary_verts,
264 Kokkos::View<lno_t*,
265 device_type > verts_to_send_view,
266 Kokkos::View<size_t*,
268 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic) = 0;
269
270 protected:
271 RCP<const base_adapter_t> adapter;
272 RCP<GraphModel<base_adapter_t> > model;
273 RCP<Teuchos::ParameterList> pl;
274 RCP<Environment> env;
275 RCP<const Teuchos::Comm<int> > comm;
277 bool timing;
278
279 private:
280 //This function constructs a CSR with complete adjacency information for
281 //first-layer ghost vertices. This CSR can contain edges from:
282 // first-layer ghosts to owned;
283 // first-layer ghosts to first-layer ghosts;
284 // first-layer ghosts to second-layer ghosts;
285 //1) Each process sends a list of ghost GIDs back to their owning process
286 // to request full adjacency information for that vertex.
287 //
288 //2) Initially each process sends back degree information to the requesting
289 // process.
290 //
291 //3) Then, each process can construct send buffers and receive schedules for
292 // the adjacency information for each requested GID it received,
293 // in addition to building the 2GL offsets and relabeling ghost LIDs
294 // to make the final CSR construction easier.
295 //
296 // 3a) We can construct the 2GL offsets here because each process has
297 // already received the degree information for each first-layer
298 // ghost vertex.
299 //
300 // 3b) The original ghost LIDs may not correspond to the order in which
301 // we will receive the adjacency information. Instead of reordering
302 // the received adjacencies, we relabel the GIDs of first-layer
303 // ghosts with new LIDs that correspond to the order in which we
304 // receive the adjacency information. Only first-layer ghost LIDs
305 // are changed.
306 //
307 //4) Due to limitations on the size of an MPI message, we split sending and
308 // receiving into rounds to accommodate arbitrarily large graphs.
309 //
310 //5) As send rounds progress, we construct the 2GL adjacency structure.
311 // Because we relabel ghost LIDs, we do not need to reorder the
312 // data after we receive it.
313 //
314 //
315 //
316 //OUTPUT ARGS
317 // ownedPlusGhosts Initially holds the list of GIDs for owned and ghost
318 // vertices. The only hard constraint on the ordering is
319 // that owned vertices come before ghost vertices.
320 // This function can re-assign the LIDs of ghost vertices
321 // in order to make the final construction of the 2GL
322 // CSR easier. After this function call, some ghost
323 // LIDs may be changed, but they will still be greater
324 // than owned LIDs. No owned LIDs will be changed.
325 //
326 // adjs_2GL holds the second ghost layer CSR adjacencies. The 2GL CSR
327 // contains complete adjacency information for the first-layer
328 // ghosts. These adjacencies can hold both owned vertices and
329 // second-layer ghosts, as well as first-layer ghosts.
330 //
331 // offsets_2GL holds CSR offsets for vertices in the first ghost layer only.
332 // That is, a vertex with GID = gid[n_local] will be the first
333 // vertex with information in this offset structure.
334 //
335 //
336 //INPUT ARGS
337 // owners holds the owning proc for a given vertex, indexed by local ID.
338 //
339 // adjs is the CSR adjacencies of the local graph with a single ghost layer.
340 //
341 // offsets is the CSR offsets of the local graph with a single ghost layer.
342 //
343 // mapOwned translates from Owned GID to LID. We only need this translation
344 // for owned vertices.
345 //
346 //TODO: This function uses many vectors of size comm->getSize();
347 // consider changes to increase memory scalability.
348 void constructSecondGhostLayer(std::vector<gno_t>& ownedPlusGhosts,
349 const std::vector<int>& owners,
350 ArrayView<const gno_t> adjs,
351 ArrayView<const offset_t> offsets,
352 RCP<const map_t> mapOwned,
353 std::vector< gno_t>& adjs_2GL,
354 std::vector< offset_t>& offsets_2GL) {
355
356 //first, we send ghost GIDs back to owners to receive the
357 //degrees of each first-layer ghost.
358 std::vector<int> sendcounts(comm->getSize(),0);
359 std::vector<size_t> sdispls(comm->getSize()+1,0);
360 //loop through owners, count how many vertices we'll send to each processor
361 //We send each ghost GID back to its owning process.
362 if(verbose) std::cout<<comm->getRank()<<": building sendcounts\n";
363 for(size_t i = 0; i < owners.size(); i++){
364 if(owners[i] != comm->getRank()&& owners[i] !=-1) sendcounts[owners[i]]++;
365 }
366 //construct sdispls (for building sendbuf), and sum the total sendcount
367 if(verbose) std::cout<<comm->getRank()<<": building sdispls\n";
368 size_t sendcount = 0;
369 for(int i = 1; i < comm->getSize()+1; i++){
370 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
371 sendcount += sendcounts[i-1];
372 }
373
374 if(verbose) std::cout<<comm->getRank()<<": building idx\n";
375 std::vector<gno_t> idx(comm->getSize(),0);
376 for(int i = 0; i < comm->getSize(); i++){
377 idx[i]=sdispls[i];
378 }
379 //construct sendbuf to send GIDs to owning processes
380 if(verbose) std::cout<<comm->getRank()<<": building sendbuf\n";
381
382 std::vector<gno_t> sendbuf(sendcount,0);
383 for(size_t i = offsets.size()-1; i < owners.size(); i++){
384 if(owners[i] != comm->getRank() && owners[i] != -1){
385 sendbuf[idx[owners[i]]++] = ownedPlusGhosts[i];
386 }
387 }
388
389 //communicate GIDs to owners
390 if(verbose) std::cout<<comm->getRank()<<": requesting GIDs from owners\n";
391 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
392 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
393 Teuchos::ArrayRCP<gno_t> recvbuf;
394 std::vector<int> recvcounts(comm->getSize(),0);
395 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
396 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
397
398 if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
399 //replace entries in recvGIDs with their degrees
400
401 if(verbose) std::cout<<comm->getRank()<<": building rdispls\n";
402 gno_t recvcounttotal = 0;
403 std::vector<int> rdispls(comm->getSize()+1,0);
404 for(size_t i = 1; i<recvcounts.size()+1; i++){
405 rdispls[i] = rdispls[i-1] + recvcounts[i-1];
406 recvcounttotal += recvcounts[i-1];
407 }
408 //send back the degrees to the requesting processes,
409 //build the adjacency counts
410 std::vector<offset_t> sendDegrees(recvcounttotal,0);
411 gno_t adj_len = 0;
412 std::vector<int> adjsendcounts(comm->getSize(),0);
413 if(verbose) std::cout<<comm->getRank()<<": building adjacency counts\n";
414 for(int i = 0; i < comm->getSize(); i++){
415 adjsendcounts[i] = 0;
416 for(int j = rdispls[i]; j < rdispls[i+1]; j++){
417 lno_t lid = mapOwned->getLocalElement(recvbuf[j]);
418 offset_t degree = offsets[lid+1] - offsets[lid];
419 sendDegrees[j] = degree;
420 adj_len +=degree;
421 adjsendcounts[i] += degree;
422 }
423 }
424 //communicate the degrees back to the requesting processes
425 if(verbose) std::cout<<comm->getRank()<<": sending degrees back to requestors\n";
426 Teuchos::ArrayView<offset_t> sendDegrees_view = Teuchos::arrayViewFromVector(sendDegrees);
427 Teuchos::ArrayRCP<offset_t> recvDegrees;
428 std::vector<int> recvDegreesCount(comm->getSize(),0);
429 Teuchos::ArrayView<int> recvDegreesCount_view = Teuchos::arrayViewFromVector(recvDegreesCount);
430 Zoltan2::AlltoAllv<offset_t>(*comm, *env, sendDegrees_view, recvcounts_view, recvDegrees, recvDegreesCount_view);
431
432 //calculate number of rounds of AlltoAllv's that are necessary on this process
433
434 if(verbose) std::cout<<comm->getRank()<<": determining number of rounds necessary\n";
435 int rounds = 1;
436 for(int i = 0; i < comm->getSize(); i++){
437 if(adjsendcounts[i]*sizeof(gno_t)/ INT_MAX > (size_t)rounds){
438 rounds = (adjsendcounts[i]*sizeof(gno_t)/INT_MAX)+1;
439 }
440 }
441
442 //see what the max number of rounds will be globally
443 int max_rounds = 0;
444 Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_MAX, 1, &rounds, &max_rounds);
445
446 if(verbose) std::cout<<comm->getRank()<<": building per_proc sums\n";
447 //compute send and receive schedules to and from each process
448 std::vector<std::vector<uint64_t>> per_proc_round_adj_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
449 std::vector<std::vector<uint64_t>> per_proc_round_vtx_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
450
451 if(verbose) std::cout<<comm->getRank()<<": filling per_proc sums\n";
452 //fill out the send schedules
453 for(int proc_to_send = 0; proc_to_send < comm->getSize(); proc_to_send++){
454 int curr_round = 0;
455 for(size_t j = sdispls[proc_to_send]; j < sdispls[proc_to_send+1]; j++){
456 if((per_proc_round_adj_sums[curr_round][proc_to_send] + recvDegrees[j])*sizeof(gno_t) > INT_MAX){
457 curr_round++;
458 }
459 per_proc_round_adj_sums[curr_round][proc_to_send] += recvDegrees[j];
460 per_proc_round_vtx_sums[curr_round][proc_to_send]++;
461 }
462 }
463
464 if(verbose) std::cout<<comm->getRank()<<": building recv GID schedule\n";
465
466 //A 3D vector to hold the GIDs so we can see how this process will receive
467 //the vertices in each round, from each process. This way we can reorder things
468 //locally so that the data we receive will automatically construct the CSR correctly
469 //without any other computation. We do the reordering before we start receiving.
470 std::vector<std::vector<std::vector<gno_t>>> recv_GID_per_proc_per_round(
471 max_rounds+1,std::vector<std::vector<gno_t>>(
472 comm->getSize(),std::vector<gno_t>(0)));
473 for(int i = 0; i < max_rounds; i++){
474 for(int j = 0; j < comm->getSize(); j++){
475 recv_GID_per_proc_per_round[i][j] = std::vector<gno_t>(sendcounts[j],0);
476 }
477 }
478
479 if(verbose) std::cout<<comm->getRank()<<": filling out recv GID schedule\n";
480 for(int i = 0; i < comm->getSize(); i++){
481 int curr_round = 0;
482 size_t curr_idx = 0;
483 for(size_t j = sdispls[i]; j < sdispls[i+1]; j++){
484 if(curr_idx > per_proc_round_vtx_sums[curr_round][i]){
485 curr_round++;
486 curr_idx = 0;
487 }
488 recv_GID_per_proc_per_round[curr_round][i][curr_idx++] = j;
489 }
490 }
491
492 if(verbose) std::cout<<comm->getRank()<<": reordering gids and degrees in the order they'll be received\n";
493 //reorder the GIDs and degrees locally:
494 // - The way we previously stored GIDs and degrees had no relation
495 // to the order that we'll be receiving the adjacency data.
496 // For the CSR to be complete (and usable), we reorder the LIDs of
497 // ghosts so that they are in the order we will receive the adjacency
498 // data.
499 //
500 // - For graphs that are not large enough to trigger sending in multiple
501 // rounds of communication, the LIDs of the ghosts will be ordered
502 // by owning process. If multiple rounds are used, the LIDs of
503 // ghosts will be ordered by rounds in addition to owning process.
504 //
505 // - final_gid_vec and final_degree_vec hold the reorganized gids and
506 // degrees, the final re-mapping happens further down
507 std::vector<gno_t> final_gid_vec(sendcount, 0);
508 std::vector<offset_t> final_degree_vec(sendcount,0);
509 gno_t reorder_idx = 0;
510 for(int i = 0; i < max_rounds; i++){
511 for(int j = 0; j < comm->getSize(); j++){
512 for(size_t k = 0; k < per_proc_round_vtx_sums[i][j]; k++){
513 final_gid_vec[reorder_idx] = sendbuf[recv_GID_per_proc_per_round[i][j][k]];
514 final_degree_vec[reorder_idx++] = recvDegrees[recv_GID_per_proc_per_round[i][j][k]];
515 }
516 }
517 }
518
519 //check to see if the reorganization has happened correctly
520 if(verbose){
521 //do a quick check to see if we ended up reorganizing anything
522 bool reorganized = false;
523 for(size_t i = 0; i < sendcount; i++){
524 if(final_gid_vec[i] != sendbuf[i]) reorganized = true;
525 }
526
527 //if we have more than a single round of communication, we need to reorganize.
528 //this alerts of unnecessary reorganization, and a failure to perform necessary
529 //reorganization.
530 if(!reorganized && (max_rounds > 1)) std::cout<<comm->getRank()<<": did not reorgainze GIDs, but probably should have\n";
531 if(reorganized && (max_rounds == 1)) std::cout<<comm->getRank()<<": reorganized GIDs, but probably should not have\n";
532 }
533 //remap the GIDs so we receive the adjacencies in the same order as the current processes LIDs
534 // originally, the GID->LID mapping has no relevance to how we'll be receiving adj data from
535 // remote processes. Here we make it so that the GID->LID mapping does correspond to the
536 // order we have received degree info and will receive adjacencies. ( LID n_local
537 // corresponds to the first GID whose adjacency info we will receive, LID n_local+1 the
538 // second, etc.)
539 // That way, we don't need to reorder the adjacencies after we receive them.
540 //
541 // This next loop is the final mapping step.
542
543 for (size_t i = 0; i < sendcount; i++){
544 ownedPlusGhosts[i+offsets.size()-1] = final_gid_vec[i];
545 }
546
547 //status report
548 if(verbose) {
549 std::cout<<comm->getRank()<<": done remapping\n";
550 std::cout<<comm->getRank()<<": building ghost offsets\n";
551 }
552 //start building the second ghost layer's offsets
553 std::vector<offset_t> ghost_offsets(sendcount+1,0);
554 for(size_t i = 1; i < sendcount+1; i++){
555 ghost_offsets[i] = ghost_offsets[i-1] + final_degree_vec[i-1];
556 }
557
558
559 if(verbose) std::cout<<comm->getRank()<<": going through the sending rounds\n";
560 //set up counters to keep track of where we are in the sending order
561 std::vector<uint64_t> curr_idx_per_proc(comm->getSize(),0);
562 for(int i = 0; i < comm->getSize(); i++) curr_idx_per_proc[i] = rdispls[i];
563 for(int round = 0; round < max_rounds; round++){
564 //send buffers
565 std::vector<gno_t> send_adj;
566 std::vector<int> send_adj_counts(comm->getSize(),0);
567 if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", constructing send_adj\n";
568 //construct the adjacencies to send for this round
569 for(int curr_proc = 0; curr_proc < comm->getSize(); curr_proc++){
570 uint64_t curr_adj_sum = 0;
571 //keep going through adjacencies to send to this process until we're done
572 while( curr_idx_per_proc[curr_proc] < (size_t)rdispls[curr_proc+1]){
573 lno_t lid = mapOwned->getLocalElement(recvbuf[curr_idx_per_proc[curr_proc]++]);
574
575 //if the next adjacency would push us over the MPI message size max,
576 //stop for this round
577 if((curr_adj_sum + (offsets[lid+1]-offsets[lid]))*sizeof(gno_t) >= INT_MAX){
578 break;
579 }
580
581 //add the adjacencies to the send buffer
582 curr_adj_sum += (offsets[lid+1] - offsets[lid]);
583 for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
584 send_adj.push_back(adjs[j]);
585 }
586 }
587 //update the send counts for this round
588 send_adj_counts[curr_proc] = curr_adj_sum;
589 }
590 if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", sending...\n";
591 //do the sending...
592 Teuchos::ArrayView<gno_t> send_adjs_view = Teuchos::arrayViewFromVector(send_adj);
593 Teuchos::ArrayView<int> adjsendcounts_view = Teuchos::arrayViewFromVector(send_adj_counts);
594 Teuchos::ArrayRCP<gno_t> ghost_adjs;
595 uint64_t recv_adjs_count = 0;
596 for(int i = 0; i < comm->getSize(); i++){
597 recv_adjs_count += per_proc_round_adj_sums[round][i];
598 }
599 std::vector<int> adjrecvcounts(comm->getSize(),0);
600 Teuchos::ArrayView<int> adjsrecvcounts_view = Teuchos::arrayViewFromVector(adjrecvcounts);
601 Zoltan2::AlltoAllv<gno_t>(*comm, *env, send_adjs_view, adjsendcounts_view, ghost_adjs, adjsrecvcounts_view);
602 //Because of the previous reordering, these adjacencies are
603 //in the correct order as they arrive on the process.
604 for(offset_t i = 0; i< (offset_t)ghost_adjs.size(); i++){
605 adjs_2GL.push_back(ghost_adjs[i]);
606 }
607 }
608 if(verbose) std::cout<<comm->getRank()<<": constructing offsets\n";
609 //put the offsets we computed into the output argument.
610 for(size_t i = 0; i < sendcount+1; i++){
611 offsets_2GL.push_back(ghost_offsets[i]);
612 }
613 if(verbose) std::cout<<comm->getRank()<<": done building 2nd ghost layer\n";
614 }
615
616 //Communicates owned vertex colors to remote ghost copies.
617 //
618 //returns: the amount of time the communication call took.
619 //
620 //OUTPUT ARGS:
621 // colors: the owned vertices' colors are not changed,
622 // ghost vertex colors are updated from their owners.
623 //
624 // total_sent: reports the total size of the send buffer
625 //
626 // total_recv: reports the total size of the recv buffer
627 //
628 //INPUT ARGS:
629 //
630 // mapOwnedPlusGhosts: maps global IDs to local IDs and vice-versa.
631 //
632 // nVtx: the number of owned vertices on this process
633 //
634 // verts_to_send: hostmirror of the list of vertices to send.
635 // This list will only contain local vertices
636 // that are ghosted on a remote process.
637 //
638 // verts_to_send_size: hostmirror of the size of verts_to_send
639 //
640 // procs_to_send: map that takes a local ID and gives a vector of
641 // process IDs which have a ghost copy of that vertex.
642 //
643 double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
644 size_t nVtx,
645 typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send,
646 typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size,
647 Teuchos::RCP<femv_t> femv,
648 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
649 gno_t& total_sent, gno_t& total_recvd){
650
651 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
652 auto colors = subview(femvColors, Kokkos::ALL, 0);
653 //create vectors to hold send information
654 int nprocs = comm->getSize();
655 std::vector<int> sendcnts(comm->getSize(), 0);
656 std::vector<gno_t> sdispls(comm->getSize()+1, 0);
657
658 //calculate how much data we're sending to each process
659 for(size_t i = 0; i < verts_to_send_size(0); i++){
660 for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
661 sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
662 }
663 }
664 //calculate sendsize and sdispls
665 sdispls[0] = 0;
666 gno_t sendsize = 0;
667 std::vector<int> sentcount(nprocs, 0);
668
669 for(int i = 1; i < comm->getSize()+1; i++){
670 sdispls[i] = sdispls[i-1] + sendcnts[i-1];
671 sendsize += sendcnts[i-1];
672 }
673 total_sent = sendsize;
674 std::vector<gno_t> sendbuf(sendsize,0);
675 //construct sendbuf, send each owned vertex's GID
676 //and its color to the processes that have a
677 //ghost copy of that vertex. If a vertex is not ghosted,
678 //it does not get sent anywhere.
679 for(size_t i = 0; i < verts_to_send_size(0); i++){
680 std::vector<int> procs = procs_to_send.at(verts_to_send(i));
681 for(size_t j = 0; j < procs.size(); j++){
682 size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
683 sentcount[procs[j]] += 2;
684 sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
685 sendbuf[idx] = colors(verts_to_send(i));
686 }
687 }
688
689 Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
690 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
691 Teuchos::ArrayRCP<gno_t> recvbuf;
692 std::vector<int> recvcnts(comm->getSize(), 0);
693 Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
694
695 //if we're reporting times, remove the computation imbalance from the comm timer
696 if(timing) comm->barrier();
697 double comm_total = 0.0;
698 double comm_temp = timer();
699
700 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
701 comm_total += timer() - comm_temp;
702
703 //compute total recvsize for updating local ghost colors
704 gno_t recvsize = 0;
705 for(int i = 0; i < recvcnts_view.size(); i++){
706 recvsize += recvcnts_view[i];
707 }
708 total_recvd = recvsize;
709 //update the local ghost copies with the color we just received.
710 for(int i = 0; i < recvsize; i+=2){
711 size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
712 colors(lid) = recvbuf[i+1];
713 }
714
715 return comm_total;
716 }
717
718 public:
719 //constructor
721 const RCP<const base_adapter_t> &adapter_,
722 const RCP<Teuchos::ParameterList> &pl_,
723 const RCP<Environment> &env_,
724 const RCP<const Teuchos::Comm<int> > &comm_)
725 : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
726 verbose = pl->get<bool>("verbose",false);
727 timing = pl->get<bool>("timing", false);
728 modelFlag_t flags;
729 flags.reset();
730 buildModel(flags);
731 }
732 //Main entry point for graph coloring
733 void color( const RCP<ColoringSolution<Adapter> > &solution){
734 //convert from global graph to local graph
735 ArrayView<const gno_t> vtxIDs;
736 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
737 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
738 // the weights are not used at this point.
739
740 //get the adjacencies in a view that holds GIDs
741 ArrayView<const gno_t> adjs;
742 //get the CSR offsets
743 ArrayView<const offset_t> offsets;
744 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
745 model->getEdgeList(adjs, offsets, ewgts);
746
747 //This map is used to map GID to LID initially
748 std::unordered_map<gno_t,lno_t> globalToLocal;
749 //this vector holds GIDs for owned and ghost vertices
750 //The final order of the gids is:
751 // -----------------------------------
752 // | | first | second |
753 // | owned | layer | layer |
754 // | | ghosts | ghosts |
755 // -----------------------------------
756 //
757 // constructSecondGhostLayer reorders the first layer ghost
758 // GIDs in the particular order that the communication requires.
759
760 std::vector<gno_t> ownedPlusGhosts;
761
762 //this vector holds the owning process for
763 //owned vertices and the first ghost layer.
764 //owners[i] gives the processor owning ownedPlusGhosts[i],
765 //for owned vertices and first layer ghosts.
766 //
767 //This should only be used BEFORE the call to constructSecondGhostLayer
768 std::vector<int> owners;
769
770 //fill these structures using owned vertices
771 for(int i = 0; i < vtxIDs.size(); i++){
772 globalToLocal[vtxIDs[i]] = i;
773 ownedPlusGhosts.push_back(vtxIDs[i]);
774 owners.push_back(comm->getRank());
775 }
776
777 //count the initial number of ghosts,
778 //map them to a local ID.
779 //The globalToLocal map has first layer ghosts
780 //from here on.
781 int nGhosts = 0;
782 std::vector<lno_t> local_adjs;
783 for(int i = 0; i < adjs.size(); i++){
784 if(globalToLocal.count(adjs[i])==0){
785 ownedPlusGhosts.push_back(adjs[i]);
786 globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
787 nGhosts++;
788 }
789 local_adjs.push_back(globalToLocal[adjs[i]]);
790 }
791
792 //construct a Tpetra map for the FEMultiVector
793 Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
794 <Tpetra::global_size_t>::invalid();
795 RCP<const map_t> mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
796
797 //GID and owner lookup for ghost vertices
798 std::vector<gno_t> ghosts;
799 std::vector<int> ghostowners;
800 for(size_t i = nVtx; i < nVtx+nGhosts; i++){
801 ghosts.push_back(ownedPlusGhosts[i]);
802 ghostowners.push_back(-1);
803 }
804
805 //get the owners of the ghost vertices
806 ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
807 ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
808 mapOwned->getRemoteIndexList(gids, owningProcs);
809
810 //add ghost owners to the total owner vector
811 for(size_t i = 0; i < ghostowners.size(); i++){
812 owners.push_back(ghostowners[i]);
813 }
814
815 //construct the second ghost layer.
816 //NOTE: this may reorder the LIDs of the ghosts.
817 //
818 //these vectors hold the CSR representation of the
819 // first ghost layer. This CSR contains:
820 // - edges from first layer ghosts to:
821 // - owned vertices
822 // - first layer ghosts
823 // - second layer ghosts
824 //
825 // - first_layer_ghost_adjs uses GIDs that need
826 // translated to LIDs.
827 //
828 // - first_layer_ghost_offsets are indexed by LID,
829 // first_layer_ghost_offsets[i] corresponds to
830 // the vertex with GID = ownedPlusGhosts[i+nVtx].
831 // first_layer_ghost_offsets.size() = #first layer ghosts + 1
832 //
833 std::vector< gno_t> first_layer_ghost_adjs;
834 std::vector< offset_t> first_layer_ghost_offsets;
835 constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
836 first_layer_ghost_adjs, first_layer_ghost_offsets);
837
838 //we potentially reordered the local IDs of the ghost vertices, so we need
839 //to re-insert the GIDs into the global to local ID mapping.
840 globalToLocal.clear();
841 for(size_t i = 0; i < ownedPlusGhosts.size(); i++){
842 globalToLocal[ownedPlusGhosts[i]] = i;
843 }
844
845 //use updated globalToLocal map to translate
846 //adjacency GIDs to LIDs
847 for(int i = 0 ; i < adjs.size(); i++){
848 local_adjs[i] = globalToLocal[adjs[i]];
849 }
850
851 //at this point, we have ownedPlusGhosts with 1layer ghosts' GIDs.
852 //Need to map the second ghost layer GIDs to new local IDs,
853 //and add them to the map, as well as translating 2layer ghost
854 //adjacencies to use those new LIDs.
855 size_t n2Ghosts = 0;
856
857 //local_ghost_adjs is the LID translation of first_layer_ghost_adjs.
858 std::vector<lno_t> local_ghost_adjs;
859 for(size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
860 if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
861 ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
862 globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
863 n2Ghosts++;
864 }
865 local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
866 }
867
868 //construct data structures necessary for FEMultiVector
869 if(verbose) std::cout<<comm->getRank()<<": constructing Tpetra map with copies\n";
870 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
871 RCP<const map_t> mapWithCopies = rcp(new map_t(dummy,
872 Teuchos::arrayViewFromVector(ownedPlusGhosts),
873 0, comm));
874 if(verbose) std::cout<<comm->getRank()<<": done constructing map with copies\n";
875
876 using import_t = Tpetra::Import<lno_t, gno_t>;
877 Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
878 mapWithCopies));
879 if(verbose) std::cout<<comm->getRank()<<": done constructing importer\n";
880 Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
881 importer, 1, true));
882 if(verbose) std::cout<<comm->getRank()<<": done constructing femv\n";
883
884 //Create random numbers seeded on global IDs to resolve conflicts
885 //consistently between processes.
886 std::vector<int> rand(ownedPlusGhosts.size());
887 for(size_t i = 0; i < rand.size(); i++){
888 std::srand(ownedPlusGhosts[i]);
889 rand[i] = std::rand();
890 }
891
892 // get up-to-date owners for all ghosts.
893 std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
894 std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
895 for(size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
896 Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
897 Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
898 mapOwned->getRemoteIndexList(ghostGIDs,owners2);
899 if(verbose) std::cout<<comm->getRank()<<": done getting ghost owners\n";
900
901 //calculations for how many 2GL verts are in the boundary of another process, only
902 //do this if it's going to be displayed.
903 if(verbose) {
904 std::cout<<comm->getRank()<<": calculating 2GL stats...\n";
905
906 std::vector<int> sendcounts(comm->getSize(),0);
907 std::vector<gno_t> sdispls(comm->getSize()+1,0);
908 //loop through owners, count how many vertices we'll send to each processor
909 for(int i = nGhosts; i < ghostGIDs.size(); i++){
910 if(owners2[i] != comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
911 }
912 //construct sdispls (for building sendbuf), and sum the total sendcount
913 gno_t sendcount = 0;
914 for(int i = 1; i < comm->getSize()+1; i++){
915 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
916 sendcount += sendcounts[i-1];
917 }
918 std::vector<gno_t> idx(comm->getSize(),0);
919 for(int i = 0; i < comm->getSize(); i++){
920 idx[i]=sdispls[i];
921 }
922 //construct sendbuf to send GIDs to owning processes
923 std::vector<gno_t> sendbuf(sendcount,0);
924 for(size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
925 if(owners2[i] != comm->getRank() && owners2[i] != -1){
926 sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
927 }
928 }
929 //do the communication
930 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
931 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
932 Teuchos::ArrayRCP<gno_t> recvbuf;
933 std::vector<int> recvcounts(comm->getSize(),0);
934 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
935 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
936 std::vector<int> is_bndry_send(recvbuf.size(),0);
937
938 //send back how many received vertices are in the boundary
939 for(int i = 0; i < recvbuf.size(); i++){
940 size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
941 is_bndry_send[i] = 0;
942 if(lid < nVtx){
943 for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
944 if((size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
945 }
946 } else{
947 for(offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
948 if((size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
949 }
950 }
951 }
952
953 //send back the boundary flags
954 Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
955 Teuchos::ArrayRCP<int> is_bndry_recv;
956 std::vector<int> bndry_recvcounts(comm->getSize(),0);
957 Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
958 Zoltan2::AlltoAllv<int> (*comm, *env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
959
960 //add together the flags to compute how many boundary vertices are in the second ghost layer
961 int boundaryverts = 0;
962 for(int i = 0; i < is_bndry_recv.size(); i++){
963 boundaryverts += is_bndry_recv[i];
964 }
965 //this cout is guarded by a verbose check.
966 std::cout<<comm->getRank()<<": "<<boundaryverts<<" boundary verts out of "<<n2Ghosts<<" verts in 2GL\n";
967 }
968
969
970 //local CSR representation for the owned vertices:
971 Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
972 //NOTE: the offset ArrayView was constructed earlier, and is up-to-date.
973 //
974 //first ghost layer CSR representation:
975 Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
976 Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
977 Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
978 Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
979 //these ArrayViews contain LIDs that are ghosted remotely,
980 //and the Process IDs that have those ghost copies.
981 //An LID may appear in the exportLIDs more than once.
982 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
983 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
984
985 //construct a quick lookup datastructure to map from LID to a
986 //list of PIDs we need to send data to.
987 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
988 for(int i = 0; i < exportLIDs.size(); i++){
989 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
990 }
991
992 //call the coloring algorithm
993 twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
994 femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
995
996 //copy colors to the output array
997 ArrayRCP<int> colors = solution->getColorsRCP();
998 auto femvdata = femv->getData(0);
999 for(size_t i=0; i<nVtx; i++){
1000 colors[i] = femvdata[i];
1001 }
1002
1003 }
1004// private:
1005
1006 //This is the coloring logic for all 2GL-based methods.
1007 //
1008 //INPUT ARGS:
1009 //
1010 // n_local: the number of local owned vertices
1011 //
1012 // n_total: the number of local owned vertices plus first layer ghosts
1013 //
1014 // adjs: the CSR adjacencies for the graph, only including owned vertices
1015 // and first layer ghosts
1016 //
1017 // offsets: the CSR offsets for the graph, has owned offsets into adjacencies
1018 //
1019 // ghost_adjs: the adjacency information for first-layer ghost vertices.
1020 // Includes all neighbors (owned, first-layer ghost,
1021 // second-layer ghost) for each first-layer ghost.
1022 //
1023 //
1024 // ghost_offsets: the offsets into ghost_adjs, first layer ghost LIDs
1025 // minus n_local are used to index this
1026 // datastructure. I.E. ghost_offsets(0)
1027 // corresponds to the ghost with LID n_local
1028 //
1029 // gids: a vector that holds GIDs for all vertices on this process
1030 // (includes owned, and all ghosts, indexable by LID)
1031 // The GIDs are ordered like this:
1032 // ----------------------------------
1033 // | | first | second |
1034 // | owned | layer | layer |
1035 // | | ghosts | ghosts |
1036 // ----------------------------------
1037 // ^ ^
1038 // n_local n_total
1039 //
1040 // rand: a vector that holds random numbers generated on GID for tie
1041 // breaking. Indexed by LID.
1042 //
1043 // ghost_owners: contains the process ID for the owner of each remote vertex.
1044 // Indexable by LID. owners[i] = the owning process for vertex
1045 // with GID = gids[i+n_local]
1046 //
1047 // mapOwnedPlusGhosts: map that can translate between GID and LID
1048 //
1049 // procs_to_send: for each vertex that is copied on a remote process,
1050 // this map contains the list of processes that have
1051 // a copy of a given vertex. Input LID, get the list
1052 // of PIDs that have a ghost copy of that LID.
1053 //
1054 //OUTPUT ARGS:
1055 //
1056 // femv: an FEMultiVector that holds a dual view of the colors.
1057 // After this call, femv contains updated colors.
1058 //
1059 void twoGhostLayer(const size_t n_local, const size_t n_total,
1060 const Teuchos::ArrayView<const lno_t>& adjs,
1061 const Teuchos::ArrayView<const offset_t>& offsets,
1062 const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1063 const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1064 const Teuchos::RCP<femv_t>& femv,
1065 const Teuchos::ArrayView<const gno_t>& gids,
1066 const Teuchos::ArrayView<const int>& rand,
1067 const Teuchos::ArrayView<const int>& ghost_owners,
1068 RCP<const map_t> mapOwnedPlusGhosts,
1069 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
1070 //initialize timing variables
1071 double total_time = 0.0;
1072 double interior_time = 0.0;
1073 double comm_time = 0.0;
1074 double comp_time = 0.0;
1075 double recoloring_time=0.0;
1076 double conflict_detection = 0.0;
1077
1078 //Number of rounds we are saving statistics for
1079 //100 is a decent default. Reporting requires --verbose argument.
1080 const int numStatisticRecordingRounds = 100;
1081
1082 //includes all ghosts, including the second layer.
1083 const size_t n_ghosts = rand.size() - n_local;
1084
1085
1086 //Get the degrees of all ghost vertices
1087 //This is necessary for recoloring based on degrees,
1088 //we need ghost vertex degrees for consistency.
1089 std::vector<int> deg_send_cnts(comm->getSize(),0);
1090 std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
1091 for(int i = 0; i < ghost_owners.size(); i++){
1092 deg_send_cnts[ghost_owners[i]]++;
1093 }
1094 deg_sdispls[0] = 0;
1095 gno_t deg_sendsize = 0;
1096 std::vector<int> deg_sentcount(comm->getSize(),0);
1097 for(int i = 1; i < comm->getSize()+1; i++){
1098 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1099 deg_sendsize += deg_send_cnts[i-1];
1100 }
1101 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1102 for(int i = 0; i < ghost_owners.size(); i++){
1103 size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1104 deg_sentcount[ghost_owners[i]]++;
1105 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1106 }
1107 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1108 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1109 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1110 std::vector<int> deg_recvcnts(comm->getSize(),0);
1111 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1112 Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1113
1114 //replace GID with the degree the owning process has for this vertex.
1115 //(this will include all edges present for this vertex globally)
1116 //This is safe to do, since we sent ghosts to their owners.
1117 for(int i = 0; i < deg_recvbuf.size(); i++){
1118 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1119 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1120 }
1121 //send modified buffer back
1122 ArrayRCP<gno_t> ghost_degrees;
1123 Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1124
1125 //create the ghost degree views, for use on host and device.
1126 Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
1127 typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1128 for(int i = 0; i < ghost_degrees.size(); i++){
1129 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1130 ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1131 }
1132 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1133
1134 //track the size of sends and receives through coloring rounds.
1135 gno_t recvPerRound[numStatisticRecordingRounds];
1136 gno_t sentPerRound[numStatisticRecordingRounds];
1137
1138 //find global max degree to determine which
1139 //coloring algorithm will be the most efficient
1140 //(For D1-2GL this is important, D2 and PD2 should always use NBBIT
1141 offset_t local_max_degree = 0;
1142 offset_t global_max_degree = 0;
1143 for(size_t i = 0; i < n_local; i++){
1144 offset_t curr_degree = offsets[i+1] - offsets[i];
1145 if(curr_degree > local_max_degree){
1146 local_max_degree = curr_degree;
1147 }
1148 }
1149 Teuchos::reduceAll<int, offset_t>(*comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1150 if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
1151
1152 if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for initial coloring\n";
1153
1154 //the initial coloring is able to use a standard CSR representation, so we construct that here.
1155 //This is a direct translation of the offsets and adjs arguments into Kokkos::Views.
1156 Kokkos::View<offset_t*, device_type> offsets_dev("Host Offset View", offsets.size());
1157 typename Kokkos::View<offset_t*, device_type>::HostMirror offsets_host = Kokkos::create_mirror(offsets_dev);
1158 Kokkos::View<lno_t*, device_type> adjs_dev("Host Adjacencies View", adjs.size());
1159 typename Kokkos::View<lno_t*, device_type>::HostMirror adjs_host = Kokkos::create_mirror(adjs_dev);
1160 for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1161 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1162 Kokkos::deep_copy(offsets_dev,offsets_host);
1163 Kokkos::deep_copy(adjs_dev, adjs_host);
1164
1165
1166 //create the graph structures which allow KokkosKernels to recolor the conflicting vertices
1167 if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for recoloring\n";
1168
1169 //in order to correctly recolor, all ghost vertices on this process need an entry in the CSR offsets.
1170 //Otherwise, the color of the ghosts will be ignored, and the coloring will not converge.
1171 Kokkos::View<offset_t*, device_type> dist_degrees_dev("Owned+Ghost degree view",rand.size());
1172 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1173
1174 //calculate the local degrees for the owned, first layer ghosts, and second layer ghosts
1175 //to be used to construct the CSR representation of the local graph.
1176 //owned
1177 for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1178 //first layer ghosts
1179 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1180 //second layer ghosts
1181 for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1182 //second layer ghosts have LID >= n_total
1183 if((size_t)ghost_adjs[i] >= n_total ){
1184 dist_degrees_host(ghost_adjs[i])++;
1185 }
1186 }
1187
1188 //The structure of the final CSR constructed here looks like:
1189 //
1190 // Index by LID--v
1191 // --------------------------------------------
1192 // | | first |second |
1193 // dist_offsets_dev/host |owned verts | layer |layer |
1194 // | | ghosts |ghosts |
1195 // --------------------------------------------
1196 // |indexes
1197 // v
1198 // --------------------------------------------
1199 // | |first |second |
1200 // dist_adjs_dev/host |owned adjs |layer |layer |
1201 // | |ghost adjs |ghost adjs |
1202 // --------------------------------------------
1203 //
1204 // We end up with a CSR that has adjacency information for all the vertices on
1205 // this process. We include only edges on the process, so ghosts have only partial
1206 // adjacency information.
1207 //
1208 // We symmetrize the local graph representation as well, for
1209 // KokkosKernels to behave as we need it to. This means that edges to
1210 // second layer ghosts must be symmetrized, so we end up with offsets
1211 // that correspond to second layer ghosts.
1212 Kokkos::View<offset_t*, device_type> dist_offsets_dev("Owned+Ghost Offset view", rand.size()+1);
1213 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1214
1215 //we can construct the entire offsets using the degrees we computed above
1216 dist_offsets_host(0) = 0;
1217 offset_t total_adjs = 0;
1218 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1219 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1220 total_adjs += dist_degrees_host(i-1);
1221 }
1222 Kokkos::View<lno_t*, device_type> dist_adjs_dev("Owned+Ghost adjacency view", total_adjs);
1223 typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1224
1225 //zero out the degrees to use it as a counter.
1226 //The offsets now allow us to compute degrees,
1227 //so we aren't losing anything.
1228 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1229 dist_degrees_host(i) = 0;
1230 }
1231 //We can just copy the adjacencies for the owned verts and first layer ghosts
1232 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1233 for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1234
1235 //We have to symmetrize edges that involve a second layer ghost.
1236 //Add edges from the second layer ghosts to their neighbors.
1237 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1238 //loop through each first layer ghost
1239 for(offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1240 //if an adjacency is a second layer ghost
1241 if((size_t)ghost_adjs[j] >= n_total){
1242 //compute the offset to place the symmetric edge, and place it.
1243 dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1244 //increment the counter to keep track of how many adjacencies
1245 //have been placed.
1246 dist_degrees_host(ghost_adjs[j])++;
1247 }
1248 }
1249 }
1250 //copy the host views back to the device views
1251 Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1252 Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1253 Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1254
1255 //this view represents how many conflicts were found
1256 Kokkos::View<size_t*, device_type> recoloringSize("Recoloring Queue Size",1);
1257 typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1258 recoloringSize_host(0) = 0;
1259 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1260
1261 //create a view for the tie-breaking random numbers.
1262 if(verbose) std::cout<<comm->getRank()<<": constructing rand and GIDs views\n";
1263 Kokkos::View<int*, device_type> rand_dev("Random View", rand.size());
1264 typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
1265 for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1266 Kokkos::deep_copy(rand_dev,rand_host);
1267
1268 //create a view for the global IDs of each vertex.
1269 Kokkos::View<gno_t*, device_type> gid_dev("GIDs", gids.size());
1270 typename Kokkos::View<gno_t*, device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
1271 for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1272 Kokkos::deep_copy(gid_dev,gid_host);
1273
1274 //These variables will be initialized by constructBoundary()
1275 //
1276 //boundary_verts_dev holds all owned vertices that could possibly conflict
1277 //with a remote vertex. Stores LIDs.
1278 Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1279 //this is the number of vertices in the boundary.
1280 if(verbose) std::cout<<comm->getRank()<<": constructing communication and recoloring lists\n";
1281
1282 //We keep track of the vertices that need to get recolored
1283 //this list can contain ghost vertices, so it needs to be initialized
1284 //to the number of all vertices on the local process.
1285 //rand has an entry for each vertex, so its size corresponds to what is needed.
1286 Kokkos::View<lno_t*, device_type> verts_to_recolor_view("verts to recolor", rand.size());
1287 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1288
1289 //This view keeps track of the size of the list of vertices to recolor.
1290 Kokkos::View<int*, device_type> verts_to_recolor_size("verts to recolor size",1);
1291 Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1292 typename Kokkos::View<int*, device_type>::HostMirror verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1293
1294 //initialize the host view
1295 verts_to_recolor_size_host(0) = 0;
1296 //copy to device
1297 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1298
1299 //verts to send can only include locally owned vertices,
1300 //so we can safely allocate the view to size n_local.
1301 //
1302 //This view is a list of local vertices that are going to be
1303 //recolored, and need to be sent to their remote copies.
1304 Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send", n_local);
1305 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
1306
1307 //this view keeps track of the size of verts_to_send.
1308 Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
1309 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1310 typename Kokkos::View<size_t*, device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
1311
1312 verts_to_send_size_host(0) = 0;
1313 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1314
1315 if(verbose) std::cout<<comm->getRank()<<": Constructing the boundary\n";
1316
1317 //this function allocates and initializes the boundary_verts_dev view,
1318 //sets the boundary_size accordingly, and adds vertices to the
1319 //verts_to_send_atomic view, updating the size view as well.
1320 constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1321 verts_to_send_view, verts_to_send_size_atomic);
1322
1323 //this boolean chooses which KokkosKernels algorithm to use.
1324 //It was experimentally chosen for distance-1 coloring.
1325 //Should be disregarded for distance-2 colorings.
1326 bool use_vbbit = (global_max_degree < 6000);
1327 //Done initializing, start coloring!
1328
1329 //use a barrier if we are reporting timing info
1330 if(timing) comm->barrier();
1331 interior_time = timer();
1332 total_time = timer();
1333 //give the entire local graph to KokkosKernels to color
1334 this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1335 interior_time = timer() - interior_time;
1336 comp_time = interior_time;
1337
1338 //ghost_colors holds the colors of only ghost vertices.
1339 //ghost_colors(0) holds the color of a vertex with LID n_local.
1340 //To index this with LIDs, subtract n_local. If an LID is < n_local,
1341 //it will not have a color stored in this view.
1342 Kokkos::View<int*,device_type> ghost_colors("ghost color backups", n_ghosts);
1343
1344 //communicate the initial coloring.
1345 if(verbose) std::cout<<comm->getRank()<<": communicating\n";
1346
1347 //update the host views for the verts to send,
1348 //doOwnedToGhosts needs host memory views, but doesn't
1349 //change the values in them, so no need to copy afterwards
1350 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1351 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1352 gno_t recv,sent;
1353 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1354 sentPerRound[0] = sent;
1355 recvPerRound[0] = recv;
1356
1357 //store ghost colors so we can restore them after recoloring.
1358 //the local process can't color ghosts correctly, so we
1359 //reset the colors to avoid consistency issues.
1360 //get the color view from the FEMultiVector
1361 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1362 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1363 Kokkos::parallel_for(n_ghosts, KOKKOS_LAMBDA(const int& i){
1364 ghost_colors(i) = femv_colors(i+n_local);
1365 });
1366 Kokkos::fence();
1367
1368 double temp = timer();
1369 //detect conflicts only for ghost vertices
1370 bool recolor_degrees = this->pl->template get<bool>("recolor_degrees",false);
1371 if(verbose) std::cout<<comm->getRank()<<": detecting conflicts\n";
1372
1373 //these sizes will be updated by detectConflicts, zero them out beforehand
1374 verts_to_send_size_host(0) = 0;
1375 verts_to_recolor_size_host(0) = 0;
1376 recoloringSize_host(0) = 0;
1377 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1378 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1379 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1380
1381 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1382 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1383 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1384
1385 //these sizes were updated by detectConflicts,
1386 //copy the device views back into host memory
1387 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1388 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1389 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1390 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1391
1392 if(comm->getSize() > 1){
1393 conflict_detection = timer() - temp;
1394 comp_time += conflict_detection;
1395 }
1396 //all conflicts detected!
1397 if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
1398 //variables for statistics
1399 double totalPerRound[numStatisticRecordingRounds];
1400 double commPerRound[numStatisticRecordingRounds];
1401 double compPerRound[numStatisticRecordingRounds];
1402 double recoloringPerRound[numStatisticRecordingRounds];
1403 double conflictDetectionPerRound[numStatisticRecordingRounds];
1404 uint64_t vertsPerRound[numStatisticRecordingRounds];
1405 uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1406 int distributedRounds = 1;
1407 totalPerRound[0] = interior_time + comm_time + conflict_detection;
1408 recoloringPerRound[0] = 0;
1409 commPerRound[0] = comm_time;
1410 compPerRound[0] = interior_time + conflict_detection;
1411 conflictDetectionPerRound[0] = conflict_detection;
1412 recoloringPerRound[0] = 0;
1413 vertsPerRound[0] = 0;
1414 incorrectGhostsPerRound[0]=0;
1415 typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
1416 typename Kokkos::View<lno_t*, device_type>::HostMirror boundary_verts_host;
1417 size_t serial_threshold = this->pl->template get<int>("serial_threshold",0);
1418 //see if recoloring is necessary.
1419 size_t totalConflicts = 0;
1420 size_t localConflicts = recoloringSize_host(0);
1421 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1422 bool done = !totalConflicts;
1423 if(comm->getSize()==1) done = true;
1424
1425 //recolor until no conflicts are left
1426 while(!done){
1427 //if the number of vertices left to recolor is less than
1428 //the serial threshold set by the user, finish the coloring
1429 //only using host views in a serial execution space.
1430 if(recoloringSize_host(0) < serial_threshold) break;
1431 if(distributedRounds < numStatisticRecordingRounds) {
1432 vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1433 }
1434
1435 if(timing) comm->barrier();
1436 double recolor_temp = timer();
1437 //recolor using KokkosKernels' coloring function
1438 if(verts_to_recolor_size_host(0) > 0){
1439 this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1440 }
1441
1442 if(distributedRounds < numStatisticRecordingRounds){
1443 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1444 recoloring_time += recoloringPerRound[distributedRounds];
1445 comp_time += recoloringPerRound[distributedRounds];
1446 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1447 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1448 } else if(timing){
1449 double recoloring_round_time = timer() - recolor_temp;
1450 recoloring_time += recoloring_round_time;
1451 comp_time += recoloring_round_time;
1452 }
1453
1454 //reset the ghost colors to what they were before recoloring
1455 //the recoloring does not have enough information to color
1456 //ghosts correctly, so we set the colors to what they were before
1457 //to avoid consistency issues.
1458 Kokkos::parallel_for(n_ghosts, KOKKOS_LAMBDA(const int& i){
1459 femv_colors(i+n_local) = ghost_colors(i);
1460 });
1461 Kokkos::fence();
1462
1463 //send views are up-to-date, they were copied after conflict detection.
1464 //communicate the new colors
1465
1466 // Reset device views
1467 femvColors = decltype(femvColors)();
1468 femv_colors = decltype(femv_colors)();
1469 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1470 comm_time += curr_comm_time;
1471
1472 if(distributedRounds < numStatisticRecordingRounds){
1473 commPerRound[distributedRounds] = curr_comm_time;
1474 recvPerRound[distributedRounds] = recv;
1475 sentPerRound[distributedRounds] = sent;
1476 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1477 }
1478
1479 //store ghost colors before we uncolor and recolor them.
1480 //this process doesn't have enough info to correctly color
1481 //ghosts, so we set them back to what they were before to
1482 //remove consistency issues.
1483 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1484 femv_colors = subview(femvColors, Kokkos::ALL, 0);
1485 Kokkos::parallel_for(n_ghosts, KOKKOS_LAMBDA(const int& i){
1486 ghost_colors(i) = femv_colors(i+n_local);
1487 });
1488 Kokkos::fence();
1489
1490
1491 //these views will change in detectConflicts, they need
1492 //to be zeroed out beforehand
1493 verts_to_send_size_host(0) = 0;
1494 verts_to_recolor_size_host(0) = 0;
1495 recoloringSize_host(0) = 0;
1496 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1497 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1498 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1499
1500 //check for further conflicts
1501 double detection_temp = timer();
1502
1503 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1504 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1505 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1506
1507 //copy the updated device views back into host memory where necessary
1508 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1509 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1510 //we only use the list of verts to recolor on device, no need to copy to host.
1511 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1512 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1513
1514 if(distributedRounds < numStatisticRecordingRounds){
1515 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1516 conflict_detection += conflictDetectionPerRound[distributedRounds];
1517 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1518 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1519 comp_time += conflictDetectionPerRound[distributedRounds];
1520 } else if(timing){
1521 double conflict_detection_round_time = timer() - detection_temp;
1522 conflict_detection += conflict_detection_round_time;
1523 comp_time += conflict_detection_round_time;
1524 }
1525
1526 distributedRounds++;
1527 size_t localDone = recoloringSize_host(0);
1528 size_t globalDone = 0;
1529 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1530 done = !globalDone;
1531
1532 }//end device coloring
1533
1534
1535 //If we reach this point and still have vertices to color,
1536 //the rest of the coloring is happening in serial on the host.
1537 //
1538 //First, we need to copy device-only views to host mirrors
1539 if(recoloringSize_host(0) > 0 || !done){
1540 ghost_colors_host = Kokkos::create_mirror_view_and_copy(host_mem(),ghost_colors,"ghost_colors host mirror");
1541 boundary_verts_host = Kokkos::create_mirror_view_and_copy(host_mem(),boundary_verts_dev,"boundary_verts host mirror");
1542 }
1543
1544 //Now we do a similar coloring loop to before,
1545 //but using only host views in a serial execution space.
1546 // Reset device views
1547 femvColors = decltype(femvColors)();
1548 femv_colors = decltype(femv_colors)();
1549 while(recoloringSize_host(0) > 0 || !done){
1550 auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1551 auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1552 if(distributedRounds < numStatisticRecordingRounds){
1553 vertsPerRound[distributedRounds] = recoloringSize_host(0);
1554 }
1555 if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
1556 if(timing) comm->barrier();
1557
1558 double recolor_temp = timer();
1559 if(verts_to_recolor_size_host(0) > 0){
1560 this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1561 verts_to_recolor_host, verts_to_recolor_size_host(0), true);
1562 }
1563 if(distributedRounds < numStatisticRecordingRounds){
1564 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1565 recoloring_time += recoloringPerRound[distributedRounds];
1566 comp_time += recoloringPerRound[distributedRounds];
1567 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1568 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1569 } else if(timing){
1570 double recoloring_serial_round_time = timer() - recolor_temp;
1571 recoloring_time += recoloring_serial_round_time;
1572 comp_time += recoloring_serial_round_time;
1573 }
1574
1575 //reset the ghost colors to their previous values to avoid
1576 //consistency issues.
1577 for(size_t i = 0; i < n_ghosts; i++){
1578 colors_host(i+n_local) = ghost_colors_host(i);
1579 }
1580
1581 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1582 comm_time += curr_comm_time;
1583
1584 if(distributedRounds < numStatisticRecordingRounds){
1585 commPerRound[distributedRounds] = curr_comm_time;
1586 recvPerRound[distributedRounds] = recv;
1587 sentPerRound[distributedRounds] = sent;
1588 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1589 }
1590
1591 //store updated ghost colors we received from their owners
1592 //before conflict detection and recoloring changes them locally.
1593 for(size_t i = 0; i < n_ghosts; i++){
1594 ghost_colors_host(i) = colors_host(i+n_local);
1595 }
1596
1597 if(timing) comm->barrier();
1598 double detection_temp = timer();
1599
1600 //zero these out, they'll be updated by detectConflicts_serial
1601 verts_to_recolor_size_host(0) = 0;
1602 verts_to_send_size_host(0) = 0;
1603 recoloringSize_host(0) = 0;
1604
1605 detectConflicts_serial(n_local,dist_offsets_host, dist_adjs_host, colors_host, boundary_verts_host,
1606 verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1607 recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1608
1609 //no need to update the host views, we're only using host views now.
1610
1611 if(distributedRounds < numStatisticRecordingRounds){
1612 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1613 conflict_detection += conflictDetectionPerRound[distributedRounds];
1614 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1615 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1616 comp_time += conflictDetectionPerRound[distributedRounds];
1617 } else if(timing){
1618 double conflict_detection_serial_round_time = timer() - detection_temp;
1619 conflict_detection += conflict_detection_serial_round_time;
1620 comp_time += conflict_detection_serial_round_time;
1621 }
1622
1623 size_t globalDone = 0;
1624 size_t localDone = recoloringSize_host(0);
1625 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1626 distributedRounds++;
1627 done = !globalDone;
1628 }
1629
1630 total_time = timer() - total_time;
1631 //only compute stats if they will be displayed
1632 if(verbose){
1633 uint64_t localBoundaryVertices = 0;
1634 for(size_t i = 0; i < n_local; i++){
1635 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1636 if((size_t)adjs[j] >= n_local){
1637 localBoundaryVertices++;
1638 break;
1639 }
1640 }
1641 }
1642
1643 if(comm->getRank() == 0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1644 uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1645 uint64_t totalBoundarySize = 0;
1646 uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1647 double finalTotalPerRound[numStatisticRecordingRounds];
1648 double maxRecoloringPerRound[numStatisticRecordingRounds];
1649 double minRecoloringPerRound[numStatisticRecordingRounds];
1650 double finalCommPerRound[numStatisticRecordingRounds];
1651 double finalCompPerRound[numStatisticRecordingRounds];
1652 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1653 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1654 gno_t finalSentPerRound[numStatisticRecordingRounds];
1655 for(int i = 0; i < numStatisticRecordingRounds; i++){
1656 totalVertsPerRound[i] = 0;
1657 finalTotalPerRound[i] = 0.0;
1658 maxRecoloringPerRound[i] = 0.0;
1659 minRecoloringPerRound[i] = 0.0;
1660 finalCommPerRound[i] = 0.0;
1661 finalCompPerRound[i] = 0.0;
1662 finalConflictDetectionPerRound[i] = 0.0;
1663 finalRecvPerRound[i] = 0;
1664 finalSentPerRound[i] = 0;
1665 }
1666 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1667 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1668 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1669 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1670 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1671 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1672 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1673 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1674 Teuchos::reduceAll<int,double>(*comm,
1675 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1676 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1677 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1678 printf("Rank %d: boundary size: %ld\n",comm->getRank(),localBoundaryVertices);
1679 if(comm->getRank() == 0) printf("Total boundary size: %ld\n",totalBoundarySize);
1680 for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
1681 printf("Rank %d: recolor %ld vertices in round %d\n",comm->getRank(), vertsPerRound[i],i);
1682 printf("Rank %d: sentbuf had %lld entries in round %d\n", comm->getRank(), sentPerRound[i],i);
1683 if(comm->getRank()==0){
1684 printf("recolored %ld vertices in round %d\n",totalVertsPerRound[i], i);
1685 printf("%ld inconsistent ghosts in round %d\n",totalIncorrectGhostsPerRound[i],i);
1686 printf("total time in round %d: %f\n",i,finalTotalPerRound[i]);
1687 printf("recoloring time in round %d: %f\n",i,maxRecoloringPerRound[i]);
1688 printf("min recoloring time in round %d: %f\n",i,minRecoloringPerRound[i]);
1689 printf("conflict detection time in round %d: %f\n",i,finalConflictDetectionPerRound[i]);
1690 printf("comm time in round %d: %f\n",i,finalCommPerRound[i]);
1691 printf("recvbuf size in round %d: %lld\n",i,finalRecvPerRound[i]);
1692 printf("sendbuf size in round %d: %lld\n",i,finalSentPerRound[i]);
1693 printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
1694 }
1695 }
1696 } else if (timing){
1697 double global_total_time = 0.0;
1698 double global_recoloring_time = 0.0;
1699 double global_min_recoloring_time = 0.0;
1700 double global_conflict_detection=0.0;
1701 double global_comm_time=0.0;
1702 double global_comp_time=0.0;
1703 double global_interior_time=0.0;
1704 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1705 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1706 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1707 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1708 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1709 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1710 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1711 comm->barrier();
1712 fflush(stdout);
1713 if(comm->getRank()==0){
1714 printf("Total Time: %f\n",global_total_time);
1715 printf("Interior Time: %f\n",global_interior_time);
1716 printf("Recoloring Time: %f\n",global_recoloring_time);
1717 printf("Min Recoloring Time: %f\n",global_min_recoloring_time);
1718 printf("Conflict Detection Time: %f\n",global_conflict_detection);
1719 printf("Comm Time: %f\n",global_comm_time);
1720 printf("Comp Time: %f\n",global_comp_time);
1721 }
1722 }
1723 }
1724}; //end class
1725
1726template <typename Adapter>
1727void AlgTwoGhostLayer<Adapter>::buildModel(modelFlag_t &flags){
1728 flags.set(REMOVE_SELF_EDGES);
1729
1730 this->env->debug(DETAILED_STATUS, " building graph model");
1731 this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1732 this->comm, flags));
1733 this->env->debug(DETAILED_STATUS, " graph model built");
1734}
1735
1736}//end namespace Zoltan2
1737
1738#endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
Tpetra::Map<>::memory_space memory_space
typename Kokkos::View< device_type >::HostMirror::execution_space host_exec
Tpetra::Map<>::device_type device_type
Tpetra::Map<>::execution_space execution_space
virtual void constructBoundary(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, Kokkos::View< lno_t *, device_type > &boundary_verts, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic)=0
typename Adapter::base_adapter_t base_adapter_t
void twoGhostLayer(const size_t n_local, const size_t n_total, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::ArrayView< const lno_t > &ghost_adjs, const Teuchos::ArrayView< const offset_t > &ghost_offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &ghost_owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int > > &procs_to_send)
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
typename Adapter::offset_t offset_t
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
virtual void detectConflicts(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, Kokkos::View< int *, device_type > femv_colors, Kokkos::View< lno_t *, device_type > boundary_verts_view, Kokkos::View< lno_t *, device_type > verts_to_recolor_view, Kokkos::View< int *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_recolor_size_atomic, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic, Kokkos::View< size_t *, device_type > recoloringSize, Kokkos::View< int *, device_type > rand, Kokkos::View< gno_t *, device_type > gid, Kokkos::View< gno_t *, device_type > ghost_degrees, bool recolor_degrees)=0
typename Kokkos::View< device_type >::HostMirror::memory_space host_mem
virtual void detectConflicts_serial(const size_t n_local, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, typename Kokkos::View< int *, device_type >::HostMirror femv_colors, typename Kokkos::View< lno_t *, device_type >::HostMirror boundary_verts_view, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_recolor_view, typename Kokkos::View< int *, device_type >::HostMirror verts_to_recolor_size_atomic, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_send_view, typename Kokkos::View< size_t *, device_type >::HostMirror verts_to_send_size_atomic, typename Kokkos::View< size_t *, device_type >::HostMirror recoloringSize, typename Kokkos::View< int *, device_type >::HostMirror rand, typename Kokkos::View< gno_t *, device_type >::HostMirror gid, typename Kokkos::View< gno_t *, device_type >::HostMirror ghost_degrees, bool recolor_degrees)=0
RCP< Teuchos::ParameterList > pl
RCP< GraphModel< base_adapter_t > > model
RCP< const base_adapter_t > adapter
AlgTwoGhostLayer(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
typename Adapter::lno_t lno_t
Tpetra::Map< lno_t, gno_t > map_t
RCP< const Teuchos::Comm< int > > comm
Algorithm defines the base class for all algorithms.
The class containing coloring solution.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ DETAILED_STATUS
sub-steps, each method's entry and exit
@ REMOVE_SELF_EDGES
algorithm requires no self edges