1#ifndef _ZOLTAN2_ALGHYBRIDD1_HPP_
2#define _ZOLTAN2_ALGHYBRIDD1_HPP_
5#include <unordered_map>
23#include "Tpetra_Core.hpp"
24#include "Teuchos_RCP.hpp"
25#include "Tpetra_Import.hpp"
26#include "Tpetra_FEMultiVector.hpp"
28#include "KokkosKernels_Handle.hpp"
29#include "KokkosKernels_IOUtils.hpp"
30#include "KokkosGraph_Distance1Color.hpp"
31#include "KokkosGraph_Distance1ColorHandle.hpp"
40template <
typename Adapter>
50 using map_t = Tpetra::Map<lno_t, gno_t>;
52 using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
56 using host_exec =
typename Kokkos::View<device_type>::HostMirror::execution_space;
57 using host_mem =
typename Kokkos::View<device_type>::HostMirror::memory_space;
60 gettimeofday(&tp, NULL);
61 return ((
double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
93 template <
class ExecutionSpace,
typename MemorySpace>
94 void colorInterior(
const size_t nVtx,
95 Kokkos::View<
lno_t*, Kokkos::Device<ExecutionSpace,MemorySpace> > adjs_view,
96 Kokkos::View<
offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > offset_view,
97 Teuchos::RCP<femv_t> femv,
98 Kokkos::View<
lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > vertex_list,
99 size_t vertex_list_size = 0,
103 using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
106 using lno_row_view_t = Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
107 using lno_nnz_view_t = Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
115 kh.create_graph_coloring_handle(KokkosGraph::COLORING_VBBIT);
117 kh.create_graph_coloring_handle(KokkosGraph::COLORING_EB);
122 if(vertex_list_size != 0){
123 kh.get_graph_coloring_handle()->set_vertex_list(vertex_list,vertex_list_size);
126 kh.set_verbose(verbose);
130 auto femvColors = femv->template getLocalView<Kokkos::Device<ExecutionSpace,MemorySpace> >(Tpetra::Access::ReadWrite);
131 auto sv = subview(femvColors, Kokkos::ALL, 0);
132 kh.get_graph_coloring_handle()->set_vertex_colors(sv);
133 kh.get_graph_coloring_handle()->set_tictoc(verbose);
134 KokkosGraph::Experimental::graph_color_symbolic<KernelHandle, lno_row_view_t, lno_nnz_view_t>
135 (&kh, nVtx, nVtx, offset_view, adjs_view);
139 std::cout<<
"\nKokkosKernels Coloring: "<<kh.get_graph_coloring_handle()->get_overall_coloring_time()<<
" iterations: "<<kh.get_graph_coloring_handle()->get_num_phases()<<
"\n\n";
182 template <
class ExecutionSpace,
typename MemorySpace>
184 Kokkos::View<
offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_offsets,
185 Kokkos::View<
lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_adjs,
186 Kokkos::View<
int*, Kokkos::Device<ExecutionSpace, MemorySpace>> femv_colors,
188 Kokkos::Device<ExecutionSpace, MemorySpace>> verts_to_send_view,
189 Kokkos::View<
size_t*,
190 Kokkos::Device<ExecutionSpace, MemorySpace>,
191 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
192 Kokkos::View<
gno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> recoloringSize,
194 Kokkos::Device<ExecutionSpace, MemorySpace>> rand,
196 Kokkos::Device<ExecutionSpace, MemorySpace>> gid,
198 Kokkos::Device<ExecutionSpace, MemorySpace>> ghost_degrees,
199 bool recolor_degrees){
200 gno_t local_recoloring_size;
201 Kokkos::parallel_reduce(
"Conflict Detection",
202 Kokkos::RangePolicy<ExecutionSpace>(0,n_ghosts),
203 KOKKOS_LAMBDA(
const int& i,
204 gno_t& recoloring_size){
205 lno_t lid = i+n_local;
206 int currColor = femv_colors(lid);
207 int currDegree = ghost_degrees(i);
208 for(
offset_t j = dist_offsets(lid); j < dist_offsets(lid+1); j++){
209 int nborColor = femv_colors(dist_adjs(j));
211 if((
size_t)dist_adjs(j) < n_local) nborDegree = dist_offsets(dist_adjs(j)+1) - dist_offsets(dist_adjs(j));
212 else nborDegree = ghost_degrees(dist_adjs(j) - n_local);
213 if(currColor == nborColor ){
214 if(currDegree < nborDegree && recolor_degrees){
215 femv_colors(lid) = 0;
217 }
else if (nborDegree < currDegree && recolor_degrees){
218 femv_colors(dist_adjs(j)) = 0;
220 }
else if(rand(lid) > rand(dist_adjs(j))){
221 femv_colors(lid) = 0;
224 }
else if (rand(dist_adjs(j)) > rand(lid)){
225 femv_colors(dist_adjs(j)) = 0;
228 if (gid(lid) >= gid(dist_adjs(j))){
229 femv_colors(lid) = 0;
233 femv_colors(dist_adjs(j)) = 0;
239 },local_recoloring_size);
240 Kokkos::deep_copy(recoloringSize, local_recoloring_size);
242 Kokkos::parallel_for(
"Rebuild verts_to_send_view",
243 Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
244 KOKKOS_LAMBDA(
const int& i){
245 if(femv_colors(i) == 0){
246 verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
284 double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
286 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send,
287 typename Kokkos::View<size_t*, device_type>::HostMirror& verts_to_send_size,
288 Teuchos::RCP<femv_t> femv,
289 const std::unordered_map<
lno_t, std::vector<int>>& procs_to_send,
291 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
292 auto colors = subview(femvColors, Kokkos::ALL, 0);
293 int nprocs = comm->getSize();
295 std::vector<int> sendcnts(comm->getSize(), 0);
296 std::vector<gno_t> sdispls(comm->getSize()+1, 0);
297 for(
size_t i = 0; i < verts_to_send_size(0); i++){
298 for(
size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
299 sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
305 std::vector<int> sentcount(nprocs, 0);
307 for(
int i = 1; i < comm->getSize()+1; i++){
308 sdispls[i] = sdispls[i-1] + sendcnts[i-1];
309 sendsize += sendcnts[i-1];
312 std::vector<gno_t> sendbuf(sendsize,0);
314 for(
size_t i = 0; i < verts_to_send_size(0); i++){
315 std::vector<int> procs = procs_to_send.at(verts_to_send(i));
316 for(
size_t j = 0; j < procs.size(); j++){
317 size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
318 sentcount[procs[j]] += 2;
319 sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
320 sendbuf[idx] = colors(verts_to_send(i));
324 Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
325 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
326 Teuchos::ArrayRCP<gno_t> recvbuf;
327 std::vector<int> recvcnts(comm->getSize(), 0);
328 Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
331 if(timing) comm->barrier();
332 double comm_total = 0.0;
333 double comm_temp =
timer();
335 AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
336 comm_total +=
timer() - comm_temp;
340 for(
int i = 0; i < recvcnts_view.size(); i++){
341 recvsize += recvcnts_view[i];
344 for(
int i = 0; i < recvsize; i+=2){
345 size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
346 if(lid<nVtx && verbose) std::cout<<comm->getRank()<<
": received a locally owned vertex, somehow\n";
347 colors(lid) = recvbuf[i+1];
353 RCP<const base_adapter_t> adapter;
354 RCP<GraphModel<base_adapter_t> > model;
355 RCP<Teuchos::ParameterList> pl;
356 RCP<Environment> env;
357 RCP<const Teuchos::Comm<int> > comm;
363 const RCP<const base_adapter_t> &adapter_,
364 const RCP<Teuchos::ParameterList> &pl_,
365 const RCP<Environment> &env_,
366 const RCP<
const Teuchos::Comm<int> > &comm_)
367 : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
368 verbose = pl->get<
bool>(
"verbose",
false);
369 timing = pl->get<
bool>(
"timing",
false);
370 if(verbose) std::cout<<comm->getRank()<<
": inside coloring constructor\n";
374 if(verbose) std::cout<<comm->getRank()<<
": done constructing coloring class\n";
380 if(verbose) std::cout<<comm->getRank()<<
": inside coloring function\n";
384 if(verbose) std::cout<<comm->getRank()<<
": getting owned vtxIDs\n";
385 ArrayView<const gno_t> vtxIDs;
386 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
387 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
389 if(verbose) std::cout<<comm->getRank()<<
": getting edge list\n";
391 ArrayView<const gno_t> adjs;
392 ArrayView<const offset_t> offsets;
393 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
394 model->getEdgeList(adjs, offsets, ewgts);
397 RCP<const map_t> mapOwned;
398 RCP<const map_t> mapWithCopies;
400 std::vector<gno_t> finalGIDs;
401 std::vector<offset_t> finalOffset_vec;
402 std::vector<lno_t> finalAdjs_vec;
404 std::vector<lno_t> reorderToLocal;
405 for(
size_t i = 0; i< nVtx; i++) reorderToLocal.push_back(i);
406 if(verbose) std::cout<<comm->getRank()<<
": Setting up local datastructures\n";
408 std::unordered_map<gno_t,lno_t> globalToLocal;
409 std::vector<gno_t> ownedPlusGhosts;
410 for (
gno_t i = 0; i < vtxIDs.size(); i++){
411 if(vtxIDs[i] < 0 && verbose) std::cout<<comm->getRank()<<
": found a negative GID\n";
412 globalToLocal[vtxIDs[i]] = i;
413 ownedPlusGhosts.push_back(vtxIDs[i]);
416 for (
int i = 0; i < adjs.size(); i++){
417 if(globalToLocal.count(adjs[i]) == 0){
419 if(adjs[i] < 0 && verbose) std::cout<<comm->getRank()<<
": found a negative adjacency\n";
420 ownedPlusGhosts.push_back(adjs[i]);
421 globalToLocal[adjs[i]] = vtxIDs.size() + nghosts;
426 if(verbose) std::cout<<comm->getRank()<<
": vec.max_size() = "<<finalAdjs_vec.max_size()<<
", adjs.size() = "<<adjs.size()<<
"\n";
427 finalAdjs_vec.resize(adjs.size());
428 for(
size_t i = 0; i < finalAdjs_vec.size();i++){
429 finalAdjs_vec[i] = globalToLocal[adjs[i]];
431 for(
int i = 0; i < offsets.size(); i++) finalOffset_vec.push_back(offsets[i]);
432 finalGIDs = ownedPlusGhosts;
435 if(verbose) std::cout<<comm->getRank()<<
": creating Tpetra Maps\n";
438 mapOwned = rcp(
new map_t(dummy, vtxIDs, 0, comm));
440 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
441 mapWithCopies = rcp(
new map_t(dummy,
442 Teuchos::arrayViewFromVector(ownedPlusGhosts),
448 if(verbose)std::cout<<comm->getRank()<<
": creating FEMultiVector\n";
449 typedef Tpetra::Import<lno_t, gno_t> import_t;
450 Teuchos::RCP<import_t> importer = rcp(
new import_t(mapOwned,
452 Teuchos::RCP<femv_t> femv = rcp(
new femv_t(mapOwned,
455 ArrayRCP<int> colors = solution->getColorsRCP();
456 for(
size_t i=0; i<nVtx; i++){
464 std::vector<int> rand(finalGIDs.size());
465 for(
size_t i = 0; i < finalGIDs.size(); i++){
466 std::srand(finalGIDs[i]);
467 rand[i] = std::rand();
471 std::vector<int> ghostOwners(finalGIDs.size() - nVtx);
472 std::vector<gno_t> ghosts(finalGIDs.size() - nVtx);
473 for(
size_t i = nVtx; i < finalGIDs.size(); i++) ghosts[i-nVtx] = finalGIDs[i];
474 ArrayView<int> owners = Teuchos::arrayViewFromVector(ghostOwners);
475 ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts);
476 mapOwned->getRemoteIndexList(ghostGIDs, owners);
479 ArrayView<const offset_t> finalOffsets = Teuchos::arrayViewFromVector(finalOffset_vec);
480 ArrayView<const lno_t> finalAdjs = Teuchos::arrayViewFromVector(finalAdjs_vec);
481 ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
482 ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(finalGIDs);
485 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
486 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
489 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
490 for(
int i = 0; i < exportLIDs.size(); i++){
491 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
495 hybridGMB(nVtx, finalAdjs, finalOffsets,femv,gid_view,rand_view,owners,mapWithCopies, procs_to_send);
498 auto femvdata = femv->getData(0);
499 for(
int i = 0; i < colors.size(); i++){
500 colors[reorderToLocal[i]] = femvdata[i];
537 const Teuchos::ArrayView<const lno_t>& adjs,
538 const Teuchos::ArrayView<const offset_t>& offsets,
539 const Teuchos::RCP<femv_t>& femv,
540 const Teuchos::ArrayView<const gno_t>& gids,
541 const Teuchos::ArrayView<const int>& rand,
542 const Teuchos::ArrayView<const int>& owners,
543 RCP<const map_t> mapOwnedPlusGhosts,
544 const std::unordered_map<
lno_t, std::vector<int>>& procs_to_send){
545 if(verbose) std::cout<<comm->getRank()<<
": inside coloring algorithm\n";
548 double total_time = 0.0;
549 double interior_time = 0.0;
550 double comm_time = 0.0;
551 double comp_time = 0.0;
552 double recoloring_time = 0.0;
553 double conflict_detection = 0.0;
555 const int numStatisticRecordingRounds = 100;
559 std::vector<int> deg_send_cnts(comm->getSize(),0);
560 std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
561 for(
int i = 0; i < owners.size(); i++){
562 deg_send_cnts[owners[i]]++;
565 gno_t deg_sendsize = 0;
566 std::vector<int> deg_sentcount(comm->getSize(),0);
567 for(
int i = 1; i < comm->getSize()+1; i++){
568 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
569 deg_sendsize += deg_send_cnts[i-1];
571 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
572 for(
int i = 0; i < owners.size(); i++){
573 size_t idx = deg_sdispls[owners[i]] + deg_sentcount[owners[i]];
574 deg_sentcount[owners[i]]++;
575 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+nVtx);
577 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
578 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
579 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
580 std::vector<int> deg_recvcnts(comm->getSize(),0);
581 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
582 AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
585 for(
int i = 0; i < deg_recvbuf.size(); i++){
586 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
587 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
590 ArrayRCP<gno_t> ghost_degrees;
591 AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
593 Kokkos::View<gno_t*, device_type> ghost_degrees_dev(
"ghost degree view",ghost_degrees.size());
594 typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
595 for(
int i = 0; i < ghost_degrees.size(); i++){
596 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
597 ghost_degrees_host(lid-nVtx) = ghost_degrees[i];
599 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
603 for(
size_t i = 0; i < nVtx; i++){
604 offset_t curr_degree = offsets[i+1] - offsets[i];
605 if(curr_degree > local_max_degree){
606 local_max_degree = curr_degree;
609 Teuchos::reduceAll<int, offset_t>(*comm,Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
610 if(comm->getRank() == 0 && verbose) std::cout<<
"Input has max degree "<<global_max_degree<<
"\n";
611 if(verbose)std::cout<<comm->getRank()<<
": creating Kokkos Views\n";
613 Kokkos::View<offset_t*, device_type> dist_degrees(
"Owned+Ghost degree view",rand.size());
614 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees);
616 for(
int i = 0; i < adjs.size(); i++){
617 if((
size_t)adjs[i] < nVtx)
continue;
618 dist_degrees_host(adjs[i])++;
621 for(
int i = 0; i < offsets.size()-1; i++){
622 dist_degrees_host(i) = offsets[i+1] - offsets[i];
625 Kokkos::View<offset_t*, device_type> dist_offsets(
"Owned+Ghost Offset view", rand.size()+1);
626 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets);
629 dist_offsets_host(0) = 0;
630 uint64_t total_adjs = 0;
631 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
632 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
633 total_adjs+= dist_degrees_host(i-1);
636 Kokkos::View<lno_t*, device_type> dist_adjs(
"Owned+Ghost adjacency view", total_adjs);
637 typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs);
639 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
640 dist_degrees_host(i) = 0;
642 for(
int i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
643 if(comm->getSize() > 1){
644 for(
size_t i = 0; i < nVtx; i++){
645 for(
offset_t j = offsets[i]; j < offsets[i+1]; j++){
647 if( (
size_t)adjs[j] >= nVtx){
649 dist_adjs_host(dist_offsets_host(adjs[j]) + dist_degrees_host(adjs[j])) = i;
650 dist_degrees_host(adjs[j])++;
656 if(verbose) std::cout<<comm->getRank()<<
": copying host mirrors to device views\n";
658 Kokkos::deep_copy(dist_degrees, dist_degrees_host);
659 Kokkos::deep_copy(dist_offsets, dist_offsets_host);
660 Kokkos::deep_copy(dist_adjs, dist_adjs_host);
661 if(verbose) std::cout<<comm->getRank()<<
": done copying to device\n";
664 Kokkos::View<gno_t*, device_type> recoloringSize(
"Recoloring Queue Size",1);
665 typename Kokkos::View<gno_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
666 recoloringSize_host(0) = 0;
667 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
670 Kokkos::View<int*,device_type> rand_dev(
"randVec",rand.size());
671 typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
672 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
673 rand_host(i) = rand[i];
677 Kokkos::View<gno_t*, device_type> gid_dev(
"GIDs",gids.size());
678 typename Kokkos::View<gno_t*,device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
679 for(Teuchos_Ordinal i = 0; i < gids.size(); i++){
680 gid_host(i) = gids[i];
684 Kokkos::deep_copy(rand_dev,rand_host);
685 Kokkos::deep_copy(gid_dev, gid_host);
687 if(verbose)std::cout<<comm->getRank()<<
": done creating recoloring datastructures\n";
690 for(
size_t i = 0; i < nVtx; i++){
691 for(
offset_t j = offsets[i]; j < offsets[i+1]; j++){
692 if((
size_t)adjs[j] >= nVtx) {
698 if(verbose)std::cout<<comm->getRank()<<
": creating send views\n";
701 Kokkos::View<lno_t*, device_type> verts_to_send_view(
"verts to send",boundary_size);
702 Kokkos::parallel_for(boundary_size, KOKKOS_LAMBDA(
const int& i){
703 verts_to_send_view(i) = -1;
707 Kokkos::View<size_t*, device_type> verts_to_send_size(
"verts to send size",1);
708 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic> > verts_to_send_size_atomic = verts_to_send_size;
709 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
710 typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
712 verts_to_send_size_host(0) = 0;
713 deep_copy(verts_to_send_size, verts_to_send_size_host);
715 if(verbose)std::cout<<comm->getRank()<<
": Done creating send views, initializing...\n";
716 if(verbose)std::cout<<comm->getRank()<<
": boundary_size = "<<boundary_size<<
" verts_to_send_size_atomic(0) = "<<verts_to_send_size_atomic(0)<<
"\n";
718 Kokkos::parallel_for(
"Initialize verts_to_send",nVtx, KOKKOS_LAMBDA(
const int&i){
719 for(
offset_t j = dist_offsets(i); j < dist_offsets(i+1); j++){
720 if((
size_t)dist_adjs(j) >= nVtx){
721 verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
730 Kokkos::View<int*, device_type> ghost_colors(
"ghost color backups", rand.size()-nVtx);
731 if(verbose)std::cout<<comm->getRank()<<
": Done initializing\n";
732 gno_t sentPerRound[numStatisticRecordingRounds];
733 gno_t recvPerRound[numStatisticRecordingRounds];
735 if(verbose) std::cout<<comm->getRank()<<
": Coloring interior\n";
738 if(timing) comm->barrier();
739 interior_time =
timer();
740 total_time =
timer();
742 bool use_vbbit = (global_max_degree < 6000);
743 this->colorInterior<execution_space,memory_space>
744 (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
746 interior_time =
timer() - interior_time;
747 comp_time = interior_time;
749 if(verbose) std::cout<<comm->getRank()<<
": Going to recolor\n";
750 bool recolor_degrees = this->pl->template get<bool>(
"recolor_degrees",
true);
753 if(comm->getSize() > 1){
755 if(verbose)std::cout<<comm->getRank()<<
": going to communicate\n";
758 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
759 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
761 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
764 verts_to_send_size_host,
769 sentPerRound[0] = sent;
770 recvPerRound[0] = recv;
771 if(verbose) std::cout<<comm->getRank()<<
": done communicating\n";
772 verts_to_send_size_host(0) = 0;
773 deep_copy(verts_to_send_size, verts_to_send_size_host);
776 Kokkos::View<int**, Kokkos::LayoutLeft, device_type> femvColors =
777 femv->template getLocalView<device_type>(Tpetra::Access::ReadWrite);
778 Kokkos::View<int*, device_type> femv_colors = subview(femvColors, Kokkos::ALL, 0);
779 Kokkos::parallel_for(rand.size()-nVtx,KOKKOS_LAMBDA(
const int& i){
780 ghost_colors(i) = femv_colors(i+nVtx);
784 double temp =
timer();
785 detectConflicts<execution_space, memory_space>(nVtx,
791 verts_to_send_size_atomic,
797 deep_copy(recoloringSize_host, recoloringSize);
798 conflict_detection +=
timer() - temp;
799 comp_time += conflict_detection;
802 if(verbose)std::cout<<comm->getRank()<<
": done initial recoloring, begin recoloring loop\n";
803 double totalPerRound[numStatisticRecordingRounds];
804 double commPerRound[numStatisticRecordingRounds];
805 double compPerRound[numStatisticRecordingRounds];
806 double recoloringPerRound[numStatisticRecordingRounds];
807 double conflictDetectionPerRound[numStatisticRecordingRounds];
808 double serialRecoloringPerRound[numStatisticRecordingRounds];
809 int vertsPerRound[numStatisticRecordingRounds];
811 if(comm->getSize() == 1) done =
true;
812 totalPerRound[0] = interior_time + comm_time + conflict_detection;
813 recoloringPerRound[0] = 0;
814 commPerRound[0] = comm_time;
815 compPerRound[0] = interior_time + conflict_detection;
816 conflictDetectionPerRound[0] = conflict_detection;
817 recoloringPerRound[0] = 0;
818 vertsPerRound[0] = 0;
819 int distributedRounds = 1;
820 int serial_threshold = this->pl->template get<int>(
"serial_threshold",0);
822 Kokkos::View<lno_t*, device_type> verts_to_recolor(
"verts_to_recolor", boundary_size);
823 typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
825 while(recoloringSize_host(0) > 0 || !done){
826 if(recoloringSize_host(0) < serial_threshold)
break;
828 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
829 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
831 if(distributedRounds < numStatisticRecordingRounds) {
832 vertsPerRound[distributedRounds] = recoloringSize_host(0);
838 Kokkos::deep_copy(verts_to_recolor, verts_to_send_view);
840 double recolor_temp =
timer();
842 deep_copy(verts_to_send_size_host, verts_to_send_size);
843 if(verts_to_send_size_host(0) > 0){
846 dist_adjs,dist_offsets,
849 verts_to_send_size_host(0),
853 if(distributedRounds < numStatisticRecordingRounds){
854 recoloringPerRound[distributedRounds] =
timer() - recolor_temp;
855 recoloring_time += recoloringPerRound[distributedRounds];
856 comp_time += recoloringPerRound[distributedRounds];
857 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
858 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
860 double recolor_round_time =
timer() - recolor_temp;
861 recoloring_time += recolor_round_time;
862 comp_time += recolor_round_time;
867 recoloringSize_host(0) = 0;
868 Kokkos::deep_copy(recoloringSize,recoloringSize_host);
870 Kokkos::parallel_for(rand.size()-nVtx, KOKKOS_LAMBDA(
const int& i){
871 femv_colors(i+nVtx) = ghost_colors(i);
875 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
876 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
879 femvColors =
decltype(femvColors)();
880 femv_colors =
decltype(femv_colors)();
882 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
885 verts_to_send_size_host,
890 comm_time += curr_comm_time;
891 if(distributedRounds < numStatisticRecordingRounds){
892 commPerRound[distributedRounds] = curr_comm_time;
893 sentPerRound[distributedRounds] = sent;
894 recvPerRound[distributedRounds] = recv;
895 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
901 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
902 femv_colors = subview(femvColors, Kokkos::ALL, 0);
903 Kokkos::parallel_for(rand.size()-nVtx, KOKKOS_LAMBDA(
const int& i){
904 ghost_colors(i) = femv_colors(i+nVtx);
907 verts_to_send_size_host(0) = 0;
908 deep_copy(verts_to_send_size, verts_to_send_size_host);
909 double detection_temp =
timer();
910 detectConflicts<execution_space, memory_space>(nVtx,
916 verts_to_send_size_atomic,
923 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
925 if(distributedRounds < numStatisticRecordingRounds){
926 conflictDetectionPerRound[distributedRounds] =
timer() - detection_temp;
927 conflict_detection += conflictDetectionPerRound[distributedRounds];
928 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
929 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
930 comp_time += conflictDetectionPerRound[distributedRounds];
932 double conflict_detection_round_time =
timer()- detection_temp;
933 conflict_detection += conflict_detection_round_time;
934 comp_time += conflict_detection_round_time;
938 int localDone = recoloringSize_host(0);
939 Teuchos::reduceAll<int, int>(*comm,Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
946 if(recoloringSize_host(0) > 0 || !done){
947 ghost_colors_host = Kokkos::create_mirror_view(ghost_colors);
948 deep_copy(ghost_colors_host, ghost_colors);
949 deep_copy(verts_to_send_host, verts_to_send_view);
950 deep_copy(verts_to_send_size_host, verts_to_send_size);
955 while(recoloringSize_host(0) > 0 || !done){
957 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
958 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
961 if(distributedRounds < 100){
962 vertsPerRound[distributedRounds] = recoloringSize_host(0);
965 double recolor_temp =
timer();
967 if(verts_to_send_size_host(0) > 0){
970 (femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, verts_to_send_host, verts_to_send_size_host(0),
true);
973 if(distributedRounds < numStatisticRecordingRounds){
974 recoloringPerRound[distributedRounds] =
timer() - recolor_temp;
975 recoloring_time += recoloringPerRound[distributedRounds];
976 comp_time += recoloringPerRound[distributedRounds];
977 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
978 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
980 double recolor_serial_round_time =
timer() - recolor_temp;
981 recoloring_time += recolor_serial_round_time;
982 comp_time += recolor_serial_round_time;
985 recoloringSize_host(0) = 0;
987 for(
size_t i = 0; i < rand.size() -nVtx; i++){
988 femv_colors(i+nVtx) = ghost_colors_host(i);
992 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
995 verts_to_send_size_host,
1000 comm_time += curr_comm_time;
1002 if(distributedRounds < numStatisticRecordingRounds){
1003 commPerRound[distributedRounds] = curr_comm_time;
1004 sentPerRound[distributedRounds] = sent;
1005 recvPerRound[distributedRounds] = recv;
1006 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1008 for(
size_t i = 0; i < rand.size()-nVtx; i++){
1009 ghost_colors_host(i) = femv_colors(i+nVtx);
1012 verts_to_send_size_host(0) = 0;
1013 double detection_temp =
timer();
1014 detectConflicts<host_exec, host_mem>(nVtx,
1020 verts_to_send_size_host,
1021 recoloringSize_host,
1026 if(distributedRounds < numStatisticRecordingRounds){
1027 conflictDetectionPerRound[distributedRounds] =
timer() - detection_temp;
1028 conflict_detection += conflictDetectionPerRound[distributedRounds];
1029 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1030 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1031 comp_time += conflictDetectionPerRound[distributedRounds];
1033 double conflict_detection_serial_round_time =
timer() - detection_temp;
1034 conflict_detection += conflict_detection_serial_round_time;
1035 comp_time += conflict_detection_serial_round_time;
1039 int localDone = recoloringSize_host(0);
1040 Teuchos::reduceAll<int, int>(*comm, Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
1041 distributedRounds++;
1044 total_time =
timer() - total_time;
1048 std::cout<<comm->getRank()<<
": done recoloring loop, computing statistics\n";
1049 int localBoundaryVertices = 0;
1050 for(
size_t i = 0; i < nVtx; i++){
1051 for(
offset_t j = offsets[i]; j < offsets[i+1]; j++){
1052 if((
size_t)adjs[j] >= nVtx){
1053 localBoundaryVertices++;
1059 if(comm->getRank()==0) printf(
"did %d rounds of distributed coloring\n", distributedRounds);
1060 int totalBoundarySize = 0;
1061 int totalVertsPerRound[numStatisticRecordingRounds];
1062 double finalTotalPerRound[numStatisticRecordingRounds];
1063 double maxRecoloringPerRound[numStatisticRecordingRounds];
1064 double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
1065 double minRecoloringPerRound[numStatisticRecordingRounds];
1066 double finalCommPerRound[numStatisticRecordingRounds];
1067 double finalCompPerRound[numStatisticRecordingRounds];
1068 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1069 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1070 gno_t finalSentPerRound[numStatisticRecordingRounds];
1071 for(
int i = 0; i < numStatisticRecordingRounds; i++) {
1072 totalVertsPerRound[i] = 0;
1073 finalTotalPerRound[i] = 0.0;
1074 maxRecoloringPerRound[i] = 0.0;
1075 minRecoloringPerRound[i] = 0.0;
1076 finalCommPerRound[i] = 0.0;
1077 finalCompPerRound[i] = 0.0;
1078 finalConflictDetectionPerRound[i] = 0.0;
1079 finalSentPerRound[i] = 0;
1080 finalRecvPerRound[i] = 0;
1082 Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,1, &localBoundaryVertices,&totalBoundarySize);
1083 Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1084 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound,finalTotalPerRound);
1085 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1086 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1087 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,serialRecoloringPerRound,finalSerialRecoloringPerRound);
1088 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1089 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1090 Teuchos::reduceAll<int,double>(*comm,
1091 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1092 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound, finalRecvPerRound);
1093 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
1095 printf(
"Rank %d: boundary size: %d\n",comm->getRank(),localBoundaryVertices);
1096 if(comm->getRank()==0) printf(
"Total boundary size: %d\n",totalBoundarySize);
1097 for(
int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
1098 printf(
"Rank %d: recolor %d vertices in round %d\n",comm->getRank(),vertsPerRound[i],i);
1099 if(comm->getRank()==0) printf(
"recolored %d vertices in round %d\n",totalVertsPerRound[i],i);
1100 if(comm->getRank()==0) printf(
"total time in round %d: %f\n",i,finalTotalPerRound[i]);
1101 if(comm->getRank()==0) printf(
"recoloring time in round %d: %f\n",i,maxRecoloringPerRound[i]);
1102 if(comm->getRank()==0) printf(
"serial recoloring time in round %d: %f\n",i,finalSerialRecoloringPerRound[i]);
1103 if(comm->getRank()==0) printf(
"min recoloring time in round %d: %f\n",i,minRecoloringPerRound[i]);
1104 if(comm->getRank()==0) printf(
"conflict detection time in round %d: %f\n",i,finalConflictDetectionPerRound[i]);
1105 if(comm->getRank()==0) printf(
"comm time in round %d: %f\n",i,finalCommPerRound[i]);
1106 if(comm->getRank()==0) printf(
"total sent in round %d: %lld\n",i,finalSentPerRound[i]);
1107 if(comm->getRank()==0) printf(
"total recv in round %d: %lld\n",i,finalRecvPerRound[i]);
1108 if(comm->getRank()==0) printf(
"comp time in round %d: %f\n",i,finalCompPerRound[i]);
1111 double global_total_time = 0.0;
1112 double global_recoloring_time=0.0;
1113 double global_min_recoloring_time=0.0;
1114 double global_conflict_detection=0.0;
1115 double global_comm_time=0.0;
1116 double global_comp_time=0.0;
1117 double global_interior_time = 0.0;
1118 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1119 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1120 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1121 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1122 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1123 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1124 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1127 if(comm->getRank()==0){
1128 printf(
"Total Time: %f\n",global_total_time);
1129 printf(
"Interior Time: %f\n",global_interior_time);
1130 printf(
"Recoloring Time: %f\n",global_recoloring_time);
1131 printf(
"Min Recoloring Time: %f\n",global_min_recoloring_time);
1132 printf(
"Conflict Detection Time: %f\n",global_conflict_detection);
1133 printf(
"Comm Time: %f\n",global_comm_time);
1134 printf(
"Comp Time: %f\n",global_comp_time);
1137 if(verbose) std::cout<<comm->getRank()<<
": exiting coloring\n";
1141template <
typename Adapter>
1142void AlgDistance1<Adapter>::buildModel(
modelFlag_t &flags){
1146 if(verbose) std::cout<<comm->getRank()<<
": starting to construct graph model\n";
1147 this->model = rcp(
new GraphModel<base_adapter_t>(this->adapter, this->env,
1148 this->comm, flags));
1149 if(verbose) std::cout<<comm->getRank()<<
": done constructing graph model\n";
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
Tpetra::Map< lno_t, gno_t > map_t
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
typename Kokkos::View< device_type >::HostMirror::memory_space host_mem
void detectConflicts(const size_t n_local, const size_t n_ghosts, Kokkos::View< offset_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > dist_offsets, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > dist_adjs, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace > > femv_colors, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > verts_to_send_view, Kokkos::View< size_t *, Kokkos::Device< ExecutionSpace, MemorySpace >, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > recoloringSize, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace > > rand, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > gid, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > ghost_degrees, bool recolor_degrees)
typename Adapter::base_adapter_t base_adapter_t
typename Adapter::lno_t lno_t
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
void hybridGMB(const size_t nVtx, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int > > &procs_to_send)
AlgDistance1(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
Tpetra::Map<>::memory_space memory_space
Tpetra::Map<>::device_type device_type
typename Kokkos::View< device_type >::HostMirror::execution_space host_exec
typename Adapter::offset_t offset_t
Tpetra::Map<>::execution_space execution_space
Algorithm defines the base class for all algorithms.
The class containing coloring solution.
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ DETAILED_STATUS
sub-steps, each method's entry and exit
@ REMOVE_SELF_EDGES
algorithm requires no self edges