1062 lines
36 KiB
C++
1062 lines
36 KiB
C++
#ifndef MPI_FUNCS_H
|
|
#define MPI_FUNCS_H
|
|
|
|
#include <stdlib.h>
|
|
#include <mpi.h>
|
|
#include <cstring>
|
|
#include <algorithm>
|
|
#include <vector>
|
|
#include <numeric>
|
|
#include "utils.h"
|
|
#include "config.h"
|
|
|
|
#ifdef USE_OMP
|
|
#include <omp.h>
|
|
#endif
|
|
|
|
// Forward declaration for Grid class
|
|
class Grid;
|
|
|
|
inline void initialize_mpi();
|
|
inline void finalize_mpi();
|
|
inline void cleanup_all_mpi_windows(); // Add cleanup function declaration
|
|
|
|
inline void mpi_debug(char const* str);
|
|
|
|
template<typename T>
|
|
inline std::vector<size_t> node_reordering(const std::vector<T>&);
|
|
inline int check_total_nprocs_and_ndiv();
|
|
inline void synchronize_all();
|
|
inline void synchronize_all_inter_sim();
|
|
inline void synchronize_all_sim();
|
|
inline void synchronize_all_sub();
|
|
inline void synchronize_all_inter();
|
|
inline void synchronize_all_world();
|
|
inline void send_bool_single_sim(bool*, int);
|
|
inline void recv_bool_single_sim(bool*, int);
|
|
inline void send_i_single_sim(int*, int);
|
|
inline void recv_i_single_sim(int*, int);
|
|
inline void send_cr(CUSTOMREAL*, int, int);
|
|
inline void recv_cr(CUSTOMREAL*, int, int);
|
|
inline void isend_cr(CUSTOMREAL*, int, int, MPI_Request&);
|
|
inline void irecv_cr(CUSTOMREAL*, int, int, MPI_Request&);
|
|
inline void send_cr_single_sim(CUSTOMREAL *, int);
|
|
inline void recv_cr_single_sim(CUSTOMREAL *, int);
|
|
inline void send_str_sim(std::string, int);
|
|
inline void recv_str_sim(std::string&, int);
|
|
inline void send_str(std::string, int);
|
|
inline void recv_str(std::string&, int);
|
|
inline void allreduce_i_single(int&, int&);
|
|
inline void allreduce_cr_single(CUSTOMREAL&, CUSTOMREAL&);
|
|
inline void allreduce_i_inplace(int*, int);
|
|
inline void allreduce_i_sim_single_inplace(int&);
|
|
inline void allreduce_bool_inplace_inter_sim(bool*, int);
|
|
inline void allreduce_bool_inplace(bool*, int);
|
|
inline void allreduce_bool_inplace_sub(bool*, int);
|
|
inline void allreduce_bool_single_inplace(bool&);
|
|
inline void allreduce_bool_single_inplace_world(bool&);
|
|
inline void allreduce_cr_inplace(CUSTOMREAL*, int);
|
|
inline void allreduce_cr_sim(CUSTOMREAL*, int, CUSTOMREAL*);
|
|
inline void allreduce_cr_sim_inplace(CUSTOMREAL*, int);
|
|
inline void allreduce_cr_sim_single_inplace(CUSTOMREAL&);
|
|
inline void allgather_i_single(int*, int*);
|
|
inline void allgather_cr_single(CUSTOMREAL*, CUSTOMREAL*);
|
|
inline void allgather_str(const std::string&, std::vector<std::string>&);
|
|
inline void broadcast_bool_single(bool&, int);
|
|
inline void broadcast_bool_single_sub(bool&, int);
|
|
inline void broadcast_bool_single_inter_sim(bool&, int);
|
|
inline void broadcast_bool_inter_and_intra_sim(bool&, int);
|
|
inline void broadcast_i_single(int&, int);
|
|
inline void broadcast_i_single_inter_sim(int&, int);
|
|
inline void broadcast_i_single_sub(int&, int);
|
|
inline void broadcast_i_single_intra_sim(int&, int);
|
|
inline void broadcast_i_single_inter_and_intra_sim(int&, int);
|
|
inline void broadcast_f_single(float&, int);
|
|
inline void broadcast_cr(CUSTOMREAL* , int, int);
|
|
inline void broadcast_cr_single(CUSTOMREAL&, int);
|
|
inline void broadcast_cr_single_inplace(CUSTOMREAL&);
|
|
inline void broadcast_cr_inter_sim(CUSTOMREAL*, int, int);
|
|
inline void broadcast_str(std::string&, int);
|
|
inline void broadcast_str_inter_sim(std::string&, int);
|
|
inline void broadcast_str_sub(std::string&, int);
|
|
inline void broadcast_str_inter_and_intra_sim(std::string&, int);
|
|
inline void broadcast_cr_single_sub(CUSTOMREAL&, int);
|
|
inline void broadcast_cr_sub(CUSTOMREAL*, int, int);
|
|
inline void broadcast_cr_single_inter_and_intra_sim(CUSTOMREAL&, int);
|
|
inline void prepare_shm_array_cr(int, CUSTOMREAL*&, MPI_Win&);
|
|
inline void prepare_shm_array_bool(int, bool*&, MPI_Win&);
|
|
inline void cleanup_mpi_win(MPI_Win&);
|
|
inline void cleanup_mpi_wins(std::initializer_list<MPI_Win*>);
|
|
inline void init_mpi_wins(std::initializer_list<MPI_Win*>);
|
|
|
|
inline void wait_req(MPI_Request&);
|
|
inline void shm_fence(MPI_Win&);
|
|
|
|
inline void initialize_mpi(){
|
|
// Initialize the MPI environment
|
|
#ifndef USE_OMP
|
|
MPI_Init(NULL, NULL);
|
|
#else
|
|
int provided;
|
|
MPI_Init_thread(NULL,NULL,MPI_THREAD_FUNNELED,&provided);
|
|
if(provided != MPI_THREAD_FUNNELED){
|
|
std::cerr << "MPI_THREAD_FUNNELED is not supported" << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
// currently no routine except src/rec weight calculation is parallelized by openmp
|
|
// thus put 1 thread per process
|
|
omp_set_num_threads(1);
|
|
|
|
// show the number of threads
|
|
int nthreads = omp_get_max_threads();
|
|
|
|
// error check
|
|
if (nthreads != 1){
|
|
std::cerr << "number of threads is not 1" << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
//if (world_rank == 0)
|
|
// std::cout << "Number of threads = " << nthreads << std::endl;
|
|
#endif
|
|
// Get the number of processes
|
|
MPI_Comm_size(MPI_COMM_WORLD, &world_nprocs);
|
|
// Get the rank of the process
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
|
|
|
|
// temporary use those parameters for reading input file
|
|
nprocs = world_nprocs;
|
|
myrank = world_rank;
|
|
inter_sub_comm = MPI_COMM_WORLD;
|
|
|
|
stdout_by_rank_zero("mpi initialized.");
|
|
}
|
|
|
|
|
|
inline void finalize_mpi(){
|
|
|
|
synchronize_all_world();
|
|
|
|
// Clean up all MPI windows before finalizing MPI
|
|
cleanup_all_mpi_windows();
|
|
|
|
// free communicators if necessary
|
|
//if (sub_comm != MPI_COMM_NULL)
|
|
// MPI_Comm_free(&sub_comm);
|
|
//if (inter_sub_comm != MPI_COMM_WORLD)
|
|
// MPI_Comm_free(&inter_sub_comm);
|
|
//if (sim_comm != MPI_COMM_NULL)
|
|
// MPI_Comm_free(&sim_comm);
|
|
|
|
// Finalize the MPI environment.
|
|
MPI_Finalize();
|
|
|
|
stdout_by_rank_zero("mpi finalized.");
|
|
}
|
|
|
|
|
|
inline void mpi_debug(char const* str){
|
|
std::cout << "rank: " << world_rank << ", " << str << std::endl;
|
|
synchronize_all_world();
|
|
}
|
|
|
|
|
|
template<typename T>
|
|
inline std::vector<size_t> node_reordering(const std::vector<T>& vec){
|
|
// reordering the nodes in the order of the number of particles
|
|
// the first node has the most particles
|
|
std::vector<size_t> idx(vec.size());
|
|
std::iota(idx.begin(), idx.end(), 0);
|
|
std::sort(idx.begin(), idx.end(), [&vec](size_t i1, size_t i2) {return vec[i1] > vec[i2];});
|
|
return idx;
|
|
}
|
|
|
|
|
|
inline std::vector<int> define_node_ids(std::vector<std::string>& mpi_node_names_pre){
|
|
std::vector<int> mpi_node_ids(world_nprocs);
|
|
std::vector<std::string> mpi_node_names_unique = mpi_node_names_pre;
|
|
std::sort(mpi_node_names_unique.begin(), mpi_node_names_unique.end());
|
|
mpi_node_names_unique.erase(std::unique(mpi_node_names_unique.begin(), mpi_node_names_unique.end()), mpi_node_names_unique.end());
|
|
for (int irank = 0; irank < world_nprocs; irank++){
|
|
for (long unsigned int i = 0; i < mpi_node_names_unique.size(); i++){
|
|
if (mpi_node_names_pre[irank] == mpi_node_names_unique[i]){
|
|
mpi_node_ids[irank] = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return mpi_node_ids;
|
|
}
|
|
|
|
|
|
inline int count_number_compute_nodes(std::vector<std::string> mpi_node_names) {
|
|
std::vector<std::string> mpi_node_names_unique = mpi_node_names;
|
|
std::sort(mpi_node_names_unique.begin(), mpi_node_names_unique.end());
|
|
mpi_node_names_unique.erase(std::unique(mpi_node_names_unique.begin(), mpi_node_names_unique.end()), mpi_node_names_unique.end());
|
|
return mpi_node_names_unique.size();
|
|
}
|
|
|
|
|
|
inline void split_mpi_comm(){
|
|
/*
|
|
This function splits MPI_COMM_WORLD into 3 layers of mpi communication groups
|
|
- simulation groups
|
|
- subdomain groups
|
|
- group of only the sub domain's leader
|
|
|
|
subdomain leader is selected from the first processes of each simulation group.
|
|
Then the later processes will be assigned to the subdomain group.
|
|
|
|
# memo
|
|
h5 grid file is written only by the first group of simultaneous runs
|
|
h5 data is written by the subdomain main processes of all the simultaneous runs.
|
|
xdmf file is written by only the first subdomain's subdomain main process of all the simultaneous runs.
|
|
*/
|
|
|
|
// make a list of mpi node names and redefine the world_rank
|
|
std::vector<std::string> mpi_node_names(world_nprocs, "not_initialized"), mpi_node_names_pre(world_nprocs, "not_initialized");
|
|
std::vector<int> mpi_node_ids(world_nprocs, -1); // we put id for each node
|
|
|
|
// get processor name of this process
|
|
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
|
int name_len;
|
|
MPI_Get_processor_name(processor_name, &name_len);
|
|
mpi_node_names_pre[world_rank] = std::string(processor_name);
|
|
|
|
//
|
|
// DEBUG ONLY rename mpi node names into 4 different names
|
|
//
|
|
// //if (world_nprocs != 32){
|
|
// // std::cout << "ERROR: world_nprocs is not 64. (for processor assignment test)" << std::endl;
|
|
// // exit(1);
|
|
// //}
|
|
// if (world_nprocs%2 != 0){
|
|
// std::cout << "ERROR: world_nprocs is not even. (for processor assignment test)" << std::endl;
|
|
// exit(1);
|
|
// }
|
|
//
|
|
// if (world_rank %2 == 0) {
|
|
// mpi_node_names_pre[world_rank] = "node_inu";
|
|
// //} else if (world_rank % 4 == 1) {
|
|
// // mpi_node_names_pre[world_rank] = "node_neko";
|
|
// //} else if (world_rank % 4 == 2) {
|
|
// // mpi_node_names_pre[world_rank] = "node_kani";
|
|
// } else {
|
|
// mpi_node_names_pre[world_rank] = "node_usagi";
|
|
// }
|
|
//
|
|
// // debug out
|
|
// if (world_rank == 0){
|
|
// for (int irank = 0; irank < world_nprocs; irank++){
|
|
// std::cout << "rank: " << irank << ", node name: " << mpi_node_names_pre[irank] << std::endl;
|
|
// }
|
|
// }
|
|
//
|
|
// synchronize_all_world();
|
|
//
|
|
// DEBUG node fake name assignment done
|
|
//
|
|
|
|
// gather all the node names in std::vector
|
|
allgather_str(mpi_node_names_pre[world_rank], mpi_node_names_pre);
|
|
|
|
// define node id for each node
|
|
mpi_node_ids = define_node_ids(mpi_node_names_pre);
|
|
|
|
// total number of nodes (unique)
|
|
int n_compute_nodes = count_number_compute_nodes(mpi_node_names_pre);
|
|
|
|
// reorder if the number of unique mpi_node_ids != 1
|
|
if (n_compute_nodes > 1){
|
|
|
|
// sort mpi_node_names and change world rank accordingly
|
|
std::vector<size_t> node_reorder = node_reordering(mpi_node_names_pre);
|
|
std::vector<std::string> mpi_node_names_sorted(world_nprocs);
|
|
std::vector<int> mpi_node_ids_sorted(world_nprocs);
|
|
for (int irank = 0; irank < world_nprocs; irank++){
|
|
mpi_node_names_sorted[irank] = mpi_node_names_pre[node_reorder[irank]];
|
|
mpi_node_ids_sorted[irank] = mpi_node_ids[node_reorder[irank]];
|
|
}
|
|
mpi_node_names = mpi_node_names_sorted;
|
|
mpi_node_ids = mpi_node_ids_sorted;
|
|
|
|
// renumbering this process's rank
|
|
world_rank = node_reorder[world_rank];
|
|
|
|
} else {
|
|
mpi_node_names = mpi_node_names_pre;
|
|
//mpi_node_ids = mpi_node_ids;
|
|
}
|
|
|
|
// debug out
|
|
// if (world_rank == 0){
|
|
// for (int irank = 0; irank < world_nprocs; irank++){
|
|
// std::cout << "apres rank: " << irank << ", node name: " << mpi_node_names[irank] << ", node id : " << mpi_node_ids[irank] << std::endl;
|
|
// }
|
|
// }
|
|
// synchronize_all_world();
|
|
|
|
// show node name and number summary
|
|
if (world_rank == 0){
|
|
std::cout << "\n\n Node name summary\n" << std::endl;
|
|
|
|
std::cout << "Total number of compute nodes: " << n_compute_nodes << std::endl;
|
|
|
|
for (long unsigned int i = 0; i < mpi_node_names.size(); i++){
|
|
// only one show for each node name
|
|
if (i == 0 || mpi_node_names[i] != mpi_node_names[i-1]){
|
|
std::cout << "node name: " << mpi_node_names[i] << ", number: " << std::count(mpi_node_names.begin(), mpi_node_names.end(), mpi_node_names[i]) << std::endl;
|
|
}
|
|
}
|
|
std::cout << "\n" << std::endl;
|
|
}
|
|
|
|
// create communicator for this group if simultaneous run mode
|
|
if(n_sims > 1) {
|
|
// set a simultaneous run id
|
|
if (world_nprocs%n_sims == 0) {
|
|
n_procs_each_sim = static_cast<int>(world_nprocs/n_sims);
|
|
id_sim = std::floor(world_rank/n_procs_each_sim);
|
|
//id_sim = static_cast<int>(world_rank/n_procs_each_sim);
|
|
} else {
|
|
stdout_by_main("Error: requested nproc is not divisible by n_sims.");
|
|
finalize_mpi();
|
|
exit(1);
|
|
}
|
|
|
|
// create communicator for simulation group
|
|
MPI_Comm_split(MPI_COMM_WORLD, id_sim, world_rank, &sim_comm);
|
|
MPI_Comm_rank(sim_comm, &sim_rank); // rank in simulation group
|
|
MPI_Comm_size(sim_comm, &sim_nprocs); // number of processes in simulation group
|
|
|
|
// recreate node_names and node_ids for this simulation group
|
|
std::vector<std::string> mpi_node_names_tmp(sim_nprocs);
|
|
std::vector<int> mpi_node_ids_tmp(sim_nprocs);
|
|
|
|
for (int irank = 0; irank < sim_nprocs; irank++){
|
|
mpi_node_names_tmp[irank] = mpi_node_names[n_procs_each_sim*id_sim + irank];
|
|
mpi_node_ids_tmp[irank] = mpi_node_ids[ n_procs_each_sim*id_sim + irank];
|
|
}
|
|
|
|
// re-assign node_names and node_ids
|
|
mpi_node_names = mpi_node_names_tmp;
|
|
mpi_node_ids = mpi_node_ids_tmp;
|
|
|
|
// count the number of compute nodes (unique) again
|
|
n_compute_nodes = count_number_compute_nodes(mpi_node_names);
|
|
|
|
} else {
|
|
// this is not a simultaneous run
|
|
n_procs_each_sim = world_nprocs;
|
|
id_sim = 0;
|
|
|
|
sim_comm = MPI_COMM_WORLD;
|
|
sim_rank = world_rank;
|
|
sim_nprocs = world_nprocs;
|
|
}
|
|
|
|
// inter-communicator (between simulation groups)
|
|
MPI_Comm_split(MPI_COMM_WORLD, sim_rank, id_sim, &inter_sim_comm);
|
|
MPI_Comm_rank(inter_sim_comm, &inter_sim_rank);
|
|
|
|
// number of subdomains
|
|
n_subdomains = ndiv_i*ndiv_j*ndiv_k;
|
|
|
|
// check if the number of compute node is multiple of n_subdomains
|
|
if (n_subdomains % n_compute_nodes != 0){
|
|
std::cout << "ERROR: n_compute_nodes is not multiple of n_subdomains" << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
// number of subdomains per node
|
|
//int n_subdomains_per_node = static_cast<int>(n_subdomains/n_compute_nodes);
|
|
int n_procs_per_subdomain = static_cast<int>(sim_nprocs/n_subdomains);
|
|
id_subdomain = std::floor(sim_rank/n_procs_per_subdomain);
|
|
int id_proc_in_subdomain = sim_rank - id_subdomain*n_procs_per_subdomain;
|
|
|
|
// assign first ranks to the main node of subdomain
|
|
// and create a commmunicator for main processes of subdomain
|
|
if (id_proc_in_subdomain == 0) {
|
|
//std::cout << "my sim_rank: " << sim_rank << std::endl;
|
|
subdom_main = true;
|
|
MPI_Comm_split(sim_comm, 0, sim_rank, &inter_sub_comm);
|
|
MPI_Comm_rank(inter_sub_comm, &inter_sub_rank);
|
|
MPI_Comm_size(inter_sub_comm, &inter_sub_nprocs);
|
|
|
|
// then the calculation in the sub domain will be done by only the processes
|
|
// with subdomom_main == true and with inter_sub_comm commmunicator
|
|
|
|
// use the rank number of inter_sub_comm as the myrank
|
|
myrank = inter_sub_rank;
|
|
nprocs = inter_sub_nprocs;
|
|
|
|
} else {
|
|
//std::cout << "my sim_rank to sub : " << sim_rank << std::endl;
|
|
MPI_Comm_split(sim_comm, 1, sim_rank, &inter_sub_comm);
|
|
|
|
// assign those ranks as subprocesses of each sub domain.
|
|
subdom_main = false;
|
|
myrank = -9999;
|
|
nprocs = -9999;
|
|
}
|
|
|
|
synchronize_all_sim();
|
|
|
|
MPI_Comm_split(sim_comm, id_subdomain, sim_rank, &sub_comm);
|
|
MPI_Comm_rank(sub_comm, &sub_rank);
|
|
MPI_Comm_size(sub_comm, &sub_nprocs);
|
|
|
|
// convert the sub_comm to shared memory available communicator
|
|
MPI_Comm tmp_comm;
|
|
MPI_Comm_split_type(sub_comm, MPI_COMM_TYPE_SHARED, id_subdomain, MPI_INFO_NULL, &tmp_comm);
|
|
sub_comm = tmp_comm;
|
|
MPI_Comm_rank(sub_comm, &sub_rank);
|
|
MPI_Comm_size(sub_comm, &sub_nprocs);
|
|
|
|
synchronize_all_world();
|
|
|
|
// check processors
|
|
for (int irank = 0; irank < world_nprocs; irank++){
|
|
synchronize_all_world();
|
|
if (irank == world_rank)
|
|
std::cout << "global rank: " << world_rank << ", node name: " << mpi_node_names[world_rank] \
|
|
<< " | i_simul/total: " << id_sim << "/" << n_sims << ", n_procs_each_sim: " << n_procs_each_sim \
|
|
<< " | i_subdom/total: " << id_subdomain << "/" << n_subdomains << ", subdom_main: " << subdom_main \
|
|
<< ", sub_rank/total: " << sub_rank << "/" << sub_nprocs \
|
|
<< std::endl;
|
|
}
|
|
|
|
synchronize_all_world();
|
|
|
|
// // check processors
|
|
// for (int irank = 0; irank < world_nprocs; irank++){
|
|
// synchronize_all_world();
|
|
// if (irank == world_rank) {
|
|
// std::cout << "global rank: " << world_rank << ", node name: " << mpi_node_names[world_rank]
|
|
// << ", world_rank: " << world_rank << ", world_nprocs: " << world_nprocs
|
|
// << ", sim_rank: " << sim_rank << ", sim_nprocs: " << sim_nprocs
|
|
// << ", sub_rank: " << sub_rank << ", sub_nprocs: " << sub_nprocs
|
|
// << ", inter_sim_rank: " << inter_sim_rank
|
|
// << ", inter_sub_rank: " << inter_sub_rank << ", inter_sub_nprocs: " << inter_sub_nprocs
|
|
// << ", id_sim: " << id_sim << ", id_subdomain: " << id_subdomain
|
|
// << ", subdom_main: " << subdom_main
|
|
// << ", id_proc_in_subdomain: " << id_proc_in_subdomain
|
|
// << ", id_subdomain: " << id_subdomain
|
|
// << std::endl;
|
|
// std::cout << std::endl;
|
|
// }
|
|
// }
|
|
// synchronize_all_world();
|
|
|
|
}
|
|
|
|
|
|
inline int check_total_nprocs_and_ndiv(){
|
|
if (world_nprocs != (n_sims*ndiv_i*ndiv_j*ndiv_k*n_subprocs)){
|
|
stdout_by_main("ERROR: the number of requested processors and n_sims*ndiv_rtp[0]*ndiv_rtp[1]*ndiv_rtp[2]*nproc_sub in input_params.yml need to be the same.");
|
|
if (world_rank==0){
|
|
// print all params
|
|
std::cout << "n_sims: " << n_sims << std::endl;
|
|
std::cout << "ndiv_rtp: " << ndiv_k << " " << ndiv_j << " " << ndiv_i << std::endl;
|
|
std::cout << "n_subprocs: " << n_subprocs << std::endl;
|
|
std::cout << "nprocs should be n_sims*ndiv_p*ndiv_t*ndiv_r*n_subprocs = " << n_sims*ndiv_i*ndiv_j*ndiv_k*n_subprocs << std::endl;
|
|
std::cout << "but the actual nprocs = " << nprocs << std::endl;
|
|
}
|
|
finalize_mpi();
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
inline void synchronize_all(){
|
|
MPI_Barrier(sim_comm);
|
|
}
|
|
|
|
|
|
inline void synchronize_all_sim(){
|
|
MPI_Barrier(sim_comm);
|
|
}
|
|
|
|
|
|
inline void synchronize_all_sub(){
|
|
MPI_Barrier(sub_comm);
|
|
}
|
|
|
|
|
|
inline void synchronize_all_inter(){
|
|
MPI_Barrier(inter_sub_comm);
|
|
}
|
|
|
|
|
|
inline void synchronize_all_inter_sim(){
|
|
MPI_Barrier(inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void synchronize_all_world(){
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
}
|
|
|
|
|
|
inline void send_bool_single_sim(bool* b, int dest){
|
|
const int n = 1;
|
|
MPI_Send(b, n, MPI_C_BOOL, dest, MPI_DUMMY_TAG, inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void recv_bool_single_sim(bool* b, int src){
|
|
const int n = 1;
|
|
MPI_Recv(b, n, MPI_C_BOOL, src, MPI_DUMMY_TAG, inter_sim_comm, MPI_STATUS_IGNORE);
|
|
}
|
|
|
|
|
|
inline void send_i_single_sim(int* i, int dest){
|
|
const int n = 1;
|
|
MPI_Send(i, n, MPI_INT, dest, MPI_DUMMY_TAG, inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void recv_i_single_sim(int* i, int src){
|
|
const int n = 1;
|
|
MPI_Recv(i, n, MPI_INT, src, MPI_DUMMY_TAG, inter_sim_comm, MPI_STATUS_IGNORE);
|
|
}
|
|
|
|
|
|
inline void send_cr_single_sim(CUSTOMREAL *cr, int dest){
|
|
const int n = 1;
|
|
MPI_Send(cr, n, MPI_CR, dest, MPI_DUMMY_TAG, inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void recv_cr_single_sim(CUSTOMREAL *cr, int src){
|
|
const int n = 1;
|
|
MPI_Recv(cr, n, MPI_CR, src, MPI_DUMMY_TAG, inter_sim_comm, MPI_STATUS_IGNORE);
|
|
}
|
|
|
|
|
|
inline void send_str_sim(std::string str, int dest){
|
|
const int n = str.size();
|
|
char* cstr = new char[n+1];
|
|
strcpy(cstr, str.c_str());
|
|
MPI_Send(cstr, n, MPI_CHAR, dest, MPI_DUMMY_TAG, inter_sim_comm);
|
|
delete[] cstr;
|
|
}
|
|
|
|
|
|
inline void recv_str_sim(std::string& str, int src){
|
|
MPI_Status status;
|
|
int n;
|
|
MPI_Probe(src, MPI_DUMMY_TAG, inter_sim_comm, &status);
|
|
MPI_Get_count(&status, MPI_CHAR, &n);
|
|
char* cstr = new char[n+1];
|
|
MPI_Recv(cstr, n, MPI_CHAR, src, MPI_DUMMY_TAG, inter_sim_comm, MPI_STATUS_IGNORE);
|
|
cstr[n] = '\0';
|
|
str = std::string(cstr);
|
|
delete[] cstr;
|
|
}
|
|
|
|
|
|
inline void send_str(std::string str, int dest){
|
|
const int n = str.size();
|
|
char* cstr = new char[n+1];
|
|
strcpy(cstr, str.c_str());
|
|
MPI_Send(cstr, n, MPI_CHAR, dest, MPI_DUMMY_TAG, inter_sub_comm);
|
|
delete[] cstr;
|
|
}
|
|
|
|
|
|
inline void recv_str(std::string& str, int src){
|
|
MPI_Status status;
|
|
int n;
|
|
MPI_Probe(src, MPI_DUMMY_TAG, inter_sub_comm, &status);
|
|
MPI_Get_count(&status, MPI_CHAR, &n);
|
|
char* cstr = new char[n+1];
|
|
MPI_Recv(cstr, n, MPI_CHAR, src, MPI_DUMMY_TAG, inter_sub_comm, MPI_STATUS_IGNORE);
|
|
cstr[n] = '\0';
|
|
str = std::string(cstr);
|
|
delete[] cstr;
|
|
}
|
|
|
|
|
|
inline void send_cr(CUSTOMREAL *cr, int n, int dest){
|
|
MPI_Send(cr, n, MPI_CR, dest, MPI_DUMMY_TAG, inter_sub_comm);
|
|
}
|
|
|
|
|
|
inline void recv_cr(CUSTOMREAL *cr, int n, int src){
|
|
MPI_Recv(cr, n, MPI_CR, src, MPI_DUMMY_TAG, inter_sub_comm, MPI_STATUS_IGNORE);
|
|
}
|
|
|
|
inline void isend_cr(CUSTOMREAL* buf, int count, int dest, MPI_Request& request){
|
|
//MPI_Request request = MPI_REQUEST_NULL;
|
|
//std::cout << "sending from : " << inter_sub_rank << ", to : " << dest <<", size : " << count << std::endl;
|
|
MPI_Isend(buf, count, MPI_CR, dest, MPI_DUMMY_TAG, inter_sub_comm, &request);
|
|
}
|
|
|
|
inline void irecv_cr(CUSTOMREAL* buf, int count, int source, MPI_Request& request){
|
|
//MPI_Request request = MPI_REQUEST_NULL;
|
|
//std::cout << "receiving by : " << inter_sub_rank << ", from : " << source << ", size : " << count << std::endl;
|
|
MPI_Irecv(buf, count, MPI_CR, source, MPI_DUMMY_TAG, inter_sub_comm, &request);
|
|
}
|
|
|
|
inline void allreduce_i_single(int& value, int& result){
|
|
int count = 1;
|
|
MPI_Allreduce(&value, &result, count, MPI_INT, MPI_SUM, inter_sub_comm);
|
|
}
|
|
|
|
inline void allreduce_i_sim_single_inplace(int& value){
|
|
int count = 1;
|
|
MPI_Allreduce(MPI_IN_PLACE, &value, count, MPI_INT, MPI_SUM, inter_sim_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_single(CUSTOMREAL& loc_buf, CUSTOMREAL& all_buf){
|
|
int count = 1;
|
|
MPI_Allreduce(&loc_buf, &all_buf, count, MPI_CR, MPI_SUM, inter_sub_comm);
|
|
}
|
|
|
|
|
|
inline void allreduce_bool_inplace_inter_sim(bool* loc_buf, int count){
|
|
// return true if any of the processes return true
|
|
MPI_Allreduce(MPI_IN_PLACE, loc_buf, count, MPI_CXX_BOOL, MPI_LOR, inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void allreduce_bool_inplace(bool* loc_buf, int count){
|
|
// return true if any of the processes return true
|
|
MPI_Allreduce(MPI_IN_PLACE, loc_buf, count, MPI_CXX_BOOL, MPI_LOR, inter_sub_comm);
|
|
}
|
|
|
|
|
|
inline void allreduce_bool_inplace_sub(bool* loc_buf, int count){
|
|
// return true if any of the processes return true
|
|
MPI_Allreduce(MPI_IN_PLACE, loc_buf, count, MPI_CXX_BOOL, MPI_LOR, sub_comm);
|
|
}
|
|
|
|
|
|
inline void allreduce_bool_single_inplace(bool& loc_buf){
|
|
MPI_Allreduce(MPI_IN_PLACE, &loc_buf, 1, MPI_CXX_BOOL, MPI_LAND, inter_sub_comm);
|
|
}
|
|
|
|
// true if all processes return true else false
|
|
inline void allreduce_bool_single_inplace_sim(bool& loc_buf){
|
|
MPI_Allreduce(MPI_IN_PLACE, &loc_buf, 1, MPI_CXX_BOOL, MPI_LAND, inter_sim_comm);
|
|
}
|
|
|
|
inline void allreduce_i_inplace(int* loc_buf, int count){
|
|
MPI_Allreduce(MPI_IN_PLACE, loc_buf, count, MPI_INT, MPI_SUM, inter_sub_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_inplace(CUSTOMREAL* loc_buf, int count){
|
|
MPI_Allreduce(MPI_IN_PLACE, loc_buf, count, MPI_CR, MPI_SUM, inter_sub_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_sim(CUSTOMREAL* loc_buf, int count, CUSTOMREAL* all_buf){
|
|
MPI_Allreduce(loc_buf, all_buf, count, MPI_CR, MPI_SUM, inter_sim_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_sim_single(CUSTOMREAL& loc_buf, CUSTOMREAL& all_buf){
|
|
int count = 1;
|
|
MPI_Allreduce(&loc_buf, &all_buf, count, MPI_CR, MPI_SUM, inter_sim_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_sim_single_inplace(CUSTOMREAL& loc_buf){
|
|
int count = 1;
|
|
MPI_Allreduce(MPI_IN_PLACE, &loc_buf, count, MPI_CR, MPI_SUM, inter_sim_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_sim_inplace(CUSTOMREAL* loc_buf, int count){
|
|
MPI_Allreduce(MPI_IN_PLACE, loc_buf, count, MPI_CR, MPI_SUM, inter_sim_comm);
|
|
}
|
|
|
|
inline void alleduce_cr_sim_single_inplace(CUSTOMREAL& loc_buf){
|
|
int count = 1;
|
|
MPI_Allreduce(MPI_IN_PLACE, &loc_buf, count, MPI_CR, MPI_SUM, inter_sim_comm);
|
|
}
|
|
|
|
inline void allreduce_cr_single_max(CUSTOMREAL& loc_buf, CUSTOMREAL& all_buf){
|
|
int count = 1;
|
|
MPI_Allreduce(&loc_buf, &all_buf, count, MPI_CR, MPI_MAX, inter_sub_comm);
|
|
}
|
|
|
|
inline void allgather_i_single(int* loc_buf, int* all_buf){
|
|
int count = 1;
|
|
MPI_Allgather(loc_buf, count, MPI_INT, all_buf, count, MPI_INT, inter_sub_comm);
|
|
}
|
|
|
|
inline void allgather_cr_single(CUSTOMREAL* loc_buf, CUSTOMREAL* all_buf){
|
|
int count = 1;
|
|
MPI_Allgather(loc_buf, count, MPI_CR, all_buf, count, MPI_CR, inter_sub_comm);
|
|
}
|
|
|
|
inline void allgather_bool_single(bool* loc_buf, bool* all_buf){
|
|
int count = 1;
|
|
MPI_Allgather(loc_buf, count, MPI_CXX_BOOL, all_buf, count, MPI_CXX_BOOL, inter_sub_comm);
|
|
}
|
|
|
|
inline void broadcast_bool_single(bool& value, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_CXX_BOOL, root, inter_sub_comm);
|
|
}
|
|
|
|
inline void broadcast_bool_single_sub(bool& value, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_CXX_BOOL, root, sub_comm);
|
|
}
|
|
|
|
inline void broadcast_bool_single_inter_sim(bool& value, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_CXX_BOOL, root, inter_sim_comm);
|
|
}
|
|
|
|
inline void broadcast_bool_inter_and_intra_sim(bool& value, int root){
|
|
broadcast_bool_single_inter_sim(value, root); // broadcast among simultaneous run group
|
|
broadcast_bool_single(value, root); // broadcast among subdomain group
|
|
broadcast_bool_single_sub(value, root); // broadcast within subdomain group
|
|
}
|
|
|
|
inline void broadcast_i_single(int& value, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_INT, root, inter_sub_comm);
|
|
}
|
|
|
|
|
|
inline void broadcast_i_single_inter_sim(int& value, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_INT, root, inter_sim_comm);
|
|
}
|
|
|
|
inline void broadcast_i_single_intra_sim(int& value, int root){
|
|
broadcast_i_single(value, root); // broadcast among subdomain group
|
|
broadcast_i_single_sub(value, root); // broadcast within subdomain group
|
|
}
|
|
|
|
inline void broadcast_i_single_inter_and_intra_sim(int& value, int root){
|
|
broadcast_i_single_inter_sim(value, root); // broadcast among simultaneous run group
|
|
broadcast_i_single(value, root); // broadcast among subdomain group
|
|
broadcast_i_single_sub(value, root); // broadcast within subdomain group
|
|
}
|
|
|
|
inline void broadcast_f_single(float& value, int root){ // !!!! FOR ONLY READ PARAMETER !!!!!
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_FLOAT, root, inter_sub_comm);
|
|
}
|
|
|
|
inline void broadcast_cr_single(CUSTOMREAL& buf, int root){
|
|
MPI_Bcast(&buf, 1, MPI_CR, root, inter_sub_comm);
|
|
}
|
|
|
|
inline void broadcast_cr(CUSTOMREAL* buf, int count, int root){
|
|
MPI_Bcast(buf, count, MPI_CR, root, inter_sub_comm);
|
|
}
|
|
|
|
inline void broadcast_cr_single_inplace(CUSTOMREAL& buf, int root){
|
|
MPI_Bcast(&buf, 1, MPI_CR, root, inter_sub_comm);
|
|
}
|
|
|
|
inline void broadcast_cr_inter_sim(CUSTOMREAL* buf, int count, int root){
|
|
MPI_Bcast(buf, count, MPI_CR, root, inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void broadcast_cr_single_inter_sim(CUSTOMREAL& buf, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&buf, count, MPI_CR, root, inter_sim_comm);
|
|
}
|
|
|
|
|
|
inline void broadcast_cr_single_inter_and_intra_sim(CUSTOMREAL& buf, int root){
|
|
broadcast_cr_single_inter_sim(buf, root); // broadcast among simultaneous run group
|
|
broadcast_cr_single(buf, root); // broadcast among subdomain group
|
|
broadcast_cr_single_sub(buf, root); // broadcast within subdomain group
|
|
}
|
|
|
|
|
|
inline void broadcast_i_single_sub(int& value, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&value, count, MPI_INT, root, sub_comm);
|
|
}
|
|
|
|
//inline void broadcast_f_single_sub(float& value, int root){ // !!!! FOR ONLY READ PARAMETER !!!!!
|
|
// int count = 1;
|
|
// MPI_Bcast(&value, count, MPI_FLOAT, root, sub_comm);
|
|
//}
|
|
|
|
inline void broadcast_cr_single_sub(CUSTOMREAL& buf, int root){
|
|
int count = 1;
|
|
MPI_Bcast(&buf, count, MPI_CR, root, sub_comm);
|
|
}
|
|
|
|
inline void broadcast_cr_sub(CUSTOMREAL* buf, int count, int root){
|
|
MPI_Bcast(buf, count, MPI_CR, root, sub_comm);
|
|
}
|
|
|
|
inline void broadcast_str(std::string& str, int root) {
|
|
int count = str.size();
|
|
MPI_Bcast(&count, 1, MPI_INT, root, inter_sub_comm);
|
|
char* buf = new char[count+1];
|
|
if (root == inter_sub_rank) {
|
|
std::strcpy(buf, str.c_str());
|
|
}
|
|
MPI_Bcast(buf, count+1, MPI_CHAR, root, inter_sub_comm);
|
|
str = buf;
|
|
delete[] buf;
|
|
}
|
|
|
|
inline void broadcast_str_sub(std::string& str, int root) {
|
|
int count = str.size();
|
|
MPI_Bcast(&count, 1, MPI_INT, root, sub_comm);
|
|
char* buf = new char[count+1];
|
|
if (root == sub_rank) {
|
|
std::strcpy(buf, str.c_str());
|
|
}
|
|
MPI_Bcast(buf, count+1, MPI_CHAR, root, sub_comm);
|
|
str = buf;
|
|
delete[] buf;
|
|
}
|
|
|
|
inline void broadcast_str_inter_sim(std::string& str, int root) {
|
|
int count = str.size();
|
|
MPI_Bcast(&count, 1, MPI_INT, root, inter_sim_comm);
|
|
char* buf = new char[count+1];
|
|
if (root == inter_sim_rank) {
|
|
std::strcpy(buf, str.c_str());
|
|
}
|
|
MPI_Bcast(buf, count+1, MPI_CHAR, root, inter_sim_comm);
|
|
str = buf;
|
|
delete[] buf;
|
|
}
|
|
|
|
inline void broadcast_str_inter_and_intra_sim(std::string& str, int root) {
|
|
broadcast_str_inter_sim(str, root);
|
|
broadcast_str(str, root);
|
|
broadcast_str_sub(str, root);
|
|
}
|
|
|
|
|
|
inline void allgather_str(const std::string &str, std::vector<std::string> &result) {
|
|
MPI_Comm comm = MPI_COMM_WORLD;
|
|
int size = world_nprocs;
|
|
|
|
int str_size = str.size();
|
|
std::vector<int> str_sizes(size);
|
|
MPI_Allgather(&str_size, 1, MPI_INT, str_sizes.data(), 1, MPI_INT, comm);
|
|
|
|
int total_size = 0;
|
|
std::vector<int> displs(size);
|
|
for (int i = 0; i < size; i++) {
|
|
total_size += str_sizes[i];
|
|
displs[i] = (i == 0) ? 0 : (displs[i - 1] + str_sizes[i - 1]);
|
|
}
|
|
|
|
std::vector<char> data(total_size);
|
|
MPI_Allgatherv(str.data(), str_size, MPI_CHAR, data.data(), str_sizes.data(), displs.data(), MPI_CHAR, comm);
|
|
|
|
result.reserve(size);
|
|
int start = 0;
|
|
for (int i = 0; i < size; i++) {
|
|
result[i] = std::string(&data[start], str_sizes[i]);
|
|
start += str_sizes[i];
|
|
}
|
|
|
|
}
|
|
|
|
|
|
inline void prepare_shm_array_cr(int n_elms, CUSTOMREAL* &buf, MPI_Win& win){
|
|
int* model;
|
|
int flag;
|
|
MPI_Aint winsize_dummy;
|
|
int windisp_dummy;
|
|
|
|
// Allocate shared memory for the array on only the subdomain's main process (n_elms == 0 for the other processes)
|
|
MPI_Win_allocate_shared(n_elms*sizeof(CUSTOMREAL), sizeof(CUSTOMREAL), MPI_INFO_NULL, sub_comm, &buf, &win);
|
|
// get attribute
|
|
MPI_Win_get_attr(win, MPI_WIN_MODEL, &model, &flag);
|
|
// shared query
|
|
if (sub_rank != 0){
|
|
MPI_Win_shared_query(win, 0, &winsize_dummy, &windisp_dummy, &buf);
|
|
}
|
|
|
|
shm_fence(win);
|
|
}
|
|
|
|
/*
|
|
inline void prepare_shm_single_i(int*& value, MPI_Win& win){
|
|
int* model;
|
|
int flag;
|
|
MPI_Aint winsize_dummy;
|
|
int windisp_dummy;
|
|
int mpi_size;
|
|
if (sub_rank == 0){
|
|
mpi_size = sizeof(int)*1;
|
|
} else {
|
|
mpi_size = 0;
|
|
}
|
|
|
|
// Allocate shared memory for the array on only the subdomain's main process (n_elms == 0 for the other processes)
|
|
MPI_Win_allocate_shared(mpi_size, sizeof(int), MPI_INFO_NULL, sub_comm, &value, &win);
|
|
// get attribute
|
|
MPI_Win_get_attr(win, MPI_WIN_MODEL, &model, &flag);
|
|
// shared query
|
|
if (sub_rank != 0){
|
|
MPI_Win_shared_query(win, 0, &winsize_dummy, &windisp_dummy, &value);
|
|
}
|
|
|
|
shm_fence(win);
|
|
}
|
|
*/
|
|
/*
|
|
inline void prepare_shm_single_cr(CUSTOMREAL*& value, MPI_Win& win){
|
|
int* model;
|
|
int flag;
|
|
MPI_Aint winsize_dummy;
|
|
int windisp_dummy;
|
|
int mpi_size;
|
|
if (sub_rank == 0){
|
|
mpi_size = sizeof(CUSTOMREAL)*1;
|
|
} else {
|
|
mpi_size = 0;
|
|
}
|
|
|
|
// Allocate shared memory for the array on only the subdomain's main process (n_elms == 0 for the other processes)
|
|
MPI_Win_allocate_shared(mpi_size, sizeof(CUSTOMREAL), MPI_INFO_NULL, sub_comm, &value, &win);
|
|
// get attribute
|
|
MPI_Win_get_attr(win, MPI_WIN_MODEL, &model, &flag);
|
|
// shared query
|
|
if (sub_rank != 0){
|
|
MPI_Win_shared_query(win, 0, &winsize_dummy, &windisp_dummy, &value);
|
|
}
|
|
|
|
shm_fence(win);
|
|
}
|
|
*/
|
|
|
|
inline void prepare_shm_array_bool(int n_elms, bool* &buf, MPI_Win& win){
|
|
int* model;
|
|
int flag;
|
|
MPI_Aint winsize_dummy;
|
|
int windisp_dummy;
|
|
|
|
// Allocate shared memory for the array on only the subdomain's main process (n_elms == 0 for the other processes)
|
|
MPI_Win_allocate_shared(n_elms*sizeof(bool), sizeof(bool), MPI_INFO_NULL, sub_comm, &buf, &win);
|
|
// get attribute
|
|
MPI_Win_get_attr(win, MPI_WIN_MODEL, &model, &flag);
|
|
// shared query
|
|
if (sub_rank != 0){
|
|
MPI_Win_shared_query(win, 0, &winsize_dummy, &windisp_dummy, &buf);
|
|
}
|
|
|
|
shm_fence(win);
|
|
}
|
|
|
|
|
|
inline void wait_req(MPI_Request& req){
|
|
MPI_Status status;
|
|
MPI_Wait(&req, &status);
|
|
}
|
|
|
|
|
|
inline void shm_fence(MPI_Win& win){
|
|
MPI_Win_fence(0, win);
|
|
}
|
|
|
|
// Function to safely free a single MPI window
|
|
inline void cleanup_mpi_win(MPI_Win& win) {
|
|
if (win != MPI_WIN_NULL) {
|
|
int mpi_error = MPI_Win_free(&win);
|
|
if (mpi_error != MPI_SUCCESS) {
|
|
char error_string[MPI_MAX_ERROR_STRING];
|
|
int length_of_error_string;
|
|
MPI_Error_string(mpi_error, error_string, &length_of_error_string);
|
|
std::cerr << "ERROR: MPI_Win_free failed with error code " << mpi_error
|
|
<< ": " << error_string << std::endl;
|
|
std::cerr << "This may indicate issues with Intel OneAPI MPI shared memory cleanup." << std::endl;
|
|
// Don't call MPI_Abort here as we're likely in cleanup phase
|
|
} else {
|
|
win = MPI_WIN_NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Function to safely free multiple MPI windows
|
|
inline void cleanup_mpi_wins(std::initializer_list<MPI_Win*> wins) {
|
|
int error_count = 0;
|
|
int total_windows = 0;
|
|
|
|
for (MPI_Win* win : wins) {
|
|
if (win != nullptr) {
|
|
total_windows++;
|
|
if (*win != MPI_WIN_NULL) {
|
|
int mpi_error = MPI_Win_free(win);
|
|
if (mpi_error != MPI_SUCCESS) {
|
|
error_count++;
|
|
char error_string[MPI_MAX_ERROR_STRING];
|
|
int length_of_error_string;
|
|
MPI_Error_string(mpi_error, error_string, &length_of_error_string);
|
|
std::cerr << "ERROR: MPI_Win_free failed for window " << total_windows
|
|
<< " with error code " << mpi_error
|
|
<< ": " << error_string << std::endl;
|
|
} else {
|
|
*win = MPI_WIN_NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (error_count > 0) {
|
|
std::cerr << "WARNING: " << error_count << " out of " << total_windows
|
|
<< " MPI windows failed to free properly." << std::endl;
|
|
std::cerr << "This may indicate Intel OneAPI MPI shared memory cleanup issues." << std::endl;
|
|
} else if (total_windows > 0) {
|
|
// Only show success message in verbose mode or for debugging
|
|
if (if_verbose && world_rank == 0) {
|
|
std::cout << "Successfully freed " << total_windows << " MPI shared memory windows." << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Function to initialize multiple MPI windows to NULL
|
|
inline void init_mpi_wins(std::initializer_list<MPI_Win*> wins) {
|
|
int initialized_count = 0;
|
|
|
|
for (MPI_Win* win : wins) {
|
|
if (win != nullptr) {
|
|
*win = MPI_WIN_NULL;
|
|
initialized_count++;
|
|
} else {
|
|
std::cerr << "WARNING: Null pointer passed to init_mpi_wins, skipping." << std::endl;
|
|
}
|
|
}
|
|
|
|
if (if_verbose && world_rank == 0 && initialized_count > 0) {
|
|
std::cout << "Initialized " << initialized_count << " MPI windows to MPI_WIN_NULL." << std::endl;
|
|
}
|
|
}
|
|
|
|
// Simple flag to track if MPI windows have been cleaned up
|
|
inline bool& get_mpi_windows_cleaned_flag() {
|
|
static bool mpi_windows_cleaned = false;
|
|
return mpi_windows_cleaned;
|
|
}
|
|
|
|
// Clean up all MPI windows before MPI_Finalize (called once)
|
|
inline void cleanup_all_mpi_windows() {
|
|
bool& cleaned = get_mpi_windows_cleaned_flag();
|
|
|
|
if (!cleaned) {
|
|
if (world_rank == 0 && if_verbose) {
|
|
std::cout << "Cleaning up all MPI windows before MPI_Finalize..." << std::endl;
|
|
}
|
|
cleaned = true;
|
|
|
|
if (world_rank == 0 && if_verbose) {
|
|
std::cout << "All MPI windows cleaned up successfully." << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // MPI_FUNCS_H
|