Days since OpenMPI+UCX trashed my cluster: 0 (#13818)
This commit is contained in:
parent
8b85a6ca14
commit
260a4c4904
@ -0,0 +1,254 @@
|
|||||||
|
diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h
|
||||||
|
index 3875679443..0e4ec9a949 100644
|
||||||
|
--- a/opal/mca/btl/uct/btl_uct.h
|
||||||
|
+++ b/opal/mca/btl/uct/btl_uct.h
|
||||||
|
@@ -12,6 +12,7 @@
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
|
||||||
|
* reserved.
|
||||||
|
+ * Copyright (c) 2019 Google, LLC. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
@@ -85,6 +86,10 @@ struct mca_btl_uct_module_t {
|
||||||
|
/** array containing the am_tl and rdma_tl */
|
||||||
|
mca_btl_uct_tl_t *comm_tls[2];
|
||||||
|
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ uct_component_h uct_component;
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
/** registration cache */
|
||||||
|
mca_rcache_base_module_t *rcache;
|
||||||
|
|
||||||
|
diff --git a/opal/mca/btl/uct/btl_uct_amo.c b/opal/mca/btl/uct/btl_uct_amo.c
|
||||||
|
index f7d0232688..72398ce736 100644
|
||||||
|
--- a/opal/mca/btl/uct/btl_uct_amo.c
|
||||||
|
+++ b/opal/mca/btl/uct/btl_uct_amo.c
|
||||||
|
@@ -110,7 +110,7 @@ int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
|
||||||
|
mca_btl_uct_uct_completion_release (comp);
|
||||||
|
}
|
||||||
|
|
||||||
|
- uct_rkey_release (&rkey);
|
||||||
|
+ mca_btl_uct_rkey_release (uct_btl, &rkey);
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
@@ -184,7 +184,7 @@ int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
|
||||||
|
mca_btl_uct_uct_completion_release (comp);
|
||||||
|
}
|
||||||
|
|
||||||
|
- uct_rkey_release (&rkey);
|
||||||
|
+ mca_btl_uct_rkey_release (uct_btl, &rkey);
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c
|
||||||
|
index f968cb9c31..bff160736b 100644
|
||||||
|
--- a/opal/mca/btl/uct/btl_uct_component.c
|
||||||
|
+++ b/opal/mca/btl/uct/btl_uct_component.c
|
||||||
|
@@ -314,7 +314,12 @@ ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsig
|
||||||
|
return UCS_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+static int mca_btl_uct_component_process_uct_md (uct_component_h component, uct_md_resource_desc_t *md_desc,
|
||||||
|
+ char **allowed_ifaces)
|
||||||
|
+#else
|
||||||
|
static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc, char **allowed_ifaces)
|
||||||
|
+#endif
|
||||||
|
{
|
||||||
|
mca_rcache_base_resources_t rcache_resources;
|
||||||
|
uct_tl_resource_desc_t *tl_desc;
|
||||||
|
@@ -348,8 +353,14 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc
|
||||||
|
|
||||||
|
md = OBJ_NEW(mca_btl_uct_md_t);
|
||||||
|
|
||||||
|
+
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ uct_md_config_read (component, NULL, NULL, &uct_config);
|
||||||
|
+ uct_md_open (component, md_desc->md_name, uct_config, &md->uct_md);
|
||||||
|
+#else
|
||||||
|
uct_md_config_read (md_desc->md_name, NULL, NULL, &uct_config);
|
||||||
|
uct_md_open (md_desc->md_name, uct_config, &md->uct_md);
|
||||||
|
+#endif
|
||||||
|
uct_config_release (uct_config);
|
||||||
|
|
||||||
|
uct_md_query (md->uct_md, &md_attr);
|
||||||
|
@@ -375,6 +386,10 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc
|
||||||
|
return OPAL_ERR_NOT_AVAILABLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ module->uct_component = component;
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module;
|
||||||
|
|
||||||
|
/* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable
|
||||||
|
@@ -400,6 +415,42 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+static int mca_btl_uct_component_process_uct_component (uct_component_h component, char **allowed_ifaces)
|
||||||
|
+{
|
||||||
|
+ uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME |
|
||||||
|
+ UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT};
|
||||||
|
+ ucs_status_t ucs_status;
|
||||||
|
+ int rc;
|
||||||
|
+
|
||||||
|
+ ucs_status = uct_component_query (component, &attr);
|
||||||
|
+ if (UCS_OK != ucs_status) {
|
||||||
|
+ return OPAL_ERROR;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ BTL_VERBOSE(("processing uct component %s", attr.name));
|
||||||
|
+
|
||||||
|
+ attr.md_resources = calloc (attr.md_resource_count, sizeof (*attr.md_resources));
|
||||||
|
+ attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES;
|
||||||
|
+ ucs_status = uct_component_query (component, &attr);
|
||||||
|
+ if (UCS_OK != ucs_status) {
|
||||||
|
+ return OPAL_ERROR;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ for (int i = 0 ; i < attr.md_resource_count ; ++i) {
|
||||||
|
+ rc = mca_btl_uct_component_process_uct_md (component, attr.md_resources + i,
|
||||||
|
+ allowed_ifaces);
|
||||||
|
+ if (OPAL_SUCCESS != rc) {
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ free (attr.md_resources);
|
||||||
|
+
|
||||||
|
+ return OPAL_SUCCESS;
|
||||||
|
+}
|
||||||
|
+#endif /* UCT_API >= UCT_VERSION(1, 7) */
|
||||||
|
+
|
||||||
|
/*
|
||||||
|
* UCT component initialization:
|
||||||
|
* (1) read interface list from kernel and compare against component parameters
|
||||||
|
@@ -415,6 +466,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules,
|
||||||
|
struct mca_btl_base_module_t **base_modules;
|
||||||
|
uct_md_resource_desc_t *resources;
|
||||||
|
unsigned resource_count;
|
||||||
|
+ ucs_status_t ucs_status;
|
||||||
|
char **allowed_ifaces;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
@@ -431,10 +483,32 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules,
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
- uct_query_md_resources (&resources, &resource_count);
|
||||||
|
-
|
||||||
|
mca_btl_uct_component.module_count = 0;
|
||||||
|
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ uct_component_h *components;
|
||||||
|
+ unsigned num_components;
|
||||||
|
+
|
||||||
|
+ ucs_status = uct_query_components(&components, &num_components);
|
||||||
|
+ if (UCS_OK != ucs_status) {
|
||||||
|
+ BTL_ERROR(("could not query UCT components"));
|
||||||
|
+ return NULL;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* generate all suitable btl modules */
|
||||||
|
+ for (unsigned i = 0 ; i < num_components ; ++i) {
|
||||||
|
+ rc = mca_btl_uct_component_process_uct_component (components[i], allowed_ifaces);
|
||||||
|
+ if (OPAL_SUCCESS != rc) {
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ uct_release_component_list (components);
|
||||||
|
+
|
||||||
|
+#else /* UCT 1.6 and older */
|
||||||
|
+
|
||||||
|
+ uct_query_md_resources (&resources, &resource_count);
|
||||||
|
+
|
||||||
|
/* generate all suitable btl modules */
|
||||||
|
for (unsigned i = 0 ; i < resource_count ; ++i) {
|
||||||
|
rc = mca_btl_uct_component_process_uct_md (resources + i, allowed_ifaces);
|
||||||
|
@@ -443,9 +517,11 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
- opal_argv_free (allowed_ifaces);
|
||||||
|
uct_release_md_resource_list (resources);
|
||||||
|
|
||||||
|
+#endif /* UCT_API >= UCT_VERSION(1, 7) */
|
||||||
|
+
|
||||||
|
+ opal_argv_free (allowed_ifaces);
|
||||||
|
mca_btl_uct_modex_send ();
|
||||||
|
|
||||||
|
/* pass module array back to caller */
|
||||||
|
diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c
|
||||||
|
index 2d2d1c3f04..9ee9530f26 100644
|
||||||
|
--- a/opal/mca/btl/uct/btl_uct_rdma.c
|
||||||
|
+++ b/opal/mca/btl/uct/btl_uct_rdma.c
|
||||||
|
@@ -132,7 +132,7 @@ int mca_btl_uct_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
|
||||||
|
|
||||||
|
BTL_VERBOSE(("get issued. status = %d", ucs_status));
|
||||||
|
|
||||||
|
- uct_rkey_release (&rkey);
|
||||||
|
+ mca_btl_uct_rkey_release (uct_btl, &rkey);
|
||||||
|
|
||||||
|
return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY;
|
||||||
|
}
|
||||||
|
@@ -237,7 +237,7 @@ int mca_btl_uct_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
|
||||||
|
mca_btl_uct_uct_completion_release (comp);
|
||||||
|
}
|
||||||
|
|
||||||
|
- uct_rkey_release (&rkey);
|
||||||
|
+ mca_btl_uct_rkey_release (uct_btl, &rkey);
|
||||||
|
|
||||||
|
return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY;
|
||||||
|
}
|
||||||
|
diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h
|
||||||
|
index e9b0d6b19d..ab790371af 100644
|
||||||
|
--- a/opal/mca/btl/uct/btl_uct_rdma.h
|
||||||
|
+++ b/opal/mca/btl/uct/btl_uct_rdma.h
|
||||||
|
@@ -55,8 +55,22 @@ static inline int mca_btl_uct_get_rkey (mca_btl_uct_module_t *module,
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ ucs_status = uct_rkey_unpack (module->uct_component, (void *) remote_handle, rkey);
|
||||||
|
+#else
|
||||||
|
ucs_status = uct_rkey_unpack ((void *) remote_handle, rkey);
|
||||||
|
+#endif
|
||||||
|
return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static inline void mca_btl_uct_rkey_release (mca_btl_uct_module_t *uct_btl, uct_rkey_bundle_t *rkey)
|
||||||
|
+{
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ uct_rkey_release (uct_btl->uct_component, rkey);
|
||||||
|
+#else
|
||||||
|
+ (void) uct_btl;
|
||||||
|
+ uct_rkey_release (rkey);
|
||||||
|
+#endif
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
#endif /* !defined(BTL_UCT_RDMA_H) */
|
||||||
|
diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c
|
||||||
|
index a711a41ce9..e69c769b41 100644
|
||||||
|
--- a/opal/mca/btl/uct/btl_uct_tl.c
|
||||||
|
+++ b/opal/mca/btl/uct/btl_uct_tl.c
|
||||||
|
@@ -516,7 +516,13 @@ static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl
|
||||||
|
* come up with a better estimate. */
|
||||||
|
|
||||||
|
/* UCT bandwidth is in bytes/sec, BTL is in MB/sec */
|
||||||
|
+#if UCT_API >= UCT_VERSION(1, 7)
|
||||||
|
+ module->super.btl_bandwidth = (uint32_t) ((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated +
|
||||||
|
+ MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared /
|
||||||
|
+ (opal_process_info.num_local_peers + 1)) / 1048576.0);
|
||||||
|
+#else
|
||||||
|
module->super.btl_bandwidth = (uint32_t) (MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0);
|
||||||
|
+#endif
|
||||||
|
/* TODO -- figure out how to translate UCT latency to us */
|
||||||
|
module->super.btl_latency = 1;
|
||||||
|
}
|
@ -189,6 +189,7 @@ class Openmpi(AutotoolsPackage):
|
|||||||
patch('llnl-platforms.patch', when="@1.6.5")
|
patch('llnl-platforms.patch', when="@1.6.5")
|
||||||
patch('configure.patch', when="@1.10.1")
|
patch('configure.patch', when="@1.10.1")
|
||||||
patch('fix_multidef_pmi_class.patch', when="@2.0.0:2.0.1")
|
patch('fix_multidef_pmi_class.patch', when="@2.0.0:2.0.1")
|
||||||
|
patch('fix-ucx-1.7.0-api-instability.patch', when='@4.0.0:4.0.3')
|
||||||
|
|
||||||
# Vader Bug: https://github.com/open-mpi/ompi/issues/5375
|
# Vader Bug: https://github.com/open-mpi/ompi/issues/5375
|
||||||
# Haven't release fix for 2.1.x
|
# Haven't release fix for 2.1.x
|
||||||
|
Loading…
Reference in New Issue
Block a user