py-horovod: fix compilation of ~cuda (#15719)
* py-horovod: fix compilation of ~cuda * Rewrite py-horovod with only 3 variants * Add upstream patch to workaround compilation issue
This commit is contained in:
		
							
								
								
									
										52
									
								
								var/spack/repos/builtin/packages/py-horovod/fma.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								var/spack/repos/builtin/packages/py-horovod/fma.patch
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | |||||||
|  | From 717e72f91f02d1dc3c859719ef1d804b10f88017 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Nicolas V Castet <nvcastet@us.ibm.com> | ||||||
|  | Date: Mon, 30 Mar 2020 12:47:50 -0500 | ||||||
|  | Subject: [PATCH] Add extra preprocessor guard for FMA optimization | ||||||
|  |  | ||||||
|  | Fixes #1832 | ||||||
|  |  | ||||||
|  | Signed-off-by: Nicolas V Castet <nvcastet@us.ibm.com> | ||||||
|  | --- | ||||||
|  |  horovod/common/ops/adasum/adasum.h | 8 ++++---- | ||||||
|  |  1 file changed, 4 insertions(+), 4 deletions(-) | ||||||
|  |  | ||||||
|  | diff --git a/horovod/common/ops/adasum/adasum.h b/horovod/common/ops/adasum/adasum.h | ||||||
|  | index 0330f5850..876f7f12b 100644 | ||||||
|  | --- a/horovod/common/ops/adasum/adasum.h | ||||||
|  | +++ b/horovod/common/ops/adasum/adasum.h | ||||||
|  | @@ -19,7 +19,7 @@ | ||||||
|  |  #include <cstring> | ||||||
|  |  #include <float.h> | ||||||
|  |   | ||||||
|  | -#if __AVX__ && __F16C__ | ||||||
|  | +#if __AVX__ && __F16C__ && __FMA__ | ||||||
|  |  #include <emmintrin.h> | ||||||
|  |  #include <immintrin.h> | ||||||
|  |  #endif | ||||||
|  | @@ -104,7 +104,7 @@ template <typename Communicator_type> class Adasum { | ||||||
|  |                                                int count, double& dotProduct, | ||||||
|  |                                                double& anormsq, double& bnormsq, | ||||||
|  |                                                int layerid) { | ||||||
|  | -#if __AVX__ && __F16C__ | ||||||
|  | +#if __AVX__ && __F16C__ && __FMA__ | ||||||
|  |      if (horovod_datatype == DataType::HOROVOD_FLOAT16) { | ||||||
|  |        ComputeDotAndNormSqrdsfp16((uint16_t*)a, (uint16_t*)b, count, dotProduct, | ||||||
|  |                                   anormsq, bnormsq, layerid); | ||||||
|  | @@ -125,7 +125,7 @@ template <typename Communicator_type> class Adasum { | ||||||
|  |                                   double acoeff, void* __restrict__ a, | ||||||
|  |                                   double bcoeff, void* __restrict__ b, | ||||||
|  |                                   int layerid) { | ||||||
|  | -#if __AVX__ && __F16C__ | ||||||
|  | +#if __AVX__ && __F16C__ && __FMA__ | ||||||
|  |      if (horovod_datatype == DataType::HOROVOD_FLOAT16) { | ||||||
|  |        ScaledAddfp16(count, acoeff, (uint16_t*)a, bcoeff, (uint16_t*)b, layerid); | ||||||
|  |      } else | ||||||
|  | @@ -425,7 +425,7 @@ template <typename Communicator_type> class Adasum { | ||||||
|  |    } | ||||||
|  |   | ||||||
|  |   | ||||||
|  | -#if __AVX__ && __F16C__ | ||||||
|  | +#if __AVX__ && __F16C__ && __FMA__ | ||||||
|  |    inline void ComputeDotAndNormSqrdsfp16(const uint16_t* __restrict__ a, | ||||||
|  |                                           const uint16_t* __restrict__ b, | ||||||
|  |                                           int len, double& dotProduct, | ||||||
| @@ -25,26 +25,17 @@ class PyHorovod(PythonPackage): | |||||||
|     version('0.16.3', tag='v0.16.3', submodules=True) |     version('0.16.3', tag='v0.16.3', submodules=True) | ||||||
|     version('0.16.2', tag='v0.16.2', submodules=True) |     version('0.16.2', tag='v0.16.2', submodules=True) | ||||||
| 
 | 
 | ||||||
|     # Deep learning frameworks |     # https://github.com/horovod/horovod/blob/master/docs/install.rst | ||||||
|     variant('pytorch',    default=True,  description='Enables PyTorch') |     variant('frameworks', default='pytorch', | ||||||
|     variant('tensorflow', default=False, description='Enables TensorFlow') |             description='Deep learning frameworks to build support for', | ||||||
|     variant('mxnet',      default=False, description='Enables Apache MXNet') |             values=('tensorflow', 'pytorch', 'mxnet', 'keras', 'spark'), | ||||||
| 
 |             multi=True) | ||||||
|     # Distributed support |     variant('controllers', default='mpi', | ||||||
|     variant('gloo', default=False, description='Enables features related to distributed support') |             description='Controllers to coordinate work between processes', | ||||||
|     variant('mpi',  default=True,  description='Enables MPI build') |             values=('mpi', 'gloo'), multi=True) | ||||||
| 
 |     variant('tensor_ops', default='nccl', | ||||||
|     # GPU support |             description='Framework to use for GPU/CPU operations', | ||||||
|     variant('cuda', default=True, description='Enables CUDA build') |             values=('nccl', 'mpi', 'gloo', 'ccl'), multi=False) | ||||||
|     variant('gpu_allreduce', default='mpi', |  | ||||||
|             description='Backend to use for GPU_ALLREDUCE', |  | ||||||
|             values=('mpi', 'nccl'), multi=False)  # DDL support is deprecated |  | ||||||
|     variant('gpu_allgather', default='mpi', |  | ||||||
|             description='Backend to use for GPU_ALLGATHER', |  | ||||||
|             values=('mpi',), multi=False) |  | ||||||
|     variant('gpu_broadcast', default='mpi', |  | ||||||
|             description='Backend to use for GPU_BROADCAST', |  | ||||||
|             values=('mpi', 'nccl'), multi=False) |  | ||||||
| 
 | 
 | ||||||
|     # Required dependencies |     # Required dependencies | ||||||
|     depends_on('py-setuptools',  type='build') |     depends_on('py-setuptools',  type='build') | ||||||
| @@ -53,68 +44,87 @@ class PyHorovod(PythonPackage): | |||||||
|     depends_on('py-pyyaml',      type=('build', 'run')) |     depends_on('py-pyyaml',      type=('build', 'run')) | ||||||
|     depends_on('py-six',         type=('build', 'run')) |     depends_on('py-six',         type=('build', 'run')) | ||||||
| 
 | 
 | ||||||
|     # Deep learning frameworks |     # Framework dependencies | ||||||
|     depends_on('py-torch@0.4.0:', type=('build', 'run'), when='+pytorch') |     depends_on('py-tensorflow@1.1.0:',  type=('build', 'link', 'run'), when='frameworks=tensorflow') | ||||||
|     depends_on('py-torch+cuda', type=('build', 'run'), when='+pytorch+cuda') |     depends_on('py-torch@0.4.0:',       type=('build', 'run'),         when='frameworks=pytorch') | ||||||
|     depends_on('py-cffi@1.4.0:', type=('build', 'run'), when='+pytorch') |     depends_on('py-torchvision',        type=('build', 'run'),         when='frameworks=pytorch') | ||||||
|     depends_on('py-tensorflow@1.1.0:', type=('build', 'link', 'run'), when='+tensorflow') |     depends_on('py-cffi@1.4.0:',        type=('build', 'run'),         when='frameworks=pytorch') | ||||||
|     depends_on('mxnet@1.4.0:+python', type=('build', 'link', 'run'), when='+mxnet') |     depends_on('mxnet@1.4.1:+python',   type=('build', 'link', 'run'), when='frameworks=mxnet') | ||||||
|     depends_on('mxnet+cuda', type=('build', 'link', 'run'), when='+mxnet+cuda') |     depends_on('py-keras@2.0.8,2.1.2:', type=('build', 'run'),         when='frameworks=keras') | ||||||
|  |     depends_on('py-h5py@2.9:',          type=('build', 'run'),         when='frameworks=spark') | ||||||
|  |     depends_on('py-numpy',              type=('build', 'run'),         when='frameworks=spark') | ||||||
|  |     depends_on('py-petastorm@0.8.2',    type=('build', 'run'),         when='frameworks=spark') | ||||||
|  |     depends_on('py-pyarrow@0.15.0:',    type=('build', 'run'),         when='frameworks=spark') | ||||||
|  |     depends_on('py-pyspark@2.3.2:',     type=('build', 'run'),         when='frameworks=spark') | ||||||
| 
 | 
 | ||||||
|     # Distributed support |     # Controller dependencies | ||||||
|  |     depends_on('mpi', when='controllers=mpi') | ||||||
|     # There does not appear to be a way to use an external Gloo installation |     # There does not appear to be a way to use an external Gloo installation | ||||||
|     depends_on('cmake', type='build', when='+gloo') |     depends_on('cmake', type='build', when='controllers=gloo') | ||||||
|     depends_on('mpi', when='+mpi') |  | ||||||
|     depends_on('mpi', when='gpu_allreduce=mpi') |  | ||||||
|     depends_on('mpi', when='gpu_allgather=mpi') |  | ||||||
|     depends_on('mpi', when='gpu_broadcast=mpi') |  | ||||||
| 
 | 
 | ||||||
|     # GPU support |     # Tensor Operations dependencies | ||||||
|     depends_on('cuda', when='+cuda') |     depends_on('nccl', when='tensor_ops=nccl') | ||||||
|     depends_on('nccl@2.0:', when='gpu_allreduce=nccl') |     depends_on('mpi', when='tensor_ops=mpi') | ||||||
|     depends_on('nccl@2.0:', when='gpu_broadcast=nccl') |     # There does not appear to be a way to use an external Gloo installation | ||||||
|  |     depends_on('cmake', type='build', when='tensor_ops=gloo') | ||||||
| 
 | 
 | ||||||
|     # Test dependencies |     # Test dependencies | ||||||
|     depends_on('py-mock', type='test') |     depends_on('py-mock', type='test') | ||||||
|     depends_on('py-pytest', type='test') |     depends_on('py-pytest', type='test') | ||||||
|     depends_on('py-pytest-forked', type='test') |     depends_on('py-pytest-forked', type='test') | ||||||
| 
 | 
 | ||||||
|     conflicts('+gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS') |     conflicts('controllers=gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS') | ||||||
|     conflicts('~gloo~mpi', msg='One of Gloo or MPI are required for Horovod to run') | 
 | ||||||
|     conflicts('~pytorch~tensorflow~mxnet', msg='At least one deep learning backend is required') |     # https://github.com/horovod/horovod/pull/1835 | ||||||
|  |     patch('fma.patch', when='@0.19.0:0.19.1') | ||||||
| 
 | 
 | ||||||
|     def setup_build_environment(self, env): |     def setup_build_environment(self, env): | ||||||
|         # Deep learning frameworks |         # Frameworks | ||||||
|         if '~pytorch' in self.spec: |         if 'frameworks=tensorflow' in self.spec: | ||||||
|             env.set('HOROVOD_WITHOUT_PYTORCH', 1) |             env.set('HOROVOD_WITH_TENSORFLOW', 1) | ||||||
|         if '~tensorflow' in self.spec: |         else: | ||||||
|             env.set('HOROVOD_WITHOUT_TENSORFLOW', 1) |             env.set('HOROVOD_WITHOUT_TENSORFLOW', 1) | ||||||
|         if '~mxnet' in self.spec: |         if 'frameworks=pytorch' in self.spec: | ||||||
|  |             env.set('HOROVOD_WITH_PYTORCH', 1) | ||||||
|  |         else: | ||||||
|  |             env.set('HOROVOD_WITHOUT_PYTORCH', 1) | ||||||
|  |         if 'frameworks=mxnet' in self.spec: | ||||||
|  |             env.set('HOROVOD_WITH_MXNET', 1) | ||||||
|  |         else: | ||||||
|             env.set('HOROVOD_WITHOUT_MXNET', 1) |             env.set('HOROVOD_WITHOUT_MXNET', 1) | ||||||
| 
 | 
 | ||||||
|         # Distributed support |         # Controllers | ||||||
|         if '~gloo' in self.spec: |         if 'controllers=mpi' in self.spec: | ||||||
|             env.set('HOROVOD_WITHOUT_GLOO', 1) |  | ||||||
|         if '+mpi' in self.spec: |  | ||||||
|             env.set('HOROVOD_WITH_MPI', 1) |             env.set('HOROVOD_WITH_MPI', 1) | ||||||
|         else: |         else: | ||||||
|             env.set('HOROVOD_WITHOUT_MPI', 1) |             env.set('HOROVOD_WITHOUT_MPI', 1) | ||||||
|  |         if 'controllers=gloo' in self.spec: | ||||||
|  |             env.set('HOROVOD_WITH_GLOO', 1) | ||||||
|  |         else: | ||||||
|  |             env.set('HOROVOD_WITHOUT_GLOO', 1) | ||||||
|  | 
 | ||||||
|  |         # Tensor Operations | ||||||
|  |         if 'tensor_ops=nccl' in self.spec: | ||||||
|  |             env.set('HOROVOD_GPU', 'CUDA') | ||||||
| 
 | 
 | ||||||
|         # GPU support |  | ||||||
|         if '+cuda' in self.spec: |  | ||||||
|             env.set('HOROVOD_CUDA_HOME', self.spec['cuda'].prefix) |             env.set('HOROVOD_CUDA_HOME', self.spec['cuda'].prefix) | ||||||
|             env.set('HOROVOD_CUDA_INCLUDE', |             env.set('HOROVOD_CUDA_INCLUDE', | ||||||
|                     self.spec['cuda'].headers.directories[0]) |                     self.spec['cuda'].headers.directories[0]) | ||||||
|             env.set('HOROVOD_CUDA_LIB', self.spec['cuda'].libs.directories[0]) |             env.set('HOROVOD_CUDA_LIB', self.spec['cuda'].libs.directories[0]) | ||||||
|         if '^nccl' in self.spec: | 
 | ||||||
|             env.set('HOROVOD_NCCL_HOME', self.spec['nccl'].prefix) |             env.set('HOROVOD_NCCL_HOME', self.spec['nccl'].prefix) | ||||||
|             env.set('HOROVOD_NCCL_INCLUDE', |             env.set('HOROVOD_NCCL_INCLUDE', | ||||||
|                     self.spec['nccl'].headers.directories[0]) |                     self.spec['nccl'].headers.directories[0]) | ||||||
|             env.set('HOROVOD_NCCL_LIB', self.spec['nccl'].libs.directories[0]) |             env.set('HOROVOD_NCCL_LIB', self.spec['nccl'].libs.directories[0]) | ||||||
|         env.set('HOROVOD_GPU_ALLREDUCE', | 
 | ||||||
|                 self.spec.variants['gpu_allreduce'].value.upper()) |             env.set('HOROVOD_GPU_ALLREDUCE', 'NCCL') | ||||||
|         env.set('HOROVOD_GPU_ALLGATHER', |             env.set('HOROVOD_GPU_BROADCAST', 'NCCL') | ||||||
|                 self.spec.variants['gpu_allgather'].value.upper()) |         else: | ||||||
|         env.set('HOROVOD_GPU_BROADCAST', |             env.set('HOROVOD_CPU_OPERATIONS', | ||||||
|                 self.spec.variants['gpu_broadcast'].value.upper()) |                     self.spec.variants['tensor_ops'].value.upper()) | ||||||
|         env.set('HOROVOD_ALLOW_MIXED_GPU_IMPL', 1) | 
 | ||||||
|  |     @run_after('install') | ||||||
|  |     @on_package_attributes(run_tests=True) | ||||||
|  |     def install_test(self): | ||||||
|  |         horovodrun = Executable(self.prefix.bin.horovodrun) | ||||||
|  |         horovodrun('--check-build') | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Adam J. Stewart
					Adam J. Stewart