CI: add ML ROCm stack (#45302)
* add ML ROCm stack * add suggested changes * remove py-torch and py-tensorflow-estimator * add TF_ROCM_AMDGPU_TARGETS env variable and remove packages from pipeline * remove py-jax and py-xgboost
This commit is contained in:
		@@ -726,6 +726,29 @@ ml-linux-x86_64-cuda-build:
 | 
			
		||||
    - artifacts: True
 | 
			
		||||
      job: ml-linux-x86_64-cuda-generate
 | 
			
		||||
 | 
			
		||||
########################################
 | 
			
		||||
# Machine Learning - Linux x86_64 (ROCm)
 | 
			
		||||
########################################
 | 
			
		||||
.ml-linux-x86_64-rocm:
 | 
			
		||||
  extends: [ ".linux_x86_64_v3" ]
 | 
			
		||||
  variables:
 | 
			
		||||
    SPACK_CI_STACK_NAME: ml-linux-x86_64-rocm
 | 
			
		||||
 | 
			
		||||
ml-linux-x86_64-rocm-generate:
 | 
			
		||||
  extends: [ ".generate-x86_64", .ml-linux-x86_64-rocm, ".tags-x86_64_v4" ]
 | 
			
		||||
  image: ghcr.io/spack/ubuntu-22.04:v2024-05-07
 | 
			
		||||
 | 
			
		||||
ml-linux-x86_64-rocm-build:
 | 
			
		||||
  extends: [ ".build", ".ml-linux-x86_64-rocm" ]
 | 
			
		||||
  trigger:
 | 
			
		||||
    include:
 | 
			
		||||
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
 | 
			
		||||
        job: ml-linux-x86_64-rocm-generate
 | 
			
		||||
    strategy: depend
 | 
			
		||||
  needs:
 | 
			
		||||
    - artifacts: True
 | 
			
		||||
      job: ml-linux-x86_64-rocm-generate
 | 
			
		||||
 | 
			
		||||
#########################################
 | 
			
		||||
# Machine Learning - Darwin aarch64 (MPS)
 | 
			
		||||
#########################################
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,93 @@
 | 
			
		||||
spack:
 | 
			
		||||
  view: false
 | 
			
		||||
  packages:
 | 
			
		||||
    all:
 | 
			
		||||
      require:
 | 
			
		||||
      - target=x86_64_v3
 | 
			
		||||
      - ~cuda
 | 
			
		||||
      - +rocm
 | 
			
		||||
      - amdgpu_target=gfx90a
 | 
			
		||||
    gl:
 | 
			
		||||
      require: "osmesa"
 | 
			
		||||
    mpi:
 | 
			
		||||
      require: openmpi
 | 
			
		||||
 | 
			
		||||
  specs:
 | 
			
		||||
    # Horovod
 | 
			
		||||
    # - py-horovod
 | 
			
		||||
 | 
			
		||||
    # Hugging Face
 | 
			
		||||
    - py-transformers
 | 
			
		||||
 | 
			
		||||
    # JAX
 | 
			
		||||
    # Does not yet support Spack-installed ROCm
 | 
			
		||||
    # - py-jax
 | 
			
		||||
    # - py-jaxlib
 | 
			
		||||
 | 
			
		||||
    # Keras
 | 
			
		||||
    - py-keras backend=tensorflow
 | 
			
		||||
    # - py-keras backend=jax
 | 
			
		||||
    # - py-keras backend=torch
 | 
			
		||||
    - py-keras-applications
 | 
			
		||||
    - py-keras-preprocessing
 | 
			
		||||
    - py-keras2onnx
 | 
			
		||||
 | 
			
		||||
    # PyTorch
 | 
			
		||||
    # Does not yet support Spack-installed ROCm
 | 
			
		||||
    # - py-botorch
 | 
			
		||||
    # - py-efficientnet-pytorch
 | 
			
		||||
    # - py-gpytorch
 | 
			
		||||
    # - py-kornia
 | 
			
		||||
    # - py-lightning
 | 
			
		||||
    # - py-pytorch-gradual-warmup-lr
 | 
			
		||||
    # - py-pytorch-lightning
 | 
			
		||||
    # - py-segmentation-models-pytorch
 | 
			
		||||
    # - py-timm
 | 
			
		||||
    # - py-torch
 | 
			
		||||
    # - py-torch-cluster
 | 
			
		||||
    # - py-torch-geometric
 | 
			
		||||
    # - py-torch-nvidia-apex
 | 
			
		||||
    # - py-torch-scatter
 | 
			
		||||
    # - py-torch-sparse
 | 
			
		||||
    # - py-torch-spline-conv
 | 
			
		||||
    # - py-torchaudio
 | 
			
		||||
    # - py-torchdata
 | 
			
		||||
    # - py-torchfile
 | 
			
		||||
    # - py-torchgeo
 | 
			
		||||
    # - py-torchmetrics
 | 
			
		||||
    # - py-torchtext
 | 
			
		||||
    # - py-torchvision
 | 
			
		||||
    # - py-vector-quantize-pytorch
 | 
			
		||||
 | 
			
		||||
    # scikit-learn
 | 
			
		||||
    - py-scikit-learn
 | 
			
		||||
    - py-scikit-learn-extra
 | 
			
		||||
 | 
			
		||||
    # TensorBoard
 | 
			
		||||
    - py-tensorboard
 | 
			
		||||
    - py-tensorboard-data-server
 | 
			
		||||
    - py-tensorboard-plugin-wit
 | 
			
		||||
    - py-tensorboardx
 | 
			
		||||
 | 
			
		||||
    # TensorFlow
 | 
			
		||||
    - py-tensorflow
 | 
			
		||||
    - py-tensorflow-datasets
 | 
			
		||||
    # version 2.16 is not available
 | 
			
		||||
    # - py-tensorflow-estimator
 | 
			
		||||
    - py-tensorflow-hub
 | 
			
		||||
    - py-tensorflow-metadata
 | 
			
		||||
    - py-tensorflow-probability
 | 
			
		||||
 | 
			
		||||
    # XGBoost
 | 
			
		||||
    # Does not yet support Spack-installed ROCm
 | 
			
		||||
    # - py-xgboost
 | 
			
		||||
 | 
			
		||||
  ci:
 | 
			
		||||
    pipeline-gen:
 | 
			
		||||
    - build-job:
 | 
			
		||||
        image:
 | 
			
		||||
          name: ghcr.io/spack/ubuntu-22.04:v2024-05-07
 | 
			
		||||
          entrypoint: ['']
 | 
			
		||||
 | 
			
		||||
  cdash:
 | 
			
		||||
    build-group: Machine Learning
 | 
			
		||||
@@ -562,6 +562,7 @@ def setup_build_environment(self, env):
 | 
			
		||||
            for pkg_dep in rocm_dependencies:
 | 
			
		||||
                pkg_dep_cap = pkg_dep.upper().replace("-", "_")
 | 
			
		||||
                env.set(f"{pkg_dep_cap}_PATH", spec[pkg_dep].prefix)
 | 
			
		||||
            env.set("TF_ROCM_AMDGPU_TARGETS", ",".join(self.spec.variants["amdgpu_target"].value))
 | 
			
		||||
        else:
 | 
			
		||||
            env.set("TF_NEED_ROCM", "0")
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user