CI: add ML ROCm stack (#45302)

* add ML ROCm stack

* add suggested changes

* remove py-torch and py-tensorflow-estimator

* add TF_ROCM_AMDGPU_TARGETS env variable and remove packages from pipeline

* remove py-jax and py-xgboost
This commit is contained in:
afzpatel 2024-07-24 10:16:15 -04:00 committed by GitHub
parent 1b5dc396e3
commit e529a454eb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 117 additions and 0 deletions

View File

@ -726,6 +726,29 @@ ml-linux-x86_64-cuda-build:
- artifacts: True
job: ml-linux-x86_64-cuda-generate
########################################
# Machine Learning - Linux x86_64 (ROCm)
########################################
.ml-linux-x86_64-rocm:
extends: [ ".linux_x86_64_v3" ]
variables:
SPACK_CI_STACK_NAME: ml-linux-x86_64-rocm
ml-linux-x86_64-rocm-generate:
extends: [ ".generate-x86_64", .ml-linux-x86_64-rocm, ".tags-x86_64_v4" ]
image: ghcr.io/spack/ubuntu-22.04:v2024-05-07
ml-linux-x86_64-rocm-build:
extends: [ ".build", ".ml-linux-x86_64-rocm" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-linux-x86_64-rocm-generate
strategy: depend
needs:
- artifacts: True
job: ml-linux-x86_64-rocm-generate
#########################################
# Machine Learning - Darwin aarch64 (MPS)
#########################################

View File

@ -0,0 +1,93 @@
spack:
view: false
packages:
all:
require:
- target=x86_64_v3
- ~cuda
- +rocm
- amdgpu_target=gfx90a
gl:
require: "osmesa"
mpi:
require: openmpi
specs:
# Horovod
# - py-horovod
# Hugging Face
- py-transformers
# JAX
# Does not yet support Spack-installed ROCm
# - py-jax
# - py-jaxlib
# Keras
- py-keras backend=tensorflow
# - py-keras backend=jax
# - py-keras backend=torch
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
# Does not yet support Spack-installed ROCm
# - py-botorch
# - py-efficientnet-pytorch
# - py-gpytorch
# - py-kornia
# - py-lightning
# - py-pytorch-gradual-warmup-lr
# - py-pytorch-lightning
# - py-segmentation-models-pytorch
# - py-timm
# - py-torch
# - py-torch-cluster
# - py-torch-geometric
# - py-torch-nvidia-apex
# - py-torch-scatter
# - py-torch-sparse
# - py-torch-spline-conv
# - py-torchaudio
# - py-torchdata
# - py-torchfile
# - py-torchgeo
# - py-torchmetrics
# - py-torchtext
# - py-torchvision
# - py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
# version 2.16 is not available
# - py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
# Does not yet support Spack-installed ROCm
# - py-xgboost
ci:
pipeline-gen:
- build-job:
image:
name: ghcr.io/spack/ubuntu-22.04:v2024-05-07
entrypoint: ['']
cdash:
build-group: Machine Learning

View File

@ -562,6 +562,7 @@ def setup_build_environment(self, env):
for pkg_dep in rocm_dependencies:
pkg_dep_cap = pkg_dep.upper().replace("-", "_")
env.set(f"{pkg_dep_cap}_PATH", spec[pkg_dep].prefix)
env.set("TF_ROCM_AMDGPU_TARGETS", ",".join(self.spec.variants["amdgpu_target"].value))
else:
env.set("TF_NEED_ROCM", "0")