ML CI: Linux x86_64 (#34299)

* ML CI: Linux x86_64

* Update comments

* Rename again

* Rename comments

* Update to match other arches

* No compiler

* Compiler was wrong anyway

* Faster TF
This commit is contained in:
Adam J. Stewart 2022-12-22 11:31:40 -06:00 committed by GitHub
parent 371268a9aa
commit eb67497020
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 280 additions and 253 deletions

View File

@ -760,122 +760,122 @@ tutorial-protected-build:
- artifacts: True
job: tutorial-protected-generate
########################################
# Machine Learning (CPU)
########################################
.ml-cpu:
#######################################
# Machine Learning - Linux x86_64 (CPU)
#######################################
.ml-linux-x86_64-cpu:
variables:
SPACK_CI_STACK_NAME: ml-cpu
SPACK_CI_STACK_NAME: ml-linux-x86_64-cpu
.ml-cpu-generate:
extends: .ml-cpu
.ml-linux-x86_64-cpu-generate:
extends: .ml-linux-x86_64-cpu
image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
ml-cpu-pr-generate:
extends: [ ".ml-cpu-generate", ".pr-generate"]
ml-linux-x86_64-cpu-pr-generate:
extends: [ ".ml-linux-x86_64-cpu-generate", ".pr-generate"]
ml-cpu-protected-generate:
extends: [ ".ml-cpu-generate", ".protected-generate"]
ml-linux-x86_64-cpu-protected-generate:
extends: [ ".ml-linux-x86_64-cpu-generate", ".protected-generate"]
ml-cpu-pr-build:
extends: [ ".ml-cpu", ".pr-build" ]
ml-linux-x86_64-cpu-pr-build:
extends: [ ".ml-linux-x86_64-cpu", ".pr-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cpu-pr-generate
job: ml-linux-x86_64-cpu-pr-generate
strategy: depend
needs:
- artifacts: True
job: ml-cpu-pr-generate
job: ml-linux-x86_64-cpu-pr-generate
ml-cpu-protected-build:
extends: [ ".ml-cpu", ".protected-build" ]
ml-linux-x86_64-cpu-protected-build:
extends: [ ".ml-linux-x86_64-cpu", ".protected-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cpu-protected-generate
job: ml-linux-x86_64-cpu-protected-generate
strategy: depend
needs:
- artifacts: True
job: ml-cpu-protected-generate
job: ml-linux-x86_64-cpu-protected-generate
########################################
# Machine Learning (CUDA)
# Machine Learning - Linux x86_64 (CUDA)
########################################
.ml-cuda:
.ml-linux-x86_64-cuda:
variables:
SPACK_CI_STACK_NAME: ml-cuda
SPACK_CI_STACK_NAME: ml-linux-x86_64-cuda
.ml-cuda-generate:
extends: .ml-cuda
.ml-linux-x86_64-cuda-generate:
extends: .ml-linux-x86_64-cuda
image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
ml-cuda-pr-generate:
extends: [ ".ml-cuda-generate", ".pr-generate"]
ml-linux-x86_64-cuda-pr-generate:
extends: [ ".ml-linux-x86_64-cuda-generate", ".pr-generate"]
ml-cuda-protected-generate:
extends: [ ".ml-cuda-generate", ".protected-generate"]
ml-linux-x86_64-cuda-protected-generate:
extends: [ ".ml-linux-x86_64-cuda-generate", ".protected-generate"]
ml-cuda-pr-build:
extends: [ ".ml-cuda", ".pr-build" ]
ml-linux-x86_64-cuda-pr-build:
extends: [ ".ml-linux-x86_64-cuda", ".pr-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cuda-pr-generate
job: ml-linux-x86_64-cuda-pr-generate
strategy: depend
needs:
- artifacts: True
job: ml-cuda-pr-generate
job: ml-linux-x86_64-cuda-pr-generate
ml-cuda-protected-build:
extends: [ ".ml-cuda", ".protected-build" ]
ml-linux-x86_64-cuda-protected-build:
extends: [ ".ml-linux-x86_64-cuda", ".protected-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cuda-protected-generate
job: ml-linux-x86_64-cuda-protected-generate
strategy: depend
needs:
- artifacts: True
job: ml-cuda-protected-generate
job: ml-linux-x86_64-cuda-protected-generate
########################################
# Machine Learning (ROCm)
# Machine Learning - Linux x86_64 (ROCm)
########################################
.ml-rocm:
.ml-linux-x86_64-rocm:
variables:
SPACK_CI_STACK_NAME: ml-rocm
SPACK_CI_STACK_NAME: ml-linux-x86_64-rocm
.ml-rocm-generate:
extends: .ml-rocm
.ml-linux-x86_64-rocm-generate:
extends: .ml-linux-x86_64-rocm
image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
ml-rocm-pr-generate:
extends: [ ".ml-rocm-generate", ".pr-generate"]
ml-linux-x86_64-rocm-pr-generate:
extends: [ ".ml-linux-x86_64-rocm-generate", ".pr-generate"]
ml-rocm-protected-generate:
extends: [ ".ml-rocm-generate", ".protected-generate"]
ml-linux-x86_64-rocm-protected-generate:
extends: [ ".ml-linux-x86_64-rocm-generate", ".protected-generate"]
ml-rocm-pr-build:
extends: [ ".ml-rocm", ".pr-build" ]
ml-linux-x86_64-rocm-pr-build:
extends: [ ".ml-linux-x86_64-rocm", ".pr-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-rocm-pr-generate
job: ml-linux-x86_64-rocm-pr-generate
strategy: depend
needs:
- artifacts: True
job: ml-rocm-pr-generate
job: ml-linux-x86_64-rocm-pr-generate
ml-rocm-protected-build:
extends: [ ".ml-rocm", ".protected-build" ]
ml-linux-x86_64-rocm-protected-build:
extends: [ ".ml-linux-x86_64-rocm", ".protected-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-rocm-protected-generate
job: ml-linux-x86_64-rocm-protected-generate
strategy: depend
needs:
- artifacts: True
job: ml-rocm-protected-generate
job: ml-linux-x86_64-rocm-protected-generate

View File

@ -16,77 +16,85 @@ spack:
packages:
all:
compiler: [gcc@11.2.0]
target: [x86_64_v3]
variants: ~cuda~rocm
definitions:
- packages:
# Horovod
- py-horovod
# Hugging Face
- py-transformers
# JAX
- py-jax
- py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
- py-botorch
- py-efficientnet-pytorch
- py-gpytorch
- py-kornia
- py-pytorch-gradual-warmup-lr
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-cluster
- py-torch-geometric
- py-torch-nvidia-apex
- py-torch-scatter
- py-torch-sparse
- py-torch-spline-conv
- py-torchaudio
- py-torchdata
- py-torchfile
- py-torchgeo
- py-torchmeta
- py-torchmetrics
- py-torchtext
- py-torchvision
- py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
- arch:
- target=x86_64_v3
specs:
# Horovod
- py-horovod
- matrix:
- [$packages]
- [$arch]
# Hugging Face
- py-transformers
# JAX
- py-jax
- py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
- py-botorch
- py-efficientnet-pytorch
- py-gpytorch
- py-kornia
- py-pytorch-gradual-warmup-lr
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-cluster
- py-torch-geometric
- py-torch-nvidia-apex
- py-torch-scatter
- py-torch-sparse
- py-torch-spline-conv
- py-torchaudio
- py-torchdata
- py-torchfile
- py-torchgeo
- py-torchmeta
- py-torchmetrics
- py-torchtext
- py-torchvision
- py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
mirrors: { "mirror": "s3://spack-binaries/develop/ml-cpu" }
mirrors: { "mirror": "s3://spack-binaries/develop/ml-linux-x86_64-cpu" }
gitlab-ci:
script:
@ -113,6 +121,7 @@ spack:
mappings:
- match:
- llvm
- py-tensorflow
- py-torch
runner-attributes:
tags: [ "spack", "huge", "x86_64_v4" ]

View File

@ -16,80 +16,88 @@ spack:
packages:
all:
compiler: [gcc@11.2.0]
target: [x86_64_v3]
variants: ~rocm+cuda cuda_arch=80
llvm:
# https://github.com/spack/spack/issues/27999
require: ~cuda
definitions:
- packages:
# Horovod
- py-horovod
# Hugging Face
- py-transformers
# JAX
- py-jax
- py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
- py-botorch
- py-efficientnet-pytorch
- py-gpytorch
- py-kornia
- py-pytorch-gradual-warmup-lr
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-cluster
- py-torch-geometric
- py-torch-nvidia-apex
- py-torch-scatter
- py-torch-sparse
- py-torch-spline-conv
- py-torchaudio
- py-torchdata
- py-torchfile
- py-torchgeo
- py-torchmeta
- py-torchmetrics
- py-torchtext
- py-torchvision
- py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
- arch:
- target=x86_64_v3
specs:
# Horovod
- py-horovod
- matrix:
- [$packages]
- [$arch]
# Hugging Face
- py-transformers
# JAX
- py-jax
- py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
- py-botorch
- py-efficientnet-pytorch
- py-gpytorch
- py-kornia
- py-pytorch-gradual-warmup-lr
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-cluster
- py-torch-geometric
- py-torch-nvidia-apex
- py-torch-scatter
- py-torch-sparse
- py-torch-spline-conv
- py-torchaudio
- py-torchdata
- py-torchfile
- py-torchgeo
- py-torchmeta
- py-torchmetrics
- py-torchtext
- py-torchvision
- py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
mirrors: { "mirror": "s3://spack-binaries/develop/ml-cuda" }
mirrors: { "mirror": "s3://spack-binaries/develop/ml-linux-x86_64-cuda" }
gitlab-ci:
script:
@ -116,6 +124,7 @@ spack:
mappings:
- match:
- llvm
- py-tensorflow
- py-torch
runner-attributes:
tags: [ "spack", "huge", "x86_64_v4" ]

View File

@ -16,7 +16,6 @@ spack:
packages:
all:
compiler: [gcc@11.2.0]
target: [x86_64_v3]
variants: ~cuda+rocm amdgpu_target=gfx90a
gl:
@ -25,74 +24,83 @@ spack:
# Does not yet support Spack-installed ROCm
require: ~rocm
definitions:
- packages:
# Horovod
- py-horovod
# Hugging Face
- py-transformers
# JAX
- py-jax
- py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
# Does not yet support Spack-install ROCm
# - py-botorch
# - py-efficientnet-pytorch
# - py-gpytorch
# - py-kornia
# - py-pytorch-gradual-warmup-lr
# - py-pytorch-lightning
# - py-segmentation-models-pytorch
# - py-timm
# - py-torch
# - py-torch-cluster
# - py-torch-geometric
# - py-torch-nvidia-apex
# - py-torch-scatter
# - py-torch-sparse
# - py-torch-spline-conv
# - py-torchaudio
# - py-torchdata
# - py-torchfile
# - py-torchgeo
# - py-torchmeta
# - py-torchmetrics
# - py-torchtext
# - py-torchvision
# - py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
- arch:
- target=x86_64_v3
specs:
# Horovod
- py-horovod
- matrix:
- [$packages]
- [$arch]
# Hugging Face
- py-transformers
# JAX
- py-jax
- py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
# Does not yet support Spack-install ROCm
# - py-botorch
# - py-efficientnet-pytorch
# - py-gpytorch
# - py-kornia
# - py-pytorch-gradual-warmup-lr
# - py-pytorch-lightning
# - py-segmentation-models-pytorch
# - py-timm
# - py-torch
# - py-torch-cluster
# - py-torch-geometric
# - py-torch-nvidia-apex
# - py-torch-scatter
# - py-torch-sparse
# - py-torch-spline-conv
# - py-torchaudio
# - py-torchdata
# - py-torchfile
# - py-torchgeo
# - py-torchmeta
# - py-torchmetrics
# - py-torchtext
# - py-torchvision
# - py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
mirrors: { "mirror": "s3://spack-binaries/develop/ml-rocm" }
mirrors: { "mirror": "s3://spack-binaries/develop/ml-linux-x86_64-rocm" }
gitlab-ci:
script:
@ -118,8 +126,9 @@ spack:
match_behavior: first
mappings:
- match:
- llvm-amdgpu
- llvm
- llvm-amdgpu
- py-tensorflow
- py-torch
- rocblas
runner-attributes: