Add CI stack for ML packages (#31592)

Basic stack of ML packages we would like to test and generate binaries for in CI. 

Spack now has a large CI framework in GitLab for PR testing and public binary generation.
We should take advantage of this to test and distribute optimized binaries for popular ML
frameworks.

This is a pretty extensive initial set, including CPU, ROCm, and CUDA versions of a core
`x96_64_v4` stack.

### Core ML frameworks

These are all popular core ML frameworks already available in Spack.

- [x] PyTorch
- [x] TensorFlow
- [x] Scikit-learn
- [x] MXNet
- [x] CNTK
- [x] Caffe
- [x] Chainer
- [x] XGBoost
- [x] Theano

### ML extensions

These are domain libraries and wrappers that build on top of core ML libraries

- [x] Keras
- [x] TensorBoard
- [x] torchvision
- [x] torchtext
- [x] torchaudio
- [x] TorchGeo
- [x] PyTorch Lightning
- [x] torchmetrics
- [x] GPyTorch
- [x] Horovod

### ML-adjacent libraries

These are libraries that aren't specific to ML but are still core libraries used in ML pipelines

- [x] numpy
- [x] scipy
- [x] pandas
- [x] ONNX
- [x] bazel

Co-authored-by: Jonathon Anderson <17242663+blue42u@users.noreply.github.com>
This commit is contained in:
Adam J. Stewart 2022-10-09 17:39:47 -05:00 committed by GitHub
parent 4a6aff8bd1
commit 01ede3c595
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 553 additions and 0 deletions

View File

@ -749,3 +749,123 @@ tutorial-protected-build:
needs:
- artifacts: True
job: tutorial-protected-generate
########################################
# Machine Learning (CPU)
########################################
.ml-cpu:
variables:
SPACK_CI_STACK_NAME: ml-cpu
.ml-cpu-generate:
extends: .ml-cpu
image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
ml-cpu-pr-generate:
extends: [ ".ml-cpu-generate", ".pr-generate"]
ml-cpu-protected-generate:
extends: [ ".ml-cpu-generate", ".protected-generate"]
ml-cpu-pr-build:
extends: [ ".ml-cpu", ".pr-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cpu-pr-generate
strategy: depend
needs:
- artifacts: True
job: ml-cpu-pr-generate
ml-cpu-protected-build:
extends: [ ".ml-cpu", ".protected-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cpu-protected-generate
strategy: depend
needs:
- artifacts: True
job: ml-cpu-protected-generate
########################################
# Machine Learning (CUDA)
########################################
.ml-cuda:
variables:
SPACK_CI_STACK_NAME: ml-cuda
.ml-cuda-generate:
extends: .ml-cuda
image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
ml-cuda-pr-generate:
extends: [ ".ml-cuda-generate", ".pr-generate"]
ml-cuda-protected-generate:
extends: [ ".ml-cuda-generate", ".protected-generate"]
ml-cuda-pr-build:
extends: [ ".ml-cuda", ".pr-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cuda-pr-generate
strategy: depend
needs:
- artifacts: True
job: ml-cuda-pr-generate
ml-cuda-protected-build:
extends: [ ".ml-cuda", ".protected-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-cuda-protected-generate
strategy: depend
needs:
- artifacts: True
job: ml-cuda-protected-generate
########################################
# Machine Learning (ROCm)
########################################
.ml-rocm:
variables:
SPACK_CI_STACK_NAME: ml-rocm
.ml-rocm-generate:
extends: .ml-rocm
image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
ml-rocm-pr-generate:
extends: [ ".ml-rocm-generate", ".pr-generate"]
ml-rocm-protected-generate:
extends: [ ".ml-rocm-generate", ".protected-generate"]
ml-rocm-pr-build:
extends: [ ".ml-rocm", ".pr-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-rocm-pr-generate
strategy: depend
needs:
- artifacts: True
job: ml-rocm-pr-generate
ml-rocm-protected-build:
extends: [ ".ml-rocm", ".protected-build" ]
trigger:
include:
- artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
job: ml-rocm-protected-generate
strategy: depend
needs:
- artifacts: True
job: ml-rocm-protected-generate

View File

@ -0,0 +1,142 @@
spack:
view: false
concretizer:
reuse: false
unify: false
config:
concretizer: clingo
install_tree:
root: /home/software/spack
padded_length: 384
projections:
all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}"
packages:
all:
compiler: [gcc@11.2.0]
target: [x86_64_v4]
variants: ~cuda~rocm
specs:
# Horovod
- py-horovod
# JAX
# https://github.com/google/jax/issues/12614
# - py-jax
# - py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
- py-botorch
- py-efficientnet-pytorch
- py-gpytorch
- py-kornia
- py-pytorch-gradual-warmup-lr
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-cluster
- py-torch-geometric
# https://github.com/NVIDIA/apex/issues/1498
# - py-torch-nvidia-apex
- py-torch-scatter
- py-torch-sparse
- py-torch-spline-conv
- py-torchaudio
- py-torchdata
- py-torchfile
- py-torchgeo
- py-torchmeta
- py-torchmetrics
- py-torchtext
- py-torchvision
- py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
mirrors: { "mirror": "s3://spack-binaries/develop/ml-cpu" }
gitlab-ci:
script:
- . "./share/spack/setup-env.sh"
- spack --version
- cd ${SPACK_CONCRETE_ENV_DIR}
- spack env activate --without-view .
- spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'"
- mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data
- if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi
- if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi
- spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2)
mappings:
- match:
- llvm
runner-attributes:
tags: [ "spack", "huge", "x86_64_v4" ]
variables:
CI_JOB_SIZE: huge
KUBERNETES_CPU_REQUEST: 11000m
KUBERNETES_MEMORY_REQUEST: 42G
- match:
- "@:"
runner-attributes:
tags: [ "spack", "large", "x86_64_v4" ]
variables:
CI_JOB_SIZE: large
KUBERNETES_CPU_REQUEST: 8000m
KUBERNETES_MEMORY_REQUEST: 12G
image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
broken-specs-url: "s3://spack-binaries/broken-specs"
service-job-attributes:
before_script:
- . "./share/spack/setup-env.sh"
- spack --version
image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
tags: ["spack", "public", "x86_64_v4"]
signing-job-attributes:
image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] }
tags: ["spack", "aws"]
script:
- aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp
- /sign.sh
- aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache
cdash:
build-group: Machine Learning
url: https://cdash.spack.io
project: Spack Testing
site: Cloud Gitlab Infrastructure

View File

@ -0,0 +1,144 @@
spack:
view: false
concretizer:
reuse: false
unify: false
config:
concretizer: clingo
install_tree:
root: /home/software/spack
padded_length: 384
projections:
all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}"
packages:
all:
compiler: [gcc@11.2.0]
target: [x86_64_v4]
variants: ~rocm+cuda cuda_arch=80
llvm:
# https://github.com/spack/spack/issues/27999
require: ~cuda
specs:
# Horovod
- py-horovod
# JAX
# https://github.com/google/jax/issues/12614
# - py-jax
# - py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
- py-botorch
- py-efficientnet-pytorch
- py-gpytorch
- py-kornia
- py-pytorch-gradual-warmup-lr
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-cluster
- py-torch-geometric
- py-torch-nvidia-apex
- py-torch-scatter
- py-torch-sparse
- py-torch-spline-conv
- py-torchaudio
- py-torchdata
- py-torchfile
- py-torchgeo
- py-torchmeta
- py-torchmetrics
- py-torchtext
- py-torchvision
- py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
mirrors: { "mirror": "s3://spack-binaries/develop/ml-cuda" }
gitlab-ci:
script:
- . "./share/spack/setup-env.sh"
- spack --version
- cd ${SPACK_CONCRETE_ENV_DIR}
- spack env activate --without-view .
- spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'"
- mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data
- if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi
- if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi
- spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2)
mappings:
- match:
- llvm
runner-attributes:
tags: [ "spack", "huge", "x86_64_v4" ]
variables:
CI_JOB_SIZE: huge
KUBERNETES_CPU_REQUEST: 11000m
KUBERNETES_MEMORY_REQUEST: 42G
- match:
- "@:"
runner-attributes:
tags: [ "spack", "large", "x86_64_v4" ]
variables:
CI_JOB_SIZE: large
KUBERNETES_CPU_REQUEST: 8000m
KUBERNETES_MEMORY_REQUEST: 12G
image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
broken-specs-url: "s3://spack-binaries/broken-specs"
service-job-attributes:
before_script:
- . "./share/spack/setup-env.sh"
- spack --version
image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
tags: ["spack", "public", "x86_64_v4"]
signing-job-attributes:
image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] }
tags: ["spack", "aws"]
script:
- aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp
- /sign.sh
- aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache
cdash:
build-group: Machine Learning
url: https://cdash.spack.io
project: Spack Testing
site: Cloud Gitlab Infrastructure

View File

@ -0,0 +1,147 @@
spack:
view: false
concretizer:
reuse: false
unify: false
config:
concretizer: clingo
install_tree:
root: /home/software/spack
padded_length: 384
projections:
all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}"
packages:
all:
compiler: [gcc@11.2.0]
target: [x86_64_v4]
variants: ~cuda+rocm amdgpu_target=gfx90a
gl:
require: "osmesa"
py-torch:
# Does not yet support Spack-installed ROCm
require: ~rocm
specs:
# Horovod
- py-horovod
# JAX
# https://github.com/google/jax/issues/12614
# - py-jax
# - py-jaxlib
# Keras
- py-keras
- py-keras-applications
- py-keras-preprocessing
- py-keras2onnx
# PyTorch
# Does not yet support Spack-install ROCm
# - py-botorch
# - py-efficientnet-pytorch
# - py-gpytorch
# - py-kornia
# - py-pytorch-gradual-warmup-lr
# - py-pytorch-lightning
# - py-segmentation-models-pytorch
# - py-timm
# - py-torch
# - py-torch-cluster
# - py-torch-geometric
# - py-torch-nvidia-apex
# - py-torch-scatter
# - py-torch-sparse
# - py-torch-spline-conv
# - py-torchaudio
# - py-torchdata
# - py-torchfile
# - py-torchgeo
# - py-torchmeta
# - py-torchmetrics
# - py-torchtext
# - py-torchvision
# - py-vector-quantize-pytorch
# scikit-learn
- py-scikit-learn
- py-scikit-learn-extra
# TensorBoard
- py-tensorboard
- py-tensorboard-data-server
- py-tensorboard-plugin-wit
- py-tensorboardx
# TensorFlow
- py-tensorflow
- py-tensorflow-datasets
- py-tensorflow-estimator
- py-tensorflow-hub
- py-tensorflow-metadata
- py-tensorflow-probability
# XGBoost
- py-xgboost
# - r-xgboost
- xgboost
mirrors: { "mirror": "s3://spack-binaries/develop/ml-rocm" }
gitlab-ci:
script:
- . "./share/spack/setup-env.sh"
- spack --version
- cd ${SPACK_CONCRETE_ENV_DIR}
- spack env activate --without-view .
- spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'"
- mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data
- if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi
- if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi
- spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2)
mappings:
- match:
- llvm
runner-attributes:
tags: [ "spack", "huge", "x86_64_v4" ]
variables:
CI_JOB_SIZE: huge
KUBERNETES_CPU_REQUEST: 11000m
KUBERNETES_MEMORY_REQUEST: 42G
- match:
- "@:"
runner-attributes:
tags: [ "spack", "large", "x86_64_v4" ]
variables:
CI_JOB_SIZE: large
KUBERNETES_CPU_REQUEST: 8000m
KUBERNETES_MEMORY_REQUEST: 12G
image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
broken-specs-url: "s3://spack-binaries/broken-specs"
service-job-attributes:
before_script:
- . "./share/spack/setup-env.sh"
- spack --version
image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
tags: ["spack", "public", "x86_64_v4"]
signing-job-attributes:
image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] }
tags: ["spack", "aws"]
script:
- aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp
- /sign.sh
- aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache
cdash:
build-group: Machine Learning
url: https://cdash.spack.io
project: Spack Testing
site: Cloud Gitlab Infrastructure