ML CI: Linux aarch64 (#39666)

* ML CI: Linux aarch64 * Add config files * No aarch64 tag * Don't specify image * Use amazonlinux image Co-authored-by: kwryankrattiger <80296582+kwryankrattiger@users.noreply.github.com> * Update and require * GCC is too old * Fix some builds * xgboost doesn't support old GCC + cuda * Run on newer Ubuntu * Remove mxnet * Try aarch64 range * Use main branch * Conflict applies to all targets * cuda only required when +cuda * Use tagged version * Comment out tf-estimator * Add ROCm, use newer Ubuntu * Remove ROCm --------- Co-authored-by: kwryankrattiger <80296582+kwryankrattiger@users.noreply.github.com>
2024-10-28 10:30:07 +01:00
parent e83536de38
commit 32ce278a51
3 changed files with 222 additions and 0 deletions
--- a/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
+++ b/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
@@ -735,6 +735,52 @@ ml-linux-x86_64-rocm-build:
    - artifacts: True
      job: ml-linux-x86_64-rocm-generate

+########################################
+# Machine Learning - Linux aarch64 (CPU)
+########################################
+.ml-linux-aarch64-cpu:
+  extends: [ ".linux_aarch64" ]
+  variables:
+    SPACK_CI_STACK_NAME: ml-linux-aarch64-cpu
+
+ml-linux-aarch64-cpu-generate:
+  extends: [ ".generate-aarch64", .ml-linux-aarch64-cpu ]
+  image: ghcr.io/spack/ubuntu-24.04:v2024-09-05-v2
+
+ml-linux-aarch64-cpu-build:
+  extends: [ ".build", ".ml-linux-aarch64-cpu" ]
+  trigger:
+    include:
+      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+        job: ml-linux-aarch64-cpu-generate
+    strategy: depend
+  needs:
+    - artifacts: True
+      job: ml-linux-aarch64-cpu-generate
+
+#########################################
+# Machine Learning - Linux aarch64 (CUDA)
+#########################################
+.ml-linux-aarch64-cuda:
+  extends: [ ".linux_aarch64" ]
+  variables:
+    SPACK_CI_STACK_NAME: ml-linux-aarch64-cuda
+
+ml-linux-aarch64-cuda-generate:
+  extends: [ ".generate-aarch64", .ml-linux-aarch64-cuda ]
+  image: ghcr.io/spack/ubuntu-24.04:v2024-09-05-v2
+
+ml-linux-aarch64-cuda-build:
+  extends: [ ".build", ".ml-linux-aarch64-cuda" ]
+  trigger:
+    include:
+      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+        job: ml-linux-aarch64-cuda-generate
+    strategy: depend
+  needs:
+    - artifacts: True
+      job: ml-linux-aarch64-cuda-generate
+
 #########################################
 # Machine Learning - Darwin aarch64 (MPS)
 #########################################
--- a/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-aarch64-cpu/spack.yaml
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-aarch64-cpu/spack.yaml
@@ -0,0 +1,85 @@
+spack:
+  view: false
+  packages:
+    all:
+      require:
+      - target=aarch64
+      - ~cuda
+      - ~rocm
+    mpi:
+      require: openmpi
+
+  specs:
+    # Horovod
+    - py-horovod
+
+    # Hugging Face
+    - py-transformers
+
+    # JAX
+    - py-jax
+    - py-jaxlib
+
+    # Keras
+    - py-keras backend=tensorflow
+    - py-keras backend=jax
+    - py-keras backend=torch
+    - py-keras-applications
+    - py-keras-preprocessing
+    - py-keras2onnx
+
+    # PyTorch
+    - py-botorch
+    - py-efficientnet-pytorch
+    - py-gpytorch
+    - py-kornia
+    - py-lightning
+    - py-pytorch-gradual-warmup-lr
+    - py-pytorch-lightning
+    - py-segmentation-models-pytorch
+    - py-timm
+    - py-torch
+    - py-torch-cluster
+    - py-torch-geometric
+    - py-torch-nvidia-apex
+    - py-torch-scatter
+    - py-torch-sparse
+    - py-torch-spline-conv
+    - py-torchaudio
+    - py-torchdata
+    - py-torchfile
+    - py-torchgeo
+    - py-torchmetrics
+    - py-torchtext
+    - py-torchvision
+    - py-vector-quantize-pytorch
+
+    # scikit-learn
+    - py-scikit-learn
+    - py-scikit-learn-extra
+
+    # TensorBoard
+    - py-tensorboard
+    - py-tensorboard-data-server
+    - py-tensorboard-plugin-wit
+    - py-tensorboardx
+
+    # TensorFlow
+    - py-tensorflow
+    - py-tensorflow-datasets
+    - py-tensorflow-hub
+    - py-tensorflow-metadata
+    - py-tensorflow-probability
+
+    # XGBoost
+    - py-xgboost
+
+  ci:
+    pipeline-gen:
+    - build-job:
+        image:
+          name: ghcr.io/spack/ubuntu-24.04:v2024-09-05-v2
+          entrypoint: ['']
+
+  cdash:
+    build-group: Machine Learning
--- a/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-aarch64-cuda/spack.yaml
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-aarch64-cuda/spack.yaml
@@ -0,0 +1,91 @@
+spack:
+  view: false
+  packages:
+    all:
+      require:
+      - target=aarch64
+      - ~rocm
+      - +cuda
+      - cuda_arch=80
+    llvm:
+      # https://github.com/spack/spack/issues/27999
+      require: ~cuda
+    mpi:
+      require: openmpi
+
+  specs:
+    # Horovod
+    - py-horovod
+
+    # Hugging Face
+    - py-transformers
+
+    # JAX
+    - py-jax
+    - py-jaxlib
+
+    # Keras
+    - py-keras backend=tensorflow
+    - py-keras backend=jax
+    - py-keras backend=torch
+    - py-keras-applications
+    - py-keras-preprocessing
+    - py-keras2onnx
+
+    # PyTorch
+    - py-botorch
+    - py-efficientnet-pytorch
+    - py-gpytorch
+    - py-kornia
+    - py-lightning
+    - py-pytorch-gradual-warmup-lr
+    - py-pytorch-lightning
+    - py-segmentation-models-pytorch
+    - py-timm
+    - py-torch
+    - py-torch-cluster
+    - py-torch-geometric
+    - py-torch-nvidia-apex
+    - py-torch-scatter
+    - py-torch-sparse
+    - py-torch-spline-conv
+    - py-torchaudio
+    - py-torchdata
+    - py-torchfile
+    - py-torchgeo
+    - py-torchmetrics
+    # torchtext requires older pytorch, which requires older cuda, which doesn't support newer GCC
+    # - py-torchtext
+    - py-torchvision
+    - py-vector-quantize-pytorch
+
+    # scikit-learn
+    - py-scikit-learn
+    - py-scikit-learn-extra
+
+    # TensorBoard
+    - py-tensorboard
+    - py-tensorboard-data-server
+    - py-tensorboard-plugin-wit
+    - py-tensorboardx
+
+    # TensorFlow
+    - py-tensorflow
+    - py-tensorflow-datasets
+    - py-tensorflow-hub
+    - py-tensorflow-metadata
+    - py-tensorflow-probability
+
+    # XGBoost
+    # xgboost requires older cuda, which doesn't support newer GCC
+    # - py-xgboost
+
+  ci:
+    pipeline-gen:
+    - build-job:
+        image:
+          name: ghcr.io/spack/ubuntu-24.04:v2024-09-05-v2
+          entrypoint: ['']
+
+  cdash:
+    build-group: Machine Learning