ML CI: Linux x86_64 (#34299)

* ML CI: Linux x86_64 * Update comments * Rename again * Rename comments * Update to match other arches * No compiler * Compiler was wrong anyway * Faster TF
2022-12-22 11:31:40 -06:00
parent 371268a9aa
commit eb67497020
4 changed files with 280 additions and 253 deletions
--- a/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
+++ b/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
@@ -760,122 +760,122 @@ tutorial-protected-build:
    - artifacts: True
      job: tutorial-protected-generate

-########################################
-# Machine Learning (CPU)
-########################################
-.ml-cpu:
+#######################################
+# Machine Learning - Linux x86_64 (CPU)
+#######################################
+.ml-linux-x86_64-cpu:
  variables:
-    SPACK_CI_STACK_NAME: ml-cpu
+    SPACK_CI_STACK_NAME: ml-linux-x86_64-cpu

-.ml-cpu-generate:
-  extends: .ml-cpu
+.ml-linux-x86_64-cpu-generate:
+  extends: .ml-linux-x86_64-cpu
  image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
  tags: ["spack", "aws", "public", "medium", "x86_64_v4"]

-ml-cpu-pr-generate:
-  extends: [ ".ml-cpu-generate", ".pr-generate"]
+ml-linux-x86_64-cpu-pr-generate:
+  extends: [ ".ml-linux-x86_64-cpu-generate", ".pr-generate"]

-ml-cpu-protected-generate:
-  extends: [ ".ml-cpu-generate", ".protected-generate"]
+ml-linux-x86_64-cpu-protected-generate:
+  extends: [ ".ml-linux-x86_64-cpu-generate", ".protected-generate"]

-ml-cpu-pr-build:
-  extends: [ ".ml-cpu", ".pr-build" ]
+ml-linux-x86_64-cpu-pr-build:
+  extends: [ ".ml-linux-x86_64-cpu", ".pr-build" ]
  trigger:
    include:
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
-        job: ml-cpu-pr-generate
+        job: ml-linux-x86_64-cpu-pr-generate
    strategy: depend
  needs:
    - artifacts: True
-      job: ml-cpu-pr-generate
+      job: ml-linux-x86_64-cpu-pr-generate

-ml-cpu-protected-build:
-  extends: [ ".ml-cpu", ".protected-build" ]
+ml-linux-x86_64-cpu-protected-build:
+  extends: [ ".ml-linux-x86_64-cpu", ".protected-build" ]
  trigger:
    include:
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
-        job: ml-cpu-protected-generate
+        job: ml-linux-x86_64-cpu-protected-generate
    strategy: depend
  needs:
    - artifacts: True
-      job: ml-cpu-protected-generate
+      job: ml-linux-x86_64-cpu-protected-generate

 ########################################
-# Machine Learning (CUDA)
+# Machine Learning - Linux x86_64 (CUDA)
 ########################################
-.ml-cuda:
+.ml-linux-x86_64-cuda:
  variables:
-    SPACK_CI_STACK_NAME: ml-cuda
+    SPACK_CI_STACK_NAME: ml-linux-x86_64-cuda

-.ml-cuda-generate:
-  extends: .ml-cuda
+.ml-linux-x86_64-cuda-generate:
+  extends: .ml-linux-x86_64-cuda
  image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
  tags: ["spack", "aws", "public", "medium", "x86_64_v4"]

-ml-cuda-pr-generate:
-  extends: [ ".ml-cuda-generate", ".pr-generate"]
+ml-linux-x86_64-cuda-pr-generate:
+  extends: [ ".ml-linux-x86_64-cuda-generate", ".pr-generate"]

-ml-cuda-protected-generate:
-  extends: [ ".ml-cuda-generate", ".protected-generate"]
+ml-linux-x86_64-cuda-protected-generate:
+  extends: [ ".ml-linux-x86_64-cuda-generate", ".protected-generate"]

-ml-cuda-pr-build:
-  extends: [ ".ml-cuda", ".pr-build" ]
+ml-linux-x86_64-cuda-pr-build:
+  extends: [ ".ml-linux-x86_64-cuda", ".pr-build" ]
  trigger:
    include:
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
-        job: ml-cuda-pr-generate
+        job: ml-linux-x86_64-cuda-pr-generate
    strategy: depend
  needs:
    - artifacts: True
-      job: ml-cuda-pr-generate
+      job: ml-linux-x86_64-cuda-pr-generate

-ml-cuda-protected-build:
-  extends: [ ".ml-cuda", ".protected-build" ]
+ml-linux-x86_64-cuda-protected-build:
+  extends: [ ".ml-linux-x86_64-cuda", ".protected-build" ]
  trigger:
    include:
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
-        job: ml-cuda-protected-generate
+        job: ml-linux-x86_64-cuda-protected-generate
    strategy: depend
  needs:
    - artifacts: True
-      job: ml-cuda-protected-generate
+      job: ml-linux-x86_64-cuda-protected-generate

 ########################################
-# Machine Learning (ROCm)
+# Machine Learning - Linux x86_64 (ROCm)
 ########################################
-.ml-rocm:
+.ml-linux-x86_64-rocm:
  variables:
-    SPACK_CI_STACK_NAME: ml-rocm
+    SPACK_CI_STACK_NAME: ml-linux-x86_64-rocm

-.ml-rocm-generate:
-  extends: .ml-rocm
+.ml-linux-x86_64-rocm-generate:
+  extends: .ml-linux-x86_64-rocm
  image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
  tags: ["spack", "aws", "public", "medium", "x86_64_v4"]

-ml-rocm-pr-generate:
-  extends: [ ".ml-rocm-generate", ".pr-generate"]
+ml-linux-x86_64-rocm-pr-generate:
+  extends: [ ".ml-linux-x86_64-rocm-generate", ".pr-generate"]

-ml-rocm-protected-generate:
-  extends: [ ".ml-rocm-generate", ".protected-generate"]
+ml-linux-x86_64-rocm-protected-generate:
+  extends: [ ".ml-linux-x86_64-rocm-generate", ".protected-generate"]

-ml-rocm-pr-build:
-  extends: [ ".ml-rocm", ".pr-build" ]
+ml-linux-x86_64-rocm-pr-build:
+  extends: [ ".ml-linux-x86_64-rocm", ".pr-build" ]
  trigger:
    include:
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
-        job: ml-rocm-pr-generate
+        job: ml-linux-x86_64-rocm-pr-generate
    strategy: depend
  needs:
    - artifacts: True
-      job: ml-rocm-pr-generate
+      job: ml-linux-x86_64-rocm-pr-generate

-ml-rocm-protected-build:
-  extends: [ ".ml-rocm", ".protected-build" ]
+ml-linux-x86_64-rocm-protected-build:
+  extends: [ ".ml-linux-x86_64-rocm", ".protected-build" ]
  trigger:
    include:
      - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
-        job: ml-rocm-protected-generate
+        job: ml-linux-x86_64-rocm-protected-generate
    strategy: depend
  needs:
    - artifacts: True
-      job: ml-rocm-protected-generate
+      job: ml-linux-x86_64-rocm-protected-generate
--- a/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-x86_64-cpu/spack.yaml
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-x86_64-cpu/spack.yaml
@@ -16,11 +16,11 @@ spack:

  packages:
    all:
-      compiler: [gcc@11.2.0]
      target: [x86_64_v3]
      variants: ~cuda~rocm

-  specs:
+  definitions:
+    - packages:
      # Horovod
      - py-horovod

@@ -86,7 +86,15 @@ spack:
      # - r-xgboost
      - xgboost

-  mirrors: { "mirror": "s3://spack-binaries/develop/ml-cpu" }
+    - arch:
+      - target=x86_64_v3
+
+  specs:
+    - matrix:
+      - [$packages]
+      - [$arch]
+
+  mirrors: { "mirror": "s3://spack-binaries/develop/ml-linux-x86_64-cpu" }

  gitlab-ci:
    script:
@@ -113,6 +121,7 @@ spack:
    mappings:
      - match:
          - llvm
+          - py-tensorflow
          - py-torch
        runner-attributes:
          tags: [ "spack", "huge", "x86_64_v4" ]
--- a/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-x86_64-cuda/spack.yaml
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-x86_64-cuda/spack.yaml
@@ -16,14 +16,14 @@ spack:

  packages:
    all:
-      compiler: [gcc@11.2.0]
      target: [x86_64_v3]
      variants: ~rocm+cuda cuda_arch=80
    llvm:
      # https://github.com/spack/spack/issues/27999
      require: ~cuda

-  specs:
+  definitions:
+    - packages:
      # Horovod
      - py-horovod

@@ -89,7 +89,15 @@ spack:
      # - r-xgboost
      - xgboost

-  mirrors: { "mirror": "s3://spack-binaries/develop/ml-cuda" }
+    - arch:
+      - target=x86_64_v3
+
+  specs:
+    - matrix:
+      - [$packages]
+      - [$arch]
+
+  mirrors: { "mirror": "s3://spack-binaries/develop/ml-linux-x86_64-cuda" }

  gitlab-ci:
    script:
@@ -116,6 +124,7 @@ spack:
    mappings:
      - match:
          - llvm
+          - py-tensorflow
          - py-torch
        runner-attributes:
          tags: [ "spack", "huge", "x86_64_v4" ]
--- a/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-x86_64-rocm/spack.yaml
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-linux-x86_64-rocm/spack.yaml
@@ -16,7 +16,6 @@ spack:

  packages:
    all:
-      compiler: [gcc@11.2.0]
      target: [x86_64_v3]
      variants: ~cuda+rocm amdgpu_target=gfx90a
    gl:
@@ -25,7 +24,8 @@ spack:
      # Does not yet support Spack-installed ROCm
      require: ~rocm

-  specs:
+  definitions:
+    - packages:
      # Horovod
      - py-horovod

@@ -92,7 +92,15 @@ spack:
      # - r-xgboost
      - xgboost

-  mirrors: { "mirror": "s3://spack-binaries/develop/ml-rocm" }
+    - arch:
+      - target=x86_64_v3
+
+  specs:
+    - matrix:
+      - [$packages]
+      - [$arch]
+
+  mirrors: { "mirror": "s3://spack-binaries/develop/ml-linux-x86_64-rocm" }

  gitlab-ci:
    script:
@@ -118,8 +126,9 @@ spack:
    match_behavior: first
    mappings:
      - match:
-          - llvm-amdgpu
          - llvm
+          - llvm-amdgpu
+          - py-tensorflow
          - py-torch
          - rocblas
        runner-attributes: