add a phist patch to avoid trying to compile SSE code if that is not … (#38806)

* add a phist patch to avoid trying to compile SSE code if that is not available. * phist: make the avoid-sse patch more robust because compiler on ARM system still tried to compile SSE code
2023-07-15 06:10:57 +02:00 · 2023-07-15 06:10:57 +02:00 · 5672c64356
commit 5672c64356
parent 1f58ac5ed3
2 changed files with 350 additions and 0 deletions
--- a/var/spack/repos/builtin/packages/phist/avoid-sse.patch
+++ b/var/spack/repos/builtin/packages/phist/avoid-sse.patch
@ -0,0 +1,346 @@
+commit eaef462cc07509fe8f380fbf520a2617b910b139
+Author: Jonas Thies <16190001+jthies@users.noreply.github.com>
+Date:   Sun Jul 9 21:33:30 2023 +0200
+
+    exit early from builtin kernels requiring SSE so that they are not compiled if it is not available
+    (this broke phist compilation on ARM systems, even though we never called these kernels if SSE was disabled)
+
+diff --git a/src/kernels/builtin/axpy_kernels_nt.c b/src/kernels/builtin/axpy_kernels_nt.c
+index 64d5fbd0..17c5024a 100644
+--- a/src/kernels/builtin/axpy_kernels_nt.c
+++ b/src/kernels/builtin/axpy_kernels_nt.c
+@@ -19,7 +19,9 @@
+ #endif
+ #include <stdint.h>
+ #include <stdio.h>
+#ifdef PHIST_HAVE_SSE
+ #include <emmintrin.h>
+#endif
+ #include <stdlib.h>
+ 
+ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
+@@ -30,6 +32,10 @@ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
+ 
+ void daxpy_nt_2_c(int nrows, const double *restrict alpha, const double *restrict x, double *restrict y)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(y,16) )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
+@@ -54,11 +60,16 @@ void daxpy_nt_2_c(int nrows, const double *restrict alpha, const double *restric
+     // non-temporal store
+     _mm_stream_pd(y+2*i, y_);
+   }
+#endif
+ }
+ 
+ 
+ void daxpy_nt_4_c(int nrows, const double *restrict alpha, const double *restrict x, double *restrict y)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(y,16) )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
+@@ -86,11 +97,16 @@ void daxpy_nt_4_c(int nrows, const double *restrict alpha, const double *restric
+       _mm_stream_pd(y+4*i+2*k, y_);
+     }
+   }
+#endif
+ }
+ 
+ 
+ void daxpy_nt_8_c(int nrows, const double *restrict alpha, const double *restrict x, double *restrict y)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(y,16) )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
+@@ -118,11 +134,16 @@ void daxpy_nt_8_c(int nrows, const double *restrict alpha, const double *restric
+       _mm_stream_pd(y+8*i+2*k, y_);
+     }
+   }
+#endif
+ }
+ 
+ 
+ void daxpy_nt_strided_2_c(int nrows, const double *restrict alpha, const double *restrict x, int ldx, double *restrict y, int ldy)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(y,16) )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
+@@ -140,11 +161,16 @@ void daxpy_nt_strided_2_c(int nrows, const double *restrict alpha, const double
+     // non-temporal store
+     _mm_stream_pd(y+ldy*i, y_);
+   }
+#endif
+ }
+ 
+ 
+ void daxpy_nt_strided_4_c(int nrows, const double *restrict alpha, const double *restrict x, int ldx, double *restrict y, int ldy)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(y,16) || ldy % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
+@@ -165,11 +191,16 @@ void daxpy_nt_strided_4_c(int nrows, const double *restrict alpha, const double
+       _mm_stream_pd(y+ldy*i+2*k, y_);
+     }
+   }
+#endif
+ }
+ 
+ 
+ void daxpy_nt_strided_8_c(int nrows, const double *restrict alpha, const double *restrict x, int ldx, double *restrict y, int ldy)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(y,16) || ldy % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
+@@ -190,11 +221,16 @@ void daxpy_nt_strided_8_c(int nrows, const double *restrict alpha, const double
+       _mm_stream_pd(y+ldy*i+2*k, y_);
+     }
+   }
+#endif
+ }
+ 
+ 
+ void dcopy_general_nt_c(int nrows, int nvec, const double *restrict x, int ldx, double *restrict y, int ldy)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( nvec % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)x);
+@@ -217,5 +253,6 @@ void dcopy_general_nt_c(int nrows, int nvec, const double *restrict x, int ldx,
+       _mm_stream_pd(y+i*ldy+2*j, tmp);
+     }
+   }
+#endif
+ }
+ 
+diff --git a/src/kernels/builtin/spmvm_kernels_nt.c b/src/kernels/builtin/spmvm_kernels_nt.c
+index d4d30bff..5d858878 100644
+--- a/src/kernels/builtin/spmvm_kernels_nt.c
+++ b/src/kernels/builtin/spmvm_kernels_nt.c
+@@ -19,7 +19,9 @@
+ #endif
+ #include <stdint.h>
+ #include <stdio.h>
+#ifdef PHIST_HAVE_SSE
+ #include <emmintrin.h>
+#endif
+ #include <stdlib.h>
+ 
+ #ifdef PHIST_HIGH_PRECISION_KERNELS
+@@ -35,6 +37,10 @@ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
+ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                  const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,16) )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -123,7 +129,7 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
+ #endif
+ 
+   // last row
+-#ifdef PHIST_HIGH_PRECISION_KERNELS
+# ifdef PHIST_HIGH_PRECISION_KERNELS
+   if( nrows % 2 != 0 )
+   {
+     double lhs, lhsC;
+@@ -136,7 +142,7 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
+ 
+     lhsv[nrows-1] = alpha*(lhs+lhsC);
+   }
+-#else
+# else
+   if( nrows % 2 != 0 )
+   {
+     lhsv[nrows-1] = shifts[0]*rhsv[nrows-1];
+@@ -146,6 +152,7 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
+       lhsv[nrows-1] += val[j]*halo[ (col_idx[j]-1) ];
+     lhsv[nrows-1] *= alpha;
+   }
+# endif
+ #endif
+ }
+ 
+@@ -153,6 +160,10 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
+ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                  const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv, int ldl)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,32) || ldl % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -176,7 +187,7 @@ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const
+   __m128d shifts_ = _mm_loadu_pd(shifts);
+   __m128d alpha_ = _mm_set1_pd(alpha);
+ 
+-#ifdef PHIST_HIGH_PRECISION_KERNELS
+# ifdef PHIST_HIGH_PRECISION_KERNELS
+ #pragma omp parallel for schedule(static)
+   for(int i = 0; i < nrows; i++)
+   {
+@@ -204,7 +215,7 @@ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const
+     // non-temporal store
+     _mm_stream_pd(lhsv+i*ldl, lhs);
+   }
+-#else
+# else
+ #pragma omp parallel for schedule(static)
+   for(int i = 0; i < nrows; i++)
+   {
+@@ -232,16 +243,21 @@ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const
+     // multiply with alpha
+     __m128d alpha_ = _mm_set1_pd(alpha);
+     lhs_ = _mm_mul_pd(alpha_,lhs_);
+- 
+
+     // non-temporal store
+     _mm_stream_pd(lhsv+i*ldl, lhs_);
+   }
+# endif
+ #endif
+ }
+ 
+ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                  const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv, int ldl)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,32) || ldl % 4 != 0 )
+   {
+     printf("%s: lhsv not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -261,7 +277,7 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
+   }
+ 
+ 
+-#ifdef PHIST_HIGH_PRECISION_KERNELS
+# ifdef PHIST_HIGH_PRECISION_KERNELS
+ 
+   __m256d shifts_ = _mm256_loadu_pd(shifts);
+   __m256d alpha_ = _mm256_set1_pd(alpha);
+@@ -294,7 +310,7 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
+     _mm256_stream_pd(lhsv+i*ldl, lhs);
+   }
+ 
+-#else
+# else
+ 
+   __m128d shifts_[2];
+   shifts_[0] = _mm_loadu_pd(shifts);
+@@ -341,6 +357,7 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
+     }
+   }
+ 
+# endif
+ #endif
+ }
+ 
+@@ -348,6 +365,10 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
+ void dspmvm_nt_8_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                  const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv, int ldl)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -412,12 +433,17 @@ void dspmvm_nt_8_c(int nrows, double alpha, const long *restrict row_ptr, const
+       _mm_stream_pd(lhsv+i*ldl+2*k, lhs_[k]);
+     }
+   }
+#endif
+ }
+ 
+ 
+ void dspmvm_nt_strided_2_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                          const double *restrict shifts, const double *restrict rhsv, int ldr, const double *restrict halo, double *restrict lhsv, int ldl)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -460,15 +486,20 @@ void dspmvm_nt_strided_2_c(int nrows, double alpha, const long *restrict row_ptr
+     // multiply with alpha
+     __m128d alpha_ = _mm_set1_pd(alpha);
+     lhs_ = _mm_mul_pd(alpha_,lhs_);
+- 
+
+     // non-temporal store
+     _mm_stream_pd(lhsv+i*ldl, lhs_);
+   }
+#endif
+ }
+ 
+ void dspmvm_nt_strided_4_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                          const double *restrict shifts, const double *restrict rhsv, int ldr, const double *restrict halo, double *restrict lhsv, int ldl)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -526,11 +557,16 @@ void dspmvm_nt_strided_4_c(int nrows, double alpha, const long *restrict row_ptr
+       _mm_stream_pd(lhsv+i*ldl+2*k, lhs_[k]);
+     }
+   }
+#endif
+ }
+ 
+ void dspmvm_nt_strided_8_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
+                          const double *restrict shifts, const double *restrict rhsv, int ldr, const double *restrict halo, double *restrict lhsv, int ldl)
+ {
+#ifndef PHIST_HAVE_SSE
+  printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+  exit(1);
+#else
+   if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
+   {
+     printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
+@@ -589,6 +625,7 @@ void dspmvm_nt_strided_8_c(int nrows, double alpha, const long *restrict row_ptr
+       _mm_stream_pd(lhsv+i*ldl+2*k, lhs_[k]);
+     }
+   }
+#endif
+ }
+ 
+ 
--- a/var/spack/repos/builtin/packages/phist/package.py
+++ b/var/spack/repos/builtin/packages/phist/package.py
@ -150,6 +150,10 @@ class Phist(CMakePackage):

    # ###################### Patches ##########################

+    # Avoid trying to compile some SSE code if SSE is not available
+    # This patch will be part of phist 1.11.3 and greater and only affects
+    # the 'builtin' kernel_lib.
+    patch("avoid-sse.patch", when="@:1.11.2 kernel_lib=builtin")
    # Only applies to 1.9.4: While SSE instructions are handled correctly,
    # build fails on ppc64le unless -DNO_WARN_X86_INTRINSICS is defined.
    patch("ppc64_sse.patch", when="@1.9.4")