add a phist patch to avoid trying to compile SSE code if that is not … (#38806)

* add a phist patch to avoid trying to compile SSE code if that is not available.

* phist: make the avoid-sse patch more robust because compiler on ARM system still tried to compile SSE code
This commit is contained in:
Jonas Thies 2023-07-15 06:10:57 +02:00 committed by GitHub
parent 1f58ac5ed3
commit 5672c64356
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 350 additions and 0 deletions

View File

@ -0,0 +1,346 @@
commit eaef462cc07509fe8f380fbf520a2617b910b139
Author: Jonas Thies <16190001+jthies@users.noreply.github.com>
Date: Sun Jul 9 21:33:30 2023 +0200
exit early from builtin kernels requiring SSE so that they are not compiled if it is not available
(this broke phist compilation on ARM systems, even though we never called these kernels if SSE was disabled)
diff --git a/src/kernels/builtin/axpy_kernels_nt.c b/src/kernels/builtin/axpy_kernels_nt.c
index 64d5fbd0..17c5024a 100644
--- a/src/kernels/builtin/axpy_kernels_nt.c
+++ b/src/kernels/builtin/axpy_kernels_nt.c
@@ -19,7 +19,9 @@
#endif
#include <stdint.h>
#include <stdio.h>
+#ifdef PHIST_HAVE_SSE
#include <emmintrin.h>
+#endif
#include <stdlib.h>
static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
@@ -30,6 +32,10 @@ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
void daxpy_nt_2_c(int nrows, const double *restrict alpha, const double *restrict x, double *restrict y)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(y,16) )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
@@ -54,11 +60,16 @@ void daxpy_nt_2_c(int nrows, const double *restrict alpha, const double *restric
// non-temporal store
_mm_stream_pd(y+2*i, y_);
}
+#endif
}
void daxpy_nt_4_c(int nrows, const double *restrict alpha, const double *restrict x, double *restrict y)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(y,16) )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
@@ -86,11 +97,16 @@ void daxpy_nt_4_c(int nrows, const double *restrict alpha, const double *restric
_mm_stream_pd(y+4*i+2*k, y_);
}
}
+#endif
}
void daxpy_nt_8_c(int nrows, const double *restrict alpha, const double *restrict x, double *restrict y)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(y,16) )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
@@ -118,11 +134,16 @@ void daxpy_nt_8_c(int nrows, const double *restrict alpha, const double *restric
_mm_stream_pd(y+8*i+2*k, y_);
}
}
+#endif
}
void daxpy_nt_strided_2_c(int nrows, const double *restrict alpha, const double *restrict x, int ldx, double *restrict y, int ldy)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(y,16) )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
@@ -140,11 +161,16 @@ void daxpy_nt_strided_2_c(int nrows, const double *restrict alpha, const double
// non-temporal store
_mm_stream_pd(y+ldy*i, y_);
}
+#endif
}
void daxpy_nt_strided_4_c(int nrows, const double *restrict alpha, const double *restrict x, int ldx, double *restrict y, int ldy)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(y,16) || ldy % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
@@ -165,11 +191,16 @@ void daxpy_nt_strided_4_c(int nrows, const double *restrict alpha, const double
_mm_stream_pd(y+ldy*i+2*k, y_);
}
}
+#endif
}
void daxpy_nt_strided_8_c(int nrows, const double *restrict alpha, const double *restrict x, int ldx, double *restrict y, int ldy)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(y,16) || ldy % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)y);
@@ -190,11 +221,16 @@ void daxpy_nt_strided_8_c(int nrows, const double *restrict alpha, const double
_mm_stream_pd(y+ldy*i+2*k, y_);
}
}
+#endif
}
void dcopy_general_nt_c(int nrows, int nvec, const double *restrict x, int ldx, double *restrict y, int ldy)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( nvec % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)x);
@@ -217,5 +253,6 @@ void dcopy_general_nt_c(int nrows, int nvec, const double *restrict x, int ldx,
_mm_stream_pd(y+i*ldy+2*j, tmp);
}
}
+#endif
}
diff --git a/src/kernels/builtin/spmvm_kernels_nt.c b/src/kernels/builtin/spmvm_kernels_nt.c
index d4d30bff..5d858878 100644
--- a/src/kernels/builtin/spmvm_kernels_nt.c
+++ b/src/kernels/builtin/spmvm_kernels_nt.c
@@ -19,7 +19,9 @@
#endif
#include <stdint.h>
#include <stdio.h>
+#ifdef PHIST_HAVE_SSE
#include <emmintrin.h>
+#endif
#include <stdlib.h>
#ifdef PHIST_HIGH_PRECISION_KERNELS
@@ -35,6 +37,10 @@ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,16) )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -123,7 +129,7 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
#endif
// last row
-#ifdef PHIST_HIGH_PRECISION_KERNELS
+# ifdef PHIST_HIGH_PRECISION_KERNELS
if( nrows % 2 != 0 )
{
double lhs, lhsC;
@@ -136,7 +142,7 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
lhsv[nrows-1] = alpha*(lhs+lhsC);
}
-#else
+# else
if( nrows % 2 != 0 )
{
lhsv[nrows-1] = shifts[0]*rhsv[nrows-1];
@@ -146,6 +152,7 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
lhsv[nrows-1] += val[j]*halo[ (col_idx[j]-1) ];
lhsv[nrows-1] *= alpha;
}
+# endif
#endif
}
@@ -153,6 +160,10 @@ void dspmvm_nt_1_c(int nrows, double alpha, const long *restrict row_ptr, const
void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv, int ldl)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,32) || ldl % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -176,7 +187,7 @@ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const
__m128d shifts_ = _mm_loadu_pd(shifts);
__m128d alpha_ = _mm_set1_pd(alpha);
-#ifdef PHIST_HIGH_PRECISION_KERNELS
+# ifdef PHIST_HIGH_PRECISION_KERNELS
#pragma omp parallel for schedule(static)
for(int i = 0; i < nrows; i++)
{
@@ -204,7 +215,7 @@ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const
// non-temporal store
_mm_stream_pd(lhsv+i*ldl, lhs);
}
-#else
+# else
#pragma omp parallel for schedule(static)
for(int i = 0; i < nrows; i++)
{
@@ -232,16 +243,21 @@ void dspmvm_nt_2_c(int nrows, double alpha, const long *restrict row_ptr, const
// multiply with alpha
__m128d alpha_ = _mm_set1_pd(alpha);
lhs_ = _mm_mul_pd(alpha_,lhs_);
-
+
// non-temporal store
_mm_stream_pd(lhsv+i*ldl, lhs_);
}
+# endif
#endif
}
void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv, int ldl)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,32) || ldl % 4 != 0 )
{
printf("%s: lhsv not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -261,7 +277,7 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
}
-#ifdef PHIST_HIGH_PRECISION_KERNELS
+# ifdef PHIST_HIGH_PRECISION_KERNELS
__m256d shifts_ = _mm256_loadu_pd(shifts);
__m256d alpha_ = _mm256_set1_pd(alpha);
@@ -294,7 +310,7 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
_mm256_stream_pd(lhsv+i*ldl, lhs);
}
-#else
+# else
__m128d shifts_[2];
shifts_[0] = _mm_loadu_pd(shifts);
@@ -341,6 +357,7 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
}
}
+# endif
#endif
}
@@ -348,6 +365,10 @@ void dspmvm_nt_4_c(int nrows, double alpha, const long *restrict row_ptr, const
void dspmvm_nt_8_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, const double *restrict halo, double *restrict lhsv, int ldl)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -412,12 +433,17 @@ void dspmvm_nt_8_c(int nrows, double alpha, const long *restrict row_ptr, const
_mm_stream_pd(lhsv+i*ldl+2*k, lhs_[k]);
}
}
+#endif
}
void dspmvm_nt_strided_2_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, int ldr, const double *restrict halo, double *restrict lhsv, int ldl)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -460,15 +486,20 @@ void dspmvm_nt_strided_2_c(int nrows, double alpha, const long *restrict row_ptr
// multiply with alpha
__m128d alpha_ = _mm_set1_pd(alpha);
lhs_ = _mm_mul_pd(alpha_,lhs_);
-
+
// non-temporal store
_mm_stream_pd(lhsv+i*ldl, lhs_);
}
+#endif
}
void dspmvm_nt_strided_4_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, int ldr, const double *restrict halo, double *restrict lhsv, int ldl)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -526,11 +557,16 @@ void dspmvm_nt_strided_4_c(int nrows, double alpha, const long *restrict row_ptr
_mm_stream_pd(lhsv+i*ldl+2*k, lhs_[k]);
}
}
+#endif
}
void dspmvm_nt_strided_8_c(int nrows, double alpha, const long *restrict row_ptr, const long *restrict halo_ptr, const int *restrict col_idx, const double *restrict val,
const double *restrict shifts, const double *restrict rhsv, int ldr, const double *restrict halo, double *restrict lhsv, int ldl)
{
+#ifndef PHIST_HAVE_SSE
+ printf("%s: must not be called on platforms without SSE.", __FUNCTION__);
+ exit(1);
+#else
if( !is_aligned(lhsv,16) || ldl % 2 != 0 )
{
printf("%s: not aligned %lx\n", __FUNCTION__, (uintptr_t)(void*)lhsv);
@@ -589,6 +625,7 @@ void dspmvm_nt_strided_8_c(int nrows, double alpha, const long *restrict row_ptr
_mm_stream_pd(lhsv+i*ldl+2*k, lhs_[k]);
}
}
+#endif
}

View File

@ -150,6 +150,10 @@ class Phist(CMakePackage):
# ###################### Patches ########################## # ###################### Patches ##########################
# Avoid trying to compile some SSE code if SSE is not available
# This patch will be part of phist 1.11.3 and greater and only affects
# the 'builtin' kernel_lib.
patch("avoid-sse.patch", when="@:1.11.2 kernel_lib=builtin")
# Only applies to 1.9.4: While SSE instructions are handled correctly, # Only applies to 1.9.4: While SSE instructions are handled correctly,
# build fails on ppc64le unless -DNO_WARN_X86_INTRINSICS is defined. # build fails on ppc64le unless -DNO_WARN_X86_INTRINSICS is defined.
patch("ppc64_sse.patch", when="@1.9.4") patch("ppc64_sse.patch", when="@1.9.4")