
* Initial commit for v2.0 of the CEED software suite.
* Update Nek packages and gslib
* Help spack concretize the hypre version for ceed-2.0.
* Fix nekcem install error
* Add support for gfortran v8 in nek5000 and nekcem.
* Split Nek5000 into Nek5000 and Nektools
* Get Nektools to build fine in Theta
* Fix travis failure: remove unused 'import numbers' from nek5000.
* Check for gfortran if it is wrapped
* Tweak the detection of gfortran in nek5000.
* Fix Nek packages to add -std=legacy when FC=gcc
* spack install ceed~petsc works fine on Theta
* Fix flake8 errors
* Fix more flake8 tests
* Fix an import issue
* Tweak the suite-sparse package to avoid interaction with existing system
installations of suite-sparse.
* petsc: update superlu-dist dependency
* Updates in the packages: occa, libceed, and ceed.
* In the libceed package, explicitly tell nvcc which host compiler to use.
* Fix python formatting.
* Simplify the test for gfortran in nek* packages.
* ceed: 2.0 uses petsc@3.11.0
* hpgmg-0.4; use from ceed@2.0.0
* Update the hypre dependency for ceed 2.0.
* Disable the superlu-dist dependency (through hypre) when using a
+quickbuild of ceed 2.0.
* petsc-3.11.0: add xlf fix
* nekcem: has a build dependency on Python 2.7+
* hpgmg: better setting of compiler options and use python for configure
* libceed: use v0.4 tag
* libceed: fix 0.4 release oops (pkgconfig version)
* Add a patch for magma-2.5.0 that brings it up the current 'master'.
* In the mfem package, install the examples, miniapps, and data under
$prefix/share/mfem.
* In the magma package, apply a patch to v2.5.0 that disables
magma_sparse - for testing purposes.
* In the magma package, link the 'magma' library with the
'nvToolsExt' library.
* In the magma package, update the 'magma-2.5.0.patch' with the latest
commits from the magma source repository. Also, remove the library
'nvToolsExt' from the 'magma-2.5.0-cmake.patch' - now it is not
needed.
* In the magma package, disable OpenMP when using v2.5.0 with the
IBM XL compiler.
Please enter the commit message for your changes. Lines starting
* In the mfem package, add version for the 'laghos-v2.0' tag; also,
prefix the versions `laghos-v*` with their respective development
version numbers -- this way they are properly ordered within spack
relative to the official numbered versions.
* petsc: add version 3.11.1 (#11179)
(cherry picked from commit 1eab6e3c86
)
* ceed-2.0: use petsc-3.11.1
* this-is-so-dumb.f -> empty.f
429 lines
15 KiB
Diff
429 lines
15 KiB
Diff
diff -r 89706c0efbdb .hgtags
|
|
--- a/.hgtags Wed Jan 02 14:17:26 2019 -0500
|
|
+++ b/.hgtags Wed Apr 03 15:50:54 2019 -0700
|
|
@@ -1,3 +1,4 @@
|
|
9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0
|
|
b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0
|
|
04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0
|
|
+89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0
|
|
diff -r 89706c0efbdb include/magma_types.h
|
|
--- a/include/magma_types.h Wed Jan 02 14:17:26 2019 -0500
|
|
+++ b/include/magma_types.h Wed Apr 03 15:50:54 2019 -0700
|
|
@@ -77,7 +77,7 @@
|
|
typedef magma_int_t magma_device_t;
|
|
|
|
// Half precision in CUDA
|
|
- #if defined(__cplusplus) && CUDA_VERSION > 7500
|
|
+ #if defined(__cplusplus) && CUDA_VERSION >= 7500
|
|
#include <cuda_fp16.h>
|
|
typedef __half magmaHalf;
|
|
#else
|
|
diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu
|
|
--- a/sparse/blas/magma_zsampleselect.cu Wed Jan 02 14:17:26 2019 -0500
|
|
+++ b/sparse/blas/magma_zsampleselect.cu Wed Apr 03 15:50:54 2019 -0700
|
|
@@ -15,9 +15,12 @@
|
|
|
|
#define PRECISION_z
|
|
|
|
+
|
|
namespace magma_sampleselect {
|
|
|
|
-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) {
|
|
+__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size)
|
|
+{
|
|
+#if (__CUDA_ARCH__ >= 350)
|
|
auto idx = threadIdx.x + blockDim.x * blockIdx.x;
|
|
if (idx >= size) {
|
|
return;
|
|
@@ -25,6 +28,7 @@
|
|
|
|
auto v = in[idx];
|
|
out[idx] = real(v) * real(v) + imag(v) * imag(v);
|
|
+#endif
|
|
}
|
|
|
|
} // namespace magma_sampleselect
|
|
@@ -164,36 +168,43 @@
|
|
magma_queue_t queue )
|
|
{
|
|
magma_int_t info = 0;
|
|
+ magma_int_t arch = magma_getdevice_arch();
|
|
|
|
- auto num_blocks = magma_ceildiv(total_size, block_size);
|
|
- auto local_work = (total_size + num_threads - 1) / num_threads;
|
|
- auto required_size = sizeof(double) * (total_size + searchtree_size)
|
|
+ if( arch >= 350 ) {
|
|
+ auto num_blocks = magma_ceildiv(total_size, block_size);
|
|
+ auto local_work = (total_size + num_threads - 1) / num_threads;
|
|
+ auto required_size = sizeof(double) * (total_size + searchtree_size)
|
|
+ sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1);
|
|
- auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
|
|
+ auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
|
|
|
|
- double* gputmp = (double*)*tmp_ptr;
|
|
- double* gputree = gputmp + total_size;
|
|
- uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
|
|
- int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
|
|
- int32_t* gpucounts = gpurankout + 1;
|
|
- int32_t* gpulocalcounts = gpucounts + searchtree_width;
|
|
- uint32_t bucketidx{};
|
|
+ double* gputmp = (double*)*tmp_ptr;
|
|
+ double* gputree = gputmp + total_size;
|
|
+ uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
|
|
+ int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
|
|
+ int32_t* gpucounts = gpurankout + 1;
|
|
+ int32_t* gpulocalcounts = gpucounts + searchtree_width;
|
|
+ uint32_t bucketidx{};
|
|
|
|
- CHECK(realloc_result);
|
|
+ CHECK(realloc_result);
|
|
|
|
- compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
|
|
- (val, gputmp, total_size);
|
|
- build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
|
|
- (gputmp, gputree, total_size);
|
|
- count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
|
|
- (gputmp, gputree, gpulocalcounts, total_size, local_work);
|
|
- reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
|
|
- (gpulocalcounts, gpucounts, num_grouped_blocks);
|
|
- sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
|
|
- (gpucounts, subset_size, gpubucketidx, gpurankout);
|
|
- magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
|
|
- magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
|
|
- *thrs = std::sqrt(*thrs);
|
|
+ compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
|
|
+ (val, gputmp, total_size);
|
|
+ build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
|
|
+ (gputmp, gputree, total_size);
|
|
+ count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
|
|
+ (gputmp, gputree, gpulocalcounts, total_size, local_work);
|
|
+ reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
|
|
+ (gpulocalcounts, gpucounts, num_grouped_blocks);
|
|
+ sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
|
|
+ (gpucounts, subset_size, gpubucketidx, gpurankout);
|
|
+ magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
|
|
+ magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
|
|
+ *thrs = std::sqrt(*thrs);
|
|
+ }
|
|
+ else {
|
|
+ printf("error: this functionality needs CUDA architecture >= 3.5\n");
|
|
+ info = MAGMA_ERR_NOT_SUPPORTED;
|
|
+ }
|
|
|
|
cleanup:
|
|
return info;
|
|
diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp
|
|
--- a/src/xhsgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500
|
|
+++ b/src/xhsgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700
|
|
@@ -16,6 +16,131 @@
|
|
#include <cuda_fp16.h>
|
|
#endif
|
|
|
|
+#if CUDA_VERSION < 9020
|
|
+// conversion float to half are not defined for host in CUDA version <9.2
|
|
+// thus uses the conversion below when CUDA VERSION is < 9.2.
|
|
+#include <string.h>
|
|
+//
|
|
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
|
|
+//
|
|
+// Redistribution and use in source and binary forms, with or without
|
|
+// modification, are permitted provided that the following conditions
|
|
+// are met:
|
|
+// * Redistributions of source code must retain the above copyright
|
|
+// notice, this list of conditions and the following disclaimer.
|
|
+// * Redistributions in binary form must reproduce the above copyright
|
|
+// notice, this list of conditions and the following disclaimer in the
|
|
+// documentation and/or other materials provided with the distribution.
|
|
+// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
+// contributors may be used to endorse or promote products derived
|
|
+// from this software without specific prior written permission.
|
|
+//
|
|
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+
|
|
+// This code modified from the public domain code here:
|
|
+// https://gist.github.com/rygorous/2156668
|
|
+// The URL above includes more robust conversion routines
|
|
+// that handle Inf and NaN correctly.
|
|
+//
|
|
+// It is recommended to use the more robust versions in production code.
|
|
+
|
|
+typedef unsigned uint;
|
|
+
|
|
+union FP32
|
|
+{
|
|
+ uint u;
|
|
+ float f;
|
|
+ struct
|
|
+ {
|
|
+ uint Mantissa : 23;
|
|
+ uint Exponent : 8;
|
|
+ uint Sign : 1;
|
|
+ };
|
|
+};
|
|
+
|
|
+union FP16
|
|
+{
|
|
+ unsigned short u;
|
|
+ struct
|
|
+ {
|
|
+ uint Mantissa : 10;
|
|
+ uint Exponent : 5;
|
|
+ uint Sign : 1;
|
|
+ };
|
|
+};
|
|
+
|
|
+// Approximate solution. This is faster but converts some sNaNs to
|
|
+// infinity and doesn't round correctly. Handle with care.
|
|
+// Approximate solution. This is faster but converts some sNaNs to
|
|
+// infinity and doesn't round correctly. Handle with care.
|
|
+static half approx_float_to_half(float fl)
|
|
+{
|
|
+ FP32 f32infty = { 255 << 23 };
|
|
+ FP32 f16max = { (127 + 16) << 23 };
|
|
+ FP32 magic = { 15 << 23 };
|
|
+ FP32 expinf = { (255 ^ 31) << 23 };
|
|
+ uint sign_mask = 0x80000000u;
|
|
+ FP16 o = { 0 };
|
|
+
|
|
+ FP32 f = *((FP32*)&fl);
|
|
+
|
|
+ uint sign = f.u & sign_mask;
|
|
+ f.u ^= sign;
|
|
+
|
|
+ if (!(f.f < f32infty.u)) // Inf or NaN
|
|
+ o.u = f.u ^ expinf.u;
|
|
+ else
|
|
+ {
|
|
+ if (f.f > f16max.f) f.f = f16max.f;
|
|
+ f.f *= magic.f;
|
|
+ }
|
|
+
|
|
+ o.u = f.u >> 13; // Take the mantissa bits
|
|
+ o.u |= sign >> 16;
|
|
+ half tmp;
|
|
+ memcpy(&tmp, &o, sizeof(half));
|
|
+ //return *((half*)&o);
|
|
+ return tmp;
|
|
+}
|
|
+
|
|
+// from half->float code - just for verification.
|
|
+static float half_to_float(half hf)
|
|
+{
|
|
+ FP16 h;
|
|
+ memcpy(&h, &hf, sizeof(half));
|
|
+
|
|
+ static const FP32 magic = { 113 << 23 };
|
|
+ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
|
+ FP32 o;
|
|
+
|
|
+ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
|
|
+ uint exp = shifted_exp & o.u; // just the exponent
|
|
+ o.u += (127 - 15) << 23; // exponent adjust
|
|
+
|
|
+ // handle exponent special cases
|
|
+ if (exp == shifted_exp) // Inf/NaN?
|
|
+ o.u += (128 - 16) << 23; // extra exp adjust
|
|
+ else if (exp == 0) // Zero/Denormal?
|
|
+ {
|
|
+ o.u += 1 << 23; // extra exp adjust
|
|
+ o.f -= magic.f; // renormalize
|
|
+ }
|
|
+
|
|
+ o.u |= (h.u & 0x8000) << 16; // sign bit
|
|
+ return o.f;
|
|
+}
|
|
+#endif
|
|
+
|
|
#include "magma_internal.h"
|
|
//#include "nvToolsExt.h"
|
|
|
|
@@ -106,10 +231,13 @@
|
|
float c_one = MAGMA_S_ONE;
|
|
float c_neg_one = MAGMA_S_NEG_ONE;
|
|
#if 1
|
|
+ #if CUDA_VERSION >= 9020
|
|
const magmaHalf h_one = (magmaHalf) 1.0;
|
|
const magmaHalf h_neg_one = (magmaHalf)-1.0;
|
|
- //const magmaHalf h_one = approx_float_to_half(1.0);
|
|
- //const magmaHalf h_neg_one = approx_float_to_half(-1.0);
|
|
+ #else
|
|
+ const magmaHalf h_one = approx_float_to_half(1.0);
|
|
+ const magmaHalf h_neg_one = approx_float_to_half(-1.0);
|
|
+ #endif
|
|
#else
|
|
FP32 float_one = *((FP32*)&c_one);
|
|
FP16 half_one = float_to_half_full(float_one);
|
|
diff -r 89706c0efbdb src/xshgetrf_gpu.cpp
|
|
--- a/src/xshgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500
|
|
+++ b/src/xshgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700
|
|
@@ -92,7 +92,7 @@
|
|
magma_mp_type_t enable_tc,
|
|
magma_mp_type_t mp_algo_type )
|
|
{
|
|
-#if CUDA_VERSION >= 7500
|
|
+#if CUDA_VERSION >= 9000
|
|
#ifdef HAVE_clBLAS
|
|
#define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda)
|
|
#define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
|
|
diff -r 89706c0efbdb testing/testing_hgemm.cpp
|
|
--- a/testing/testing_hgemm.cpp Wed Jan 02 14:17:26 2019 -0500
|
|
+++ b/testing/testing_hgemm.cpp Wed Apr 03 15:50:54 2019 -0700
|
|
@@ -22,6 +22,131 @@
|
|
#include "magma_operators.h"
|
|
#include "testings.h"
|
|
|
|
+#if CUDA_VERSION < 9020
|
|
+// conversion float to half are not defined for host in CUDA version <9.2
|
|
+// thus uses the conversion below when CUDA VERSION is < 9.2.
|
|
+#include <string.h>
|
|
+//
|
|
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
|
|
+//
|
|
+// Redistribution and use in source and binary forms, with or without
|
|
+// modification, are permitted provided that the following conditions
|
|
+// are met:
|
|
+// * Redistributions of source code must retain the above copyright
|
|
+// notice, this list of conditions and the following disclaimer.
|
|
+// * Redistributions in binary form must reproduce the above copyright
|
|
+// notice, this list of conditions and the following disclaimer in the
|
|
+// documentation and/or other materials provided with the distribution.
|
|
+// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
+// contributors may be used to endorse or promote products derived
|
|
+// from this software without specific prior written permission.
|
|
+//
|
|
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+
|
|
+// This code modified from the public domain code here:
|
|
+// https://gist.github.com/rygorous/2156668
|
|
+// The URL above includes more robust conversion routines
|
|
+// that handle Inf and NaN correctly.
|
|
+//
|
|
+// It is recommended to use the more robust versions in production code.
|
|
+
|
|
+typedef unsigned uint;
|
|
+
|
|
+union FP32
|
|
+{
|
|
+ uint u;
|
|
+ float f;
|
|
+ struct
|
|
+ {
|
|
+ uint Mantissa : 23;
|
|
+ uint Exponent : 8;
|
|
+ uint Sign : 1;
|
|
+ };
|
|
+};
|
|
+
|
|
+union FP16
|
|
+{
|
|
+ unsigned short u;
|
|
+ struct
|
|
+ {
|
|
+ uint Mantissa : 10;
|
|
+ uint Exponent : 5;
|
|
+ uint Sign : 1;
|
|
+ };
|
|
+};
|
|
+
|
|
+// Approximate solution. This is faster but converts some sNaNs to
|
|
+// infinity and doesn't round correctly. Handle with care.
|
|
+// Approximate solution. This is faster but converts some sNaNs to
|
|
+// infinity and doesn't round correctly. Handle with care.
|
|
+static half approx_float_to_half(float fl)
|
|
+{
|
|
+ FP32 f32infty = { 255 << 23 };
|
|
+ FP32 f16max = { (127 + 16) << 23 };
|
|
+ FP32 magic = { 15 << 23 };
|
|
+ FP32 expinf = { (255 ^ 31) << 23 };
|
|
+ uint sign_mask = 0x80000000u;
|
|
+ FP16 o = { 0 };
|
|
+
|
|
+ FP32 f = *((FP32*)&fl);
|
|
+
|
|
+ uint sign = f.u & sign_mask;
|
|
+ f.u ^= sign;
|
|
+
|
|
+ if (!(f.f < f32infty.u)) // Inf or NaN
|
|
+ o.u = f.u ^ expinf.u;
|
|
+ else
|
|
+ {
|
|
+ if (f.f > f16max.f) f.f = f16max.f;
|
|
+ f.f *= magic.f;
|
|
+ }
|
|
+
|
|
+ o.u = f.u >> 13; // Take the mantissa bits
|
|
+ o.u |= sign >> 16;
|
|
+ half tmp;
|
|
+ memcpy(&tmp, &o, sizeof(half));
|
|
+ //return *((half*)&o);
|
|
+ return tmp;
|
|
+}
|
|
+
|
|
+// from half->float code - just for verification.
|
|
+static float half_to_float(half hf)
|
|
+{
|
|
+ FP16 h;
|
|
+ memcpy(&h, &hf, sizeof(half));
|
|
+
|
|
+ static const FP32 magic = { 113 << 23 };
|
|
+ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
|
+ FP32 o;
|
|
+
|
|
+ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
|
|
+ uint exp = shifted_exp & o.u; // just the exponent
|
|
+ o.u += (127 - 15) << 23; // exponent adjust
|
|
+
|
|
+ // handle exponent special cases
|
|
+ if (exp == shifted_exp) // Inf/NaN?
|
|
+ o.u += (128 - 16) << 23; // extra exp adjust
|
|
+ else if (exp == 0) // Zero/Denormal?
|
|
+ {
|
|
+ o.u += 1 << 23; // extra exp adjust
|
|
+ o.f -= magic.f; // renormalize
|
|
+ }
|
|
+
|
|
+ o.u |= (h.u & 0x8000) << 16; // sign bit
|
|
+ return o.f;
|
|
+}
|
|
+#endif
|
|
+
|
|
/* ////////////////////////////////////////////////////////////////////////////
|
|
-- Testing sgemm
|
|
*/
|
|
@@ -47,8 +172,13 @@
|
|
float c_neg_one = MAGMA_S_NEG_ONE;
|
|
float alpha = MAGMA_S_MAKE( 0.29, -0.86 );
|
|
float beta = MAGMA_S_MAKE( -0.48, 0.38 );
|
|
- magmaHalf h_alpha = (magmaHalf)alpha;
|
|
- magmaHalf h_beta = (magmaHalf)beta;
|
|
+ #if CUDA_VERSION >= 9020
|
|
+ const magmaHalf h_alpha = (magmaHalf) alpha;
|
|
+ const magmaHalf h_beta = (magmaHalf) beta;
|
|
+ #else
|
|
+ const magmaHalf h_alpha = approx_float_to_half(alpha);
|
|
+ const magmaHalf h_beta = approx_float_to_half(beta);
|
|
+ #endif
|
|
magma_opts opts;
|
|
opts.parse_opts( argc, argv );
|
|
|