spack/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch

diff -r 89706c0efbdb .hgtags
--- a/.hgtags	Wed Jan 02 14:17:26 2019 -0500
+++ b/.hgtags	Wed Apr 03 15:50:54 2019 -0700
@@ -1,3 +1,4 @@
 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0
 b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0
 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0
+89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0
diff -r 89706c0efbdb include/magma_types.h
--- a/include/magma_types.h	Wed Jan 02 14:17:26 2019 -0500
+++ b/include/magma_types.h	Wed Apr 03 15:50:54 2019 -0700
@@ -77,7 +77,7 @@
     typedef magma_int_t    magma_device_t;

     // Half precision in CUDA
-    #if defined(__cplusplus) && CUDA_VERSION > 7500
+    #if defined(__cplusplus) && CUDA_VERSION >= 7500
     #include <cuda_fp16.h>
     typedef __half           magmaHalf;
     #else
diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu
--- a/sparse/blas/magma_zsampleselect.cu	Wed Jan 02 14:17:26 2019 -0500
+++ b/sparse/blas/magma_zsampleselect.cu	Wed Apr 03 15:50:54 2019 -0700
@@ -15,9 +15,12 @@

 #define PRECISION_z

+
 namespace magma_sampleselect {

-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) {
+__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size)
+{
+#if (__CUDA_ARCH__ >= 350)
     auto idx = threadIdx.x + blockDim.x * blockIdx.x;
     if (idx >= size) {
         return;
@@ -25,6 +28,7 @@

     auto v = in[idx];
     out[idx] = real(v) * real(v) + imag(v) * imag(v);
+#endif
 }

 } // namespace magma_sampleselect
@@ -164,36 +168,43 @@
     magma_queue_t queue )
 {
     magma_int_t info = 0;
+    magma_int_t arch = magma_getdevice_arch();

-    auto num_blocks = magma_ceildiv(total_size, block_size);
-    auto local_work = (total_size + num_threads - 1) / num_threads;
-    auto required_size = sizeof(double) * (total_size + searchtree_size)
+    if( arch >= 350 ) {
+        auto num_blocks = magma_ceildiv(total_size, block_size);
+        auto local_work = (total_size + num_threads - 1) / num_threads;
+        auto required_size = sizeof(double) * (total_size + searchtree_size)
                          + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1);
-    auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
+        auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);

-    double* gputmp = (double*)*tmp_ptr;
-    double* gputree = gputmp + total_size;
-    uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
-    int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
-    int32_t* gpucounts = gpurankout + 1;
-    int32_t* gpulocalcounts = gpucounts + searchtree_width;
-    uint32_t bucketidx{};
+        double* gputmp = (double*)*tmp_ptr;
+        double* gputree = gputmp + total_size;
+        uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
+        int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
+        int32_t* gpucounts = gpurankout + 1;
+        int32_t* gpulocalcounts = gpucounts + searchtree_width;
+        uint32_t bucketidx{};

-    CHECK(realloc_result);
+        CHECK(realloc_result);

-    compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
-        (val, gputmp, total_size);
-    build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
-        (gputmp, gputree, total_size);
-    count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
-        (gputmp, gputree, gpulocalcounts, total_size, local_work);
-    reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
-        (gpulocalcounts, gpucounts, num_grouped_blocks);
-    sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
-        (gpucounts, subset_size, gpubucketidx, gpurankout);
-    magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
-    magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
-    *thrs = std::sqrt(*thrs);
+        compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
+            (val, gputmp, total_size);
+        build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
+            (gputmp, gputree, total_size);
+        count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
+            (gputmp, gputree, gpulocalcounts, total_size, local_work);
+        reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
+            (gpulocalcounts, gpucounts, num_grouped_blocks);
+        sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
+            (gpucounts, subset_size, gpubucketidx, gpurankout);
+        magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
+        magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
+        *thrs = std::sqrt(*thrs);
+    }
+    else {
+        printf("error: this functionality needs CUDA architecture >= 3.5\n");
+        info = MAGMA_ERR_NOT_SUPPORTED;
+    }

 cleanup:
     return info;
diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp
--- a/src/xhsgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
+++ b/src/xhsgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
@@ -16,6 +16,131 @@
 #include <cuda_fp16.h>
 #endif

+#if CUDA_VERSION < 9020
+// conversion float to half are not defined for host in CUDA version <9.2
+// thus uses the conversion below when CUDA VERSION is < 9.2.
+#include <string.h>
+//
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This code modified from the public domain code here:
+// https://gist.github.com/rygorous/2156668
+// The URL above includes more robust conversion routines
+// that handle Inf and NaN correctly.
+//
+// It is recommended to use the more robust versions in production code.
+
+typedef unsigned uint;
+
+union FP32
+{
+    uint u;
+    float f;
+    struct
+    {
+        uint Mantissa : 23;
+        uint Exponent : 8;
+        uint Sign : 1;
+    };
+};
+
+union FP16
+{
+    unsigned short u;
+    struct
+    {
+        uint Mantissa : 10;
+        uint Exponent : 5;
+        uint Sign : 1;
+    };
+};
+
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+static half approx_float_to_half(float fl)
+{
+    FP32 f32infty = { 255 << 23 };
+    FP32 f16max = { (127 + 16) << 23 };
+    FP32 magic = { 15 << 23 };
+    FP32 expinf = { (255 ^ 31) << 23 };
+    uint sign_mask = 0x80000000u;
+    FP16 o = { 0 };
+
+    FP32 f = *((FP32*)&fl);
+
+    uint sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (!(f.f < f32infty.u)) // Inf or NaN
+        o.u = f.u ^ expinf.u;
+    else
+    {
+        if (f.f > f16max.f) f.f = f16max.f;
+        f.f *= magic.f;
+    }
+
+    o.u = f.u >> 13; // Take the mantissa bits
+    o.u |= sign >> 16;
+    half tmp;
+    memcpy(&tmp, &o, sizeof(half));
+    //return *((half*)&o);
+    return tmp;
+}
+
+// from half->float code - just for verification.
+static float half_to_float(half hf)
+{
+    FP16 h;
+    memcpy(&h, &hf, sizeof(half));
+
+    static const FP32 magic = { 113 << 23 };
+    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    uint exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o.u += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) // Zero/Denormal?
+    {
+        o.u += 1 << 23;             // extra exp adjust
+        o.f -= magic.f;             // renormalize
+    }
+
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+}
+#endif
+
 #include "magma_internal.h"
 //#include "nvToolsExt.h"

@@ -106,10 +231,13 @@
     float c_one     = MAGMA_S_ONE;
     float c_neg_one = MAGMA_S_NEG_ONE;
     #if 1
+    #if CUDA_VERSION >= 9020
     const magmaHalf h_one     = (magmaHalf) 1.0;
     const magmaHalf h_neg_one = (magmaHalf)-1.0;
-    //const magmaHalf h_one = approx_float_to_half(1.0);
-    //const magmaHalf h_neg_one = approx_float_to_half(-1.0);
+    #else
+    const magmaHalf h_one = approx_float_to_half(1.0);
+    const magmaHalf h_neg_one = approx_float_to_half(-1.0);
+    #endif
     #else
     FP32 float_one    = *((FP32*)&c_one);
     FP16 half_one     = float_to_half_full(float_one);
diff -r 89706c0efbdb src/xshgetrf_gpu.cpp
--- a/src/xshgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
+++ b/src/xshgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
@@ -92,7 +92,7 @@
     magma_mp_type_t enable_tc,
     magma_mp_type_t mp_algo_type )
 {
-#if CUDA_VERSION >= 7500
+#if CUDA_VERSION >= 9000
     #ifdef HAVE_clBLAS
     #define  dA(i_, j_) dA,  (dA_offset  + (i_)       + (j_)*ldda)
     #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
diff -r 89706c0efbdb testing/testing_hgemm.cpp
--- a/testing/testing_hgemm.cpp	Wed Jan 02 14:17:26 2019 -0500
+++ b/testing/testing_hgemm.cpp	Wed Apr 03 15:50:54 2019 -0700
@@ -22,6 +22,131 @@
 #include "magma_operators.h"
 #include "testings.h"

+#if CUDA_VERSION < 9020
+// conversion float to half are not defined for host in CUDA version <9.2
+// thus uses the conversion below when CUDA VERSION is < 9.2.
+#include <string.h>
+//
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This code modified from the public domain code here:
+// https://gist.github.com/rygorous/2156668
+// The URL above includes more robust conversion routines
+// that handle Inf and NaN correctly.
+//
+// It is recommended to use the more robust versions in production code.
+
+typedef unsigned uint;
+
+union FP32
+{
+    uint u;
+    float f;
+    struct
+    {
+        uint Mantissa : 23;
+        uint Exponent : 8;
+        uint Sign : 1;
+    };
+};
+
+union FP16
+{
+    unsigned short u;
+    struct
+    {
+        uint Mantissa : 10;
+        uint Exponent : 5;
+        uint Sign : 1;
+    };
+};
+
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+static half approx_float_to_half(float fl)
+{
+    FP32 f32infty = { 255 << 23 };
+    FP32 f16max = { (127 + 16) << 23 };
+    FP32 magic = { 15 << 23 };
+    FP32 expinf = { (255 ^ 31) << 23 };
+    uint sign_mask = 0x80000000u;
+    FP16 o = { 0 };
+
+    FP32 f = *((FP32*)&fl);
+
+    uint sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (!(f.f < f32infty.u)) // Inf or NaN
+        o.u = f.u ^ expinf.u;
+    else
+    {
+        if (f.f > f16max.f) f.f = f16max.f;
+        f.f *= magic.f;
+    }
+
+    o.u = f.u >> 13; // Take the mantissa bits
+    o.u |= sign >> 16;
+    half tmp;
+    memcpy(&tmp, &o, sizeof(half));
+    //return *((half*)&o);
+    return tmp;
+}
+
+// from half->float code - just for verification.
+static float half_to_float(half hf)
+{
+    FP16 h;
+    memcpy(&h, &hf, sizeof(half));
+
+    static const FP32 magic = { 113 << 23 };
+    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    uint exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o.u += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) // Zero/Denormal?
+    {
+        o.u += 1 << 23;             // extra exp adjust
+        o.f -= magic.f;             // renormalize
+    }
+
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+}
+#endif
+
 /* ////////////////////////////////////////////////////////////////////////////
    -- Testing sgemm
 */
@@ -47,8 +172,13 @@
     float c_neg_one = MAGMA_S_NEG_ONE;
     float alpha = MAGMA_S_MAKE(  0.29, -0.86 );
     float beta  = MAGMA_S_MAKE( -0.48,  0.38 );
-    magmaHalf h_alpha = (magmaHalf)alpha;
-    magmaHalf h_beta  = (magmaHalf)beta;
+    #if CUDA_VERSION >= 9020
+    const magmaHalf h_alpha = (magmaHalf) alpha;
+    const magmaHalf h_beta  = (magmaHalf) beta;
+    #else
+    const magmaHalf h_alpha = approx_float_to_half(alpha);
+    const magmaHalf h_beta  = approx_float_to_half(beta);
+    #endif
     magma_opts opts;
     opts.parse_opts( argc, argv );