
* Added a package for the MDAnalysis toolkit. * Added a patch that allows Elemental to use cuBLAS internally. * Added support for LBANN to use the new cuBLAS extension in Elemental. * Added a proper variant for when LBANN does not want to use cuBLAS in elemental. * Added a package for the cnpy project and used it in the lbann package. * Removed unnecessary comment lines. * Removed blank lines * Removed debug variant * Add support for libjpeg-turbo * Added additional variants for OpenCV features. Fixed bug when linking in TIFF support, where libtiff used the regular JPEG library and OpenCV used libjpeg-turbo. Now libtiff can use libjpeg-turbo. * Removed the variant for getting Elemental to use the cublas variant. Updated the requirements for OpenCV to add new options. * Fixed a flake8 error in OpenCV and added a path to find cnpy in lbann. * Fixed line too long flake8 error. * Added a flag to specify the datatype size in lbann and fixed a flake8 error. * Added a debug build variant using hte new build_type * Fixed flake8 * Fixed how the debug build is pushed to Elemental * Fixed a bug in the Elemental package where the blas search flags were being overridden by the blas link flags. Changed how the sequential initialization variant is implemented in LBANN. * Added support via a variant to explicitly use mkl or openblas. This helps work around variant forwarding problems. * Updated package files to address pull request comments.
669 lines
20 KiB
Diff
669 lines
20 KiB
Diff
diff -Naur a/include/El/blas_like/level3.hpp b/include/El/blas_like/level3.hpp
|
|
--- a/include/El/blas_like/level3.hpp 2017-06-08 07:30:43.180249917 -0700
|
|
+++ b/include/El/blas_like/level3.hpp 2017-06-08 07:35:27.325434602 -0700
|
|
@@ -31,6 +31,10 @@
|
|
}
|
|
using namespace GemmAlgorithmNS;
|
|
|
|
+void GemmUseGPU(int min_M, int min_N, int min_K);
|
|
+
|
|
+void GemmUseCPU();
|
|
+
|
|
template<typename T>
|
|
void Gemm
|
|
( Orientation orientA, Orientation orientB,
|
|
diff -Naur a/include/El/core/imports/blas.hpp b/include/El/core/imports/blas.hpp
|
|
--- a/include/El/core/imports/blas.hpp 2017-06-08 07:30:43.522016908 -0700
|
|
+++ b/include/El/core/imports/blas.hpp 2017-06-08 07:35:06.834030908 -0700
|
|
@@ -916,4 +916,63 @@
|
|
} // namespace blas
|
|
} // namespace El
|
|
|
|
+
|
|
+#if defined(EL_USE_CUBLAS)
|
|
+
|
|
+namespace El {
|
|
+
|
|
+#ifdef EL_USE_64BIT_BLAS_INTS
|
|
+typedef long long int BlasInt;
|
|
+#else
|
|
+typedef int BlasInt;
|
|
+#endif
|
|
+
|
|
+namespace cublas {
|
|
+
|
|
+// NOTE: templated routines are custom and not wrappers
|
|
+
|
|
+// Level 3 BLAS
|
|
+// ============
|
|
+template<typename T>
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const T& alpha,
|
|
+ const T* A, BlasInt ALDim,
|
|
+ const T* B, BlasInt BLDim,
|
|
+ const T& beta,
|
|
+ T* C, BlasInt CLDim );
|
|
+
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const float& alpha,
|
|
+ const float* A, BlasInt ALDim,
|
|
+ const float* B, BlasInt BLDim,
|
|
+ const float& beta,
|
|
+ float* C, BlasInt CLDim );
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const double& alpha,
|
|
+ const double* A, BlasInt ALDim,
|
|
+ const double* B, BlasInt BLDim,
|
|
+ const double& beta,
|
|
+ double* C, BlasInt CLDim );
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const scomplex& alpha,
|
|
+ const scomplex* A, BlasInt ALDim,
|
|
+ const scomplex* B, BlasInt BLDim,
|
|
+ const scomplex& beta,
|
|
+ scomplex* C, BlasInt CLDim );
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const dcomplex& alpha,
|
|
+ const dcomplex* A, BlasInt ALDim,
|
|
+ const dcomplex* B, BlasInt BLDim,
|
|
+ const dcomplex& beta,
|
|
+ dcomplex* C, BlasInt CLDim );
|
|
+
|
|
+} // namespace cublas
|
|
+} // namespace El
|
|
+#endif
|
|
+
|
|
#endif // ifndef EL_IMPORTS_BLAS_DECL_HPP
|
|
diff -Naur a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp
|
|
--- a/src/blas_like/level3/Gemm.cpp 2017-06-08 07:30:44.307096427 -0700
|
|
+++ b/src/blas_like/level3/Gemm.cpp 2017-06-08 07:34:23.062863489 -0700
|
|
@@ -16,6 +16,20 @@
|
|
|
|
namespace El {
|
|
|
|
+char gemm_cpu_gpu_switch = 'c';
|
|
+int min_M = 0, min_N = 0, min_K = 0;
|
|
+
|
|
+void GemmUseGPU(int _min_M, int _min_N, int _min_K) {
|
|
+ gemm_cpu_gpu_switch = 'g';
|
|
+ min_M = _min_M;
|
|
+ min_N = _min_N;
|
|
+ min_K = _min_K;
|
|
+}
|
|
+
|
|
+void GemmUseCPU() {
|
|
+ gemm_cpu_gpu_switch = 'c';
|
|
+}
|
|
+
|
|
template<typename T>
|
|
void Gemm
|
|
( Orientation orientA, Orientation orientB,
|
|
@@ -59,11 +73,30 @@
|
|
const Int k = ( orientA == NORMAL ? A.Width() : A.Height() );
|
|
if( k != 0 )
|
|
{
|
|
+#if defined(EL_USE_CUBLAS)
|
|
+ if (gemm_cpu_gpu_switch == 'g' &&
|
|
+ m >= min_M &&
|
|
+ n >= min_N &&
|
|
+ k >= min_K) {
|
|
+ cublas::Gemm
|
|
+ ( transA, transB, m, n, k,
|
|
+ alpha, A.LockedBuffer(), A.LDim(),
|
|
+ B.LockedBuffer(), B.LDim(),
|
|
+ beta, C.Buffer(), C.LDim() );
|
|
+ } else {
|
|
+ blas::Gemm
|
|
+ ( transA, transB, m, n, k,
|
|
+ alpha, A.LockedBuffer(), A.LDim(),
|
|
+ B.LockedBuffer(), B.LDim(),
|
|
+ beta, C.Buffer(), C.LDim() );
|
|
+ }
|
|
+#else
|
|
blas::Gemm
|
|
( transA, transB, m, n, k,
|
|
alpha, A.LockedBuffer(), A.LDim(),
|
|
B.LockedBuffer(), B.LDim(),
|
|
beta, C.Buffer(), C.LDim() );
|
|
+#endif
|
|
}
|
|
else
|
|
{
|
|
diff -Naur a/src/core/imports/blas/Gemm.hpp b/src/core/imports/blas/Gemm.hpp
|
|
--- a/src/core/imports/blas/Gemm.hpp 2017-06-08 07:30:45.090529967 -0700
|
|
+++ b/src/core/imports/blas/Gemm.hpp 2017-06-08 07:34:46.503009958 -0700
|
|
@@ -41,6 +41,12 @@
|
|
|
|
} // extern "C"
|
|
|
|
+
|
|
+#if defined(EL_USE_CUBLAS)
|
|
+#include <cublas.h>
|
|
+#include <cub/util_allocator.cuh>
|
|
+#endif
|
|
+
|
|
namespace El {
|
|
namespace blas {
|
|
|
|
@@ -515,3 +521,515 @@
|
|
|
|
} // namespace blas
|
|
} // namespace El
|
|
+
|
|
+
|
|
+#if EL_USE_CUBLAS
|
|
+
|
|
+#define USE_CUB 1
|
|
+
|
|
+namespace El {
|
|
+namespace cublas {
|
|
+
|
|
+#if USE_CUB
|
|
+cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
|
|
+#endif
|
|
+
|
|
+template<typename T>
|
|
+void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const T& alpha,
|
|
+ const T* A, BlasInt ALDim,
|
|
+ const T* B, BlasInt BLDim,
|
|
+ const T& beta,
|
|
+ T* C, BlasInt CLDim )
|
|
+{
|
|
+ // put something here
|
|
+ printf("integer version \n");
|
|
+}
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const Int& alpha,
|
|
+ const Int* A, BlasInt ALDim,
|
|
+ const Int* B, BlasInt BLDim,
|
|
+ const Int& beta,
|
|
+ Int* C, BlasInt CLDim );
|
|
+#ifdef EL_HAVE_QD
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const DoubleDouble& alpha,
|
|
+ const DoubleDouble* A, BlasInt ALDim,
|
|
+ const DoubleDouble* B, BlasInt BLDim,
|
|
+ const DoubleDouble& beta,
|
|
+ DoubleDouble* C, BlasInt CLDim );
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const QuadDouble& alpha,
|
|
+ const QuadDouble* A, BlasInt ALDim,
|
|
+ const QuadDouble* B, BlasInt BLDim,
|
|
+ const QuadDouble& beta,
|
|
+ QuadDouble* C, BlasInt CLDim );
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const Complex<DoubleDouble>& alpha,
|
|
+ const Complex<DoubleDouble>* A, BlasInt ALDim,
|
|
+ const Complex<DoubleDouble>* B, BlasInt BLDim,
|
|
+ const Complex<DoubleDouble>& beta,
|
|
+ Complex<DoubleDouble>* C, BlasInt CLDim );
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const Complex<QuadDouble>& alpha,
|
|
+ const Complex<QuadDouble>* A, BlasInt ALDim,
|
|
+ const Complex<QuadDouble>* B, BlasInt BLDim,
|
|
+ const Complex<QuadDouble>& beta,
|
|
+ Complex<QuadDouble>* C, BlasInt CLDim );
|
|
+#endif
|
|
+#ifdef EL_HAVE_QUAD
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const Quad& alpha,
|
|
+ const Quad* A, BlasInt ALDim,
|
|
+ const Quad* B, BlasInt BLDim,
|
|
+ const Quad& beta,
|
|
+ Quad* C, BlasInt CLDim );
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const Complex<Quad>& alpha,
|
|
+ const Complex<Quad>* A, BlasInt ALDim,
|
|
+ const Complex<Quad>* B, BlasInt BLDim,
|
|
+ const Complex<Quad>& beta,
|
|
+ Complex<Quad>* C, BlasInt CLDim );
|
|
+#endif
|
|
+#ifdef EL_HAVE_MPC
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const BigInt& alpha,
|
|
+ const BigInt* A, BlasInt ALDim,
|
|
+ const BigInt* B, BlasInt BLDim,
|
|
+ const BigInt& beta,
|
|
+ BigInt* C, BlasInt CLDim );
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const BigFloat& alpha,
|
|
+ const BigFloat* A, BlasInt ALDim,
|
|
+ const BigFloat* B, BlasInt BLDim,
|
|
+ const BigFloat& beta,
|
|
+ BigFloat* C, BlasInt CLDim );
|
|
+template void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const Complex<BigFloat>& alpha,
|
|
+ const Complex<BigFloat>* A, BlasInt ALDim,
|
|
+ const Complex<BigFloat>* B, BlasInt BLDim,
|
|
+ const Complex<BigFloat>& beta,
|
|
+ Complex<BigFloat>* C, BlasInt CLDim );
|
|
+#endif
|
|
+
|
|
+void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const float& alpha,
|
|
+ const float* A, BlasInt ALDim,
|
|
+ const float* B, BlasInt BLDim,
|
|
+ const float& beta,
|
|
+ float* C, BlasInt CLDim )
|
|
+{
|
|
+ EL_DEBUG_CSE
|
|
+ EL_DEBUG_ONLY(
|
|
+ if( std::toupper(transA) == 'N' )
|
|
+ {
|
|
+ if( ALDim < Max(m,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( ALDim < Max(k,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
|
|
+ }
|
|
+
|
|
+ if( std::toupper(transB) == 'N' )
|
|
+ {
|
|
+ if( BLDim < Max(k,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( BLDim < Max(n,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
|
|
+ }
|
|
+
|
|
+ if( CLDim < Max(m,1) )
|
|
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
|
|
+ )
|
|
+ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
|
|
+ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
|
|
+
|
|
+ const mpi::Comm comm;
|
|
+ const Int commRank = mpi::Rank( comm );
|
|
+ if (commRank == 0) {
|
|
+ //printf("calling cublas Sgemm: m %d n %d k %d\n", m, n, k);
|
|
+ }
|
|
+
|
|
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
|
|
+ // device memory size for A, B and C
|
|
+ BlasInt sizeA, sizeB, sizeC;
|
|
+ float *devA=NULL, *devB=NULL, *devC=NULL;
|
|
+
|
|
+ rowA = fixedTransA == 'T' ? k : m;
|
|
+ colA = fixedTransA == 'T' ? m : k;
|
|
+ rowB = fixedTransB == 'T' ? n : k;
|
|
+ colB = fixedTransB == 'T' ? k : n;
|
|
+ rowC = m;
|
|
+ colC = n;
|
|
+ sizeA = rowA * colA;
|
|
+ sizeB = rowB * colB;
|
|
+ sizeC = rowC * colC;
|
|
+
|
|
+ cublasStatus stat;
|
|
+
|
|
+#if USE_CUB
|
|
+ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA,
|
|
+ sizeof(float) * (sizeA+sizeB+sizeC) ));
|
|
+#else
|
|
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(float), (void **) &devA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
|
|
+#endif
|
|
+
|
|
+ devB = devA + sizeA;
|
|
+ devC = devB + sizeB;
|
|
+
|
|
+ // copy matrix A, B and C to device
|
|
+ stat = cublasSetMatrix(rowA, colA, sizeof(float), A, ALDim, devA, rowA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
|
|
+
|
|
+ stat = cublasSetMatrix(rowB, colB, sizeof(float), B, BLDim, devB, rowB);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
|
|
+
|
|
+ if (beta != 0.0)
|
|
+ {
|
|
+ stat = cublasSetMatrix(rowC, colC, sizeof(float), C, CLDim, devC, rowC);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
|
|
+ }
|
|
+
|
|
+ // cublas<t>gemm
|
|
+ cublasSgemm
|
|
+ ( fixedTransA, fixedTransB, m, n, k,
|
|
+ alpha, devA, rowA, devB, rowB, beta, devC, rowC );
|
|
+
|
|
+ // copy matrix C to host
|
|
+ stat = cublasGetMatrix(rowC, colC, sizeof(float), devC, rowC, C, CLDim);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
|
|
+
|
|
+ // free
|
|
+#if USE_CUB
|
|
+ CubDebugExit(g_allocator.DeviceFree(devA));
|
|
+#else
|
|
+ cublasFree(devA);
|
|
+#endif
|
|
+ //printf("CUBLAS float done ...\n");
|
|
+}
|
|
+
|
|
+void Gemm
|
|
+( char transA, char transB,
|
|
+ BlasInt m, BlasInt n, BlasInt k,
|
|
+ const double& alpha,
|
|
+ const double* A, BlasInt ALDim,
|
|
+ const double* B, BlasInt BLDim,
|
|
+ const double& beta,
|
|
+ double* C, BlasInt CLDim )
|
|
+{
|
|
+ EL_DEBUG_CSE
|
|
+ EL_DEBUG_ONLY(
|
|
+ if( std::toupper(transA) == 'N' )
|
|
+ {
|
|
+ if( ALDim < Max(m,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( ALDim < Max(k,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
|
|
+ }
|
|
+
|
|
+ if( std::toupper(transB) == 'N' )
|
|
+ {
|
|
+ if( BLDim < Max(k,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( BLDim < Max(n,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
|
|
+ }
|
|
+
|
|
+ if( CLDim < Max(m,1) )
|
|
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
|
|
+ )
|
|
+ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
|
|
+ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
|
|
+
|
|
+ const mpi::Comm comm;
|
|
+ const Int commRank = mpi::Rank( comm );
|
|
+ if (commRank == 0) {
|
|
+ //printf("calling cublas Dgemm: m %d n %d k %d\n", m, n, k);
|
|
+ }
|
|
+
|
|
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
|
|
+ // device memory size for A, B and C
|
|
+ BlasInt sizeA, sizeB, sizeC;
|
|
+ double *devA=NULL, *devB=NULL, *devC=NULL;
|
|
+
|
|
+ rowA = fixedTransA == 'T' ? k : m;
|
|
+ colA = fixedTransA == 'T' ? m : k;
|
|
+ rowB = fixedTransB == 'T' ? n : k;
|
|
+ colB = fixedTransB == 'T' ? k : n;
|
|
+ rowC = m;
|
|
+ colC = n;
|
|
+ sizeA = rowA * colA;
|
|
+ sizeB = rowB * colB;
|
|
+ sizeC = rowC * colC;
|
|
+
|
|
+ cublasStatus stat;
|
|
+
|
|
+#if USE_CUB
|
|
+ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA,
|
|
+ sizeof(double) * (sizeA+sizeB+sizeC) ));
|
|
+#else
|
|
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(double), (void **) &devA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
|
|
+#endif
|
|
+
|
|
+ devB = devA + sizeA;
|
|
+ devC = devB + sizeB;
|
|
+
|
|
+ // copy matrix A, B and C to device
|
|
+ stat = cublasSetMatrix(rowA, colA, sizeof(double), A, ALDim, devA, rowA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
|
|
+
|
|
+ stat = cublasSetMatrix(rowB, colB, sizeof(double), B, BLDim, devB, rowB);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
|
|
+
|
|
+ if (beta != 0.0)
|
|
+ {
|
|
+ stat = cublasSetMatrix(rowC, colC, sizeof(double), C, CLDim, devC, rowC);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
|
|
+ }
|
|
+
|
|
+ // cublas<t>gemm
|
|
+ cublasDgemm
|
|
+ ( fixedTransA, fixedTransB, m, n, k,
|
|
+ alpha, devA, rowA, devB, rowB, beta, devC, rowC );
|
|
+
|
|
+ // copy matrix C to host
|
|
+ stat = cublasGetMatrix(rowC, colC, sizeof(double), devC, rowC, C, CLDim);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
|
|
+
|
|
+ // free
|
|
+#if USE_CUB
|
|
+ CubDebugExit(g_allocator.DeviceFree(devA));
|
|
+#else
|
|
+ cublasFree(devA);
|
|
+#endif
|
|
+ //printf("CUBLAS double done ...\n");
|
|
+}
|
|
+
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const scomplex& alpha,
|
|
+ const scomplex* A, BlasInt ALDim,
|
|
+ const scomplex* B, BlasInt BLDim,
|
|
+ const scomplex& beta,
|
|
+ scomplex* C, BlasInt CLDim )
|
|
+{
|
|
+ EL_DEBUG_CSE
|
|
+ EL_DEBUG_ONLY(
|
|
+ if( std::toupper(transA) == 'N' )
|
|
+ {
|
|
+ if( ALDim < Max(m,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( ALDim < Max(k,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
|
|
+ }
|
|
+
|
|
+ if( std::toupper(transB) == 'N' )
|
|
+ {
|
|
+ if( BLDim < Max(k,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( BLDim < Max(n,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
|
|
+ }
|
|
+
|
|
+ if( CLDim < Max(m,1) )
|
|
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
|
|
+ )
|
|
+
|
|
+ const char fixedTransA = transA;
|
|
+ const char fixedTransB = transB;
|
|
+
|
|
+ const mpi::Comm comm;
|
|
+ const Int commRank = mpi::Rank( comm );
|
|
+ if (commRank == 0) {
|
|
+ //printf("calling cublas Cgemm: m %d n %d k %d\n", m, n, k);
|
|
+ }
|
|
+
|
|
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
|
|
+ // device memory size for A, B and C
|
|
+ BlasInt sizeA, sizeB, sizeC;
|
|
+ cuComplex *devA=NULL, *devB=NULL, *devC=NULL;
|
|
+
|
|
+ rowA = fixedTransA == 'T' ? k : m;
|
|
+ colA = fixedTransA == 'T' ? m : k;
|
|
+ rowB = fixedTransB == 'T' ? n : k;
|
|
+ colB = fixedTransB == 'T' ? k : n;
|
|
+ rowC = m;
|
|
+ colC = n;
|
|
+ sizeA = rowA * colA;
|
|
+ sizeB = rowB * colB;
|
|
+ sizeC = rowC * colC;
|
|
+
|
|
+ cublasStatus stat;
|
|
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuComplex), (void **) &devA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
|
|
+
|
|
+ devB = devA + sizeA;
|
|
+ devC = devB + sizeB;
|
|
+
|
|
+ // copy matrix A, B and C to device
|
|
+ stat = cublasSetMatrix(rowA, colA, sizeof(cuComplex), A, ALDim, devA, rowA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
|
|
+
|
|
+ stat = cublasSetMatrix(rowB, colB, sizeof(cuComplex), B, BLDim, devB, rowB);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
|
|
+
|
|
+ if (beta.real() != 0.0 || beta.imag() != 0.0)
|
|
+ {
|
|
+ stat = cublasSetMatrix(rowC, colC, sizeof(cuComplex), C, CLDim, devC, rowC);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
|
|
+ }
|
|
+
|
|
+ // cublas<t>gemm
|
|
+ cublasCgemm
|
|
+ ( fixedTransA, fixedTransB, m, n, k,
|
|
+ *((cuComplex*) &alpha), devA, rowA, devB, rowB, *((cuComplex*) &beta), devC, rowC );
|
|
+
|
|
+ // copy matrix C to host
|
|
+ stat = cublasGetMatrix(rowC, colC, sizeof(cuComplex), devC, rowC, C, CLDim);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
|
|
+
|
|
+ // free
|
|
+ cublasFree(devA);
|
|
+}
|
|
+
|
|
+void Gemm
|
|
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
|
|
+ const dcomplex& alpha,
|
|
+ const dcomplex* A, BlasInt ALDim,
|
|
+ const dcomplex* B, BlasInt BLDim,
|
|
+ const dcomplex& beta,
|
|
+ dcomplex* C, BlasInt CLDim )
|
|
+{
|
|
+ EL_DEBUG_CSE
|
|
+ EL_DEBUG_ONLY(
|
|
+ if( std::toupper(transA) == 'N' )
|
|
+ {
|
|
+ if( ALDim < Max(m,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( ALDim < Max(k,1) )
|
|
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
|
|
+ }
|
|
+
|
|
+ if( std::toupper(transB) == 'N' )
|
|
+ {
|
|
+ if( BLDim < Max(k,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if( BLDim < Max(n,1) )
|
|
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
|
|
+ }
|
|
+
|
|
+ if( CLDim < Max(m,1) )
|
|
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
|
|
+ )
|
|
+
|
|
+ const char fixedTransA = transA;
|
|
+ const char fixedTransB = transB;
|
|
+
|
|
+ const mpi::Comm comm;
|
|
+ const Int commRank = mpi::Rank( comm );
|
|
+ if (commRank == 0) {
|
|
+ //printf("calling cublas Zgemm: m %d n %d k %d\n", m, n, k);
|
|
+ }
|
|
+
|
|
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
|
|
+ // device memory size for A, B and C
|
|
+ BlasInt sizeA, sizeB, sizeC;
|
|
+ cuDoubleComplex *devA=NULL, *devB=NULL, *devC=NULL;
|
|
+
|
|
+ rowA = fixedTransA == 'T' ? k : m;
|
|
+ colA = fixedTransA == 'T' ? m : k;
|
|
+ rowB = fixedTransB == 'T' ? n : k;
|
|
+ colB = fixedTransB == 'T' ? k : n;
|
|
+ rowC = m;
|
|
+ colC = n;
|
|
+ sizeA = rowA * colA;
|
|
+ sizeB = rowB * colB;
|
|
+ sizeC = rowC * colC;
|
|
+
|
|
+ cublasStatus stat;
|
|
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuDoubleComplex), (void **) &devA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
|
|
+
|
|
+ devB = devA + sizeA;
|
|
+ devC = devB + sizeB;
|
|
+
|
|
+ // copy matrix A, B and C to device
|
|
+ stat = cublasSetMatrix(rowA, colA, sizeof(cuDoubleComplex), A, ALDim, devA, rowA);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
|
|
+
|
|
+ stat = cublasSetMatrix(rowB, colB, sizeof(cuDoubleComplex), B, BLDim, devB, rowB);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
|
|
+
|
|
+ if (beta.real() != 0.0 || beta.imag() != 0.0)
|
|
+ {
|
|
+ stat = cublasSetMatrix(rowC, colC, sizeof(cuDoubleComplex), C, CLDim, devC, rowC);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
|
|
+ }
|
|
+
|
|
+ cublasZgemm
|
|
+ ( fixedTransA, fixedTransB, m, n, k,
|
|
+ *((cuDoubleComplex*) &alpha), devA, rowA, devB, rowB, *((cuDoubleComplex*) &beta),
|
|
+ devC, rowC );
|
|
+
|
|
+ // copy matrix C to host
|
|
+ stat = cublasGetMatrix(rowC, colC, sizeof(cuDoubleComplex), devC, rowC, C, CLDim);
|
|
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
|
|
+
|
|
+ // free
|
|
+ cublasFree(devA);
|
|
+}
|
|
+
|
|
+} // namespace cublas
|
|
+} // namespace El
|
|
+
|
|
+#endif
|
|
+
|