random123: Add support for HIP/rocm. (#46284)

This commit is contained in:
Kelly (KT) Thompson 2024-09-24 08:29:16 -06:00 committed by GitHub
parent 0b575f60a5
commit d5b8b0600a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 299 additions and 0 deletions

View File

@ -16,6 +16,8 @@ class Random123(Package):
homepage = "https://www.deshawresearch.com/resources_random123.html"
url = "https://github.com/DEShawResearch/random123/archive/refs/tags/v1.14.0.tar.gz"
maintainers("KineticTheory")
version("1.14.0", sha256="effafd8656b18030b2a5b995cd3650c51a7c45052e6e1c21e48b9fa7a59d926e")
version(
"1.13.2",
@ -39,6 +41,7 @@ class Random123(Package):
patch("ibmxl.patch", when="@1.09")
patch("arm-gcc.patch", when="@1.09")
patch("v1132-xl161.patch", when="@1.13.2")
patch("v1140-hip.patch", when="@1.14.0")
def install(self, spec, prefix):
# Random123 doesn't have a build system.

View File

@ -0,0 +1,296 @@
warning: refname 'v1.14.0' is ambiguous.
diff --git a/include/Random123/array.h b/include/Random123/array.h
index 8076f23..06650ec 100644
--- a/include/Random123/array.h
+++ b/include/Random123/array.h
@@ -81,7 +81,7 @@ inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
/** @endcond */
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
/* CUDA can't handle std::reverse_iterator. We *could* implement it
ourselves, but let's not bother until somebody really feels a need
to reverse-iterate through an r123array */
@@ -114,8 +114,8 @@ inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
enum {static_size = _N}; \
R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \
R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \
- R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
- R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+ R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) {R123_THROW(std::out_of_range("array index out of range"));}; return (*this)[i]; } \
+ R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) {R123_THROW(std::out_of_range("array index out of range"));}; return (*this)[i]; } \
R123_CUDA_DEVICE size_type size() const { return _N; } \
R123_CUDA_DEVICE size_type max_size() const { return _N; } \
R123_CUDA_DEVICE bool empty() const { return _N==0; }; \
diff --git a/include/Random123/boxmuller.hpp b/include/Random123/boxmuller.hpp
index 9c91cf8..16d91f9 100644
--- a/include/Random123/boxmuller.hpp
+++ b/include/Random123/boxmuller.hpp
@@ -49,7 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// r123::float2 r123::boxmuller(uint32_t u0, uint32_t u1);
// r123::double2 r123::boxmuller(uint64_t u0, uint64_t u1);
-//
+//
// float2 and double2 are identical to their synonymous global-
// namespace structures in CUDA.
//
@@ -68,7 +68,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
namespace r123{
-#if !defined(__CUDACC__)
+#if !(defined(__CUDACC__) || defined(__HIPCC__))
typedef struct { float x, y; } float2;
typedef struct { double x, y; } double2;
#else
diff --git a/include/Random123/features/compilerfeatures.h b/include/Random123/features/compilerfeatures.h
index 0606dee..9ad3f82 100644
--- a/include/Random123/features/compilerfeatures.h
+++ b/include/Random123/features/compilerfeatures.h
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The Random123 library is portable across C, C++, CUDA, OpenCL environments,
and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
This level of portability requires the abstraction of some features
-and idioms that are either not standardized (e.g., asm statments), or for which
+and idioms that are either not standardized (e.g., asm statments), or for which
different vendors have their own standards (e.g., SSE intrinsics) or for
which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
@@ -55,7 +55,7 @@ Most of the symbols are boolean valued. In general, they will
Library users can override any value by defining the pp-symbol with a compiler option,
e.g.,
- cc -DR123_USE_MULHILO64_C99
+ cc -DR123_USE_MULHILO64_C99
will use a strictly c99 version of the full-width 64x64->128-bit multiplication
function, even if it would be disabled by default.
@@ -84,8 +84,8 @@ All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.
CXX11_EXPLICIT_CONVERSIONS
CXX11_LONG_LONG
CXX11_STD_ARRAY
- CXX11
-
+ CXX11
+
X86INTRIN_H
IA32INTRIN_H
XMMINTRIN_H
@@ -102,7 +102,7 @@ All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.
MULHILO64_C99
U01_DOUBLE
-
+
@endverbatim
Most have obvious meanings. Some non-obvious ones:
@@ -141,11 +141,11 @@ There are also non-boolean valued symbols:
<ul>
<li>R123_STATIC_INLINE -
According to both C99 and GNU99, the 'static inline' declaration allows
- the compiler to not emit code if the function is not used.
+ the compiler to not emit code if the function is not used.
Note that the semantics of 'inline', 'static' and 'extern' in
gcc have changed over time and are subject to modification by
command line options, e.g., -std=gnu89, -fgnu-inline.
- Nevertheless, it appears that the meaning of 'static inline'
+ Nevertheless, it appears that the meaning of 'static inline'
has not changed over time and (with a little luck) the use of 'static inline'
here will be portable between versions of gcc and to other C99
compilers.
@@ -157,7 +157,7 @@ There are also non-boolean valued symbols:
embellishments to strongly encourage that the declared function be
inlined. If there is no such compiler-specific magic, it should
expand to decl, unadorned.
-
+
<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
sufficiently similar semantics) when CUDA is in use, and expands
to nothing in other cases.
@@ -192,7 +192,7 @@ There are also non-boolean valued symbols:
\cond HIDDEN_FROM_DOXYGEN
*/
-/*
+/*
N.B. When something is added to the list of features, it should be
added to each of the *features.h files, AND to examples/ut_features.cpp.
*/
@@ -204,6 +204,8 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
#include "openclfeatures.h"
#elif defined(__CUDACC__)
#include "nvccfeatures.h"
+#elif defined(__HIPCC__)
+#include "hipfeatures.h"
#elif defined(__ICC)
#include "iccfeatures.h"
#elif defined(__xlC__) || defined(__ibmxl__)
@@ -292,7 +294,7 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
#ifndef R123_USE_64BIT
#define R123_USE_64BIT 1
-#endif
+#endif
#ifndef R123_USE_PHILOX_64BIT
#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN))
@@ -327,7 +329,7 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
#define R123_METAL_CONSTANT_ADDRESS_SPACE
#endif
-
+
/*
* Windows.h (and perhaps other "well-meaning" code define min and
* max, so there's a high chance that our definition of min, max
diff --git a/include/Random123/features/hipfeatures.h b/include/Random123/features/hipfeatures.h
new file mode 100644
index 0000000..f3ac0ed
--- /dev/null
+++ b/include/Random123/features/hipfeatures.h
@@ -0,0 +1,129 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __r123_hip_features_dot_h__
+#define __r123_hip_features_dot_h__
+
+#if !(defined(CUDART_VERSION) || defined(HIP_INCLUDE_HIP_HIP_RUNTIME_API_H))
+#error "why are we in hipfeatures.h if neither CUDART_VERSION NOR HIP_PLATFORM?"
+#endif
+
+#if CUDART_VERSION < 4010 && !defined(HIP_INCLUDE_HIP_HIP_RUNTIME_API_H)
+#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 is unsupported. See comments in nvccfeatures.h"
+// This test was added in Random123-1.08 (August, 2013) because we
+// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
+// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
+// earlier. We can't be sure this bug doesn't also affect invocations
+// of other templated functions, e.g., essentially all of Random123.
+// Thus, we no longer trust CUDA versions earlier than 4.1 even though
+// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
+// If you feel lucky or desperate, you can change #error to #warning, but
+// please take extra care to be sure that you are getting correct
+// results.
+#endif
+
+// nvcc falls through to gcc or msvc. So first define
+// a couple of things and then include either gccfeatures.h
+// or msvcfeatures.h
+
+//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
+//for both device and host functions in CUDA by setting compiler flags
+//for the device function
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE __host__ __device__
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 1
+#endif
+
+#ifndef R123_THROW
+// No exceptions in CUDA, at least upto 4.0
+#define R123_THROW(x) R123_ASSERT(0)
+#endif
+
+#ifndef R123_ASSERT
+# if defined(__CUDA_ARCH__)
+# define R123_ASSERT(x) if((x)); else asm("trap;")
+# elif defined(__HIP_DEVICE_COMPILE__)
+# define R123_ASSERT(x) if((x)); else asm("s_trap 2;")
+# endif
+#endif
+
+#else // ! ( defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) )
+// If we're using nvcc not compiling for the CUDA architecture,
+// then we must be compiling for the host. In that case,
+// tell the philox code to use the mulhilo64 asm because
+// nvcc doesn't grok uint128_t.
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#endif // __CUDA_ARCH__
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_ULONG_LONG
+// uint64_t, which is what we'd get without this, is
+// not the same as unsigned long long
+#define R123_ULONG_LONG unsigned long long
+#endif
+
+#if defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#endif
+
+#endif
diff --git a/include/Random123/uniform.hpp b/include/Random123/uniform.hpp
index ee4ddfb..d40d0a4 100644
--- a/include/Random123/uniform.hpp
+++ b/include/Random123/uniform.hpp
@@ -125,7 +125,7 @@ R123_MK_SIGNED_UNSIGNED(__int128_t, __uint128_t);
#undef R123_MK_SIGNED_UNSIGNED
#endif
-#if defined(__CUDACC__) || defined(_LIBCPP_HAS_NO_CONSTEXPR)
+#if defined(__CUDACC__) || defined(_LIBCPP_HAS_NO_CONSTEXPR) || defined(__HIPCC__)
// Amazing! cuda thinks numeric_limits::max() is a __host__ function, so
// we can't use it in a device function.
//